## Import Library

In [80]:
import pandas as pd
import numpy as np
import seaborn as sns
import pickle
import matplotlib.pyplot as plt
from datetime import datetime
from scipy.stats import uniform, randint

from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder

from sklearn.metrics import classification_report, f1_score
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score,classification_report,ConfusionMatrixDisplay,confusion_matrix,precision_score,recall_score,make_scorer


## Model Inference
> Model yang sudah dilatih akan dicoba pada data yang bukan termasuk ke dalam train-set ataupun test-set. Data ini harus dalam format yang asli, bukan data yang sudah di-scaled.

In [81]:
with open("model.pkl", "rb") as f: # load the model
    model = pickle.load(f)

In [82]:
with open("encoder.pkl", "rb") as f: # load the scaler
    encoder = pickle.load(f)

In [83]:
with open('column_names.pkl', 'rb') as f:
    column_names = pickle.load(f)

In [84]:
ohe = OneHotEncoder(sparse=True, handle_unknown='ignore')


def encode_and_create_dataframe_train(df, column):
    # Fit a separate OneHotEncoder for the column
    transformed_data = ohe.fit_transform(df[[column]])

    # Get feature names for the column
    feature_names = ohe.get_feature_names_out(input_features=[column])

    # Create a DataFrame for the column
    transformed_df = pd.DataFrame(transformed_data.toarray(), 
                                  index=df.index, 
                                  columns=feature_names)
    
    return transformed_df, ohe

from sklearn.preprocessing import OrdinalEncoder


oe = OrdinalEncoder()

def encode_and_create_dataframe_train_ordinal(df, column):
    # Fit and transform the data
    transformed_data = oe.fit_transform(df[[column]])

    # Create a DataFrame for the column
    transformed_df = pd.DataFrame(transformed_data, 
                                  index=df.index, 
                                  columns=[column])
    
    return transformed_df, oe

from random import randrange
from datetime import timedelta
from datetime import datetime

def random_date(start, end):
    """
    This function will return a random datetime between two datetime 
    objects.
    """
    delta = end - start
    int_delta = (delta.days * 24 * 60 * 60) + delta.seconds
    random_second = randrange(int_delta)
    return start + timedelta(seconds=random_second)

In [85]:
# Generate random stuff

#INT BLOCK
URL_LENGTH = np.random.randint(16.000000, 159.000000, size=10)
NUMBER_SPECIAL_CHARACTERS = np.random.randint(5.000000, 28.000000, size=10)
CONTENT_LENGTH = np.random.randint(0, 9806.000000, size=10)

d1 = datetime.strptime('1990-07-26', '%Y-%m-%d')
d2 = datetime.strptime('2017-04-14', '%Y-%m-%d')
WHOIS_UPDATED_DATE = random_date(d1, d2)
WHOIS_REGDATE = random_date(d1, d2)

TCP_CONVERSATION_EXCHANGE = np.random.randint(0, 84.000000, size=10)
DIST_REMOTE_TCP_PORT = np.random.randint(0, 20.000000, size=10)
REMOTE_IPS = np.random.randint(0, 16, size=10)
APP_BYTES = np.random.randint(0, 9302, size=10)

SOURCE_APP_PACKETS = np.random.randint(0, 103, size=10)
REMOTE_APP_PACKETS = np.random.randint(0, 99, size=10)
SOURCE_APP_BYTES = np.random.randint(0, 38681, size=10)
REMOTE_APP_BYTES = np.random.randint(0, 10693, size=10)
APP_PACKETS = np.random.randint(0, 103, size=10)
DNS_QUERY_TIMES = np.random.randint(0, 14, size=10)

# Create a DataFrame
data_int = pd.DataFrame({
    'URL_LENGTH': URL_LENGTH,
    'NUMBER_SPECIAL_CHARACTERS': NUMBER_SPECIAL_CHARACTERS,
    'CONTENT_LENGTH': CONTENT_LENGTH,
    'WHOIS_REGDATE': WHOIS_REGDATE,
    'WHOIS_UPDATED_DATE': WHOIS_UPDATED_DATE,
    'TCP_CONVERSATION_EXCHANGE': TCP_CONVERSATION_EXCHANGE,
    'DIST_REMOTE_TCP_PORT': DIST_REMOTE_TCP_PORT,
    'REMOTE_IPS': REMOTE_IPS,
    'APP_BYTES': APP_BYTES,
    'SOURCE_APP_PACKETS': SOURCE_APP_PACKETS,
    'REMOTE_APP_PACKETS': REMOTE_APP_PACKETS,
    'SOURCE_APP_BYTES': SOURCE_APP_BYTES,
    'REMOTE_APP_BYTES': REMOTE_APP_BYTES,
    'APP_PACKETS': APP_PACKETS,
    'DNS_QUERY_TIMES': DNS_QUERY_TIMES
})

# convert ['WHOIS_REGDATE', 'WHOIS_UPDATED_DATE'] to dtype('<M8[ns]')
data_int.WHOIS_REGDATE = data_int.WHOIS_REGDATE.view("float64")
data_int.WHOIS_UPDATED_DATE = data_int.WHOIS_UPDATED_DATE.view("float64")

In [86]:
# new DataFrame
test = pd.DataFrame()

# Unique values
unique_CHARSET = ['ISO-8859-1', 'UTF-8', 'utf-8', 'us-ascii', 'iso-8859-1', 'unknown', 'windows-1252', 'windows-1251']
unique_SERVER = ['Apache', 'cloudflare-nginx', 'other', 'Server', 'GSE', 'nginx', 'unknown', 'Microsoft-HTTPAPI/2.0', 'nginx/1.8.0', 'nginx/1.10.1', 'Microsoft-IIS/7.5', 'YouTubeFrontEnd', 'Apache/2.2.22 (Debian)', 'nginx/1.12.0', 'Microsoft-IIS/6.0', 'Apache/2.4.23 (Unix) OpenSSL/1.0.1e-fips mod_bwlimited/1.4', 'Apache/2.2.14 (FreeBSD) mod_ssl/2.2.14 OpenSSL/0.9.8y DAV/2 PHP/5.2.12 with Suhosin-Patch']
unique_WHOIS_COUNTRY = ['AU', 'CA', 'ES', 'US', 'other', 'unknown', 'PA', 'FR', 'KR', 'CZ', 'JP', 'ru', 'UK', 'CN', 'GB', 'UY']
unique_WHOIS_STATEPRO = ['other', 'Barcelona', 'CA', 'NV', 'Washington', 'unknown', 'Arizona', 'UT', 'NY', 'ON', 'PA', 'FL', 'California', 'PRAHA', 'WA', 'Krasnoyarsk', 'Utah', 'WC1N']

# Generate random columns
test['CHARSET'] = np.random.choice(unique_CHARSET, size=10)
test['SERVER'] = np.random.choice(unique_SERVER, size=10)
test['WHOIS_COUNTRY'] = np.random.choice(unique_WHOIS_COUNTRY, size=10)
test['WHOIS_STATEPRO'] = np.random.choice(unique_WHOIS_STATEPRO, size=10)

In [87]:
# concat data_int to test
data_int_check = data_int.copy()
data_int_check = pd.concat([data_int_check, test], axis=1)
data_int_check

Unnamed: 0,URL_LENGTH,NUMBER_SPECIAL_CHARACTERS,CONTENT_LENGTH,WHOIS_REGDATE,WHOIS_UPDATED_DATE,TCP_CONVERSATION_EXCHANGE,DIST_REMOTE_TCP_PORT,REMOTE_IPS,APP_BYTES,SOURCE_APP_PACKETS,REMOTE_APP_PACKETS,SOURCE_APP_BYTES,REMOTE_APP_BYTES,APP_PACKETS,DNS_QUERY_TIMES,CHARSET,SERVER,WHOIS_COUNTRY,WHOIS_STATEPRO
0,22,12,9528,3.438769e-309,4.918688e-309,76,11,7,1498,65,76,30500,7078,39,6,windows-1251,nginx/1.10.1,UY,NY
1,101,18,4296,3.438769e-309,4.918688e-309,71,0,11,6107,82,62,28937,9735,96,2,iso-8859-1,Apache,FR,ON
2,57,6,2978,3.438769e-309,4.918688e-309,30,8,14,4816,32,96,37663,5532,44,11,windows-1252,Microsoft-IIS/6.0,PA,Barcelona
3,65,22,6546,3.438769e-309,4.918688e-309,21,8,8,2869,42,15,23253,3088,17,5,windows-1251,cloudflare-nginx,UY,Washington
4,78,22,858,3.438769e-309,4.918688e-309,7,6,8,5205,88,72,28719,3804,79,12,windows-1252,Microsoft-IIS/6.0,CN,Krasnoyarsk
5,114,15,8703,3.438769e-309,4.918688e-309,18,12,4,5392,76,23,12971,9725,19,10,ISO-8859-1,Apache/2.2.22 (Debian),GB,other
6,46,19,2736,3.438769e-309,4.918688e-309,21,4,6,6237,64,33,7265,7060,85,9,windows-1252,Microsoft-IIS/7.5,PA,WA
7,69,12,879,3.438769e-309,4.918688e-309,48,17,13,5658,75,10,32302,4814,59,13,windows-1252,Microsoft-IIS/6.0,FR,Krasnoyarsk
8,17,8,7069,3.438769e-309,4.918688e-309,68,2,9,7237,30,55,906,3699,101,3,unknown,GSE,JP,California
9,68,18,3915,3.438769e-309,4.918688e-309,42,9,8,7258,60,30,26128,8915,35,11,ISO-8859-1,nginx/1.12.0,GB,other


In [88]:
# source destination cab_type name long_summary

capped_CHARSET, ohe_CHARSET = encode_and_create_dataframe_train(test, 'CHARSET')
capped_SERVER, ohe_SERVER = encode_and_create_dataframe_train(test, 'SERVER')
capped_WHOIS_COUNTRY, ohe_WHOIS_COUNTRY = encode_and_create_dataframe_train(test, 'WHOIS_COUNTRY')
capped_WHOIS_STATEPRO, ohe_WHOIS_STATEPRO = encode_and_create_dataframe_train(test, 'WHOIS_STATEPRO')

# concat all generated data into test_capped
test_capped = pd.concat([data_int, capped_CHARSET, capped_SERVER, capped_WHOIS_COUNTRY, capped_WHOIS_STATEPRO], axis=1)

test_capped




Unnamed: 0,URL_LENGTH,NUMBER_SPECIAL_CHARACTERS,CONTENT_LENGTH,WHOIS_REGDATE,WHOIS_UPDATED_DATE,TCP_CONVERSATION_EXCHANGE,DIST_REMOTE_TCP_PORT,REMOTE_IPS,APP_BYTES,SOURCE_APP_PACKETS,...,WHOIS_COUNTRY_PA,WHOIS_COUNTRY_UY,WHOIS_STATEPRO_Barcelona,WHOIS_STATEPRO_California,WHOIS_STATEPRO_Krasnoyarsk,WHOIS_STATEPRO_NY,WHOIS_STATEPRO_ON,WHOIS_STATEPRO_WA,WHOIS_STATEPRO_Washington,WHOIS_STATEPRO_other
0,22,12,9528,3.438769e-309,4.918688e-309,76,11,7,1498,65,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,101,18,4296,3.438769e-309,4.918688e-309,71,0,11,6107,82,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,57,6,2978,3.438769e-309,4.918688e-309,30,8,14,4816,32,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,65,22,6546,3.438769e-309,4.918688e-309,21,8,8,2869,42,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,78,22,858,3.438769e-309,4.918688e-309,7,6,8,5205,88,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
5,114,15,8703,3.438769e-309,4.918688e-309,18,12,4,5392,76,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
6,46,19,2736,3.438769e-309,4.918688e-309,21,4,6,6237,64,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
7,69,12,879,3.438769e-309,4.918688e-309,48,17,13,5658,75,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
8,17,8,7069,3.438769e-309,4.918688e-309,68,2,9,7237,30,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
9,68,18,3915,3.438769e-309,4.918688e-309,42,9,8,7258,60,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [89]:
test_capped.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 42 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   URL_LENGTH                     10 non-null     int32  
 1   NUMBER_SPECIAL_CHARACTERS      10 non-null     int32  
 2   CONTENT_LENGTH                 10 non-null     int32  
 3   WHOIS_REGDATE                  10 non-null     float64
 4   WHOIS_UPDATED_DATE             10 non-null     float64
 5   TCP_CONVERSATION_EXCHANGE      10 non-null     int32  
 6   DIST_REMOTE_TCP_PORT           10 non-null     int32  
 7   REMOTE_IPS                     10 non-null     int32  
 8   APP_BYTES                      10 non-null     int32  
 9   SOURCE_APP_PACKETS             10 non-null     int32  
 10  REMOTE_APP_PACKETS             10 non-null     int32  
 11  SOURCE_APP_BYTES               10 non-null     int32  
 12  REMOTE_APP_BYTES               10 non-null     int32 

In [90]:
# reindex to match the training columns
test_capped = test_capped.reindex(columns=column_names)

# Check Missing Values
test_capped.isnull().sum()

# fill null value with zeros
test_capped = test_capped.fillna(0)

test_capped

Unnamed: 0,URL_LENGTH,NUMBER_SPECIAL_CHARACTERS,CONTENT_LENGTH,WHOIS_REGDATE,WHOIS_UPDATED_DATE,TCP_CONVERSATION_EXCHANGE,DIST_REMOTE_TCP_PORT,REMOTE_IPS,APP_BYTES,SOURCE_APP_PACKETS,...,WHOIS_STATEPRO_ON,WHOIS_STATEPRO_PA,WHOIS_STATEPRO_PRAHA,WHOIS_STATEPRO_UT,WHOIS_STATEPRO_Utah,WHOIS_STATEPRO_WA,WHOIS_STATEPRO_WC1N,WHOIS_STATEPRO_Washington,WHOIS_STATEPRO_other,WHOIS_STATEPRO_unknown
0,22,12,9528,3.438769e-309,4.918688e-309,76,11,7,1498,65,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,101,18,4296,3.438769e-309,4.918688e-309,71,0,11,6107,82,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,57,6,2978,3.438769e-309,4.918688e-309,30,8,14,4816,32,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,65,22,6546,3.438769e-309,4.918688e-309,21,8,8,2869,42,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,78,22,858,3.438769e-309,4.918688e-309,7,6,8,5205,88,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,114,15,8703,3.438769e-309,4.918688e-309,18,12,4,5392,76,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
6,46,19,2736,3.438769e-309,4.918688e-309,21,4,6,6237,64,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
7,69,12,879,3.438769e-309,4.918688e-309,48,17,13,5658,75,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,17,8,7069,3.438769e-309,4.918688e-309,68,2,9,7237,30,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,68,18,3915,3.438769e-309,4.918688e-309,42,9,8,7258,60,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [91]:
# create prediction
y_pred_inf = model.predict(test_capped)
y_pred_inf

array([0, 1, 0, 0, 0, 0, 0, 0, 0, 0])

In [92]:
# concat prediction to test_capped
test_capped['prediction'] = y_pred_inf
test_capped

Unnamed: 0,URL_LENGTH,NUMBER_SPECIAL_CHARACTERS,CONTENT_LENGTH,WHOIS_REGDATE,WHOIS_UPDATED_DATE,TCP_CONVERSATION_EXCHANGE,DIST_REMOTE_TCP_PORT,REMOTE_IPS,APP_BYTES,SOURCE_APP_PACKETS,...,WHOIS_STATEPRO_PA,WHOIS_STATEPRO_PRAHA,WHOIS_STATEPRO_UT,WHOIS_STATEPRO_Utah,WHOIS_STATEPRO_WA,WHOIS_STATEPRO_WC1N,WHOIS_STATEPRO_Washington,WHOIS_STATEPRO_other,WHOIS_STATEPRO_unknown,prediction
0,22,12,9528,3.438769e-309,4.918688e-309,76,11,7,1498,65,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,101,18,4296,3.438769e-309,4.918688e-309,71,0,11,6107,82,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,57,6,2978,3.438769e-309,4.918688e-309,30,8,14,4816,32,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,65,22,6546,3.438769e-309,4.918688e-309,21,8,8,2869,42,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0
4,78,22,858,3.438769e-309,4.918688e-309,7,6,8,5205,88,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
5,114,15,8703,3.438769e-309,4.918688e-309,18,12,4,5392,76,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0
6,46,19,2736,3.438769e-309,4.918688e-309,21,4,6,6237,64,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0
7,69,12,879,3.438769e-309,4.918688e-309,48,17,13,5658,75,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
8,17,8,7069,3.438769e-309,4.918688e-309,68,2,9,7237,30,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
9,68,18,3915,3.438769e-309,4.918688e-309,42,9,8,7258,60,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0
