## Initial steps

Import libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
testing = pd.read_csv('./cybersecurity_test.csv', delimiter="|", header=[0])

In [3]:
testing.head()

Unnamed: 0,alert_ids,client_code,categoryname,ip,ipcategory_name,ipcategory_scope,parent_category,grandparent_category,overallseverity,timestamp_dist,...,thrcnt_week,thrcnt_day,p6,p9,p5m,p5w,p5d,p8m,p8w,p8d
0,Slg,RLJ,Exploit,MW.YB.50.64,INTERNET,Internet,7,A,3,0,...,298,42,1,0,1,1,1,1,1,1
1,WKM,UZT,Exploit,IJ.NW.77.74,INTERNET,Internet,7,A,5,0,...,11,3,1,0,1,1,1,1,1,1
2,dkm,ZZW,Attack,YT.LB.36.21,INTERNET,Internet,7,A,3,0,...,3601,602,1,0,3,1,1,1,1,1
3,RIX,QXG,Attack,172.BW.LB.105,PRIV-172,Private network,1,A,3,0,...,12,4,1,0,3,1,1,2,1,1
4,qFU,PDU,Exploit,YT.LB.32.110,INTERNET,Internet,7,A,3,258273,...,131,20,1,0,1,1,1,1,1,1


### Fill NA numerical

In [4]:
def fillmissing(dataset):
    for column in dataset.columns:
        if dataset[column].isna().sum() > 0:
            dataset[column].fillna(0, inplace=True)
    return dataset

In [5]:
testing =fillmissing(testing)
print(testing.columns)

Index(['alert_ids', 'client_code', 'categoryname', 'ip', 'ipcategory_name',
       'ipcategory_scope', 'parent_category', 'grandparent_category',
       'overallseverity', 'timestamp_dist', 'start_hour', 'start_minute',
       'start_second', 'weekday', 'correlatedcount', 'n1', 'n2', 'n3', 'n4',
       'n5', 'n6', 'n7', 'n8', 'n9', 'n10', 'score', 'srcip_cd', 'dstip_cd',
       'srcport_cd', 'dstport_cd', 'alerttype_cd', 'direction_cd',
       'eventname_cd', 'severity_cd', 'reportingdevice_cd', 'devicetype_cd',
       'devicevendor_cd', 'domain_cd', 'protocol_cd', 'username_cd',
       'srcipcategory_cd', 'dstipcategory_cd', 'isiptrusted', 'untrustscore',
       'flowscore', 'trustscore', 'enforcementscore', 'dstipcategory_dominate',
       'srcipcategory_dominate', 'dstportcategory_dominate',
       'srcportcategory_dominate', 'thrcnt_month', 'thrcnt_week', 'thrcnt_day',
       'p6', 'p9', 'p5m', 'p5w', 'p5d', 'p8m', 'p8w', 'p8d'],
      dtype='object')


### Column types

In [6]:
categorical_string_columns = ['categoryname', 'ipcategory_name', 'ipcategory_scope', 'parent_category', 'grandparent_category', 'weekday']
categorical_numeric_columns = ['overallseverity', 'start_hour', 'start_minute', 'start_second', 'n1', 'n2', 'n3', 'n4', 'n5', 'n6', 'n7', 'n8', 'n9', 'n10', 'score', 
                               'alerttype_cd', 'direction_cd', 'eventname_cd', 'severity_cd', 'reportingdevice_cd', 'devicetype_cd', 'devicevendor_cd',
                               'srcipcategory_cd', 'dstipcategory_cd', 'isiptrusted', 'untrustscore', 'flowscore', 'trustscore','enforcementscore', 'dstipcategory_dominate', 
                               'srcipcategory_dominate', 'dstportcategory_dominate', 'srcportcategory_dominate', 'p6', 'p5m', 'p5w', 'p5d', 'p8m', 'p8w', 'p8d']

## Variables preprocessing

#### ip

In [7]:
# split IPs to prefix
testing['ip_prefix'] = testing['ip'].apply(lambda x: ".".join(x.split('.')[:1]))
vc = testing['ip_prefix'].value_counts()
testing['ip_prefix_aggreg'] = testing['ip_prefix'].apply(lambda x: x if vc[x]>100 else "other")
testing['ip_prefix_aggreg'].value_counts().size

25

### No string columns dataset

In [8]:
string_columns =  ['alert_ids', 'client_code', 'categoryname', 'ip', 'ipcategory_name', 'ipcategory_scope','parent_category','grandparent_category', 
                   'weekday', 'dstipcategory_dominate', 'srcipcategory_dominate', 'ip_prefix', 'ip_prefix_aggreg']
no_strings_testing = testing.drop(string_columns, axis=1)
file_name = "no_strings_test.pkl"
no_strings_testing.to_pickle(file_name)

## Encoding

In [9]:
def encode(original, concate, list_of_columns):
    for column in list_of_columns:
        dummies = pd.get_dummies(columns=[column], data=original[column], prefix=column)
        concate = pd.concat([concate, dummies], axis=1)
    return concate

### Encoding string categorical columns

In [10]:
columns_to_encode = ['categoryname', 'weekday','ipcategory_name', 'ipcategory_scope','grandparent_category', 
                     'dstipcategory_dominate', 'srcipcategory_dominate', 'ip_prefix_aggreg']
encoded_category = encode(testing, no_strings_testing, columns_to_encode)
for column in columns_to_encode:
    if column in encoded_category.columns:
        encoded = encoded.drop(column, axis=1)
print("After categorical encoding: ", len(encoded_category.columns))

# numerical columns
columns_to_encode = categorical_numeric_columns
encoded = encode(testing, encoded_category, columns_to_encode)
for column in columns_to_encode:
    if column in encoded.columns:
        encoded = encoded.drop(column, axis=1)
print("After numerical encoding: " ,len(encoded.columns))

After categorical encoding:  120
After numerical encoding:  417


In [11]:
encoded_train = pd.read_pickle("../Datasets/encoded_train.pkl")
missing_columns = list(set(encoded_train.columns) - set(encoded.columns))
print("Missing columns: ", missing_columns)
superabundant_columns = list(set(encoded.columns) - set(encoded_train.columns))
print("Superabundant columns: ", superabundant_columns)



Missing columns:  ['devicetype_cd_5', 'ip_prefix_aggreg_SF', 'ip_prefix_aggreg_WZ', 'ip_prefix_aggreg_RZ', 'n5', 'categoryname_Suspicious Reputation', 'notified', 'n10', 'ip_prefix_aggreg_TT', 'ip_prefix_aggreg_KB', 'p5m_5', 'reportingdevice_cd_23', 'categoryname_Suspicious Network Activity', 'srcipcategory_dominate_PRIV-CGN', 'reportingdevice_cd_151', 'n9', 'alerttype_cd_10', 'ip_prefix_aggreg_EU', 'n3', 'categoryname_To Be Determined', 'ip_prefix_aggreg_XI', 'reportingdevice_cd_17', 'p6_10', 'ip_prefix_aggreg_NZ', 'categoryname_Suspicious Account Activity', 'ip_prefix_aggreg_LW', 'ip_prefix_aggreg_EB', 'ip_prefix_aggreg_YC', 'reportingdevice_cd_153', 'dstipcategory_cd_4', 'n8', 'n7', 'ip_prefix_aggreg_RQ', 'devicevendor_cd_7', 'ip_prefix_aggreg_ER', 'ip_prefix_aggreg_JQ', 'reportingdevice_cd_154', 'reportingdevice_cd_46', 'devicevendor_cd_4', 'n2', 'ip_prefix_aggreg_100', 'devicetype_cd_4', 'ip_prefix_aggreg_IF', 'ip_prefix_aggreg_MQ', 'ip_prefix_aggreg_RD', 'ip_prefix_aggreg_WA', 'i

In [12]:
for column in missing_columns:
    if column != 'notified':
        encoded[column] = 0

In [13]:
len(encoded.columns)

472

In [14]:
file_name = "encoded_test.pkl"
encoded.to_pickle(file_name)

## Standartization

In [15]:
from sklearn.preprocessing import StandardScaler

def standartize(dataset, target_variable):
    
    scaler = StandardScaler()
    scaler.fit(dataset)
    scaled_features = scaler.transform(dataset)
    df_scaled_features = pd.DataFrame(scaled_features,columns=dataset.columns)
    return(df_scaled_features)
    

#### No_strings dataset

In [16]:
no_strings_testing_scaled = standartize(no_strings_testing,target_variable='notified')
file_name = "no_strings_normalized_test.pkl"
no_strings_testing_scaled.to_pickle(file_name)

  return self.partial_fit(X, y)
  import sys


#### Encoded dataset

In [17]:
encoded_scaled = standartize(encoded,target_variable='notified')
file_name = "encoded_normalized_test.pkl"
encoded_scaled.to_pickle(file_name)

  return self.partial_fit(X, y)
  import sys
