## Initial steps

Import libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [45]:
training = pd.read_csv('./cybersecurity_training.csv', delimiter="|", header=[0])

In [46]:
training.head()

Unnamed: 0,alert_ids,client_code,notified,categoryname,ip,ipcategory_name,ipcategory_scope,parent_category,grandparent_category,overallseverity,...,thrcnt_week,thrcnt_day,p6,p9,p5m,p5w,p5d,p8m,p8w,p8d
0,Nhq,DPM,0,Attack,YT.LB.32.21,INTERNET,Internet,7,A,3,...,4160,675,1,0,2,1,1,1,1,1
1,XZt,FIN,0,Exploit,192.SL.UK.94,PRIV-192,Private network,1,A,5,...,9,2,4,12,3,2,2,2,1,1
2,bBz,CHP,0,Attack,YT.LB.38.21,INTERNET,Internet,7,A,4,...,3788,628,1,0,2,2,1,2,2,1
3,ZNr,HPS,0,Attack,JX.NY.13.20,INTERNET,Internet,7,A,4,...,565,96,0,0,2,2,2,2,2,2
4,poV,OSC,0,Attack,YT.LB.32.21,INTERNET,Internet,7,A,4,...,2790,632,1,0,1,1,1,1,1,1


In [47]:
training.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39427 entries, 0 to 39426
Data columns (total 63 columns):
alert_ids                   39427 non-null object
client_code                 39427 non-null object
notified                    39427 non-null int64
categoryname                39427 non-null object
ip                          39427 non-null object
ipcategory_name             39427 non-null object
ipcategory_scope            39427 non-null object
parent_category             39427 non-null int64
grandparent_category        39427 non-null object
overallseverity             39427 non-null int64
timestamp_dist              39427 non-null int64
start_hour                  39427 non-null int64
start_minute                39427 non-null int64
start_second                39427 non-null int64
weekday                     39427 non-null object
correlatedcount             39427 non-null int64
n1                          7132 non-null float64
n2                          7132 non-null float64

### Fill NA numerical
Find missing values

In [48]:
for column in training.columns:
        if training[column].isna().sum() > 0:
            print(column, training[column].isna().sum())

n1 32295
n2 32295
n3 32295
n4 32295
n5 32295
n6 32295
n7 32295
n8 32295
n9 32295
n10 32295
score 32295


In [49]:
def fillmissing(dataset):
    for column in dataset.columns:
        if dataset[column].isna().sum() > 0:
            dataset[column].fillna(0, inplace=True)
    return dataset

In [50]:
training = fillmissing(training)
training.columns

Index(['alert_ids', 'client_code', 'notified', 'categoryname', 'ip',
       'ipcategory_name', 'ipcategory_scope', 'parent_category',
       'grandparent_category', 'overallseverity', 'timestamp_dist',
       'start_hour', 'start_minute', 'start_second', 'weekday',
       'correlatedcount', 'n1', 'n2', 'n3', 'n4', 'n5', 'n6', 'n7', 'n8', 'n9',
       'n10', 'score', 'srcip_cd', 'dstip_cd', 'srcport_cd', 'dstport_cd',
       'alerttype_cd', 'direction_cd', 'eventname_cd', 'severity_cd',
       'reportingdevice_cd', 'devicetype_cd', 'devicevendor_cd', 'domain_cd',
       'protocol_cd', 'username_cd', 'srcipcategory_cd', 'dstipcategory_cd',
       'isiptrusted', 'untrustscore', 'flowscore', 'trustscore',
       'enforcementscore', 'dstipcategory_dominate', 'srcipcategory_dominate',
       'dstportcategory_dominate', 'srcportcategory_dominate', 'thrcnt_month',
       'thrcnt_week', 'thrcnt_day', 'p6', 'p9', 'p5m', 'p5w', 'p5d', 'p8m',
       'p8w', 'p8d'],
      dtype='object')

### Column types

In [51]:
to_drop_columns = ['client_code', 'alert_ids']
categorical_string_columns = ['categoryname', 'ipcategory_name', 'ipcategory_scope', 'parent_category', 'grandparent_category', 'weekday']
categorical_numeric_columns = ['overallseverity', 'start_hour', 'start_minute', 'start_second', 'score', 'alerttype_cd', 'direction_cd', 'eventname_cd', 'isiptrusted', 'dstipcategory_dominate', 'srcipcategory_dominate', 'dstportcategory_dominate', 'srcportcategory_dominate', 'p6', 'p5m', 'p5w', 'p5d', 'p8m', 'p8w', 'p8d']
binary_columns = [ 'n1', 'n2', 'n3', 'n4', 'n5', 'n6', 'n7', 'n8', 'n9', 'n10']

## Variables preprocessing

#### ip
Get A prefix of IP address and create a catecogy. If there is less than 100 records of the prefix, it is makred as "other" category

In [52]:
# split IPs to prefix
training['ip_prefix'] = training['ip'].apply(lambda x: ".".join(x.split('.')[:1]))
vc = training['ip_prefix'].value_counts()
training['ip_prefix_aggreg'] = training['ip_prefix'].apply(lambda x: x if vc[x]>100 else "other")
training['ip_prefix_aggreg'].value_counts().size

39

### No string columns dataset

In [53]:
string_columns =  ['alert_ids', 'client_code', 'categoryname', 'ip', 'ipcategory_name', 'ipcategory_scope','parent_category','grandparent_category', 
                   'weekday', 'dstipcategory_dominate', 'srcipcategory_dominate', 'ip_prefix', 'ip_prefix_aggreg']
no_strings_training = training.drop(string_columns, axis=1)
file_name = "no_strings_train.pkl"
no_strings_training.to_pickle(file_name)

## Encoding

In [56]:
def encode(original, concate, list_of_columns):
    for column in list_of_columns:
        dummies = pd.get_dummies(columns=[column], data=original[column], prefix=column)
        concate = pd.concat([concate, dummies], axis=1)
    return concate

In [57]:
training.columns


Index(['alert_ids', 'client_code', 'notified', 'categoryname', 'ip',
       'ipcategory_name', 'ipcategory_scope', 'parent_category',
       'grandparent_category', 'overallseverity', 'timestamp_dist',
       'start_hour', 'start_minute', 'start_second', 'weekday',
       'correlatedcount', 'n1', 'n2', 'n3', 'n4', 'n5', 'n6', 'n7', 'n8', 'n9',
       'n10', 'score', 'srcip_cd', 'dstip_cd', 'srcport_cd', 'dstport_cd',
       'alerttype_cd', 'direction_cd', 'eventname_cd', 'severity_cd',
       'reportingdevice_cd', 'devicetype_cd', 'devicevendor_cd', 'domain_cd',
       'protocol_cd', 'username_cd', 'srcipcategory_cd', 'dstipcategory_cd',
       'isiptrusted', 'untrustscore', 'flowscore', 'trustscore',
       'enforcementscore', 'dstipcategory_dominate', 'srcipcategory_dominate',
       'dstportcategory_dominate', 'srcportcategory_dominate', 'thrcnt_month',
       'thrcnt_week', 'thrcnt_day', 'p6', 'p9', 'p5m', 'p5w', 'p5d', 'p8m',
       'p8w', 'p8d', 'ip_prefix', 'ip_prefix_aggreg'],


### Encoding string categorical columns

In [58]:
columns_to_encode = ['categoryname', 'weekday','ipcategory_name', 'ipcategory_scope','grandparent_category', 
                     'dstipcategory_dominate', 'srcipcategory_dominate', 'ip_prefix_aggreg']
encoded_category = encode(training, no_strings_training, columns_to_encode)
for column in columns_to_encode:
    if column in encoded_category.columns:
        encoded = encoded.drop(column, axis=1)
print("After categorical encoding: ", len(encoded_category.columns))

# numerical columns
columns_to_encode = categorical_numeric_columns
encoded = encode(training, encoded_category, columns_to_encode)
for column in columns_to_encode:
    if column in encoded.columns:
        encoded = encoded.drop(column, axis=1)
print("After numerical encoding: " ,len(encoded.columns))

After categorical encoding:  140
After numerical encoding:  366


In [11]:
missing_columns =  ['n4_1.0', 'n3_0.0', 'n9_0.0', 'alerttype_cd_11', 'dstipcategory_dominate_LINK-LOCAL', 'n5_1.0', 'untrustscore_8', 'n2_0.0', 'n5_0.0', 'ip_prefix_aggreg_SC', 'n10_0.0', 'n3_1.0', 'n4_0.0', 'reportingdevice_cd_144', 'devicevendor_cd_8', 'alerttype_cd_12', 'p6_11', 'reportingdevice_cd_37', 'reportingdevice_cd_31', 'n6_0.0', 'eventname_cd_14', 'n2_1.0', 'ipcategory_name_BENCH', 'ip_prefix_aggreg_ON', 'untrustscore_9', 'srcipcategory_dominate_BENCH', 'n8_0.0', 'p6_12', 'ip_prefix_aggreg_DK', 'ip_prefix_aggreg_BW', 'n7_0.0', 'n1_1.0', 'devicetype_cd_6', 'direction_cd_6', 'n6_1.0', 'n9_1.0', 'ip_prefix_aggreg_MW', 'reportingdevice_cd_28', 'n1_0.0', 'devicetype_cd_7']
for column in missing_columns:
    if column != 'notified':
        encoded[column] = 0
len(encoded.columns)

473

In [12]:
file_name = "encoded_train.pkl"
encoded.to_pickle(file_name)

## Standartization

In [13]:
from sklearn.preprocessing import StandardScaler

def standartize(dataset, target_variable):
    
    scaler = StandardScaler()
    scaler.fit(dataset.drop(target_variable,axis=1))
    scaled_features = scaler.transform(dataset.drop(target_variable,axis=1))
    df_scaled_features = pd.DataFrame(scaled_features,columns=dataset.drop(target_variable,axis=1).columns)
    df_scaled = pd.concat([dataset[target_variable], df_scaled_features], axis=1)
    return df_scaled

#### No_strings dataset

In [14]:
no_strings_training_scaled = standartize(no_strings_training,target_variable='notified')
file_name = "no_strings_normalized_train.pkl"
no_strings_training_scaled.to_pickle(file_name)

  return self.partial_fit(X, y)
  import sys


#### Encoded dataset

In [15]:
encoded_scaled = standartize(encoded,target_variable='notified')
file_name = "encoded_normalized_train.pkl"
encoded_scaled.to_pickle(file_name)

  return self.partial_fit(X, y)
  import sys
