# Unsupervised Learning with Isolation Forests
___

Author: Diego Lopez

Date: 08/15/2022

This file contains an unsupervised clustering implementation on the KDD Cup 1999 Data using Isolation Forests. 

TODO: implement feature extraction

TODO: Implement normalization of features

TODO: implement algorithm

## Step 1: Data Exploration, Dimensionality Reduction, Normalization
___

In [1]:
# Imports
import h2o
from h2o.estimators import H2OIsolationForestEstimator
from h2o.estimators import H2OKMeansEstimator
from h2o.estimators import H2OPrincipalComponentAnalysisEstimator
from sklearn.metrics import silhouette_samples, silhouette_score
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

We need to find the continuous columns in the dataset. Numeric discrete data should not be normalized and should be treated as categorical

Of these columns, we leave out from normalization the following:

protocol type : 1

service : 2

flag : 3

land : 6

wrong_fragment : 7

urgent : 8

logged_in : 11

is_host_login : 20

is_guest_login : 21

In [2]:
# initialize h2o server and set pandas options
# BEWARE: DO NOT PRINT FULL DATAFRAMES!
h2o.init(max_mem_size=5)
pd.set_option("display.max_rows", None, "display.max_columns", None)

Checking whether there is an H2O instance running at http://localhost:54321 . connected.


0,1
H2O_cluster_uptime:,2 hours 5 mins
H2O_cluster_timezone:,America/New_York
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.36.1.1
H2O_cluster_version_age:,4 months and 20 days !!!
H2O_cluster_name:,H2O_from_python_diego_q23ji5
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,4.322 Gb
H2O_cluster_total_cores:,8
H2O_cluster_allowed_cores:,8


In [3]:
# Load data
train_df = pd.read_csv("../data/raw/train1.csv", index_col=0)
test_df = pd.read_csv("../data/raw/test1.csv", index_col=0)
train = h2o.import_file("../data/raw/train1.csv")
test = h2o.import_file("../data/raw/test1.csv")
# TODO: can this be done when importing with h2o?
test = test[:, 1:]
train = train[:, 1:]

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


In [4]:
train.head(10)

duration:,protocol_type:,service:,flag:,src_bytes:,dst_bytes:,land:,wrong_fragment:,urgent:,hot:,num_failed_logins:,logged_in:,num_compromised:,root_shell:,su_attempted:,num_root:,num_file_creations:,num_shells:,num_access_files:,num_outbound_cmds:,is_host_login:,is_guest_login:,count:,srv_count:,serror_rate:,srv_serror_rate:,rerror_rate:,srv_rerror_rate:,same_srv_rate:,diff_srv_rate:,srv_diff_host_rate:,dst_host_count:,dst_host_srv_count:,dst_host_same_srv_rate:,dst_host_diff_srv_rate:,dst_host_same_src_port_rate:,dst_host_srv_diff_host_rate:,dst_host_serror_rate:,dst_host_srv_serror_rate:,dst_host_rerror_rate:,dst_host_srv_rerror_rate:,labels,attack
0,tcp,http,SF,239,486,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,8,8,0,0,0,0,1,0,0,19,19,1,0,0.05,0.0,0,0,0,0,normal,normal
0,tcp,http,SF,235,1337,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,8,8,0,0,0,0,1,0,0,29,29,1,0,0.03,0.0,0,0,0,0,normal,normal
0,tcp,http,SF,219,1337,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,6,6,0,0,0,0,1,0,0,39,39,1,0,0.03,0.0,0,0,0,0,normal,normal
0,tcp,http,SF,217,2032,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,6,6,0,0,0,0,1,0,0,49,49,1,0,0.02,0.0,0,0,0,0,normal,normal
0,tcp,http,SF,217,2032,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,6,6,0,0,0,0,1,0,0,59,59,1,0,0.02,0.0,0,0,0,0,normal,normal
0,tcp,http,SF,212,1940,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,2,0,0,0,0,1,0,1,1,69,1,0,1.0,0.04,0,0,0,0,normal,normal
0,tcp,http,SF,159,4087,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,5,5,0,0,0,0,1,0,0,11,79,1,0,0.09,0.04,0,0,0,0,normal,normal
0,tcp,http,SF,210,151,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,8,8,0,0,0,0,1,0,0,8,89,1,0,0.12,0.04,0,0,0,0,normal,normal
0,tcp,http,SF,212,786,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,8,8,0,0,0,0,1,0,0,8,99,1,0,0.12,0.05,0,0,0,0,normal,normal
0,tcp,http,SF,210,624,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,18,18,0,0,0,0,1,0,0,18,109,1,0,0.06,0.05,0,0,0,0,normal,normal




In [5]:
# Dataset is imbalanced, however we are doing clustering so this does not matter
train_df['labels'].value_counts()

smurf              280790
neptune            107201
normal              97277
back                 2203
satan                1589
ipsweep              1247
portsweep            1040
warezclient          1020
teardrop              979
pod                   264
nmap                  231
guess_passwd           53
buffer_overflow        30
land                   21
warezmaster            20
imap                   12
rootkit                10
loadmodule              9
ftp_write               8
multihop                7
phf                     4
perl                    3
spy                     2
Name: labels, dtype: int64

In [6]:
# declare columns to ignore: the columns that are categorical 
cols_to_ignore = ["protocol_type:","service:", "flag:", "land:","wrong_fragment:", "urgent:", "logged_in:", "is_host_login:", "is_guest_login:", "labels", "attack"]
cols_to_include = [x for x in train.columns if x not in cols_to_ignore]

In [7]:
test.head()

duration:,protocol_type:,service:,flag:,src_bytes:,dst_bytes:,land:,wrong_fragment:,urgent:,hot:,num_failed_logins:,logged_in:,num_compromised:,root_shell:,su_attempted:,num_root:,num_file_creations:,num_shells:,num_access_files:,num_outbound_cmds:,is_host_login:,is_guest_login:,count:,srv_count:,serror_rate:,srv_serror_rate:,rerror_rate:,srv_rerror_rate:,same_srv_rate:,diff_srv_rate:,srv_diff_host_rate:,dst_host_count:,dst_host_srv_count:,dst_host_same_srv_rate:,dst_host_diff_srv_rate:,dst_host_same_src_port_rate:,dst_host_srv_diff_host_rate:,dst_host_serror_rate:,dst_host_srv_serror_rate:,dst_host_rerror_rate:,dst_host_srv_rerror_rate:,labels,attack
0,udp,private,SF,105,146,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1.0,0,0,255,254,1.0,0.01,0.0,0.0,0,0,0,0,normal,normal
0,udp,private,SF,105,146,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1.0,0,0,255,254,1.0,0.01,0.0,0.0,0,0,0,0,normal,normal
0,udp,private,SF,105,146,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,2,0,0,0,0,1.0,0,0,255,254,1.0,0.01,0.0,0.0,0,0,0,0,snmpgetattack,snmpgetattack
0,udp,private,SF,105,146,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,2,0,0,0,0,1.0,0,0,255,254,1.0,0.01,0.01,0.0,0,0,0,0,snmpgetattack,snmpgetattack
0,udp,private,SF,105,146,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,2,0,0,0,0,1.0,0,0,255,255,1.0,0.0,0.01,0.0,0,0,0,0,snmpgetattack,snmpgetattack
0,udp,domain_u,SF,29,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,1,0,0,0,0,0.5,1,0,10,3,0.3,0.3,0.3,0.0,0,0,0,0,normal,normal
0,udp,private,SF,105,146,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1.0,0,0,255,253,0.99,0.01,0.0,0.0,0,0,0,0,normal,normal
0,udp,private,SF,105,146,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,2,0,0,0,0,1.0,0,0,255,254,1.0,0.01,0.0,0.0,0,0,0,0,snmpgetattack,snmpgetattack
0,tcp,http,SF,223,185,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,4,4,0,0,0,0,1.0,0,0,71,255,1.0,0.0,0.01,0.01,0,0,0,0,normal,normal
0,udp,private,SF,105,146,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,2,0,0,0,0,1.0,0,0,255,254,1.0,0.01,0.0,0.0,0,0,0,0,snmpgetattack,snmpgetattack




Performing PCA with only the numeric variables

In [8]:
# total number of predictors to go through PCA
len(train.columns) - len(cols_to_ignore)

32

### PCA Model

In [9]:
# initialize model
pca = H2OPrincipalComponentAnalysisEstimator(k=31, transform="standardize")

In [10]:
# train model
pca.train(training_frame=train, ignored_columns=cols_to_ignore)

pca Model Build progress: |



██████████████████████████████████████████████████████| (done) 100%
Model Details
H2OPrincipalComponentAnalysisEstimator :  Principal Components Analysis
Model Key:  PCA_model_python_1662223765577_10


Importance of components: 




Unnamed: 0,Unnamed: 1,pc1,pc2,pc3,pc4,pc5,pc6,pc7,pc8,pc9,pc10,pc11,pc12,pc13,pc14,pc15,pc16,pc17,pc18,pc19,pc20,pc21,pc22,pc23,pc24,pc25,pc26,pc27,pc28,pc29,pc30,pc31
0,Standard deviation,2.832211,2.117975,1.72809,1.574535,1.235373,1.084345,1.032012,1.012354,0.99986,0.982273,0.977462,0.932189,0.923606,0.879467,0.848387,0.841434,0.624234,0.604831,0.577464,0.384486,0.232358,0.165497,0.137874,0.133516,0.125786,0.083409,0.076594,0.072767,0.038605,0.035826,0.021545
1,Proportion of Variance,0.258755,0.144704,0.096332,0.079973,0.049231,0.037929,0.034356,0.03306,0.032249,0.031125,0.03082,0.028031,0.027518,0.02495,0.023218,0.022839,0.01257,0.011801,0.010757,0.004769,0.001742,0.000884,0.000613,0.000575,0.00051,0.000224,0.000189,0.000171,4.8e-05,4.1e-05,1.5e-05
2,Cumulative Proportion,0.258755,0.403459,0.499791,0.579764,0.628995,0.666924,0.70128,0.73434,0.766589,0.797714,0.828534,0.856566,0.884083,0.909034,0.932252,0.955091,0.967661,0.979462,0.990219,0.994987,0.996729,0.997612,0.998226,0.998801,0.999311,0.999535,0.999725,0.999896,0.999944,0.999985,1.0




ModelMetricsPCA: pca
** Reported on train data. **

MSE: NaN
RMSE: NaN

Scoring History for GramSVD: 


Unnamed: 0,Unnamed: 1,timestamp,duration,iterations
0,,2022-09-03 14:54:42,0.107 sec,0.0




The curse of dimensionality is very real. We want to keep as many columns as possible, however highly dimensional data is not suitable for clustering as data in high dimensions is more prone to being distant. We keep the minimum number of columns needed to account for 95% of the variance in the data

In [11]:
count = 0
i = 1
while count < 0.95:
    count = pca.varimp()[2][i]
    i += 1
print(i)
print(pca.varimp()[2][i])

17
0.967661006332977


18 components account for 95% of the variance in the training data, as such we were able to remove 15 columns that only accounted for 5% of the variance in the data. Pretty good. 

In [12]:
x_embedded = pca.predict(train) # get the full dataset
x_embedded_test = pca.predict(test)
x_embedded = x_embedded[:,:17] # get only the components that account for 95% of the variance
x_embedded_test = x_embedded_test[:,:17]
x_embedded_binded = x_embedded.cbind(train[:, cols_to_ignore]) # combine with the categorical data
x_embedded_test_binded = x_embedded_test.cbind(test[:, cols_to_ignore]) 

pca prediction progress: |███████████████████████████████████████████████████████| (done) 100%
pca prediction progress: |███████████████████████████████████████████████████████| (done) 100%


In [13]:
len(x_embedded_test_binded.columns) == len(x_embedded_binded.columns) # sanity check

True

## K-means Clustering
___

For our Kmeans clustering, we want to try multiple K such that we find the one that no longer satisfies our PRE threshold

In [14]:
kmeans = H2OKMeansEstimator(k=35, score_each_iteration=True,nfolds=5, max_iterations=3000, categorical_encoding="one_hot_explicit", estimate_k=True, standardize=False, ignored_columns=['labels', 'attack'], init="PlusPlus")

In [15]:
# is host login gets dropped
kmeans.train(training_frame=x_embedded_binded)

kmeans Model Build progress: |



███████████████████████████████████████████████████| (done) 100%
Model Details
H2OKMeansEstimator :  K-means
Model Key:  KMeans_model_python_1662223765577_11


Model Summary: 


Unnamed: 0,Unnamed: 1,number_of_rows,number_of_clusters,number_of_categorical_columns,number_of_iterations,within_cluster_sum_of_squares,total_sum_of_squares,between_cluster_sum_of_squares
0,,494020.0,15.0,0.0,53.0,6561045.0,15636040.0,9074996.0




ModelMetricsClustering: kmeans
** Reported on train data. **

Total Within Cluster Sum of Square Error: 6546783.807669153
Total Sum of Square Error to Grand Mean: 15636041.23016029
Between Cluster Sum of Square Error: 9089257.422491137

Centroid Statistics: 


Unnamed: 0,Unnamed: 1,centroid,size,within_cluster_sum_of_squares
0,,1.0,2.0,4945.304
1,,2.0,367634.0,4094653.0
2,,3.0,25529.0,237399.8
3,,4.0,80421.0,455080.5
4,,5.0,11246.0,847048.5
5,,6.0,12.0,62975.94
6,,7.0,45.0,41420.21
7,,8.0,5076.0,340869.0
8,,9.0,3478.0,145366.7
9,,10.0,63.0,184745.3



ModelMetricsClustering: kmeans
** Reported on cross-validation data. **

Total Within Cluster Sum of Square Error: 8441310.02109546
Total Sum of Square Error to Grand Mean: 15636041.230160372
Between Cluster Sum of Square Error: 7194731.209064912
Centroid stats are not available.

Cross-Validation Metrics Summary: 


Unnamed: 0,Unnamed: 1,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
0,betweenss,1438927.4,337990.34,1400973.1,1024379.6,1355850.8,1964917.8,1448515.6
1,mse,,0.0,,,,,
2,rmse,,0.0,,,,,
3,tot_withinss,1688262.0,538155.06,1321099.9,2515839.5,1957368.9,1312756.8,1334244.8
4,totss,3127189.5,357280.44,2722073.0,3540219.2,3313219.8,3277674.5,2782760.5



Scoring History: 


Unnamed: 0,Unnamed: 1,timestamp,duration,iterations,number_of_clusters,number_of_reassigned_observations,within_cluster_sum_of_squares
0,,2022-09-03 14:55:40,56.170 sec,0.0,0.0,,
1,,2022-09-03 14:55:41,57.568 sec,1.0,1.0,494020.0,32751960.0
2,,2022-09-03 14:55:41,57.673 sec,2.0,1.0,0.0,15636040.0
3,,2022-09-03 14:55:41,57.961 sec,3.0,2.0,4699.0,14346440.0
4,,2022-09-03 14:55:42,58.173 sec,4.0,2.0,923.0,14336470.0
5,,2022-09-03 14:55:42,58.388 sec,5.0,2.0,9923.0,14328220.0
6,,2022-09-03 14:55:42,58.595 sec,6.0,2.0,12818.0,14028990.0
7,,2022-09-03 14:55:42,58.815 sec,7.0,2.0,828.0,13903340.0
8,,2022-09-03 14:55:42,59.029 sec,8.0,2.0,56.0,13903210.0
9,,2022-09-03 14:55:43,59.244 sec,9.0,2.0,7.0,13903210.0



See the whole table with table.as_data_frame()




In [16]:
clusters =kmeans.predict(x_embedded_binded).as_data_frame()

kmeans prediction progress: |████████████████████████████████████████████████████| (done) 100%


In [17]:
train_df['clusters'] = clusters['predict'].to_numpy()
train_df.groupby("clusters")['labels'].value_counts()

clusters  labels         
0         normal                  2
1         smurf              280780
          neptune             86554
          normal                160
          satan                  96
          portsweep              18
          teardrop               14
          pod                    12
2         neptune             20457
          normal               4249
          portsweep             729
          ipsweep                85
          back                    8
          warezclient             1
3         normal              76653
          back                 1995
          teardrop              804
          warezclient           651
          pod                    80
          ipsweep                76
          neptune                44
          nmap                   29
          satan                  29
          buffer_overflow        12
          imap                   12
          smurf                  10
          rootkit                 7
  

### Performing KMeans on test set

In [18]:
clusters_test =kmeans.predict(x_embedded_test_binded).as_data_frame()

kmeans prediction progress: |████████████████████████████████████████████████████| (done) 100%




In [19]:
test_df['clusters'] = clusters_test['predict'].to_numpy()
test_df.groupby("clusters")['labels'].value_counts()

clusters  labels         
0         normal                  2
1         smurf              164081
          neptune             17360
          normal                756
          processtable          223
          apache2               142
          mscan                  22
          saint                   8
          ipsweep                 3
          satan                   3
          snmpguess               1
          xsnoop                  1
2         neptune             40621
          apache2               411
          portsweep             219
          mscan                 158
          httptunnel            117
          normal                 94
          named                   4
          imap                    1
3         normal              54867
          snmpgetattack        7704
          mailbomb             4833
          guess_passwd         3596
          snmpguess            2403
          warezmaster          1432
          back                 1089
  

# TODO: visualizations? (possibly) Or scoring 