In [1]:
%pip install --upgrade scikit-learn==1.0 --quiet

Note: you may need to restart the kernel to use updated packages.


In [30]:
%matplotlib inline

In [91]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns # viz
import matplotlib.pyplot as plt # viz
from scipy import stats
import json
from typing import List, Tuple

from sklearn.ensemble import IsolationForest
from sklearn.metrics import f1_score, balanced_accuracy_score, roc_auc_score, precision_recall_fscore_support
from sklearn import metrics, linear_model

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader

import warnings
warnings.filterwarnings('ignore')

In [92]:
train_df = pd.read_excel('dummy_aws_events.xlsx')
# test_df = pd.read_csv('input/labelled_testing_data.csv')
# validation_df = pd.read_csv('input/labelled_validation_data.csv')

In [93]:
train_df = train_df.fillna("NULL")

In [94]:
# FEATURE ENGINEERING FRAMEWORK FOR CATERGORICAL DATA

def numeric_maker(df, cols, retain_cols=[], replace=False):
    temp_df = df.copy()
    for col in cols:
        col_new = col + "_numeric"
        temp_df[col_new] = temp_df.groupby(col)[col].transform('count')
        temp_df[col_new] = np.log(temp_df[col_new]/len(temp_df))
    if replace:
        drop_cols = list(set(cols) - set(retain_cols))
        return temp_df.drop(columns=drop_cols)
    else:
        return temp_df
    # return temp_df, temp_df.drop(columns=cols)

In [95]:
# HELPER FUNCTIONS FOR EVALUATIONS

def metric_printer(y_label, y_pred):
    
    y_true = y_label.copy()
    y_true[y_true == 1] = -1
    y_true[y_true == 0] = 1
    
    metric_tuple = precision_recall_fscore_support(y_true, y_pred, average="weighted", pos_label = -1)
    print(f'Precision:\t{metric_tuple[0]}')
    print(f'Recall:\t\t{metric_tuple[1]:.3f}')
    print(f'F1-Score:\t{metric_tuple[2]:.3f}')

def output_roc_plot(y, pred):
    fpr, tpr, thresholds = metrics.roc_curve(y, pred)
    roc_auc = metrics.auc(fpr, tpr)
    display = metrics.RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc, estimator_name='Isolation Forest')
    display.plot()
    plt.show()

In [96]:
# HANDLE FOLLOWING COLUMNS FOR ANAMOLY DETECTIOn

train_df_feats = train_df[train_df.columns.values[:-1]]

categorical_cols_ids = train_df.columns.values[:-2]
categorical_cols_no_ids = train_df.columns.values[1:-2]
# train_df_feats = numeric_maker(train_df_feats, categorical_cols_ids, replace=True) # user-id is feature
train_df_feats = numeric_maker(train_df_feats[train_df.columns.values[1:-1]], categorical_cols_no_ids, replace=True) # user-id is not a feature
train_df_feats
# val_df_feats = numeric_maker(val_df_feats, categorical_cols, replace=True)
# test_df_feats = numeric_maker(test_df_feats, categorical_cols, replace=True)

Unnamed: 0,Event Cost,Source-ip_numeric,Source_numeric,Region_numeric,Event Name_numeric,Access Level_numeric
0,0.1,-1.098612,-0.693147,-0.033902,-1.003302,-1.003302
1,0.1,-1.098612,-0.693147,-0.033902,-1.003302,-1.003302
2,0.1,-1.098612,-0.693147,-0.033902,-1.003302,-1.003302
3,3.0,-1.098612,-0.693147,-0.033902,-2.302585,-1.321756
4,0.2,-1.098612,-0.693147,-0.033902,-1.003302,-1.003302
5,0.1,-1.098612,-0.916291,-0.033902,-1.098612,-1.003302
6,0.1,-1.609438,-0.916291,-0.033902,-1.098612,-1.003302
7,0.1,-1.609438,-0.916291,-0.033902,-1.098612,-1.003302
8,0.1,-1.609438,-0.916291,-0.033902,-1.098612,-1.003302
9,0.4,-1.609438,-0.916291,-0.033902,-1.098612,-1.003302


## ISOLATION FOREST

In [97]:
master_data_set_feats = pd.concat([train_df_feats])
master_data_set_feats

Unnamed: 0,Event Cost,Source-ip_numeric,Source_numeric,Region_numeric,Event Name_numeric,Access Level_numeric
0,0.1,-1.098612,-0.693147,-0.033902,-1.003302,-1.003302
1,0.1,-1.098612,-0.693147,-0.033902,-1.003302,-1.003302
2,0.1,-1.098612,-0.693147,-0.033902,-1.003302,-1.003302
3,3.0,-1.098612,-0.693147,-0.033902,-2.302585,-1.321756
4,0.2,-1.098612,-0.693147,-0.033902,-1.003302,-1.003302
5,0.1,-1.098612,-0.916291,-0.033902,-1.098612,-1.003302
6,0.1,-1.609438,-0.916291,-0.033902,-1.098612,-1.003302
7,0.1,-1.609438,-0.916291,-0.033902,-1.098612,-1.003302
8,0.1,-1.609438,-0.916291,-0.033902,-1.098612,-1.003302
9,0.4,-1.609438,-0.916291,-0.033902,-1.098612,-1.003302


In [98]:
i_forest_clf_c1 = IsolationForest(contamination=0.1, random_state=0).fit(master_data_set_feats)
i_forest_clf_c2 = IsolationForest(contamination=0.3, random_state=0).fit(master_data_set_feats)
i_forest_clf_c3 = IsolationForest(contamination=0.5, random_state=0).fit(master_data_set_feats)
i_forest_clf_c4 = IsolationForest(contamination=0.23, random_state=0).fit(master_data_set_feats)

In [101]:
train_df['Anomalies/Rare Events c=0.1'] = i_forest_clf_c1.predict(train_df_feats)
train_df['Anomalies/Rare Events c=0.1'] = train_df['Anomalies/Rare Events c=0.1'].apply(lambda x: 1*(x==-1))

train_df['Anomalies/Rare Events c=0.3'] = i_forest_clf_c2.predict(train_df_feats)
train_df['Anomalies/Rare Events c=0.3'] = train_df['Anomalies/Rare Events c=0.3'].apply(lambda x: 1*(x==-1))

train_df['Anomalies/Rare Events c=0.5'] = i_forest_clf_c3.predict(train_df_feats)
train_df['Anomalies/Rare Events c=0.5'] = train_df['Anomalies/Rare Events c=0.5'].apply(lambda x: 1*(x==-1))

train_df['Anomalies/Rare Events c=custom'] = i_forest_clf_c4.predict(train_df_feats)
train_df['Anomalies/Rare Events c=custom'] = train_df['Anomalies/Rare Events c=custom'].apply(lambda x: 1*(x==-1))

In [102]:
train_df

Unnamed: 0,User-id,Source-ip,Source,Region,Event Name,Access Level,Event Cost,Expert Rules,Anomalies/Rare Events c=0.1,Anomalies/Rare Events c=0.3,Anomalies/Rare Events c=0.5,Anomalies/Rare Events c=custom
0,sid,123,s3,ap-south-1,CreateBucket,FullAccess,0.1,1,0,0,0,0
1,sid,123,s3,ap-south-1,CreateBucket,LimitedAccess,0.1,0,0,0,0,0
2,sid,123,s3,ap-south-1,CreateBucket,LimitedAccess,0.1,0,0,0,0,0
3,sid,123,s3,ap-south-1,DeleteObj,,3.0,0,0,1,1,1
4,sid,123,s3,ap-south-1,CreateBucket,FullAccess,0.2,1,0,0,0,0
5,sid,123,lambda,ap-south-1,Invoke,FullAccess,0.1,1,0,0,1,0
6,sid,124,lambda,ap-south-1,Invoke,FullAccess,0.1,1,0,0,0,0
7,sid,124,lambda,ap-south-1,Invoke,LimitedAccess,0.1,0,0,0,0,0
8,sid,124,lambda,ap-south-1,Invoke,LimitedAccess,0.1,0,0,0,0,0
9,sid,124,lambda,ap-south-1,Invoke,LimitedAccess,0.4,0,0,0,0,0


In [103]:
train_df.to_excel('results_dummy_events.xlsx')