In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import evalml
import time

In [2]:
df = pd.read_csv('wustl-ehms-2020_with_attacks_categories.csv')
df.head()

Unnamed: 0,Dir,Flgs,SrcAddr,DstAddr,Sport,Dport,SrcBytes,DstBytes,SrcLoad,DstLoad,...,Temp,SpO2,Pulse_Rate,SYS,DIA,Heart_rate,Resp_Rate,ST,Attack Category,Label
0,->,e,10.0.1.172,10.0.1.150,58059,1111,496,186,276914.0,92305.0,...,28.9,0,0,0,0,0,0,0.0,normal,0
1,->,e,10.0.1.172,10.0.1.150,58062,1111,496,186,230984.0,76995.0,...,28.9,0,0,0,0,78,17,0.4,normal,0
2,->,e,10.0.1.172,10.0.1.150,58065,1111,496,186,218470.0,72823.0,...,28.9,89,104,0,0,78,17,0.4,normal,0
3,->,e,10.0.1.172,10.0.1.150,58067,1111,496,186,203376.0,67792.0,...,28.9,89,104,0,0,79,17,0.4,normal,0
4,->,e,10.0.1.172,10.0.1.150,58069,1111,496,186,235723.0,78574.0,...,28.9,89,101,0,0,79,17,0.4,normal,0


In [3]:
df['Attack Category'].value_counts()

Attack Category
normal             14272
Spoofing            1124
Data Alteration      922
Name: count, dtype: int64

In [4]:
df.shape

(16318, 45)

In [5]:
df=df.sample(frac=1)
df.reset_index(drop=True,inplace=True)

In [6]:
df = df.loc[:, df.nunique() > 1]
df.shape

(16318, 35)

In [7]:
df.head(10)

Unnamed: 0,Flgs,Sport,SrcBytes,DstBytes,SrcLoad,DstLoad,SIntPkt,DIntPkt,SIntPktAct,SrcJitter,...,Temp,SpO2,Pulse_Rate,SYS,DIA,Heart_rate,Resp_Rate,ST,Attack Category,Label
0,e,65191,496,186,231613.0,77204.0,4.283,3.27,0.0,3.026486,...,28.4,98,70,148,84,61,0,0.24,normal,0
1,e,63180,496,186,226157.0,75386.0,4.386333,2.6175,0.0,3.311422,...,26.9,98,73,148,84,73,19,0.3,normal,0
2,e,56356,496,186,250105.0,83368.0,3.966333,2.952,0.0,2.802078,...,27.3,98,73,148,84,73,19,0.3,normal,0
3,e,50842,496,186,224113.0,74704.0,4.426333,1.7015,0.0,4.149179,...,27.1,97,96,145,87,87,8,0.08,normal,0
4,e,56229,496,186,254098.0,84699.0,3.904,2.905,0.0,2.7575,...,27.7,98,73,142,72,74,15,0.1,normal,0
5,e,50234,496,186,285605.0,95202.0,3.473333,2.2565,0.0,2.533108,...,26.3,96,71,138,83,71,13,0.14,normal,0
6,e,52322,496,186,93245.0,31082.0,10.638667,7.5745,0.0,4.339137,...,27.2,97,80,145,87,82,23,0.2,normal,0
7,e,51487,496,186,99485.0,33162.0,9.971333,2.904,0.0,10.145375,...,26.4,98,74,134,79,73,17,0.1,normal,0
8,e,60318,496,186,221940.0,73980.0,4.469667,2.3845,0.0,3.069871,...,26.8,98,88,143,82,87,24,0.34,normal,0
9,e,56453,496,186,195109.0,65036.0,5.084333,4.0785,0.0,3.610842,...,27.6,98,73,148,84,73,19,0.3,normal,0


In [8]:
X = df.drop(['Attack Category','Label'],axis=1)
y = df['Label']
y.value_counts()

Label
0    14272
1     2046
Name: count, dtype: int64

In [9]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16318 entries, 0 to 16317
Data columns (total 33 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Flgs        16318 non-null  object 
 1   Sport       16318 non-null  object 
 2   SrcBytes    16318 non-null  int64  
 3   DstBytes    16318 non-null  int64  
 4   SrcLoad     16318 non-null  float64
 5   DstLoad     16318 non-null  float64
 6   SIntPkt     16318 non-null  float64
 7   DIntPkt     16318 non-null  float64
 8   SIntPktAct  16318 non-null  float64
 9   SrcJitter   16318 non-null  float64
 10  DstJitter   16318 non-null  float64
 11  sMaxPktSz   16318 non-null  int64  
 12  dMaxPktSz   16318 non-null  int64  
 13  sMinPktSz   16318 non-null  int64  
 14  Dur         16318 non-null  float64
 15  TotPkts     16318 non-null  int64  
 16  TotBytes    16318 non-null  int64  
 17  Load        16318 non-null  float64
 18  Loss        16318 non-null  int64  
 19  pLoss       16318 non-nul

In [11]:
non_numeric_features = X.select_dtypes(exclude=['number']).columns.tolist()
non_numeric_features

['Flgs', 'Sport', 'SrcMac']

In [12]:
X[non_numeric_features]

Unnamed: 0,Flgs,Sport,SrcMac
0,e,65191,84:3a:4b:0f:5b:94
1,e,63180,84:3a:4b:0f:5b:94
2,e,56356,84:3a:4b:0f:5b:94
3,e,50842,84:3a:4b:0f:5b:94
4,e,56229,84:3a:4b:0f:5b:94
...,...,...,...
16313,M,64312,d8:9e:f3:95:02:75
16314,e,64910,84:3a:4b:0f:5b:94
16315,e,59660,d8:9e:f3:95:02:75
16316,e,57513,84:3a:4b:0f:5b:94


In [13]:
le = LabelEncoder()
X['Sport'] = le.fit_transform(X['Sport'])

In [14]:
X.drop(['Flgs','SrcMac'],inplace=True,axis=1)
X.shape

(16318, 31)

In [15]:
y.value_counts()

Label
0    14272
1     2046
Name: count, dtype: int64

In [17]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_scaled = sc.fit_transform(X)
X = pd.DataFrame(X_scaled, columns=X.columns)

In [18]:
X_train,X_test,y_train,y_test = train_test_split(np.array(X),y,test_size=0.2,stratify = y)

In [21]:
import time
import tracemalloc
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, mean_absolute_error
from sklearn.linear_model import LogisticRegression, Perceptron, SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

# Assuming X_train, X_test, y_train, y_test are already defined

# Define models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "Perceptron": Perceptron(),
    "Simple MLP": MLPClassifier(max_iter=1000),
    "SVM": SVC(probability=True),
    "KNN": KNeighborsClassifier(),
    "SGDClassifier": SGDClassifier()
}

# Initialize results dictionary
results = {
    "Model": [],
    "Accuracy": [],
    "Precision": [],
    "Recall": [],
    "F1-Score": [],
    "ROC AUC Score": [],
    "MAE": [],
    "Training Time (s)": [],
    "Memory Used (MB)": []
}

# Train and evaluate each model
for model_name, model in models.items():
    print(f"Training {model_name}...")
    
    # Start tracking memory and time
    tracemalloc.start()
    start_time = time.time()
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Predict
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else None
    
    # Stop tracking memory and time
    training_time = time.time() - start_time
    current, peak = tracemalloc.get_traced_memory()
    tracemalloc.stop()
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='macro')
    f1 = f1_score(y_test, y_pred, average='weighted')
    roc_auc = roc_auc_score(y_test, y_pred_proba) if y_pred_proba is not None else np.nan
    mae = mean_absolute_error(y_test, y_pred)
    
    # Append results
    results["Model"].append(model_name)
    results["Accuracy"].append(accuracy)
    results["Precision"].append(precision)
    results["Recall"].append(recall)
    results["F1-Score"].append(f1)
    results["ROC AUC Score"].append(roc_auc)
    results["MAE"].append(mae)
    results["Training Time (s)"].append(training_time)
    results["Memory Used (MB)"].append(peak / 10**6)  # Convert to MB


Training Logistic Regression...
Training Random Forest...
Training Decision Tree...
Training Perceptron...
Training Simple MLP...
Training SVM...
Training KNN...
Training SGDClassifier...


In [22]:
# Convert results to DataFrame
results_df = pd.DataFrame(results)
print(results_df)

                 Model  Accuracy  Precision    Recall  F1-Score  \
0  Logistic Regression  0.925551   0.924832  0.720739  0.914091   
1        Random Forest  0.940870   0.941169  0.777674  0.934126   
2        Decision Tree  0.975490   0.975543  0.945142  0.975516   
3           Perceptron  0.695159   0.844167  0.670735  0.744130   
4           Simple MLP  0.948223   0.946450  0.824819  0.944728   
5                  SVM  0.927390   0.928504  0.721790  0.915762   
6                  KNN  0.945772   0.943355  0.842271  0.943737   
7        SGDClassifier  0.925245   0.923977  0.721611  0.913965   

   ROC AUC Score       MAE  Training Time (s)  Memory Used (MB)  
0       0.787296  0.074449           0.134671          0.474484  
1       0.951639  0.059130           6.126889          2.768948  
2       0.945142  0.024510           0.385904          2.329558  
3            NaN  0.304841           0.045876          0.320538  
4       0.953790  0.051777          73.928039          2.803677  


In [26]:
from sklearn.ensemble import HistGradientBoostingClassifier
import time
base_clf = HistGradientBoostingClassifier()
stime=time.time()
base_clf.fit(X_train,y_train)
print('Time is:',time.time()-stime)

Time is: 0.868302583694458
