In [29]:
import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import evalml
import time

In [30]:
df = pd.read_csv('wustl-ehms-2020_with_attacks_categories.csv')
df.head()

Unnamed: 0,Dir,Flgs,SrcAddr,DstAddr,Sport,Dport,SrcBytes,DstBytes,SrcLoad,DstLoad,...,Temp,SpO2,Pulse_Rate,SYS,DIA,Heart_rate,Resp_Rate,ST,Attack Category,Label
0,->,e,10.0.1.172,10.0.1.150,58059,1111,496,186,276914.0,92305.0,...,28.9,0,0,0,0,0,0,0.0,normal,0
1,->,e,10.0.1.172,10.0.1.150,58062,1111,496,186,230984.0,76995.0,...,28.9,0,0,0,0,78,17,0.4,normal,0
2,->,e,10.0.1.172,10.0.1.150,58065,1111,496,186,218470.0,72823.0,...,28.9,89,104,0,0,78,17,0.4,normal,0
3,->,e,10.0.1.172,10.0.1.150,58067,1111,496,186,203376.0,67792.0,...,28.9,89,104,0,0,79,17,0.4,normal,0
4,->,e,10.0.1.172,10.0.1.150,58069,1111,496,186,235723.0,78574.0,...,28.9,89,101,0,0,79,17,0.4,normal,0


In [31]:
df['Attack Category'].value_counts()

Attack Category
normal             14272
Spoofing            1124
Data Alteration      922
Name: count, dtype: int64

In [32]:
df.shape

(16318, 45)

In [33]:
df=df.sample(frac=1)
df.reset_index(drop=True,inplace=True)

In [34]:
df = df.loc[:, df.nunique() > 1]
df.shape

(16318, 35)

In [35]:
df.head(10)

Unnamed: 0,Flgs,Sport,SrcBytes,DstBytes,SrcLoad,DstLoad,SIntPkt,DIntPkt,SIntPktAct,SrcJitter,...,Temp,SpO2,Pulse_Rate,SYS,DIA,Heart_rate,Resp_Rate,ST,Attack Category,Label
0,e,51201,496,186,260915.0,86972.0,3.802,2.3865,0.0,2.831279,...,26.8,99,72,142,72,71,26,0.36,normal,0
1,e,65438,496,186,253557.0,84519.0,3.912333,2.383,0.0,2.153349,...,27.1,98,81,144,78,89,21,0.62,normal,0
2,e,51728,496,186,335816.0,111939.0,2.954,1.9825,0.0,2.136191,...,26.4,96,72,134,79,76,16,0.16,normal,0
3,e,64565,496,186,358468.0,119489.0,2.767333,1.682,0.0,2.07748,...,26.6,98,73,148,84,73,19,0.3,normal,0
4,e,51087,496,186,312901.0,104300.0,3.170333,1.833,0.0,2.441824,...,26.8,99,72,142,72,71,26,0.36,normal,0
5,e,60740,496,186,210392.0,70131.0,4.715,3.138,0.0,3.416618,...,26.7,98,87,143,82,86,15,0.16,normal,0
6,e,51064,496,186,274843.0,91614.0,3.609333,2.462,0.0,2.586163,...,27.2,97,82,145,87,74,20,0.0,normal,0
7,e,62897,496,186,230859.0,76953.0,4.297,2.9715,0.0,3.074972,...,27.0,98,80,129,76,84,27,0.14,Spoofing,1
8,e,64309,496,186,284431.0,94810.0,3.487667,1.7625,0.0,2.865746,...,27.9,98,73,148,84,73,19,0.3,normal,0
9,e,60401,496,186,248975.0,82992.0,3.984333,2.526,0.0,2.950836,...,26.7,99,72,142,72,71,26,0.36,normal,0


In [36]:
X = df.drop(['Attack Category','Label'],axis=1)
y = df['Attack Category']
y.value_counts()

Attack Category
normal             14272
Spoofing            1124
Data Alteration      922
Name: count, dtype: int64

In [37]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16318 entries, 0 to 16317
Data columns (total 33 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Flgs        16318 non-null  object 
 1   Sport       16318 non-null  object 
 2   SrcBytes    16318 non-null  int64  
 3   DstBytes    16318 non-null  int64  
 4   SrcLoad     16318 non-null  float64
 5   DstLoad     16318 non-null  float64
 6   SIntPkt     16318 non-null  float64
 7   DIntPkt     16318 non-null  float64
 8   SIntPktAct  16318 non-null  float64
 9   SrcJitter   16318 non-null  float64
 10  DstJitter   16318 non-null  float64
 11  sMaxPktSz   16318 non-null  int64  
 12  dMaxPktSz   16318 non-null  int64  
 13  sMinPktSz   16318 non-null  int64  
 14  Dur         16318 non-null  float64
 15  TotPkts     16318 non-null  int64  
 16  TotBytes    16318 non-null  int64  
 17  Load        16318 non-null  float64
 18  Loss        16318 non-null  int64  
 19  pLoss       16318 non-nul

In [38]:
non_numeric_features = X.select_dtypes(exclude=['number']).columns.tolist()
non_numeric_features

['Flgs', 'Sport', 'SrcMac']

In [39]:
X[non_numeric_features]

Unnamed: 0,Flgs,Sport,SrcMac
0,e,51201,84:3a:4b:0f:5b:94
1,e,65438,84:3a:4b:0f:5b:94
2,e,51728,84:3a:4b:0f:5b:94
3,e,64565,84:3a:4b:0f:5b:94
4,e,51087,84:3a:4b:0f:5b:94
...,...,...,...
16313,e,57181,84:3a:4b:0f:5b:94
16314,e,64786,84:3a:4b:0f:5b:94
16315,e,62056,84:3a:4b:0f:5b:94
16316,M,57947,d8:9e:f3:95:02:75


In [40]:
le = LabelEncoder()
X['Sport'] = le.fit_transform(X['Sport'])

In [41]:
y = pd.Series(le.fit_transform(y))

In [42]:
X.drop(['Flgs','SrcMac'],inplace=True,axis=1)
X.shape

(16318, 31)

In [43]:
y.value_counts()

2    14272
1     1124
0      922
Name: count, dtype: int64

In [44]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_scaled = sc.fit_transform(X)
X = pd.DataFrame(X_scaled, columns=X.columns)

In [45]:
X_train,X_test,y_train,y_test = train_test_split(np.array(X),y,test_size=0.2,stratify = y)

In [46]:
import time
import tracemalloc
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, mean_absolute_error
from sklearn.linear_model import LogisticRegression, Perceptron, SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

# Assuming X_train, X_test, y_train, y_test are already defined

# Define models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "Perceptron": Perceptron(),
    "Simple MLP": MLPClassifier(max_iter=1000),
    "SVM": SVC(probability=True),
    "KNN": KNeighborsClassifier(),
    "SGDClassifier": SGDClassifier()
}

# Initialize results dictionary
results = {
    "Model": [],
    "Accuracy": [],
    "Precision": [],
    "Recall": [],
    "F1-Score": [],
    "ROC AUC Score": [],
    "MAE": [],
    "Training Time (s)": [],
    "Memory Used (MB)": []
}

# Train and evaluate each model
for model_name, model in models.items():
    print(f"Training {model_name}...")
    
    # Start tracking memory and time
    tracemalloc.start() 
    start_time = time.time()
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Predict
    y_pred = model.predict(X_test)
    try:
        y_pred_proba = model.predict_proba(X_test)
    except Exception as e:
        print(e)
        pass
    
    # Stop tracking memory and time
    training_time = time.time() - start_time
    current, peak = tracemalloc.get_traced_memory()
    tracemalloc.stop()
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='macro')
    f1 = f1_score(y_test, y_pred, average='weighted')
    roc_auc = roc_auc_score(y_test, y_pred_proba, multi_class='ovr') if y_pred_proba is not None else np.nan
    mae = mean_absolute_error(y_test, y_pred)
     
    # Append results
    results["Model"].append(model_name)
    results["Accuracy"].append(accuracy)
    results["Precision"].append(precision)
    results["Recall"].append(recall)
    results["F1-Score"].append(f1)
    results["ROC AUC Score"].append(roc_auc)
    results["MAE"].append(mae)
    results["Training Time (s)"].append(training_time)
    results["Memory Used (MB)"].append(peak / 10**6)  # Convert to MB


Training Logistic Regression...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Training Random Forest...
Training Decision Tree...
Training Perceptron...
'Perceptron' object has no attribute 'predict_proba'
Training Simple MLP...
Training SVM...
Training KNN...
Training SGDClassifier...
This 'SGDClassifier' has no attribute 'predict_proba'


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [47]:
# Convert results to DataFrame
results_df = pd.DataFrame(results)
print(results_df)

                 Model  Accuracy  Precision    Recall  F1-Score  \
0  Logistic Regression  0.928615   0.864717  0.662343  0.895491   
1        Random Forest  0.938419   0.933304  0.711115  0.918025   
2        Decision Tree  0.959865   0.961797  0.907290  0.960708   
3           Perceptron  0.885723   0.868313  0.654560  0.876798   
4           Simple MLP  0.944853   0.936734  0.770181  0.934611   
5                  SVM  0.929534   0.934536  0.663728  0.897297   
6                  KNN  0.943015   0.935273  0.793385  0.936973   
7        SGDClassifier  0.929228   0.865301  0.664271  0.896089   

   ROC AUC Score       MAE  Training Time (s)  Memory Used (MB)  
0       0.827341  0.073836           0.998483          0.899439  
1       0.946563  0.062500           6.184080          2.774870  
2       0.926712  0.041054           0.380049          2.172972  
3       0.926712  0.117034           0.222678          0.333142  
4       0.961530  0.055760         100.112719          2.806140  


In [48]:
from sklearn.ensemble import HistGradientBoostingClassifier
import time
base_clf = HistGradientBoostingClassifier()
stime=time.time()
base_clf.fit(X_train,y_train)
print('Time is:',time.time()-stime)

Time is: 2.3150463104248047
