### GridSearch using Decision Trees, Random Forest, XGBoost

In [115]:
import pandas as pd
from sklearn.datasets import fetch_kddcup99

In [136]:
#get cyberattack data from samples
d_bunch = fetch_kddcup99(as_frame = True)

In [137]:
#extract data
Xdf = d_bunch.data
y = d_bunch.target

In [138]:
#check target values
y.value_counts()

b'smurf.'              280790
b'neptune.'            107201
b'normal.'              97278
b'back.'                 2203
b'satan.'                1589
b'ipsweep.'              1247
b'portsweep.'            1040
b'warezclient.'          1020
b'teardrop.'              979
b'pod.'                   264
b'nmap.'                  231
b'guess_passwd.'           53
b'buffer_overflow.'        30
b'land.'                   21
b'warezmaster.'            20
b'imap.'                   12
b'rootkit.'                10
b'loadmodule.'              9
b'ftp_write.'               8
b'multihop.'                7
b'phf.'                     4
b'perl.'                    3
b'spy.'                     2
Name: labels, dtype: int64

In [139]:
#change target to 1s (anything normal) and 0s
y = y.apply(lambda x: 0 if x == b'normal.' else 1)

In [140]:
#verify counts
y.value_counts()

1    396743
0     97278
Name: labels, dtype: int64

In [141]:
#check df columsn
Xdf.columns

Index(['duration', 'protocol_type', 'service', 'flag', 'src_bytes',
       'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot',
       'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell',
       'su_attempted', 'num_root', 'num_file_creations', 'num_shells',
       'num_access_files', 'num_outbound_cmds', 'is_host_login',
       'is_guest_login', 'count', 'srv_count', 'serror_rate',
       'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate',
       'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count',
       'dst_host_srv_count', 'dst_host_same_srv_rate',
       'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
       'dst_host_srv_diff_host_rate', 'dst_host_serror_rate',
       'dst_host_srv_serror_rate', 'dst_host_rerror_rate',
       'dst_host_srv_rerror_rate'],
      dtype='object')

In [142]:
#check data values
Xdf.protocol_type.value_counts()

b'icmp'    283602
b'tcp'     190065
b'udp'      20354
Name: protocol_type, dtype: int64

In [143]:
#check data values
Xdf.service.value_counts()

b'ecr_i'      281400
b'private'    110893
b'http'        64293
b'smtp'         9723
b'other'        7237
               ...  
b'X11'            11
b'tim_i'           7
b'pm_dump'         1
b'tftp_u'          1
b'red_i'           1
Name: service, Length: 66, dtype: int64

In [144]:
#check data values
Xdf.flag.value_counts()

b'SF'        378440
b'S0'         87007
b'REJ'        26875
b'RSTR'         903
b'RSTO'         579
b'SH'           107
b'S1'            57
b'S2'            24
b'RSTOS0'        11
b'S3'            10
b'OTH'            8
Name: flag, dtype: int64

In [145]:
#drop two of the catagory columns and one-hot-encode protocol_type
Xdf_drop = Xdf.drop(['service','flag'], axis=1)
Xdf_one_hot = pd.get_dummies(Xdf_drop, columns = ['protocol_type'])

In [146]:
from sklearn.model_selection import train_test_split

In [147]:
#split the data to train, validataion, test (60/20/20)
X_train, X_test, y_train, y_test = train_test_split(Xdf_one_hot, y, test_size=0.2, random_state=34)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=34)

In [148]:
#verify counts
X_train.shape, X_val.shape, X_test.shape

((296412, 41), (98804, 41), (98805, 41))

### Decision Tree Classifier

In [81]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score

In [82]:
#find best model usng gridsearch
dt_params = {'max_features':[0.8, 0.85, 0.9, 0.95], 'min_samples_split': [2, 4, 6, 8]}
dt = GridSearchCV(DecisionTreeClassifier(), dt_params, cv=5)
dt.fit(X_train, y_train)

In [83]:
#check accuracy
roc_auc_score(y_val, dt.predict(X_val))

0.9993964834186522

### Random Forest Classifier

In [84]:
from sklearn.ensemble import RandomForestClassifier

In [85]:
#find best model using gridsearch
rf_params = {'max_features':[0.8, 0.85, 0.9, 0.95]}
rf = GridSearchCV(RandomForestClassifier(), rf_params, cv=5)
rf.fit(X_train, y_train)

In [87]:
#check accuracy
roc_auc_score(y_val, rf.predict(X_val))

0.9998665153531954

### XGBoost

In [91]:
from xgboost import XGBClassifier

In [109]:
xgbX_train = X_train.astype(float)
xgbX_val = X_val.astype(float)

In [111]:
xgbc = XGBClassifier(objective='binary:logistic',
                          booster='gbtree',
                          eval_metric='auc',
                          tree_method='hist',
                          grow_policy='lossguide',
                          use_label_encoder=False)
xgbc.fit(xgbX_train , y_train)

In [113]:
roc_auc_score(y_val, xgbc.predict(xgbX_val))

0.9998406782304174

In [151]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
# Define the models
models = {
    "Logistic Regression": LogisticRegression(random_state=42, max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "KNN": KNeighborsClassifier()
}

# Train and evaluate the models
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    auc_score = roc_auc_score(y_test, y_pred_proba)
    print(f"Model: {name}\nROC-AUC Score: {auc_score}\n")

Model: Logistic Regression
ROC-AUC Score: 0.9965197915019902

Model: Decision Tree
ROC-AUC Score: 0.9995397660108738

