# Training a ML model using CICIoT2023

This notebook shows how a LogisticRegression model can be trained using the CICIoT2023 csv files.

In [1]:
import pandas as pd
import numpy as np
import os
import warnings
warnings.filterwarnings('ignore')
from sklearn.linear_model import LogisticRegression
from xgboost import *
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import mlflow
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from sklearn.svm import LinearSVC,SVC
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score,confusion_matrix,classification_report
import matplotlib.pyplot as plt
from sklearn.metrics import precision_recall_fscore_support as score

from sklearn.preprocessing import LabelEncoder

import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

### Importing Dataset

In [2]:
df=pd.read_csv("part-00000-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv")

In [3]:
df=df[:5000]

In [4]:
X_columns = [
    'flow_duration', 'Header_Length', 'Protocol Type', 'Duration',
       'Rate', 'Srate', 'Drate', 'fin_flag_number', 'syn_flag_number',
       'rst_flag_number', 'psh_flag_number', 'ack_flag_number',
       'ece_flag_number', 'cwr_flag_number', 'ack_count',
       'syn_count', 'fin_count', 'urg_count', 'rst_count', 
    'HTTP', 'HTTPS', 'DNS', 'Telnet', 'SMTP', 'SSH', 'IRC', 'TCP',
       'UDP', 'DHCP', 'ARP', 'ICMP', 'IPv', 'LLC', 'Tot sum', 'Min',
       'Max', 'AVG', 'Std', 'Tot size', 'IAT', 'Number', 'Magnitue',
       'Radius', 'Covariance', 'Variance', 'Weight', 
]
y_column = 'label'

In [32]:
# disable autologging

df=df[:5000]
encoder=LabelEncoder()
y=encoder.fit_transform(df[y_column])
X=df[X_columns]

X_train,X_test, y_train,y_test=train_test_split(X,y,test_size=0.2)
mlflow.xgboost.autolog(disable=True)
version = "v1.0"
data_url = "part-00000-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv"
with mlflow.start_run(run_name='XGBoost'):
    mlflow.log_param("data_url",data_url)
    mlflow.log_param("data_version",version)
    mlflow.log_param("input_rows",df.shape[0])
    mlflow.log_param("input_cols",df.shape[1])
    xg = XGBClassifier()
    params = xg.get_params()
    mlflow.set_tag(key= "model", value="XGBClassifier")
    mlflow.log_params(params)
    xg.fit(X_train,y_train)
    train_features_name = f'{X_train=}'.split('=')[0]
    train_label_name = f'{y_train=}'.split('=')[0]
    mlflow.set_tag(key="train_features_name",value= train_features_name)
    mlflow.set_tag(key= "train_label_name",value=train_label_name)
    predicted=xg.predict(X_test)
    precision,recall,fscore,support=score(y_test,predicted,average='macro')
    mlflow.log_metric("Precision_test",precision)
    mlflow.log_metric("Recall_test",recall)
    mlflow.log_metric("F1_score_test",fscore)
    mlflow.xgboost.log_model(xg,artifact_path="ML_models")


In [33]:
df=pd.read_csv("part-00000-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv")
df=df[:5000]
encoder=LabelEncoder()
df[y_column]=encoder.fit_transform(df[y_column])
X=df[X_columns]

pipelines = {
    'Random Forest': Pipeline([('RF', RandomForestClassifier())]),
    'SVM': Pipeline([('SVM', SVC())]),
    'Naive Bayes': Pipeline([('NB', MultinomialNB())]),
        'Decision Trie': Pipeline([('DTC', DecisionTreeClassifier())]),
                                'LogisticRegression ': Pipeline([('DTC', LogisticRegression())]),


}
# Définir le nombre de plis pour la validation croisée
n_folds = 5
version="v1"
mlflow.sklearn.autolog(disable=True)
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)
for i, (model_name, pipeline) in enumerate(pipelines.items()):
        mlflow.sklearn.autolog(disable=True)
        with mlflow.start_run(run_name=model_name):
                # Log the model name
                mlflow.log_param("model_name", model_name)
                for train_index, test_index in kf.split(df):

                    # Divisez le dataframe en train et test
                            X_train, X_test = df.iloc[train_index], df.iloc[test_index]
                            y_train, y_test = df.iloc[train_index, -1], df.iloc[test_index, -1]
                            mlflow.log_param("data_url",data_url)
                            mlflow.log_param("data_version",version)
                            mlflow.log_param("input_rows",df.shape[0])
                            mlflow.log_param("input_cols",df.shape[1])
                            rf = pipeline.steps[i][1]
                            mlflow.set_tag(key="model", value = "RandomForest")
                            params = rf.get_params()
                            mlflow.log_params(params)
                            rf.fit(X_train,y_train)
                            train_features_name = f'{X_train=}'.split('=')[0]
                            train_label_name = f'{y_train=}'.split('=')[0]
                            mlflow.set_tag(key="train_features_name",value= train_features_name)
                            mlflow.set_tag(key= "train_label_name",value=train_label_name)
                            predicted=rf.predict(X_test)
                            precision,recall,fscore,support=score(y_test,predicted,average='macro')
                            mlflow.log_metric("Precision_test",precision)
                            mlflow.log_metric("Recall_test",recall)
                            mlflow.log_metric("F1_score_test",fscore)
                            mlflow.sklearn.log_model(rf,artifact_path="ML_models")
                            
                            
                           
                         



                       


IndexError: list index out of range

In [14]:
#Reading Pandas Dataframe from mlflow
all_experiments = [exp.experiment_id for exp in mlflow.search_experiments()]
df_mlflow = mlflow.search_runs(experiment_ids=all_experiments,filter_string="metrics.F1_score_test <1")
run_id = df_mlflow.loc[df_mlflow['metrics.F1_score_test'].idxmax()]['run_id']
print(run_id)

63b5cce68f9d49689bf0bef7ad9dd199


In [8]:
df_mlflow

Unnamed: 0,run_id,experiment_id,status,artifact_uri,start_time,end_time,metrics.F1_score_test,metrics.Recall_test,metrics.Accuracy_test,metrics.Precision_test,...,params.max_cat_threshold,params.validate_parameters,tags.mlflow.runName,tags.model,tags.mlflow.log-model.history,tags.mlflow.user,tags.mlflow.source.type,tags.train_features_name,tags.train_label_name,tags.mlflow.source.name
0,8554fa57e2ae4b76bb0094033bd1b1c6,0,FINISHED,file:///c:/Users/ASUS/Desktop/research_cs/mlru...,2023-10-12 02:40:36.006000+00:00,2023-10-12 02:41:36.169000+00:00,0.052653,0.088444,0.179,0.048772,...,,,LogisticRegression,LogisticRegression,"[{""run_id"": ""8554fa57e2ae4b76bb0094033bd1b1c6""...",ASUS,LOCAL,X_train,y_train,c:\Users\ASUS\.ipython\extensions\envs\myenv\l...
1,e311add786fa45baa81422d22ea91202,0,FINISHED,file:///c:/Users/ASUS/Desktop/research_cs/mlru...,2023-10-12 02:39:44.825000+00:00,2023-10-12 02:40:35.976000+00:00,0.820105,0.819174,0.994,0.83256,...,,,Decision Trie,Decision Trie,"[{""run_id"": ""e311add786fa45baa81422d22ea91202""...",ASUS,LOCAL,X_train,y_train,c:\Users\ASUS\.ipython\extensions\envs\myenv\l...
2,0756607f8e454e41b0f205d3de5090c2,0,FINISHED,file:///c:/Users/ASUS/Desktop/research_cs/mlru...,2023-10-12 02:38:57.786000+00:00,2023-10-12 02:39:44.788000+00:00,0.093726,0.150713,0.178,0.148049,...,,,Naive Bayes,Naive Bayes,"[{""run_id"": ""0756607f8e454e41b0f205d3de5090c2""...",ASUS,LOCAL,X_train,y_train,c:\Users\ASUS\.ipython\extensions\envs\myenv\l...
3,3919aef61e2c4974b7c8a842c6c997bf,0,FINISHED,file:///c:/Users/ASUS/Desktop/research_cs/mlru...,2023-10-12 02:37:26.423000+00:00,2023-10-12 02:38:57.742000+00:00,0.04518,0.08,0.189,0.0359,...,,,SVM,SVM,"[{""run_id"": ""3919aef61e2c4974b7c8a842c6c997bf""...",ASUS,LOCAL,X_train,y_train,c:\Users\ASUS\.ipython\extensions\envs\myenv\l...
4,63b5cce68f9d49689bf0bef7ad9dd199,0,FINISHED,file:///c:/Users/ASUS/Desktop/research_cs/mlru...,2023-10-12 02:36:19.246000+00:00,2023-10-12 02:37:26.401000+00:00,0.886549,0.878205,0.996,0.909016,...,,,Random Forest,Random Forest,"[{""run_id"": ""63b5cce68f9d49689bf0bef7ad9dd199""...",ASUS,LOCAL,X_train,y_train,c:\Users\ASUS\.ipython\extensions\envs\myenv\l...
5,5ec0b0a63eac46f4bcef4873b72864b0,0,FINISHED,file:///c:/Users/ASUS/Desktop/research_cs/mlru...,2023-10-12 02:35:39.961000+00:00,2023-10-12 02:36:16.128000+00:00,0.751117,0.766339,0.981,0.749529,...,,,XGBoost,XGBClassifier,"[{""run_id"": ""5ec0b0a63eac46f4bcef4873b72864b0""...",ASUS,LOCAL,X_train,y_train,c:\Users\ASUS\.ipython\extensions\envs\myenv\l...


In [28]:

#let's call the model from the model registry ( in production stage)
from  mlflow.pyfunc import PyFuncModel

logged_model = f'runs:/{run_id}/ML_models'

# Load model as a PyFuncModel.
# loaded_model = mlflow.pyfunc.load_model(logged_model)
print(loaded_model)
loaded_model = mlflow.pyfunc.load_model(model_uri=model_info.model_uri)


# Predict on a Pandas DataFrame.

# loaded_model.predict(X_test)
if'__predict__' in dir(loaded_model):
    print(True)

mlflow.pyfunc.loaded_model:
  artifact_path: ML_models
  flavor: mlflow.sklearn
  run_id: 63b5cce68f9d49689bf0bef7ad9dd199



NameError: name 'model_info' is not defined

### Classification: 34 (33+1) classes

# Classification: 8 (7+1) classes

In [None]:
dict_7classes = {}
dict_7classes['DDoS-RSTFINFlood'] = 'DDoS'
dict_7classes['DDoS-PSHACK_Flood'] = 'DDoS'
dict_7classes['DDoS-SYN_Flood'] = 'DDoS'
dict_7classes['DDoS-UDP_Flood'] = 'DDoS'
dict_7classes['DDoS-TCP_Flood'] = 'DDoS'
dict_7classes['DDoS-ICMP_Flood'] = 'DDoS'
dict_7classes['DDoS-SynonymousIP_Flood'] = 'DDoS'
dict_7classes['DDoS-ACK_Fragmentation'] = 'DDoS'
dict_7classes['DDoS-UDP_Fragmentation'] = 'DDoS'
dict_7classes['DDoS-ICMP_Fragmentation'] = 'DDoS'
dict_7classes['DDoS-SlowLoris'] = 'DDoS'
dict_7classes['DDoS-HTTP_Flood'] = 'DDoS'

dict_7classes['DoS-UDP_Flood'] = 'DoS'
dict_7classes['DoS-SYN_Flood'] = 'DoS'
dict_7classes['DoS-TCP_Flood'] = 'DoS'
dict_7classes['DoS-HTTP_Flood'] = 'DoS'


dict_7classes['Mirai-greeth_flood'] = 'Mirai'
dict_7classes['Mirai-greip_flood'] = 'Mirai'
dict_7classes['Mirai-udpplain'] = 'Mirai'

dict_7classes['Recon-PingSweep'] = 'Recon'
dict_7classes['Recon-OSScan'] = 'Recon'
dict_7classes['Recon-PortScan'] = 'Recon'
dict_7classes['VulnerabilityScan'] = 'Recon'
dict_7classes['Recon-HostDiscovery'] = 'Recon'

dict_7classes['DNS_Spoofing'] = 'Spoofing'
dict_7classes['MITM-ArpSpoofing'] = 'Spoofing'

dict_7classes['BenignTraffic'] = 'Benign'

dict_7classes['BrowserHijacking'] = 'Web'
dict_7classes['Backdoor_Malware'] = 'Web'
dict_7classes['XSS'] = 'Web'
dict_7classes['Uploading_Attack'] = 'Web'
dict_7classes['SqlInjection'] = 'Web'
dict_7classes['CommandInjection'] = 'Web'


dict_7classes['DictionaryBruteForce'] = 'BruteForce'

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression


ML_models = [
        LogisticRegression(n_jobs=-1),
]

ML_neams = [
        "LogisticRegression",
]


for train_set in tqdm(training_sets):
    d = pd.read_csv("part-00000-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv")
    d[X_columns] = scaler.transform(d[X_columns])
    new_y = [dict_7classes[k] for k in d[y_column]]
    d[y_column] = new_y
    
    for model in (ML_models):
        model.fit(d[X_columns], d[y_column])
    del d

In [None]:
y_test = []
preds = {i:[] for i in range(len(ML_models))}
for test_set in tqdm(test_sets):
    d_test = pd.read_csv("part-00000-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv")
    d_test[X_columns] = scaler.transform(d_test[X_columns])
    new_y = [dict_7classes[k] for k in d_test[y_column]]
    d_test[y_column] = new_y
    
    y_test += list(d_test[y_column].values)
    
    for i in range(len(ML_models)):
        model = ML_models[i]
        y_pred = list(model.predict(d_test[X_columns]))
        preds[i] = preds[i] + y_pred
        

In [None]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
for k,v in preds.items():
    y_pred = v
    print(f"##### {ML_neams[k]} (8 classes) #####")
    print('accuracy_score = ', accuracy_score(y_pred, y_test))
    print('recall_score = ', recall_score(y_pred, y_test, average='macro'))
    print('precision_score = ', precision_score(y_pred, y_test, average='macro'))
    print('f1_score = ', f1_score(y_pred, y_test, average='macro'))
    print()
    print()
    print()

# Classification: 2 (1+1) Classes

In [None]:
dict_2classes = {}
dict_2classes['DDoS-RSTFINFlood'] = 'Attack'
dict_2classes['DDoS-PSHACK_Flood'] = 'Attack'
dict_2classes['DDoS-SYN_Flood'] = 'Attack'
dict_2classes['DDoS-UDP_Flood'] = 'Attack'
dict_2classes['DDoS-TCP_Flood'] = 'Attack'
dict_2classes['DDoS-ICMP_Flood'] = 'Attack'
dict_2classes['DDoS-SynonymousIP_Flood'] = 'Attack'
dict_2classes['DDoS-ACK_Fragmentation'] = 'Attack'
dict_2classes['DDoS-UDP_Fragmentation'] = 'Attack'
dict_2classes['DDoS-ICMP_Fragmentation'] = 'Attack'
dict_2classes['DDoS-SlowLoris'] = 'Attack'
dict_2classes['DDoS-HTTP_Flood'] = 'Attack'

dict_2classes['DoS-UDP_Flood'] = 'Attack'
dict_2classes['DoS-SYN_Flood'] = 'Attack'
dict_2classes['DoS-TCP_Flood'] = 'Attack'
dict_2classes['DoS-HTTP_Flood'] = 'Attack'


dict_2classes['Mirai-greeth_flood'] = 'Attack'
dict_2classes['Mirai-greip_flood'] = 'Attack'
dict_2classes['Mirai-udpplain'] = 'Attack'

dict_2classes['Recon-PingSweep'] = 'Attack'
dict_2classes['Recon-OSScan'] = 'Attack'
dict_2classes['Recon-PortScan'] = 'Attack'
dict_2classes['VulnerabilityScan'] = 'Attack'
dict_2classes['Recon-HostDiscovery'] = 'Attack'

dict_2classes['DNS_Spoofing'] = 'Attack'
dict_2classes['MITM-ArpSpoofing'] = 'Attack'

dict_2classes['BenignTraffic'] = 'Benign'

dict_2classes['BrowserHijacking'] = 'Attack'
dict_2classes['Backdoor_Malware'] = 'Attack'
dict_2classes['XSS'] = 'Attack'
dict_2classes['Uploading_Attack'] = 'Attack'
dict_2classes['SqlInjection'] = 'Attack'
dict_2classes['CommandInjection'] = 'Attack'

dict_2classes['DictionaryBruteForce'] = 'Attack'

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression


ML_models = [
        LogisticRegression(n_jobs=-1),
]

ML_neams = [
        "LogisticRegression",
]


for train_set in tqdm(training_sets):
    d = pd.read_csv( "part-00000-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv" )
    d[X_columns] = scaler.transform(d[X_columns])
    new_y = [dict_2classes[k] for k in d[y_column]]
    d[y_column] = new_y
    
    for model in (ML_models):
        model.fit(d[X_columns], d[y_column])
    del d

In [None]:
y_test = []
preds = {i:[] for i in range(len(ML_models))}
for test_set in tqdm(test_sets):
    d_test = pd.read_csv( "part-00000-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv")
    d_test[X_columns] = scaler.transform(d_test[X_columns])
    new_y = [dict_2classes[k] for k in d_test[y_column]]
    d_test[y_column] = new_y
    
    y_test += list(d_test[y_column].values)
    
    for i in range(len(ML_models)):
        model = ML_models[i]
        y_pred = list(model.predict(d_test[X_columns]))
        preds[i] = preds[i] + y_pred
        

In [None]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
for k,v in preds.items():
    y_pred = v
    print(f"##### {ML_neams[k]} (2 classes) #####")
    print('accuracy_score: ', accuracy_score(y_pred, y_test))
    print('recall_score: ', recall_score(y_pred, y_test, average='macro'))
    print('precision_score: ', precision_score(y_pred, y_test, average='macro'))
    print('f1_score: ', f1_score(y_pred, y_test, average='macro'))
    print()
    print()
    print()

In [None]:
correlation_matrix = df.corr()

sns.heatmap(data=correlation_matrix)

In [None]:
import numpy as np
from sklearn.decomposition import PCA
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

# Create a PCA object
pca = PCA()

# Create a Logistic Regression classifier
logistic_reg = LogisticRegression()

# Create a pipeline with PCA and the classifier
from sklearn.pipeline import Pipeline
pipe = Pipeline(steps=[('pca', pca), ('logistic_reg', logistic_reg)])

# Define the parameter grid for PCA and Logistic Regression
param_grid = {
    'pca__n_components': [1, 2, 3],  # Number of principal components to keep
    'logistic_reg__C': [0.1, 1, 10],
    'logistic_reg__penalty': ['l1', 'l2']
}

# Perform Grid Search with cross-validation
grid_search = GridSearchCV(pipe, param_grid=param_grid, cv=5)
grid_search.fit(df[X_columns], df[y_column])

# Get the best parameters and estimator
best_params = grid_search.best_params_
best_estimator = grid_search.best_estimator_

print("Best Parameters:", best_params)
print("Best Estimator:", best_estimator)

# Fit the best estimator on the data
best_estimator.fit(d[X_columns], d[y_column])

# Transform the data with PCA
X_pca = best_estimator.named_steps['pca'].transform(X)

# You can use X_pca for further analysis or modeling with Logistic Regression
