In [1]:
import pandas as pd
pd.options.display.max_columns = None
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_fscore_support as score
from xgboost import XGBClassifier
import mlflow
import datetime
import warnings
warnings.filterwarnings("ignore")

In [2]:
version = "v1.0"
data_url = "part-00000-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv"


In [3]:
import sys
sys.path.insert(0, '../backend/src')



In [4]:
#read the data
df = pd.read_csv("part-00000-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv")

In [5]:
#shape of the data
X_columns = [
    'flow_duration', 'Header_Length', 'Protocol Type', 'Duration',
       'Rate', 'Srate', 'Drate', 'fin_flag_number', 'syn_flag_number',
       'rst_flag_number', 'psh_flag_number', 'ack_flag_number',
       'ece_flag_number', 'cwr_flag_number', 'ack_count',
       'syn_count', 'fin_count', 'urg_count', 'rst_count', 
    'HTTP', 'HTTPS', 'DNS', 'Telnet', 'SMTP', 'SSH', 'IRC', 'TCP',
       'UDP', 'DHCP', 'ARP', 'ICMP', 'IPv', 'LLC', 'Tot sum', 'Min',
       'Max', 'AVG', 'Std', 'Tot size', 'IAT', 'Number', 'Magnitue',
       'Radius', 'Covariance', 'Variance', 'Weight', 
]
y_column = 'label'


df=df[:5000]
encoder=LabelEncoder()
y=encoder.fit_transform(df[y_column])
X=df[X_columns]

# 2. Machine Learning Modeling :

## 1. Logistic Regression :
Logistic Regression : It is used in classification use cases where we
want to predict a discrete target ( for example whether something is true
or false). It catches the relationship between the target variable ( class
column in our case ) and the independant features ( clump thickness ,
sizeUniformity .. ) by fitting our data with the Sigmoid function.

In [6]:
from sklearn.model_selection import train_test_split
X_resampled,X_test,y_resampled,y_test=train_test_split(X,y,test_size=0.2)

In [7]:
mlflow.sklearn.autolog(disable=True)

In [8]:

with mlflow.start_run(run_name='LogisticRegression'):
    mlflow.log_param("data_url",data_url)
    mlflow.log_param("data_version",version)
    mlflow.log_param("input_rows",df.shape[0])
    mlflow.log_param("input_cols",df.shape[1])
    #model fitting and training
    lr=LogisticRegression()
    mlflow.set_tag(key= "model",value="LogisticRegression")
    lr.fit(X_resampled,y_resampled)
    train_features_name = f'{X_resampled=}'.split('=')[0]
    train_label_name = f'{y_resampled=}'.split('=')[0]
    mlflow.set_tag(key="train_features_name",value= train_features_name)
    mlflow.set_tag(key= "train_label_name",value=train_label_name)
    predicted=lr.predict(X_test)
    precision,recall,fscore,support=score(y_test,predicted,average='macro')
    mlflow.log_metric("Precision_test",precision)
    mlflow.log_metric("Recall_test",recall)
    mlflow.log_metric("F1_score_test",fscore)
    mlflow.sklearn.log_model(lr,artifact_path="ML_models")


In [9]:
lr=LogisticRegression()
lr.fit(X_resampled,y_resampled)
predicted=lr.predict(X_test)

In [10]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.lines as mlines
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
from sklearn.model_selection import train_test_split, learning_curve
from sklearn.metrics import average_precision_score
from xgboost.sklearn import XGBClassifier
from xgboost import plot_importance, to_graphviz

## Model evaluation :
### Evaluation metrics:
* **Precision:**
Precision is one indicator of the quality of the positive predictions
made by the model. **Of all the positive predictions I made, how many of them are truly positive?** \
Precision is defined as follows:

**Remark:**
 * A true positive (TP) is an outcome where the model correctly
predicts the positive class. Similarly, a true negative (TN) is
an outcome where the model correctly predicts the negative class.
 * A false positive (FP) is an outcome where the model incorrectly
predicts the positive class. And a false negative (FN) is an
outcome where the model incorrectly predicts the negative class.

* **Recall:**
Recall, a commonly used performance metric for classification models,
is the fraction of positives that are correctly classified,**Of all the actual positive examples out there, how many of them did I correctly predict to be positive?**

* **F1-score:** To evaluate model performance comprehensively, we should examine both precision and recall. The F1 score serves as a helpful metric that considers both of them.

* **Accuracy:**
    Accuracy is the number of correctly predicted data points out of all the data points.

**To evaluate fraud detection models, the concepts of recall and precision are very important.Recall  meaures how many fraud cases where trully detected while precision evaluates how good the model is at generating as fewer false alarms as possible. For fraud detection, we want to prioritize high recall to leave out as few fraud cases as possible while also having a relatively high precision because too many false alarms can also be a problem!**

In [11]:
print('Classification report:\n', classification_report(y_test, predicted))
conf_mat = confusion_matrix(y_true=y_test, y_pred=predicted)
print('Confusion matrix:\n', conf_mat)
print('Share of Non-Fraud in Test Data:', 1-round(y_test.sum()/len(y_test),4))

Classification report:
               precision    recall  f1-score   support

           1       0.57      0.60      0.59        20
           4       0.00      0.00      0.00         8
           5       0.00      0.00      0.00         1
           6       0.16      1.00      0.28       152
           7       0.00      0.00      0.00        16
           8       0.00      0.00      0.00        98
           9       0.00      0.00      0.00        88
          10       0.00      0.00      0.00        92
          12       0.00      0.00      0.00        86
          13       0.00      0.00      0.00        98
          14       0.00      0.00      0.00       110
          15       0.00      0.00      0.00         8
          16       0.00      0.00      0.00         6
          17       0.00      0.00      0.00         2
          18       0.00      0.00      0.00        35
          19       0.00      0.00      0.00        46
          20       0.00      0.00      0.00        68
   

**As we can see above the recall is around 75% while precision is just 6% which means there is a lot of false positives to handle.**

texte en italique## 2. Random forest:

In [12]:
# disable autologging
mlflow.sklearn.autolog(disable=True)

In [13]:

with mlflow.start_run(run_name='RandomForest'):
    mlflow.log_param("data_url",data_url)
    mlflow.log_param("data_version",version)
    mlflow.log_param("input_rows",df.shape[0])
    mlflow.log_param("input_cols",df.shape[1])
    rf = RandomForestClassifier(random_state=5)
    mlflow.set_tag(key="model", value = "RandomForest")
    params = rf.get_params()
    mlflow.log_params(params)
    rf.fit(X_resampled,y_resampled)
    train_features_name = f'{X_resampled=}'.split('=')[0]
    train_label_name = f'{y_resampled=}'.split('=')[0]
    mlflow.set_tag(key="train_features_name",value= train_features_name)
    mlflow.set_tag(key= "train_label_name",value=train_label_name)
    predicted=rf.predict(X_test)
    precision,recall,fscore,support=score(y_test,predicted,average='macro')
    mlflow.log_metric("Precision_test",precision)
    mlflow.log_metric("Recall_test",recall)
    mlflow.log_metric("F1_score_test",fscore)
    mlflow.sklearn.log_model(rf,artifact_path="ML_models")


In [14]:
rf = RandomForestClassifier(random_state=5)
rf.fit(X_resampled,y_resampled)
predicted=rf.predict(X_test)

In [15]:
print('Classification report:\n', classification_report(y_test, predicted))
conf_mat = confusion_matrix(y_true=y_test, y_pred=predicted)
print('Confusion matrix:\n', conf_mat)
print('Share of Non-Fraud in Test Data:', 1-round(y_test.sum()/len(y_test),4))

Classification report:
               precision    recall  f1-score   support

           1       0.62      1.00      0.77        20
           4       1.00      1.00      1.00         8
           5       0.50      1.00      0.67         1
           6       1.00      0.99      1.00       152
           7       1.00      1.00      1.00        16
           8       1.00      1.00      1.00        98
           9       1.00      1.00      1.00        88
          10       1.00      0.97      0.98        92
          12       0.98      1.00      0.99        86
          13       1.00      1.00      1.00        98
          14       0.99      1.00      1.00       110
          15       1.00      1.00      1.00         8
          16       0.50      0.17      0.25         6
          17       1.00      1.00      1.00         2
          18       1.00      1.00      1.00        35
          19       1.00      1.00      1.00        46
          20       0.97      1.00      0.99        68
   

**The above metrics show that although the Random Forest model has a slightly lower recall, it has much better precision!**

## 3.Xgboost:

In [16]:
# disable autologging
mlflow.xgboost.autolog(disable=True)

In [17]:

with mlflow.start_run(run_name='XGBoost'):
    mlflow.log_param("data_url",data_url)
    mlflow.log_param("data_version",version)
    mlflow.log_param("input_rows",df.shape[0])
    mlflow.log_param("input_cols",df.shape[1])
    xg = XGBClassifier()
    params = xg.get_params()
    mlflow.set_tag(key= "model", value="XGBClassifier")
    mlflow.log_params(params)
    xg.fit(X_resampled,y_resampled)
    train_features_name = f'{X_resampled=}'.split('=')[0]
    train_label_name = f'{y_resampled=}'.split('=')[0]
    mlflow.set_tag(key="train_features_name",value= train_features_name)
    mlflow.set_tag(key= "train_label_name",value=train_label_name)
    predicted=xg.predict(X_test)
    precision,recall,fscore,support=score(y_test,predicted,average='macro')
    mlflow.log_metric("Precision_test",precision)
    mlflow.log_metric("Recall_test",recall)
    mlflow.log_metric("F1_score_test",fscore)
    mlflow.xgboost.log_model(xg,artifact_path="ML_models")


In [18]:
xg = XGBClassifier()
xg.fit(X_resampled,y_resampled)
predicted=xg.predict(X_test)

In [19]:
print('Classification report:\n', classification_report(y_test, predicted))
conf_mat = confusion_matrix(y_true=y_test, y_pred=predicted)
print('Confusion matrix:\n', conf_mat)
print('Share of Non-Fraud in Test Data:', 1-round(y_test.sum()/len(y_test),4))

Classification report:
               precision    recall  f1-score   support

           1       0.65      1.00      0.78        20
           4       1.00      1.00      1.00         8
           5       0.50      1.00      0.67         1
           6       1.00      0.99      1.00       152
           7       1.00      1.00      1.00        16
           8       1.00      1.00      1.00        98
           9       1.00      1.00      1.00        88
          10       1.00      0.98      0.99        92
          12       1.00      1.00      1.00        86
          13       1.00      1.00      1.00        98
          14       1.00      1.00      1.00       110
          15       1.00      0.88      0.93         8
          16       0.60      0.50      0.55         6
          17       0.67      1.00      0.80         2
          18       0.97      0.97      0.97        35
          19       1.00      1.00      1.00        46
          20       0.99      1.00      0.99        68
   

In [20]:

#Reading Pandas Dataframe from mlflow
all_experiments = [exp.experiment_id for exp in mlflow.search_experiments()]
df_mlflow = mlflow.search_runs(experiment_ids=all_experiments,filter_string="metrics.F1_score_test <1")
run_id = df_mlflow.loc[df_mlflow['metrics.F1_score_test'].idxmax()]['run_id']
print(run_id)

4f7d0cbcde724147856e376a1ff0fa89


In [21]:
df_mlflow

Unnamed: 0,run_id,experiment_id,status,artifact_uri,start_time,end_time,metrics.F1_score_test,metrics.Recall_test,metrics.Precision_test,params.booster,params.random_state,params.scale_pos_weight,params.subsample,params.feature_types,params.missing,params.base_score,params.callbacks,params.max_cat_threshold,params.interaction_constraints,params.device,params.validate_parameters,params.min_child_weight,params.num_parallel_tree,params.verbosity,params.max_leaves,params.colsample_bynode,params.early_stopping_rounds,params.input_cols,params.input_rows,params.sampling_method,params.tree_method,params.objective,params.max_cat_to_onehot,params.eval_metric,params.data_version,params.data_url,params.enable_categorical,params.max_delta_step,params.multi_strategy,params.reg_lambda,params.gamma,params.importance_type,params.colsample_bylevel,params.colsample_bytree,params.max_bin,params.monotone_constraints,params.n_estimators,params.grow_policy,params.reg_alpha,params.max_depth,params.learning_rate,params.n_jobs,params.max_features,params.min_samples_leaf,params.max_samples,params.ccp_alpha,params.class_weight,params.min_weight_fraction_leaf,params.max_leaf_nodes,params.min_impurity_decrease,params.verbose,params.criterion,params.oob_score,params.bootstrap,params.warm_start,params.min_samples_split,tags.mlflow.log-model.history,tags.train_features_name,tags.train_label_name,tags.model,tags.mlflow.source.name,tags.mlflow.user,tags.mlflow.runName,tags.mlflow.source.type
0,3fa3a799a27147678828b7733f68b7a2,0,FINISHED,file:///c:/Users/ASUS/Desktop/research_cs/mlru...,2023-10-15 22:42:26.318000+00:00,2023-10-15 22:42:43.412000+00:00,0.781563,0.811594,0.783414,,,,,,,,,,,,,,,,,,,47,5000,,,binary:logistic,,,v1.0,part-00000-363d1ba3-8ab5-4f96-bc25-4d5862db7cb...,False,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"[{""run_id"": ""3fa3a799a27147678828b7733f68b7a2""...",X_resampled,y_resampled,XGBClassifier,c:\Users\ASUS\.ipython\extensions\envs\venv\li...,ASUS,XGBoost,LOCAL
1,4f7d0cbcde724147856e376a1ff0fa89,0,FINISHED,file:///c:/Users/ASUS/Desktop/research_cs/mlru...,2023-10-15 22:42:08.173000+00:00,2023-10-15 22:42:21.755000+00:00,0.833919,0.840877,0.854588,,5.0,,,,,,,,,,,,,,,,,47,5000,,,,,,v1.0,part-00000-363d1ba3-8ab5-4f96-bc25-4d5862db7cb...,,,,,,,,,,,100.0,,,,,,sqrt,1.0,,0.0,,0.0,,0.0,0.0,gini,False,True,False,2.0,"[{""run_id"": ""4f7d0cbcde724147856e376a1ff0fa89""...",X_resampled,y_resampled,RandomForest,c:\Users\ASUS\.ipython\extensions\envs\venv\li...,ASUS,RandomForest,LOCAL
2,1cf9b3405d244eacb6f490025aef6fab,0,FINISHED,file:///c:/Users/ASUS/Desktop/research_cs/mlru...,2023-10-15 22:41:49.427000+00:00,2023-10-15 22:42:06.725000+00:00,0.061782,0.094,0.054298,,,,,,,,,,,,,,,,,,,47,5000,,,,,,v1.0,part-00000-363d1ba3-8ab5-4f96-bc25-4d5862db7cb...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"[{""run_id"": ""1cf9b3405d244eacb6f490025aef6fab""...",X_resampled,y_resampled,LogisticRegression,c:\Users\ASUS\.ipython\extensions\envs\venv\li...,ASUS,LogisticRegression,LOCAL


In [24]:

#let's call the model from the model registry ( in production stage)
import mlflow.pyfunc

logged_model = f'runs:/{run_id}/ML_models'

# Load model as a PyFuncModel.
loaded_model = mlflow.pyfunc.load_model(logged_model)
print(loaded_model)

# Predict on a Pandas DataFrame.


loaded_model.predict(X_test)

mlflow.pyfunc.loaded_model:
  artifact_path: ML_models
  flavor: mlflow.sklearn
  run_id: 4f7d0cbcde724147856e376a1ff0fa89



array([19, 13, 13, 20, 10,  9,  6, 13,  6, 14,  6, 14, 13,  8,  9, 13,  6,
       10, 12,  6,  8, 20, 19,  9, 10, 13, 23,  9,  6,  6,  6,  9, 13,  8,
        6,  6, 13,  9, 20, 10, 20, 14, 19, 13, 10, 14, 24,  9, 14, 12,  8,
       12, 20,  9,  6, 20, 20,  6,  8,  6, 18, 12, 12, 20,  9, 20, 20,  6,
       14, 12, 18,  9,  8, 12,  9, 12,  9, 22,  1, 18, 14, 13,  1, 19, 14,
        6,  9, 13,  6, 14, 23, 13, 14,  9, 14, 12,  6, 23, 14, 14,  6, 13,
        6, 14, 18, 13, 14, 14,  6,  1, 12,  8, 19, 15, 10, 13,  9,  8,  6,
       13, 20, 18,  8, 22,  8, 19, 18, 14,  4, 13, 13,  7, 10, 10, 14,  1,
        6,  9, 20,  6,  9,  9,  6,  8, 10,  8,  9, 18, 12, 12,  6,  6, 20,
        8,  4, 10, 12, 19, 20,  6,  8, 18, 12,  6, 12, 14, 13, 10,  8,  6,
        6, 22,  6, 24, 10,  8, 12,  8,  6, 22, 13, 20,  9, 20,  8, 13,  8,
       14, 20, 22, 24,  9,  1,  4,  6, 10, 10,  1,  9,  6,  1,  6,  6, 14,
       14, 24, 13, 14, 12,  6,  7, 14, 14, 20,  8, 14,  8, 20, 23,  1,  1,
       12, 22, 14,  1,  8