## Download the CSE-CIC-IDS2018 dataset

In [None]:
# Install AWS CLI (uncomment and run this cell if you haven't installed it yet)
!../venv/bin/pip install awscli --upgrade

import os
import subprocess

# Define the directory path
directory = "../data/CSE-CIC-IDS2018"

# Check if the directory is empty
if not os.listdir(directory):
    # Download data from AWS S3 bucket
    subprocess.run(["aws", "s3", "sync", "--no-sign-request", "--recursive", "s3://cse-cic-ids2018/Processed Traffic Data for ML Algorithms/", directory], check=True)
    print("Data downloaded successfully.")
else:
    print(f"The directory '{directory}' is not empty. Data download skipped.")

## Data Exploration



In [None]:
import pandas as pd
import os
import sys

sys.path.append(os.path.abspath('../'))

from lib.helper_functions import *

In [None]:
# if saved dataframe file exists, load
# if dataframe isn't saved, load raw csv file and save the dataframe
dataframe_file = '../data/flowmeter_dataframe.pkl'
exists = os.path.isfile(dataframe_file)
if exists:
    print('dataframe file exists, loading dataframe...')
    df = pd.read_pickle(dataframe_file)
    print('dataframe loaded.')
else:
    directory = 'data/CSE-CIC-IDS2018/'
    df = pd.DataFrame()
    df = read_clean_combine_csv(directory, df, 'Thuesday-20-02-2018_TrafficForML_CICFlowMeter.csv')
    # save dataframe to file for future use
    pd.to_pickle(df, dataframe_file)

In [None]:
df.columns

In [None]:
df.memory_usage().sum() / 1024**2

In [None]:
len(df)

In [None]:
df = df.sort_values(by=['timestamp'])

In [None]:
df = df[df['timestamp'] > pd.to_datetime('2018-01-01')]

In [None]:
df.head()

In [None]:
# get count of each label
print(df['label'].value_counts())

In [None]:
# get distribution in of each label
print(df['label'].value_counts()/len(df))

## Downsample the dataset to 100K rows

In [None]:
from sklearn.utils import resample

dataframe_file = '../data/multiclassification_dataset.pkl'
exists = os.path.isfile(dataframe_file)
if exists:
    print('100k dataframe file exists, loading dataframe...')
    df = pd.read_pickle(dataframe_file)
    print('dataframe loaded.')
else:
    # Calculate class counts
    class_counts = df['label'].value_counts().to_dict()

    # Total desired number of instances
    total_instances = 100000

    # Calculate the downsampled number of instances for each class
    downsampled_counts = {}
    for label, count in class_counts.items():
        downsampled_counts[label] = min(count, total_instances // len(class_counts))

    # Downsample each class to the desired number of instances
    downsampled_data = pd.DataFrame()
    for label, count in downsampled_counts.items():
        if label != 'Infilteration': 
            class_data = df[df['label'] == label]
            downsampled_data = pd.concat([downsampled_data, resample(class_data,
                                                                replace=False,
                                                                n_samples=count,
                                                                random_state=42)])

    # Shuffle the downsampled data
    downsampled_data = downsampled_data.sample(frac=1, random_state=42)

    # Check the total number of instances
    print("Total number of instances after downsampling:", len(downsampled_data))
    # save dataframe to file for future use
    df = downsampled_data.drop(['dst_port', 'protocol', 'timestamp', 'cwe_flag_count'], axis=1)  # Features
    df.sort_index(axis=1, inplace=True)
    print('The totale number of features is:', len(df.columns))
    pd.to_pickle(df, dataframe_file)


In [None]:
df.columns

In [None]:
df['label'].value_counts()

## Create the multiclassification model for the threat detection

In [None]:
from sklearn.preprocessing import LabelEncoder

# Encode labels to integers
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['label'])
df['label'].value_counts()

In [None]:
import joblib

# Save the label encoder to a file
label_encoder_file = '../models/label_encoder.joblib'
joblib.dump(label_encoder, label_encoder_file)

In [None]:
from sklearn.model_selection import train_test_split

# Assuming you already have your dataset stored in a pandas DataFrame df
# X contains the features and y contains the labels
X = df.drop(['label'], axis=1)  # Features
y = df['label']                # Labels

# Optionally, you can also specify stratification to ensure class balance in train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Print the shapes of the train and test sets to verify the split
print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

### Random Forest Model

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import pickle

model_file = '../models/rf_classifier.pkl'

exists = os.path.isfile(model_file)

if exists:
    print('Random Forest model exists, loading model...')
    rf_classifier = pickle.load(open(model_file, 'rb'))
    print('model loaded.')
else:
    print('Random Forest model does not exist, training model...')
    # Create a Random Forest classifier
    rf_classifier = RandomForestClassifier(n_estimators=300, random_state=42)

    # Train the classifier on the training data
    rf_classifier.fit(X_train, y_train)
    print('model trained.')
    # save model to file for future use
    pickle.dump(rf_classifier, open(model_file, 'wb'))

#### Model Evaluation

In [None]:
# Predict on the test data
y_pred = rf_classifier.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

feature_importance = rf_classifier.feature_importances_

df_importance = pd.DataFrame({'Feature': X_train.columns, 'Importance': feature_importance})

# Sort the DataFrame by feature importance in descending order
df_importance_sorted = df_importance.sort_values(by='Importance', ascending=False)

# Plot feature importance
plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=df_importance_sorted)
plt.title('Feature Importance')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.show()


### Neural Networks

In [None]:
import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import train_test_split

# Assuming you have already split your data into X_train, X_test, y_train, y_test

# Define the number of classes
num_classes = len(set(y_train))

input_shape = X_train.shape[1]


model = keras.Sequential([
    keras.layers.Dense(input_shape, activation='relu'),
    keras.layers.Dropout(0.5),
    keras.layers.Dense(32, activation='relu'),
    keras.layers.BatchNormalization(),
    keras.layers.Dense(num_classes, activation='softmax')
])

model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.0005),
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

# Train model
history = model.fit(X_train, y_train, epochs=100, batch_size=32, validation_split=0.2)

# Evaluate model
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print("Test Accuracy:", test_accuracy)


### XGBoost Classifier

In [None]:
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder

# Assuming you have already split your data into X_train, X_test, y_train, y_test
# Assuming num_classes is already defined

# Initialize XGBoost classifier
xgb_classifier = xgb.XGBClassifier(objective='multi:softmax', num_class=num_classes)

# Define hyperparameters grid
param_grid = {
    'learning_rate': [0.1, 0.01, 0.001],
    'subsample': [0.5, 0.7, 1]
}

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=xgb_classifier, param_grid=param_grid, cv=5, scoring='accuracy', verbose=2)

# Perform Grid Search Cross Validation
grid_search.fit(X_train, y_train)

# Get best estimator
best_xgb_classifier = grid_search.best_estimator_

# Make predictions
y_pred = best_xgb_classifier.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Print best parameters
print("Best Parameters:", grid_search.best_params_)


### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split

# Assuming you have already split your data into X_train, X_test, y_train, y_test

# Initialize the Logistic Regression classifier
logistic_regression = LogisticRegression(C=0.001, max_iter=1000, solver='saga', penalty='l1', verbose=1)  # Increase max_iter if needed

# Train the classifier on the training data
logistic_regression.fit(X_train, y_train)

# Make predictions on the test data
y_pred = logistic_regression.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))
