In [7]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

# Read the training CSV file containing normal traffic data
df_normal_train = pd.read_csv('train_normal.csv')

# Read the testing CSV file containing abnormal traffic data
df_test_abnormal = pd.read_csv('test_abnormal.csv')

# Read the testing CSV file containing normal traffic data
df_test_normal = pd.read_csv('test_normal.csv')

In [8]:
import nltk
nltk.download('stopwords')
stop_words = set(nltk.corpus.stopwords.words('english'))


df_concat = pd.concat([df_normal_train, df_test_abnormal, df_test_normal])

# Shuffle the concatenated dataframe
df_shuffled = df_concat.sample(frac=1, random_state=42).reset_index(drop=True)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Ravi_Kumar2\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
import nltk
#nltk.download('punkt')
preprocessed = []
for request in df_shuffled['request']:
    request = request.lower()
    #request = ''.join(e for e in request if e.isalnum() or e.isspace())
    words = nltk.word_tokenize(request)
    #words = [w for w in words if w not in stop_words]
    preprocessed.append(' '.join(words))

for request in df_test_abnormal['request']:
    request = request.lower()
    #request = ''.join(e for e in request if e.isalnum() or e.isspace())
    words = nltk.word_tokenize(request)
    #words = [w for w in words if w not in stop_words]
    preprocessed.append(' '.join(words))

for request in df_test_normal['request']:
    request = request.lower()
    #request = ''.join(e for e in request if e.isalnum() or e.isspace())
    words = nltk.word_tokenize(request)
    #words = [w for w in words if w not in stop_words]
    preprocessed.append(' '.join(words))

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Ravi_Kumar2\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [10]:
vectorizer = CountVectorizer()
vectorizer.fit(preprocessed)
dictionary = vectorizer.vocabulary_

# Convert HTTP requests to feature vectors
x_train_normal = vectorizer.transform(df_shuffled['request']).toarray()
y_train_normal = df_shuffled['label']

x_test_abnormal = vectorizer.transform(df_test_abnormal['request']).toarray()
y_test_abnormal = df_test_abnormal['label'].dropna()

x_test_normal = vectorizer.transform(df_test_normal['request']).toarray()
y_test_normal = df_test_normal['label'].dropna()

In [11]:
# Split the shuffled dataset into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x_train_normal, y_train_normal, test_size=0.2, random_state=42)

"""
    # Define the models to be trained and tested
models = {
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "Support Vector Machine": SVC(),
    "Multilayer Perceptron": MLPClassifier()
}

ensemble_models = {
    "Bagging": BaggingClassifier(),
    "AdaBoost": AdaBoostClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "Stacking": StackingClassifier(estimators=list(models.items()))
}

"""

models = { "Decision Tree": DecisionTreeClassifier(random_state=42),
          "Random Forest": RandomForestClassifier(random_state=42),
          "Support Vector Machine": SVC(random_state=42),
          "Multilayer Perceptron": MLPClassifier(),
          "Bagging": BaggingClassifier(base_estimator=DecisionTreeClassifier(random_state=42), random_state=42),
          "AdaBoost":AdaBoostClassifier(base_estimator=DecisionTreeClassifier(random_state=42), random_state=42),
          "Gradient Boosting": GradientBoostingClassifier(),
          "Stacking": StackingClassifier(estimators=[('dt', DecisionTreeClassifier(random_state=42)),
                                         ('rf', RandomForestClassifier(random_state=42)),
                                         ('svm', SVC(random_state=42)),
                                         ('mlp', MLPClassifier(random_state=42))],
                             final_estimator=DecisionTreeClassifier(random_state=42),
                             passthrough=True)}

In [12]:
# Train and test the models
from sklearn.metrics import f1_score


for name, model in models.items():
    model.fit(x_train, y_train)
    y_pred_abnormal = model.predict(x_test_abnormal)
    accuracy = accuracy_score(y_test_abnormal, y_pred_abnormal)
    f1 = f1_score(y_test_normal, y_pred_normal)
    print(f"{name} accuracy on abnormal traffic data: {accuracy}")
    print(f"{name} F1 on normal traffic data: {f1}")
    
    y_pred_normal = model.predict(x_test_normal)
    accuracy = accuracy_score(y_test_normal, y_pred_normal)
    f1 = f1_score(y_test_normal, y_pred_normal)
    print(f"{name} accuracy on normal traffic data: {accuracy}")
    print(f"{name} F1 on normal traffic data: {f1}")
    
    report = classification_report(y_test_abnormal, y_pred_abnormal)
    print(f"{name} classification report on abnormal traffic data:\n{report}")
    
    report = classification_report(y_test_normal, y_pred_normal)
    print(f"{name} classification report on normal traffic data:\n{report}")
