In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("modified_csic_stage2.csv")
df.shape

(43485, 4)

In [3]:
df.isnull().sum()

Method             0
payload_length     0
classification     0
URL               39
dtype: int64

In [4]:
#indexes of rows with missing values in the 'url' column
nan_indexes= df.loc[df['URL'].isna()].index
# Replace the NaN values in the 'url' column with an empty string
df.loc[nan_indexes, 'URL']=''

In [5]:
df.isnull().sum()

Method            0
payload_length    0
classification    0
URL               0
dtype: int64

In [6]:
#one-hot-encoding of Method column 
one_hot_encoded = pd.get_dummies(df['Method'], drop_first=True)
df = pd.concat([df, one_hot_encoded], axis=1)
df.drop('Method', axis=1, inplace=True)

In [7]:
df.head()

Unnamed: 0,payload_length,classification,URL,POST,PUT
0,0,0,index.jsp,0,0
1,68,0,publico/anadir.jsp?id=3&nombre=Vino+Rioja&prec...,1,0
2,63,0,publico/autenticar.jsp?modo=entrar&login=choon...,1,0
3,4,0,publico/caracteristicas.jsp?id=2,1,0
4,0,0,publico/carrito.jsp,0,0


In [26]:
df.classification.value_counts()

0    28000
1    15485
Name: classification, dtype: int64

In [27]:
min_samples = 15485

In [29]:
df_normal = df[df.classification==0].sample(min_samples, random_state=42)
df_anomaly = df[df.classification==1].sample(min_samples, random_state=42)

In [30]:
df_balanced = pd.concat([df_normal,df_anomaly],axis=0)

In [31]:
df_balanced.classification.value_counts()

0    15485
1    15485
Name: classification, dtype: int64

In [35]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df_balanced.URL, 
    df_balanced.classification, 
    test_size=0.2, # 20% samples will go to test dataset
    random_state=42,
    stratify=df_balanced.classification
)

In [36]:
y_train.value_counts()

0    12388
1    12388
Name: classification, dtype: int64

In [37]:
y_test.value_counts()

0    3097
1    3097
Name: classification, dtype: int64

In [39]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

#1. create a pipeline object
clf = Pipeline([
     ('vectorizer_tfidf',TfidfVectorizer()),    
     ('KNN', KNeighborsClassifier())         
])

#2. fit with X_train and y_train
clf.fit(X_train, y_train)


#3. get the predictions for X_test and store it in y_pred
y_pred = clf.predict(X_test)


#4. print the classfication report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.96      0.86      0.91      3097
           1       0.87      0.97      0.92      3097

    accuracy                           0.91      6194
   macro avg       0.92      0.91      0.91      6194
weighted avg       0.92      0.91      0.91      6194



In [40]:
from sklearn.naive_bayes import MultinomialNB


#1. create a pipeline object
clf = Pipeline([
     ('vectorizer_tfidf',TfidfVectorizer()),    
     ('Multi NB', MultinomialNB())         
])

#2. fit with X_train and y_train
clf.fit(X_train, y_train)


#3. get the predictions for X_test and store it in y_pred
y_pred = clf.predict(X_test)


#4. print the classfication report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.96      0.83      0.89      3097
           1       0.85      0.96      0.90      3097

    accuracy                           0.89      6194
   macro avg       0.90      0.89      0.89      6194
weighted avg       0.90      0.89      0.89      6194



In [42]:
from sklearn.ensemble import RandomForestClassifier # (-) takes too much time

#1. create a pipeline object
clf = Pipeline([
     ('vectorizer_tfidf',TfidfVectorizer()),        
     ('Random Forest', RandomForestClassifier())         
])

#2. fit with X_train and y_train
clf.fit(X_train, y_train)


#3. get the predictions for X_test and store it in y_pred
y_pred = clf.predict(X_test)


#4. print the classfication report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.97      0.87      0.91      3097
           1       0.88      0.97      0.92      3097

    accuracy                           0.92      6194
   macro avg       0.92      0.92      0.92      6194
weighted avg       0.92      0.92      0.92      6194

