In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv("clean_data.csv")
df.shape

(43485, 4)

In [4]:
df.classification.value_counts() #our dataset is imbalanced

0    28000
1    15485
Name: classification, dtype: int64

To account for the class imbalance we will later use oversampling

In [5]:
df.shape

(43485, 4)

In [6]:
df.isnull().sum()

Method             0
payload_length     0
classification     0
URL               39
dtype: int64

In [7]:
#indexes of rows with missing values in the 'url' column
nan_indexes= df.loc[df['URL'].isna()].index
# Replace the NaN values in the 'url' column with an empty string
df.loc[nan_indexes, 'URL']=''

In [8]:
df.isnull().sum()

Method            0
payload_length    0
classification    0
URL               0
dtype: int64

In [9]:
#one-hot-encoding of Method column 
one_hot_encoded = pd.get_dummies(df['Method'], drop_first=True)
df = pd.concat([df, one_hot_encoded], axis=1)
df.drop('Method', axis=1, inplace=True)

In [10]:
df.head() 

Unnamed: 0,payload_length,classification,URL,POST,PUT
0,0,0,index.jsp,0,0
1,68,0,publico/anadir.jsp?id=3&nombre=Vino+Rioja&prec...,1,0
2,63,0,publico/autenticar.jsp?modo=entrar&login=choon...,1,0
3,4,0,publico/caracteristicas.jsp?id=2,1,0
4,0,0,publico/carrito.jsp,0,0


In [11]:
df.isnull().sum()

payload_length    0
classification    0
URL               0
POST              0
PUT               0
dtype: int64

## Oversampling

In [12]:
df_class_1 = df[df['classification'] == 1]
df_class_0 = df[df['classification'] == 0]
df_class_1_over = df_class_1.sample(df['classification'].value_counts()[0], replace=True)
df1 = pd.concat([df_class_0, df_class_1_over], axis=0)

df1.shape
df1['classification'].value_counts()

0    28000
1    28000
Name: classification, dtype: int64

In [13]:
X=df1.drop('classification', axis=1)
y=df1.classification

In [14]:
from sklearn.model_selection import train_test_split
X_train, X_test,y_train, y_test= train_test_split(X,y,test_size=0.2,random_state=0)

In [15]:
X_train.shape

(44800, 4)

In [16]:
X_test.shape

(11200, 4)

In [17]:
type(X_train)

pandas.core.frame.DataFrame

In [18]:
type(X_train.values)

numpy.ndarray

In [19]:
type(y_train)

pandas.core.series.Series

In [20]:
URL_train=X_train.URL 

In [21]:
URL_train.shape

(44800,)

In [22]:
URL_train.isnull().sum()

0

In [97]:
type(URL_train.values)

numpy.ndarray

In [23]:
URL_train = URL_train.astype(str)

In [24]:
#Create bag of words representation using CountVectorizer

from sklearn.feature_extraction.text import CountVectorizer

v = CountVectorizer() 

URL_train_cv = v.fit_transform(URL_train.values) #cv for count vect
URL_train_cv

<44800x30492 sparse matrix of type '<class 'numpy.int64'>'
	with 475517 stored elements in Compressed Sparse Row format>

In [25]:
URL_train_cv.shape

(44800, 30492)

In [26]:
v.get_feature_names_out()[30000:30050] #all the words in the vocabulary

array(['whitehead', 'whitehur', 'whiteside', 'whitley', 'whitson',
       'whittinghill', 'whitwort', 'whorf', 'wicht', 'wickens', 'wickham',
       'wide', 'wido', 'widuch', 'wiebe', 'wiedeman', 'wiedlin_steppat',
       'wiele', 'wieman', 'wierzba', 'wiese7', 'wieth', 'wifred',
       'wilbur', 'wildbolz', 'wilde', 'wildhagen_cavender', 'wildt_feng',
       'wiley', 'wilfredo', 'wilfrid', 'wilie', 'wilken', 'wilkinso',
       'willa', 'willeke5', 'willetta', 'william', 'williams9',
       'williams_mellish', 'willie51', 'willingham', 'willison', 'willy',
       'wilmer', 'wilmont', 'wilmot', 'wilson', 'wilton7', 'win'],
      dtype=object)

In [27]:
#dir(v)

In [28]:
#print(v.vocabulary_)

In [31]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(URL_train_cv, y_train) # ken bel URLas a feature 

In [32]:
URL_test_cv = v.transform(X_test.URL)

In [33]:
model.score(URL_test_cv, y_test)

0.9274107142857143

In [34]:
from sklearn.metrics import classification_report

y_pred = model.predict(URL_test_cv)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.95      0.90      0.92      5517
           1       0.91      0.96      0.93      5683

    accuracy                           0.93     11200
   macro avg       0.93      0.93      0.93     11200
weighted avg       0.93      0.93      0.93     11200



## Boosting

In [35]:
from sklearn.ensemble import AdaBoostClassifier 
clf = AdaBoostClassifier(n_estimators=100, random_state=0)
clf.fit(URL_train_cv, y_train)
clf.score(URL_test_cv, y_test) #we got an improved score

0.9453571428571429

## Bagging

Multinomial Naive Bayes

In [51]:
import warnings
from sklearn.ensemble import BaggingClassifier
# Suppress all warnings
warnings.filterwarnings("ignore")

bag_model_nb = BaggingClassifier(
    base_estimator=MultinomialNB(), 
    n_estimators=100, 
    max_samples=0.8, 
    oob_score=True,
    random_state=0
)
bag_model_nb.fit(URL_train_cv, y_train)
bag_model_nb.oob_score_

0.9576339285714286

In [52]:
bag_model_nb.score(URL_test_cv, y_test)

0.92

In [50]:
from sklearn.model_selection import cross_val_score

bag_model = BaggingClassifier(
    base_estimator=MultinomialNB(), 
    n_estimators=100, 
    max_samples=0.8, 
    oob_score=True,
    random_state=0
)
scores = cross_val_score(bag_model, URL_train_cv, y_train, cv=5)
scores

array([0.94933036, 0.95334821, 0.95100446, 0.93683036, 0.95625   ])

Logistic Regression

In [43]:
from sklearn.linear_model import LogisticRegression
warnings.filterwarnings("ignore")
bag_model_lr = BaggingClassifier(
    base_estimator=LogisticRegression(), 
    n_estimators=100, 
    max_samples=0.8, 
    oob_score=True,
    random_state=0
)
bag_model_lr.fit(URL_train_cv, y_train)
bag_model_lr.oob_score_


0.96921875

In [44]:
bag_model_lr.score(URL_test_cv, y_test)

0.96875

Support Vector Machine

In [None]:
from sklearn import svm
warnings.filterwarnings("ignore")

bag_model_svm = BaggingClassifier(
    base_estimator=svm.SVC(), 
    n_estimators=100, 
    max_samples=0.8, 
    oob_score=True,
    random_state=0
)
bag_model_svm.fit(URL_train_cv, y_train)
bag_model_svm.oob_score_

In [None]:
bag_model_svm.score(URL_test_cv, y_test)

In [None]:
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import ShuffleSplit 
from sklearn.model_selection import cross_val_score
cv= ShuffleSplit(n_splits=5, test_size=0.2 , random_state=0)
cross_val_score(RandomForestClassifier(), URL_train_cv,y_train, cv=cv)


from sklearn.ensemble import BaggingClassifier

bag_model_random_forest = BaggingClassifier(
    base_estimator=RandomForestClassifier(), 
    n_estimators=100, 
    max_samples=0.8, 
    oob_score=True,
    random_state=0
)
scores = cross_val_score(bag_model_random_forest, URL_train_cv, y_train, cv=5)
scores

Critiques: This technique (bag of words) is classical technique in NLP but has several downsides: 

1. Sparsity and high dimensionality: the vectors are mainly filled with zeros which is not optimal!
2. Limitations in capturing relationships
3. Not taking into account word frequency
4. Loss of word order