In [76]:
import pandas as pd
import numpy as np

In [77]:
df = pd.read_csv("modified_csic_stage2.csv")
df.shape

(43485, 4)

In [78]:
df.classification.value_counts() #our dataset is imbalanced

0    28000
1    15485
Name: classification, dtype: int64

In [79]:
df.shape

(43485, 4)

In [80]:
df.isnull().sum()

Method             0
payload_length     0
classification     0
URL               39
dtype: int64

In [81]:
#indexes of rows with missing values in the 'url' column
nan_indexes= df.loc[df['URL'].isna()].index
# Replace the NaN values in the 'url' column with an empty string
df.loc[nan_indexes, 'URL']=''

In [82]:
df.isnull().sum()

Method            0
payload_length    0
classification    0
URL               0
dtype: int64

In [83]:
#one-hot-encoding of Method column 
one_hot_encoded = pd.get_dummies(df['Method'], drop_first=True)
df = pd.concat([df, one_hot_encoded], axis=1)
df.drop('Method', axis=1, inplace=True)

In [84]:
df.head() 

Unnamed: 0,payload_length,classification,URL,POST,PUT
0,0,0,index.jsp,0,0
1,68,0,publico/anadir.jsp?id=3&nombre=Vino+Rioja&prec...,1,0
2,63,0,publico/autenticar.jsp?modo=entrar&login=choon...,1,0
3,4,0,publico/caracteristicas.jsp?id=2,1,0
4,0,0,publico/carrito.jsp,0,0


In [85]:
df.isnull().sum()

payload_length    0
classification    0
URL               0
POST              0
PUT               0
dtype: int64

## Oversampling

In [86]:
df_class_1 = df[df['classification'] == 1]
df_class_0 = df[df['classification'] == 0]
df_class_1_over = df_class_1.sample(df['classification'].value_counts()[0], replace=True)
df1 = pd.concat([df_class_0, df_class_1_over], axis=0)

df1.shape
df1['classification'].value_counts()

0    28000
1    28000
Name: classification, dtype: int64

In [87]:
X=df1.drop('classification', axis=1)
y=df1.classification

In [88]:
from sklearn.model_selection import train_test_split
X_train, X_test,y_train, y_test= train_test_split(X,y,test_size=0.2,random_state=0)

In [89]:
X_train.shape

(44800, 4)

In [90]:
X_test.shape

(11200, 4)

In [91]:
type(X_train)

pandas.core.frame.DataFrame

In [92]:
type(X_train.values)

numpy.ndarray

In [93]:
type(y_train)

pandas.core.series.Series

In [94]:
URL_train=X_train.URL 

In [95]:
URL_train.shape

(44800,)

In [96]:
URL_train.isnull().sum()

0

In [97]:
type(URL_train.values)

numpy.ndarray

In [98]:
URL_train = URL_train.astype(str)

In [99]:
URL_train.dtype

dtype('O')

In [100]:
#Create bag of words representation using CountVectorizer

from sklearn.feature_extraction.text import CountVectorizer

v = CountVectorizer() 

URL_train_cv = v.fit_transform(URL_train.values)
URL_train_cv

<44800x30475 sparse matrix of type '<class 'numpy.int64'>'
	with 469685 stored elements in Compressed Sparse Row format>

In [102]:
URL_train_cv.shape

(44800, 30475)

In [103]:
v.get_feature_names_out()[30000:30050] #all the words in the vocabulary

array(['wiele', 'wieman', 'wierzba', 'wiese7', 'wieth', 'wifred',
       'wilbur', 'wildbolz', 'wilde', 'wildhagen_cavender', 'wildt_feng',
       'wiley', 'wilfredo', 'wilfrid', 'wilie', 'wilken', 'wilkinso',
       'willa', 'willeke5', 'willetta', 'william', 'williams9',
       'williams_mellish', 'willie51', 'willingham', 'willison', 'willy',
       'wilmer', 'wilmont', 'wilmot', 'wilson', 'wilton7', 'win',
       'wincott_remeck', 'windham', 'windros', 'winds', 'wines',
       'winfield', 'wing', 'wingard', 'wingfiel', 'wingo1', 'wingrove',
       'winnah', 'winne', 'winnie', 'winningt', 'winny', 'winona'],
      dtype=object)

In [104]:
#dir(v)

In [106]:
print(v.vocabulary_)

{'miembros': 23223, 'editar': 17185, 'jsp': 21055, 'modo': 23384, 'registro': 26245, 'login': 22144, 'chanchla': 14929, 'password': 24945, '4ino57l': 7352, 'nombre': 24146, 'atilio': 12490, 'apellidos': 12104, 'rull': 26847, 'email': 17377, 'ward': 29896, '40fortelingenieria': 5719, 'si': 27641, 'dni': 16537, '42798379w': 6517, 'direccion': 16510, 'calle': 14197, 'virgen': 29754, 'de': 16120, 'bego': 13086, 'f1a': 17941, '19': 2013, 'ciudad': 15177, 'broto': 13795, 'cp': 15682, '09107': 778, 'provincia': 25726, 'segovia': 27400, 'ntc': 24209, '5259660554384998': 7642, 'b1': 12646, 'registrar': 26240, 'publico': 25757, 'vaciar': 29235, 'b2': 12652, 'carrito': 14526, '253f': 2876, 'pagar': 24728, 'insertar': 20600, 'precio': 25640, '2248': 2466, 'b1a': 12649, 'confirmar': 15450, 'caracteristicas': 14405, 'id': 20403, 'beaulieu': 13063, 'recordati87': 26188, 'james': 20823, 'mol': 23414, 'ed': 17117, 'vadim': 29240, '40drae2': 5620, 'ci': 15111, '15495714q': 1564, '2f': 3854, 'coca': 1529

In [109]:
np.where(URL_train_np[0]!=0)

(array([  775,  2014,  5722,  6529,  7372,  7667, 12135, 12518, 12677,
        13115, 13828, 14231, 14961, 15206, 15709, 16148, 16539, 16566,
        17211, 17401, 17962, 21077, 22170, 23260, 23422, 24182, 24245,
        24976, 25761, 26273, 26278, 26880, 27434, 27681, 29794, 29934],
       dtype=int64),)

In [110]:
URL_train_np[:4][0]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [112]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(URL_train_cv, y_train) # ken bel URLas a feature 

In [113]:
URL_test_cv = v.transform(X_test.URL)

In [114]:
model.score(URL_test_cv, y_test)

0.9241071428571429

In [115]:
from sklearn.metrics import classification_report

y_pred = model.predict(URL_test_cv)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.95      0.89      0.92      5517
           1       0.90      0.95      0.93      5683

    accuracy                           0.92     11200
   macro avg       0.93      0.92      0.92     11200
weighted avg       0.93      0.92      0.92     11200



## Boosting

In [117]:
from sklearn.ensemble import AdaBoostClassifier 
clf = AdaBoostClassifier(n_estimators=100, random_state=0)
clf.fit(URL_train_cv, y_train)
clf.score(URL_test_cv, y_test) #we got an improved score

0.9432142857142857

## Bagging

In [118]:
from sklearn.ensemble import BaggingClassifier

bag_model = BaggingClassifier(
    base_estimator=MultinomialNB(), 
    n_estimators=100, 
    max_samples=0.8, 
    oob_score=True,
    random_state=0
)
bag_model.fit(URL_train_cv, y_train)
bag_model.oob_score_



0.9549107142857143

In [119]:
bag_model.score(URL_test_cv, y_test)

0.9175

In [120]:
from sklearn.linear_model import LogisticRegression

bag_model = BaggingClassifier(
    base_estimator=LogisticRegression(), 
    n_estimators=100, 
    max_samples=0.8, 
    oob_score=True,
    random_state=0
)
bag_model.fit(URL_train_cv, y_train)
bag_model.oob_score_


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

0.9685044642857142

In [121]:
bag_model.score(URL_test_cv, y_test)

0.9692857142857143

In [None]:
from sklearn import svm

bag_model = BaggingClassifier(
    base_estimator=svm.SVC(), 
    n_estimators=100, 
    max_samples=0.8, 
    oob_score=True,
    random_state=0
)
bag_model.fit(URL_train_cv, y_train)
bag_model.oob_score_



In [None]:
from sklearn.model_selection import cross_val_score

bag_model = BaggingClassifier(
    base_estimator=MultinomialNB(), 
    n_estimators=100, 
    max_samples=0.8, 
    oob_score=True,
    random_state=0
)
scores = cross_val_score(bag_model, URL_train_cv, y_train, cv=5)
scores

In [None]:
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import ShuffleSplit 
from sklearn.model_selection import cross_val_score
cv= ShuffleSplit(n_splits=5, test_size=0.2 , random_state=0)
cross_val_score(RandomForestClassifier(), X_train,y_train, cv=cv)


from sklearn.ensemble import BaggingClassifier

bag_model = BaggingClassifier(
    base_estimator=RandomForestClassifier(), 
    n_estimators=100, 
    max_samples=0.8, 
    oob_score=True,
    random_state=0
)
scores = cross_val_score(bag_model, URL_train_cv, y_train, cv=5)
scores

critiques: This technique (bag of words) creates a sparse representation of each word: the vectors are mainly filled with zeros which is not optimal