# Modeling the CSIC 2010 Dataset for TFM ITI

# Classifiers

### Import libraries

In [50]:
import pandas as pd
import numpy as np

In [51]:
from sklearn.feature_extraction.text import TfidfVectorizer, HashingVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, chi2
from sklearn import metrics

In [52]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn import tree
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.utils import shuffle

In [69]:
df = pd.read_csv('TR_traffic_dataset.csv')

  exec(code_obj, self.user_global_ns, self.user_ns)


In [70]:
df.tail()

Unnamed: 0,index,method,url,protocol,userAgent,pragma,cacheControl,accept,acceptEncoding,acceptCharset,acceptLanguage,host,connection,contentLength,contentType,cookie,payload,label
222814,35999,POST,http://localhost:8080/tienda1/miembros/editar.jsp,HTTP/1.1,Mozilla/5.0 (compatible; Konqueror/3.5; Linux)...,no-cache,no-cache,"text/xml,application/xml,application/xhtml+xml...","x-gzip, x-deflate, gzip, deflate","utf-8, utf-8;q=0.5, *;q=0.5",en,localhost:8080,close,238.0,application/x-www-form-urlencoded,JSESSIONID=05C0977CEA3691A57DFC74F2A03C3D2C,ciudad=Hontangas,norm
222815,35999,POST,http://localhost:8080/tienda1/miembros/editar.jsp,HTTP/1.1,Mozilla/5.0 (compatible; Konqueror/3.5; Linux)...,no-cache,no-cache,"text/xml,application/xml,application/xhtml+xml...","x-gzip, x-deflate, gzip, deflate","utf-8, utf-8;q=0.5, *;q=0.5",en,localhost:8080,close,238.0,application/x-www-form-urlencoded,JSESSIONID=05C0977CEA3691A57DFC74F2A03C3D2C,cp=08192,norm
222816,35999,POST,http://localhost:8080/tienda1/miembros/editar.jsp,HTTP/1.1,Mozilla/5.0 (compatible; Konqueror/3.5; Linux)...,no-cache,no-cache,"text/xml,application/xml,application/xhtml+xml...","x-gzip, x-deflate, gzip, deflate","utf-8, utf-8;q=0.5, *;q=0.5",en,localhost:8080,close,238.0,application/x-www-form-urlencoded,JSESSIONID=05C0977CEA3691A57DFC74F2A03C3D2C,provincia=Teruel,norm
222817,35999,POST,http://localhost:8080/tienda1/miembros/editar.jsp,HTTP/1.1,Mozilla/5.0 (compatible; Konqueror/3.5; Linux)...,no-cache,no-cache,"text/xml,application/xml,application/xhtml+xml...","x-gzip, x-deflate, gzip, deflate","utf-8, utf-8;q=0.5, *;q=0.5",en,localhost:8080,close,238.0,application/x-www-form-urlencoded,JSESSIONID=05C0977CEA3691A57DFC74F2A03C3D2C,ntc=8231060336250168,norm
222818,35999,POST,http://localhost:8080/tienda1/miembros/editar.jsp,HTTP/1.1,Mozilla/5.0 (compatible; Konqueror/3.5; Linux)...,no-cache,no-cache,"text/xml,application/xml,application/xhtml+xml...","x-gzip, x-deflate, gzip, deflate","utf-8, utf-8;q=0.5, *;q=0.5",en,localhost:8080,close,238.0,application/x-www-form-urlencoded,JSESSIONID=05C0977CEA3691A57DFC74F2A03C3D2C,B1=Registrar,norm


In [71]:
# Remove columns that contain the same values
df = df.drop(['userAgent', 'pragma', 'cacheControl', 'acceptEncoding', 'acceptCharset', 'acceptLanguage'], 1)
df = df.drop(['connection', 'cookie', 'accept', 'protocol'], 1)

# Keep only the port because everything else is localhost
df['port'] = df['host'].str.split(':', expand=True)[1]
df = df.drop(['host'], 1)
df.head()

  df = df.drop(['userAgent', 'pragma', 'cacheControl', 'acceptEncoding', 'acceptCharset', 'acceptLanguage'], 1)
  df = df.drop(['connection', 'cookie', 'accept', 'protocol'], 1)
  df = df.drop(['host'], 1)


Unnamed: 0,index,method,url,contentLength,contentType,payload,label,port
0,0,GET,http://localhost:8080/tienda1/publico/anadir.jsp,,,id=2,anom,8080
1,0,GET,http://localhost:8080/tienda1/publico/anadir.jsp,,,nombre=Jam%F3n+Ib%E9rico,anom,8080
2,0,GET,http://localhost:8080/tienda1/publico/anadir.jsp,,,precio=85,anom,8080
3,0,GET,http://localhost:8080/tienda1/publico/anadir.jsp,,,cantidad=%27%3B+DROP+TABLE+usuarios%3B+SELECT+...,anom,8080
4,0,GET,http://localhost:8080/tienda1/publico/anadir.jsp,,,B1=A%F1adir+al+carrito,anom,8080


In [72]:
# Split the dataset in two to avoid mixed indices
df_anom = df[df['label']=='anom']
df_norm = df[df['label']=='norm']

In [73]:
df2_anom = df_anom[['index', 'payload', 'label']]
df2_anom = df2_anom.dropna()
print(df2_anom.head())

df2_norm = df_norm[['index', 'payload', 'label']]
df2_norm = df2_norm.dropna()
print(df2_norm.head())

   index                                            payload label
0      0                                               id=2  anom
1      0                           nombre=Jam%F3n+Ib%E9rico  anom
2      0                                          precio=85  anom
3      0  cantidad=%27%3B+DROP+TABLE+usuarios%3B+SELECT+...  anom
4      0                             B1=A%F1adir+al+carrito  anom
        index            payload label
119347    234  B2=Vaciar+carrito  norm
119356    243      modo=registro  norm
119357    243      login=faraday  norm
119358    243   password=8Ef6NDo  norm
119359    243       nombre=Xixia  norm


In [74]:
df3_anom = df2_anom[['payload','label']].groupby(df2_anom['index']).agg(lambda x: ' '.join(set(x)))
df3_anom["payload"] = df3_anom['payload'].apply(lambda x: x.replace("=", " "))
print(df3_anom.head())

df3_anom['label'] = 1
print(df3_anom.head())

                                                 payload label
index                                                         
0      nombre Jam%F3n+Ib%E9rico id 2 cantidad %27%3B+...  anom
1      id 2%2F nombre Jam%F3n+Ib%E9rico B1 A%F1adir+a...  anom
3      B1 Entrar pwd 84m3ri156 remember on login bob%...  anom
4      pwd G%2F%2FlAc%2CIAr B1 Entrar login grimshaw ...  anom
5      rememberA on B1 Entrar login grimshaw pwd 84m3...  anom
                                                 payload  label
index                                                          
0      nombre Jam%F3n+Ib%E9rico id 2 cantidad %27%3B+...      1
1      id 2%2F nombre Jam%F3n+Ib%E9rico B1 A%F1adir+a...      1
3      B1 Entrar pwd 84m3ri156 remember on login bob%...      1
4      pwd G%2F%2FlAc%2CIAr B1 Entrar login grimshaw ...      1
5      rememberA on B1 Entrar login grimshaw pwd 84m3...      1


In [75]:
df3_norm = df2_norm[['payload','label']].groupby(df2_norm['index']).agg(lambda x: ' '.join(set(x)))
df3_norm["payload"] = df3_norm['payload'].apply(lambda x: x.replace("=", " "))
print(df3_norm.head())

df3_norm['label'] = 0
print(df3_norm.tail())

                                                 payload label
index                                                         
234                                    B2 Vaciar+carrito  norm
243    ntc 7057231009442608 apellidos Paula+Pladellor...  norm
253    id 2 B1 A%F1adir+al+carrito cantidad 79 precio...  norm
254    B1 Entrar remember off pwd vimbre modo entrar ...  norm
255                                                 id 1  norm
                                                 payload  label
index                                                          
35995                  errorMsg Credenciales+incorrectas      0
35996        precio 6460 B1 Pasar+por+caja modo insertar      0
35997  nombre Florestano email pianka%40noviasteresap...      0
35998                                  B2 Vaciar+carrito      0
35999  apellidos Bernaus ciudad Hontangas cp 08192 dn...      0


In [76]:
df4 = pd.concat([df3_norm, df3_anom])
print(df4.head())
print(df4.describe())
print(df4.label.value_counts())

                                                 payload  label
index                                                          
234                                    B2 Vaciar+carrito      0
243    ntc 7057231009442608 apellidos Paula+Pladellor...      0
253    id 2 B1 A%F1adir+al+carrito cantidad 79 precio...      0
254    B1 Entrar remember off pwd vimbre modo entrar ...      0
255                                                 id 1      0
              label
count  35450.000000
mean       0.550635
std        0.497437
min        0.000000
25%        0.000000
50%        1.000000
75%        1.000000
max        1.000000
1    19520
0    15930
Name: label, dtype: int64


In [77]:
df4_norm = df3_norm.iloc[:5000, :]
df4_anom = df3_anom.iloc[:5000, :]

In [78]:
# Vectorize the payload to view feature importance
vec = TfidfVectorizer(max_df=.65, analyzer='word',ngram_range=(1,2))

X_norm = vec.fit_transform(df4_norm['payload'].dropna())
X_anom = vec.fit_transform(df4_anom['payload'].dropna())

In [79]:
array_norm = X_norm.toarray()
array_anom = X_anom.toarray()

In [80]:
for counter, doc in enumerate(array_anom):
    # construct a dataframe
    tf_idf_tuples = list(zip(vec.get_feature_names(), doc))
    one_doc_as_df = pd.DataFrame.from_records(tf_idf_tuples, columns=['term', 'score']).sort_values(by='score', ascending=False).reset_index(drop=True)

KeyboardInterrupt: 

In [None]:
one_doc_as_df

In [None]:
for counter, doc in enumerate(array_norm):
    # construct a dataframe
    tf_idf_tuples = list(zip(vec.get_feature_names(), doc))
    one_doc_as_df = pd.DataFrame.from_records(tf_idf_tuples, columns=['term', 'score']).sort_values(by='score', ascending=False).reset_index(drop=True)

In [None]:
one_doc_as_df

In [81]:
# Vectorize the payload of all the dataset

y = df4['label']
X = vec.fit_transform(df4['payload'].dropna())

In [82]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21)

In [83]:
#Logistic Regression
lgs = LogisticRegression()
lgs.fit(X_train, y_train)
pred = lgs.predict(X_test)

accuracy = metrics.accuracy_score(y_test, pred)
f1_score = metrics.f1_score(y_test, pred)
conf_matrix = metrics.confusion_matrix(y_test, pred)

print(accuracy, f1_score)
print(conf_matrix)

0.9799717912552891 0.9817808570695407
[[3122   29]
 [ 113 3826]]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [84]:
#Decesion Tree
dtc = tree.DecisionTreeClassifier()
dtc.fit(X_train, y_train)
pred = dtc.predict(X_test)

accuracy = metrics.accuracy_score(y_test, pred)
f1_score = metrics.f1_score(y_test, pred)
conf_matrix = metrics.confusion_matrix(y_test, pred)

print(accuracy, f1_score)
print(conf_matrix)

0.9746121297602257 0.977005620848237
[[3086   65]
 [ 115 3824]]


In [None]:
#Random Forest
rfc = RandomForestClassifier(n_estimators=200)
rfc.fit(X_train, y_train)
pred = rfc.predict(X_test)

accuracy = metrics.accuracy_score(y_test, pred)
f1_score = metrics.f1_score(y_test, pred)
conf_matrix = metrics.confusion_matrix(y_test, pred)

print(accuracy, f1_score)
print(conf_matrix)

In [None]:
#Linear SVM
svm=LinearSVC(C=1)
svm.fit(X_train, y_train)
pred = svm.predict(X_test)



accuracy = metrics.accuracy_score(y_test, pred)
f1_score = metrics.f1_score(y_test, pred)
conf_matrix = metrics.confusion_matrix(y_test, pred)

print(accuracy, f1_score)
print(conf_matrix)

#### Save model

In [None]:
import pickle

# Save to file in the current working directory
pkl_filename = "pickle_svm.pkl"
with open(pkl_filename, 'wb') as file:
    pickle.dump(svm, file)


#### Load model

In [None]:
# Load from file
with open(pkl_filename, 'rb') as file:
    pickle_model = pickle.load(file)
    
# Predict target values
score = pickle_model.score(Xtest, Ytest)
print("Test score: {0:.2f} %".format(100 * score))
Ypredict = pickle_model.predict(Xtest)