# Modeling the CSIC 2010 Dataset for TFM ITI

# Classifiers

### Import libraries

In [1]:
import pandas as pd

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer, HashingVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn import metrics

In [6]:
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.utils import shuffle

In [4]:
df = pd.read_csv('all_traffic_dataset.csv')

  exec(code_obj, self.user_global_ns, self.user_ns)


In [8]:
# Remove columns that contain the same values
df = df.drop(['userAgent', 'pragma', 'cacheControl', 'acceptEncoding', 'acceptCharset', 'acceptLanguage'], 1)
df = df.drop(['connection', 'cookie', 'accept', 'protocol'], 1)

# Keep only the port because everything else is localhost
df['port'] = df['host'].str.split(':', expand=True)[1]
df = df.drop(['host'], 1)
df.head()

  df = df.drop(['userAgent', 'pragma', 'cacheControl', 'acceptEncoding', 'acceptCharset', 'acceptLanguage'], 1)
  df = df.drop(['connection', 'cookie', 'accept', 'protocol'], 1)
  df = df.drop(['host'], 1)


Unnamed: 0,index,method,url,contentLength,contentType,payload,label,port
0,0,GET,http://localhost:8080/tienda1/publico/anadir.jsp,,,id=2,anom,8080
1,0,GET,http://localhost:8080/tienda1/publico/anadir.jsp,,,nombre=Jam%F3n+Ib%E9rico,anom,8080
2,0,GET,http://localhost:8080/tienda1/publico/anadir.jsp,,,precio=85,anom,8080
3,0,GET,http://localhost:8080/tienda1/publico/anadir.jsp,,,cantidad=%27%3B+DROP+TABLE+usuarios%3B+SELECT+...,anom,8080
4,0,GET,http://localhost:8080/tienda1/publico/anadir.jsp,,,B1=A%F1adir+al+carrito,anom,8080


In [9]:
# Split the dataset in two to avoid mixed indices
df_anom = df[df['label']=='anom']
df_norm = df[df['label']=='norm']

In [10]:
df2_anom = df_anom[['index', 'payload', 'label']]
df2_anom = df2_anom.dropna()
print(df2_anom.head())

df2_norm = df_norm[['index', 'payload', 'label']]
df2_norm = df2_norm.dropna()
print(df2_norm.head())

   index                                            payload label
0      0                                               id=2  anom
1      0                           nombre=Jam%F3n+Ib%E9rico  anom
2      0                                          precio=85  anom
3      0  cantidad=%27%3B+DROP+TABLE+usuarios%3B+SELECT+...  anom
4      0                             B1=A%F1adir+al+carrito  anom
        index                 payload label
119586      1                    id=3  norm
119587      1       nombre=Vino+Rioja  norm
119588      1              precio=100  norm
119589      1             cantidad=55  norm
119590      1  B1=A%F1adir+al+carrito  norm


In [11]:
df3_anom = df2_anom[['payload','label']].groupby(df2_anom['index']).agg(lambda x: ' '.join(set(x)))
df3_anom["payload"] = df3_anom['payload'].apply(lambda x: x.replace("=", " "))
print(df3_anom.head())

df3_anom['label'] = 1
print(df3_anom.head())

                                                 payload label
index                                                         
0      cantidad %27%3B+DROP+TABLE+usuarios%3B+SELECT+...  anom
1      nombre Jam%F3n+Ib%E9rico B1 A%F1adir+al+carrit...  anom
3      remember on pwd 84m3ri156 B1 Entrar modo entra...  anom
4      remember on B1 Entrar modo entrar login grimsh...  anom
5      pwd 84m3ri156 B1 Entrar modo entrar login grim...  anom
                                                 payload  label
index                                                          
0      cantidad %27%3B+DROP+TABLE+usuarios%3B+SELECT+...      1
1      nombre Jam%F3n+Ib%E9rico B1 A%F1adir+al+carrit...      1
3      remember on pwd 84m3ri156 B1 Entrar modo entra...      1
4      remember on B1 Entrar modo entrar login grimsh...      1
5      pwd 84m3ri156 B1 Entrar modo entrar login grim...      1


In [12]:
df3_norm = df2_norm[['payload','label']].groupby(df2_norm['index']).agg(lambda x: ' '.join(set(x)))
df3_norm["payload"] = df3_norm['payload'].apply(lambda x: x.replace("=", " "))
print(df3_norm.head())

df3_norm['label'] = 0
print(df3_norm.head())

                                                 payload label
index                                                         
1      cantidad 55 id 3 B1 A%F1adir+al+carrito nombre...  norm
2      pwd d1se3ci%F3n B1 Entrar modo entrar remember...  norm
3                                                   id 2  norm
5                      errorMsg Credenciales+incorrectas  norm
7            B1 Pasar+por+caja precio 2672 modo insertar  norm
                                                 payload  label
index                                                          
1      cantidad 55 id 3 B1 A%F1adir+al+carrito nombre...      0
2      pwd d1se3ci%F3n B1 Entrar modo entrar remember...      0
3                                                   id 2      0
5                      errorMsg Credenciales+incorrectas      0
7            B1 Pasar+por+caja precio 2672 modo insertar      0


In [13]:
df4 = pd.concat([df3_norm, df3_anom])
print(df4.head())
print(df4.describe())
print(df4.label.value_counts())

                                                 payload  label
index                                                          
1      cantidad 55 id 3 B1 A%F1adir+al+carrito nombre...      0
2      pwd d1se3ci%F3n B1 Entrar modo entrar remember...      0
3                                                   id 2      0
5                      errorMsg Credenciales+incorrectas      0
7            B1 Pasar+por+caja precio 2672 modo insertar      0
              label
count  35574.000000
mean       0.550233
std        0.497477
min        0.000000
25%        0.000000
50%        1.000000
75%        1.000000
max        1.000000
1    19574
0    16000
Name: label, dtype: int64


In [14]:
# Vectorize the payload by creating character n-grams
vec = TfidfVectorizer(analyzer='word',ngram_range=(3,3))

y = df4['label']
X = vec.fit_transform(df4['payload'].dropna())

In [15]:
print(X.shape, y.shape)

(35574, 115429) (35574,)


In [16]:
# Use a chi-squared test to extract features
ch2 = SelectKBest(chi2, k=600)
X_train = ch2.fit_transform(X, y)
print(X_train.shape)

(35574, 600)


In [17]:
X1, y1 = shuffle(X_train, y)
offset = int(X1.shape[0] * 0.8)

In [18]:
# Random Forest Classifier
clf = RandomForestClassifier(n_estimators=1000)
clf.fit(X1[:offset], y1[:offset])
pred = clf.predict(X1[offset:,:])

accuracy = metrics.accuracy_score(y1[offset:], pred)
f1_score = metrics.f1_score(y1[offset:], pred)
conf_matrix = metrics.confusion_matrix(y1[offset:], pred)
print(accuracy, f1_score)
print(conf_matrix)

0.8109627547434997 0.7919566898685229
[[3210   28]
 [1317 2560]]


In [19]:
# Gradient Boosting Classifier
clf = GradientBoostingClassifier(n_estimators=1000)
clf.fit(X1[:offset], y1[:offset])
pred = clf.predict(X1[offset:,:].toarray())

accuracy = metrics.accuracy_score(y1[offset:], pred)
f1_score = metrics.f1_score(y1[offset:], pred)
conf_matrix = metrics.confusion_matrix(y1[offset:], pred)
print(accuracy, f1_score)
print(conf_matrix)

0.8053408292340126 0.8440139655366595
[[1983 1255]
 [ 130 3747]]
