# Modeling the CSIC 2010 Dataset for TFM ITI

# Classifiers

### Import libraries

In [6]:
import pandas as pd

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer, HashingVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, chi2
from sklearn import metrics

In [22]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn import tree
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.utils import shuffle

In [9]:
df = pd.read_csv('all_traffic_dataset.csv')

  exec(code_obj, self.user_global_ns, self.user_ns)


In [10]:
# Remove columns that contain the same values
df = df.drop(['userAgent', 'pragma', 'cacheControl', 'acceptEncoding', 'acceptCharset', 'acceptLanguage'], 1)
df = df.drop(['connection', 'cookie', 'accept', 'protocol'], 1)

# Keep only the port because everything else is localhost
df['port'] = df['host'].str.split(':', expand=True)[1]
df = df.drop(['host'], 1)
df.head()

  df = df.drop(['userAgent', 'pragma', 'cacheControl', 'acceptEncoding', 'acceptCharset', 'acceptLanguage'], 1)
  df = df.drop(['connection', 'cookie', 'accept', 'protocol'], 1)
  df = df.drop(['host'], 1)


Unnamed: 0,index,method,url,contentLength,contentType,payload,label,port
0,0,GET,http://localhost:8080/tienda1/publico/anadir.jsp,,,id=2,anom,8080
1,0,GET,http://localhost:8080/tienda1/publico/anadir.jsp,,,nombre=Jam%F3n+Ib%E9rico,anom,8080
2,0,GET,http://localhost:8080/tienda1/publico/anadir.jsp,,,precio=85,anom,8080
3,0,GET,http://localhost:8080/tienda1/publico/anadir.jsp,,,cantidad=%27%3B+DROP+TABLE+usuarios%3B+SELECT+...,anom,8080
4,0,GET,http://localhost:8080/tienda1/publico/anadir.jsp,,,B1=A%F1adir+al+carrito,anom,8080


In [11]:
# Split the dataset in two to avoid mixed indices
df_anom = df[df['label']=='anom']
df_norm = df[df['label']=='norm']

In [12]:
df2_anom = df_anom[['index', 'payload', 'label']]
df2_anom = df2_anom.dropna()
print(df2_anom.head())

df2_norm = df_norm[['index', 'payload', 'label']]
df2_norm = df2_norm.dropna()
print(df2_norm.head())

   index                                            payload label
0      0                                               id=2  anom
1      0                           nombre=Jam%F3n+Ib%E9rico  anom
2      0                                          precio=85  anom
3      0  cantidad=%27%3B+DROP+TABLE+usuarios%3B+SELECT+...  anom
4      0                             B1=A%F1adir+al+carrito  anom
        index                 payload label
119586      1                    id=3  norm
119587      1       nombre=Vino+Rioja  norm
119588      1              precio=100  norm
119589      1             cantidad=55  norm
119590      1  B1=A%F1adir+al+carrito  norm


In [13]:
df3_anom = df2_anom[['payload','label']].groupby(df2_anom['index']).agg(lambda x: ' '.join(set(x)))
df3_anom["payload"] = df3_anom['payload'].apply(lambda x: x.replace("=", " "))
print(df3_anom.head())

df3_anom['label'] = 1
print(df3_anom.head())

                                                 payload label
index                                                         
0      B1 A%F1adir+al+carrito nombre Jam%F3n+Ib%E9ric...  anom
1      B1 A%F1adir+al+carrito cantidad 49 nombre Jam%...  anom
3      pwd 84m3ri156 modo entrar B1 Entrar login bob%...  anom
4      modo entrar B1 Entrar login grimshaw pwd G%2F%...  anom
5      pwd 84m3ri156 modo entrar B1 Entrar rememberA ...  anom
                                                 payload  label
index                                                          
0      B1 A%F1adir+al+carrito nombre Jam%F3n+Ib%E9ric...      1
1      B1 A%F1adir+al+carrito cantidad 49 nombre Jam%...      1
3      pwd 84m3ri156 modo entrar B1 Entrar login bob%...      1
4      modo entrar B1 Entrar login grimshaw pwd G%2F%...      1
5      pwd 84m3ri156 modo entrar B1 Entrar rememberA ...      1


In [14]:
df3_norm = df2_norm[['payload','label']].groupby(df2_norm['index']).agg(lambda x: ' '.join(set(x)))
df3_norm["payload"] = df3_norm['payload'].apply(lambda x: x.replace("=", " "))
print(df3_norm.head())

df3_norm['label'] = 0
print(df3_norm.head())

                                                 payload label
index                                                         
1      B1 A%F1adir+al+carrito id 3 cantidad 55 nombre...  norm
2      remember off pwd d1se3ci%F3n modo entrar B1 En...  norm
3                                                   id 2  norm
5                      errorMsg Credenciales+incorrectas  norm
7            modo insertar B1 Pasar+por+caja precio 2672  norm
                                                 payload  label
index                                                          
1      B1 A%F1adir+al+carrito id 3 cantidad 55 nombre...      0
2      remember off pwd d1se3ci%F3n modo entrar B1 En...      0
3                                                   id 2      0
5                      errorMsg Credenciales+incorrectas      0
7            modo insertar B1 Pasar+por+caja precio 2672      0


In [15]:
df4 = pd.concat([df3_norm, df3_anom])
print(df4.head())
print(df4.describe())
print(df4.label.value_counts())

                                                 payload  label
index                                                          
1      B1 A%F1adir+al+carrito id 3 cantidad 55 nombre...      0
2      remember off pwd d1se3ci%F3n modo entrar B1 En...      0
3                                                   id 2      0
5                      errorMsg Credenciales+incorrectas      0
7            modo insertar B1 Pasar+por+caja precio 2672      0
              label
count  35574.000000
mean       0.550233
std        0.497477
min        0.000000
25%        0.000000
50%        1.000000
75%        1.000000
max        1.000000
1    19574
0    16000
Name: label, dtype: int64


In [16]:
# Vectorize the payload by creating character n-grams
vec = TfidfVectorizer(analyzer='word',ngram_range=(3,3))

y = df4['label']
X = vec.fit_transform(df4['payload'].dropna())

In [17]:
print(X.shape, y.shape)

(35574, 114048) (35574,)


In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21)

In [19]:
#Logistic Regression
lgs = LogisticRegression()
lgs.fit(X_train, y_train)
pred = lgs.predict(X_test)

accuracy = metrics.accuracy_score(y_test, pred)
f1_score = metrics.f1_score(y_test, pred)
conf_matrix = metrics.confusion_matrix(y_test, pred)

print(accuracy, f1_score)
print(conf_matrix)

0.907659873506676 0.9187384044526903
[[2744  444]
 [ 213 3714]]


In [23]:
#Decesion Tree
dtc = tree.DecisionTreeClassifier()
dtc.fit(X_train, y_train)
pred = dtc.predict(X_test)

accuracy = metrics.accuracy_score(y_test, pred)
f1_score = metrics.f1_score(y_test, pred)
conf_matrix = metrics.confusion_matrix(y_test, pred)

print(accuracy, f1_score)
print(conf_matrix)

0.9319747013352073 0.9352941176470588
[[3133   55]
 [ 429 3498]]


In [25]:
#Random Forest
rfc = RandomForestClassifier(n_estimators=200)
rfc.fit(X_train, y_train)
pred = rfc.predict(X_test)

accuracy = metrics.accuracy_score(y_test, pred)
f1_score = metrics.f1_score(y_test, pred)
conf_matrix = metrics.confusion_matrix(y_test, pred)

print(accuracy, f1_score)
print(conf_matrix)

0.9366127898805341 0.9394874547162216
[[3163   25]
 [ 426 3501]]


In [24]:
#Linear SVM
svm=LinearSVC(C=1)
svm.fit(X_train, y_train)
pred = svm.predict(X_test)



accuracy = metrics.accuracy_score(y_test, pred)
f1_score = metrics.f1_score(y_test, pred)
conf_matrix = metrics.confusion_matrix(y_test, pred)

print(accuracy, f1_score)
print(conf_matrix)

0.9420941672522839 0.9446682782702123
[[3186    2]
 [ 410 3517]]


#### Save model

In [44]:
import pickle

# Save to file in the current working directory
pkl_filename = "pickle_svm.pkl"
with open(pkl_filename, 'wb') as file:
    pickle.dump(svm, file)


#### Load model

In [None]:
# Load from file
with open(pkl_filename, 'rb') as file:
    pickle_model = pickle.load(file)
    
# Predict target values
score = pickle_model.score(Xtest, Ytest)
print("Test score: {0:.2f} %".format(100 * score))
Ypredict = pickle_model.predict(Xtest)