In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import TensorDataset, DataLoader

In [2]:
device = 'cuda:0' if torch.cuda.is_available else 'cpu'
device

'cuda:0'

In [3]:
!ls

1dcnn_csic2010.ipynb README.md
LICENSE              csic_database.csv


# Data Load

In [4]:
data = pd.read_csv('csic_database.csv')
print(data.columns)
data.head(5)

Index(['Unnamed: 0', 'Method', 'User-Agent', 'Pragma', 'Cache-Control',
       'Accept', 'Accept-encoding', 'Accept-charset', 'language', 'host',
       'cookie', 'content-type', 'connection', 'lenght', 'content',
       'classification', 'URL'],
      dtype='object')


Unnamed: 0.1,Unnamed: 0,Method,User-Agent,Pragma,Cache-Control,Accept,Accept-encoding,Accept-charset,language,host,cookie,content-type,connection,lenght,content,classification,URL
0,Normal,GET,Mozilla/5.0 (compatible; Konqueror/3.5; Linux)...,no-cache,no-cache,"text/xml,application/xml,application/xhtml+xml...","x-gzip, x-deflate, gzip, deflate","utf-8, utf-8;q=0.5, *;q=0.5",en,localhost:8080,JSESSIONID=1F767F17239C9B670A39E9B10C3825F4,,close,,,0,http://localhost:8080/tienda1/index.jsp HTTP/1.1
1,Normal,GET,Mozilla/5.0 (compatible; Konqueror/3.5; Linux)...,no-cache,no-cache,"text/xml,application/xml,application/xhtml+xml...","x-gzip, x-deflate, gzip, deflate","utf-8, utf-8;q=0.5, *;q=0.5",en,localhost:8080,JSESSIONID=81761ACA043B0E6014CA42A4BCD06AB5,,close,,,0,http://localhost:8080/tienda1/publico/anadir.j...
2,Normal,POST,Mozilla/5.0 (compatible; Konqueror/3.5; Linux)...,no-cache,no-cache,"text/xml,application/xml,application/xhtml+xml...","x-gzip, x-deflate, gzip, deflate","utf-8, utf-8;q=0.5, *;q=0.5",en,localhost:8080,JSESSIONID=933185092E0B668B90676E0A2B0767AF,application/x-www-form-urlencoded,Connection: close,Content-Length: 68,id=3&nombre=Vino+Rioja&precio=100&cantidad=55&...,0,http://localhost:8080/tienda1/publico/anadir.j...
3,Normal,GET,Mozilla/5.0 (compatible; Konqueror/3.5; Linux)...,no-cache,no-cache,"text/xml,application/xml,application/xhtml+xml...","x-gzip, x-deflate, gzip, deflate","utf-8, utf-8;q=0.5, *;q=0.5",en,localhost:8080,JSESSIONID=8FA18BA82C5336D03D3A8AFA3E68CBB0,,close,,,0,http://localhost:8080/tienda1/publico/autentic...
4,Normal,POST,Mozilla/5.0 (compatible; Konqueror/3.5; Linux)...,no-cache,no-cache,"text/xml,application/xml,application/xhtml+xml...","x-gzip, x-deflate, gzip, deflate","utf-8, utf-8;q=0.5, *;q=0.5",en,localhost:8080,JSESSIONID=7104E6C68A6BCF1423DAE990CE49FEE2,application/x-www-form-urlencoded,Connection: close,Content-Length: 63,modo=entrar&login=choong&pwd=d1se3ci%F3n&remem...,0,http://localhost:8080/tienda1/publico/autentic...


In [5]:
include =['object', 'float', 'int']
data.describe(include=include)

Unnamed: 0.1,Unnamed: 0,Method,User-Agent,Pragma,Cache-Control,Accept,Accept-encoding,Accept-charset,language,host,cookie,content-type,connection,lenght,content,classification,URL
count,61065,61065,61065,61065,61065,60668,61065,61065,61065,61065,61065,17977,61065,17977,17977,61065.0,61065
unique,2,3,1,1,1,1,1,1,1,2,61065,1,2,382,12091,,13498
top,Normal,GET,Mozilla/5.0 (compatible; Konqueror/3.5; Linux)...,no-cache,no-cache,"text/xml,application/xml,application/xhtml+xml...","x-gzip, x-deflate, gzip, deflate","utf-8, utf-8;q=0.5, *;q=0.5",en,localhost:8080,JSESSIONID=92418148BF9CBA27D0B66A3DA85E9F74,application/x-www-form-urlencoded,close,Content-Length: 4,B2=Vaciar+carrito,,http://localhost:8080/tienda1/publico/anadir.j...
freq,36000,43088,61065,61065,61065,60668,61065,61065,61065,60668,1,17977,43088,1057,1046,,2441
mean,,,,,,,,,,,,,,,,0.410464,
std,,,,,,,,,,,,,,,,0.491922,
min,,,,,,,,,,,,,,,,0.0,
25%,,,,,,,,,,,,,,,,0.0,
50%,,,,,,,,,,,,,,,,0.0,
75%,,,,,,,,,,,,,,,,1.0,


In [6]:
print('noraml and abnormal ratio:')
print(data['Unnamed: 0'].value_counts())

noraml and abnormal ratio:
Normal       36000
Anomalous    25065
Name: Unnamed: 0, dtype: int64


## Data Selection

- choose X and y data

In [7]:
X = data[['Method','User-Agent','Pragma','Cache-Control', 'Accept','Accept-encoding','language', 'host', 'cookie', 'content-type', 'connection','lenght', 'content','URL']]
y = data['classification']

In [8]:
print(X.columns)
X.head()

Index(['Method', 'User-Agent', 'Pragma', 'Cache-Control', 'Accept',
       'Accept-encoding', 'language', 'host', 'cookie', 'content-type',
       'connection', 'lenght', 'content', 'URL'],
      dtype='object')


Unnamed: 0,Method,User-Agent,Pragma,Cache-Control,Accept,Accept-encoding,language,host,cookie,content-type,connection,lenght,content,URL
0,GET,Mozilla/5.0 (compatible; Konqueror/3.5; Linux)...,no-cache,no-cache,"text/xml,application/xml,application/xhtml+xml...","x-gzip, x-deflate, gzip, deflate",en,localhost:8080,JSESSIONID=1F767F17239C9B670A39E9B10C3825F4,,close,,,http://localhost:8080/tienda1/index.jsp HTTP/1.1
1,GET,Mozilla/5.0 (compatible; Konqueror/3.5; Linux)...,no-cache,no-cache,"text/xml,application/xml,application/xhtml+xml...","x-gzip, x-deflate, gzip, deflate",en,localhost:8080,JSESSIONID=81761ACA043B0E6014CA42A4BCD06AB5,,close,,,http://localhost:8080/tienda1/publico/anadir.j...
2,POST,Mozilla/5.0 (compatible; Konqueror/3.5; Linux)...,no-cache,no-cache,"text/xml,application/xml,application/xhtml+xml...","x-gzip, x-deflate, gzip, deflate",en,localhost:8080,JSESSIONID=933185092E0B668B90676E0A2B0767AF,application/x-www-form-urlencoded,Connection: close,Content-Length: 68,id=3&nombre=Vino+Rioja&precio=100&cantidad=55&...,http://localhost:8080/tienda1/publico/anadir.j...
3,GET,Mozilla/5.0 (compatible; Konqueror/3.5; Linux)...,no-cache,no-cache,"text/xml,application/xml,application/xhtml+xml...","x-gzip, x-deflate, gzip, deflate",en,localhost:8080,JSESSIONID=8FA18BA82C5336D03D3A8AFA3E68CBB0,,close,,,http://localhost:8080/tienda1/publico/autentic...
4,POST,Mozilla/5.0 (compatible; Konqueror/3.5; Linux)...,no-cache,no-cache,"text/xml,application/xml,application/xhtml+xml...","x-gzip, x-deflate, gzip, deflate",en,localhost:8080,JSESSIONID=7104E6C68A6BCF1423DAE990CE49FEE2,application/x-www-form-urlencoded,Connection: close,Content-Length: 63,modo=entrar&login=choong&pwd=d1se3ci%F3n&remem...,http://localhost:8080/tienda1/publico/autentic...


In [9]:
X.iloc[0]['Method']

'GET'

In [10]:
X_string = []
cols = X.columns
for i in range(len(X)):
    temp = ''
    for col in cols:
        temp += str(X.iloc[i][col])
    ans = ''
    for t in temp:
        if t.isalpha():
            ans += t.lower()
        else:
            ans += t
    X_string.append(ans)

In [12]:
print('Example: ')
print(X_string[0])

Example: 
getmozilla/5.0 (compatible; konqueror/3.5; linux) khtml/3.5.8 (like gecko)no-cacheno-cachetext/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5x-gzip, x-deflate, gzip, deflateenlocalhost:8080jsessionid=1f767f17239c9b670a39e9b10c3825f4nanclosenannanhttp://localhost:8080/tienda1/index.jsp http/1.1


## Data Preprocessing

### Change alphabet into index
- get index for all alphabets and sign into index
- designate each element into one index num

In [17]:
char_list = "abcdefghijklmnopqrstuvwxyz0123456789-,;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]{}"

char_dict = {}
for idx, cha in enumerate(char_list):
    char_dict[cha] = idx + 1
print(char_dict)

{'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5, 'f': 6, 'g': 7, 'h': 8, 'i': 9, 'j': 10, 'k': 11, 'l': 12, 'm': 13, 'n': 14, 'o': 15, 'p': 16, 'q': 17, 'r': 18, 's': 19, 't': 20, 'u': 21, 'v': 22, 'w': 23, 'x': 24, 'y': 25, 'z': 26, '0': 27, '1': 28, '2': 29, '3': 30, '4': 31, '5': 32, '6': 33, '7': 34, '8': 35, '9': 36, '-': 60, ',': 38, ';': 39, '.': 40, '!': 41, '?': 42, ':': 43, "'": 44, '"': 45, '/': 46, '\\': 47, '|': 48, '_': 49, '@': 50, '#': 51, '$': 52, '%': 53, '^': 54, '&': 55, '*': 56, '~': 57, '`': 58, '+': 59, '=': 61, '<': 62, '>': 63, '(': 64, ')': 65, '[': 66, ']': 67, '{': 68, '}': 69}


In [22]:
def str_to_index(s):
    str2idx = np.zeros(len(s))
    s = s.lower()
    for i in range(len(s)):
        c = s[i]
        if c in char_dict:
            str2idx[i] = char_dict[c]
    return torch.Tensor(str2idx)

In [23]:
print('String to index example: ')
print(str_to_index(X_string[0]))

String to index example: 
tensor([ 7.,  5., 20., 13., 15., 26.,  9., 12., 12.,  1., 46., 32., 40., 27.,
         0., 64.,  3., 15., 13., 16.,  1., 20.,  9.,  2., 12.,  5., 39.,  0.,
        11., 15., 14., 17., 21.,  5., 18., 15., 18., 46., 30., 40., 32., 39.,
         0., 12.,  9., 14., 21., 24., 65.,  0., 11.,  8., 20., 13., 12., 46.,
        30., 40., 32., 40., 35.,  0., 64., 12.,  9., 11.,  5.,  0.,  7.,  5.,
         3., 11., 15., 65., 14., 15., 60.,  3.,  1.,  3.,  8.,  5., 14., 15.,
        60.,  3.,  1.,  3.,  8.,  5., 20.,  5., 24., 20., 46., 24., 13., 12.,
        38.,  1., 16., 16., 12.,  9.,  3.,  1., 20.,  9., 15., 14., 46., 24.,
        13., 12., 38.,  1., 16., 16., 12.,  9.,  3.,  1., 20.,  9., 15., 14.,
        46., 24.,  8., 20., 13., 12., 59., 24., 13., 12., 38., 20.,  5., 24.,
        20., 46.,  8., 20., 13., 12., 39., 17., 61., 27., 40., 36., 38., 20.,
         5., 24., 20., 46., 16., 12.,  1.,  9., 14., 39., 17., 61., 27., 40.,
        35., 38.,  9., 13.,  1.,  7., 

In [12]:
X_idx = []
for x in X_string:
    X_idx.append(str_to_index(x))

In [13]:
X_idx[0]

tensor([ 7.,  5., 20., 13., 15., 26.,  9., 12., 12.,  1., 46., 32., 40., 27.,
         0., 64.,  3., 15., 13., 16.,  1., 20.,  9.,  2., 12.,  5., 39.,  0.,
        11., 15., 14., 17., 21.,  5., 18., 15., 18., 46., 30., 40., 32., 39.,
         0., 12.,  9., 14., 21., 24., 65.,  0., 11.,  8., 20., 13., 12., 46.,
        30., 40., 32., 40., 35.,  0., 64., 12.,  9., 11.,  5.,  0.,  7.,  5.,
         3., 11., 15., 65., 14., 15., 60.,  3.,  1.,  3.,  8.,  5., 14., 15.,
        60.,  3.,  1.,  3.,  8.,  5., 20.,  5., 24., 20., 46., 24., 13., 12.,
        38.,  1., 16., 16., 12.,  9.,  3.,  1., 20.,  9., 15., 14., 46., 24.,
        13., 12., 38.,  1., 16., 16., 12.,  9.,  3.,  1., 20.,  9., 15., 14.,
        46., 24.,  8., 20., 13., 12., 59., 24., 13., 12., 38., 20.,  5., 24.,
        20., 46.,  8., 20., 13., 12., 39., 17., 61., 27., 40., 36., 38., 20.,
         5., 24., 20., 46., 16., 12.,  1.,  9., 14., 39., 17., 61., 27., 40.,
        35., 38.,  9., 13.,  1.,  7.,  5., 46., 16., 14.,  7., 3

- padding `X_idx` into max length

In [14]:
X_idx_padding = pad_sequence(X_idx, batch_first=True)

In [15]:
X_idx_padding[0]

tensor([ 7.,  5., 20.,  ...,  0.,  0.,  0.])

## Split Data into Train, Valid, Test Set

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X_idx_padding, torch.Tensor(y), test_size=0.2, shuffle=True, random_state=42)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.1, shuffle=True, random_state=42)

In [17]:
y_train[:5]

tensor([0., 0., 1., 0., 1.])

In [18]:
batch_size = 128

train_dataset = TensorDataset(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=6)

valid_dataset = TensorDataset(X_valid, y_valid)
valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False, num_workers=6)

test_dataset = TensorDataset(X_test, y_test)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=6)

# Model

In [29]:
class CNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_class):
        super(CNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.cnn1 = nn.Sequential(nn.Conv1d(embed_dim, 64, 7),
                                  nn.ReLU(),
                                  nn.MaxPool1d(3))
        self.cnn2 = nn.Sequential(nn.Conv1d(64, 64, 7),
                                  nn.ReLU(),
                                  nn.MaxPool1d(3))
        self.cnn3 = nn.Sequential(nn.Conv1d(64, 64, 7),
                                  nn.ReLU(),
                                  nn.MaxPool1d(3))
        self.cnn4 = nn.Sequential(nn.Conv1d(64, 64, 7),
                                  nn.ReLU(),
                                  nn.MaxPool1d(3))
#         self.cnn5 = nn.Sequential(nn.Conv1d(256, 256, 7),
#                                   nn.BatchNorm1d(256),
#                                   nn.ReLU())
#         self.cnn6 = nn.Sequential(nn.Conv1d(256, 256, 7),
#                                   nn.BatchNorm1d(256),
#                                   nn.ReLU())
        
        self.relu = nn.ReLU()
        
        
        feat_size = 768
        self.fc = nn.Sequential(nn.Linear(feat_size, 128),
                                nn.ReLU(),
                                nn.Dropout(p=0.5),
                                nn.Linear(128, num_class)
#                                 nn.ReLU(),
#                                 nn.Dropout(p=0.5),
#                                 nn.Linear(128, num_class)
                               )
                                
    def forward(self, inp):
        x = self.embedding(inp)
        x = x.permute(0, 2, 1)

        for idx, cnn in enumerate([self.cnn1, self.cnn2, self.cnn3, self.cnn4]): #, self.cnn5, self.cnn6]):
            x = cnn(x)
#         x = self.maxpool1d(x)
        
        x = x.reshape(x.size(0), -1)
#         print(x.shape)
        x = self.fc(x)
        return x
    

# Train & Test

In [30]:
import copy

criterion = nn.CrossEntropyLoss().to(device)
test_acces = []
for _ in range(1):
    model = CNN(len(alpha_dict), 64, 2).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr = 0.001)
    epoch = 10
    
    best_acc = 0
    best_weight = copy.deepcopy(model.state_dict())
    val_acces = []
    for i in range(epoch):
        total_acc, total_loss, total_cnt = 0., 0., 0

        for X, y in train_loader:
            model.train()
            X, y = X.long().to(device), y.long().to(device)


            output = model(X)
            loss = criterion(output, y)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_acc += (output.cpu().argmax(1) == y.cpu()).sum().item()
            total_loss += loss.item()
            total_cnt += y.size(0)

        print('Epoch: {}, Accuracy: {:.4f}, Loss: {:.4f}'.format(
        i+1, total_acc/total_cnt, total_loss/len(train_loader))
        )

        with torch.no_grad():
            model.eval()

            val_acc, val_loss, val_cnt = 0., 0., 0
            for X, y in valid_loader:
                X, y = X.long().to(device), y.long().to(device)

                predicted = model(X)
                loss = criterion(predicted, y)

                val_acc += (predicted.cpu().argmax(1) == y.cpu()).sum().item()
                val_loss += loss.item()
                val_cnt += y.size(0)

        torch.cuda.empty_cache()

        val_acc = val_acc/val_cnt
        val_acces.append(val_acc)
        print('Validation --- Accuracy: {:.4f}, Loss: {:.4f}'.format(
        val_acc, val_loss/len(valid_loader))
        )
        if val_acc > best_acc:
            best_acc = val_acc
            best_weight = copy.deepcopy(model.state_dict())
            print("------best_acc is updated.--------")

    model.load_state_dict(best_weight)

    model.eval()

    test_acc, test_loss, test_cnt = 0., 0., 0
    for X, y in test_loader:
        X, y = X.long().to(device), y.long().to(device)
        predicted = model(X)
        loss = criterion(predicted, y)

        test_acc += (predicted.cpu().argmax(1) == y.cpu()).sum().item()
        test_loss += loss.item()
        test_cnt += y.size(0)

        torch.cuda.empty_cache()
    
    print('================================================================')
    print('Test --- Accuracy: {:.4f}, Loss: {:.4f}'.format(
    test_acc/test_cnt, test_loss/len(test_loader))
    )
    print('================================================================')
    test_acces.append(test_acc/test_cnt)

Epoch: 1, Accuracy: 0.9040, Loss: 0.1841
Validation --- Accuracy: 0.9625, Loss: 0.0886
------best_acc is updated.--------
Epoch: 2, Accuracy: 0.9798, Loss: 0.0540
Validation --- Accuracy: 0.9828, Loss: 0.0506
------best_acc is updated.--------
Epoch: 3, Accuracy: 0.9869, Loss: 0.0369
Validation --- Accuracy: 0.9685, Loss: 0.0746
Epoch: 4, Accuracy: 0.9897, Loss: 0.0308
Validation --- Accuracy: 0.9849, Loss: 0.0380
------best_acc is updated.--------
Epoch: 5, Accuracy: 0.9921, Loss: 0.0249
Validation --- Accuracy: 0.9867, Loss: 0.0381
------best_acc is updated.--------
Epoch: 6, Accuracy: 0.9930, Loss: 0.0212
Validation --- Accuracy: 0.9881, Loss: 0.0402
------best_acc is updated.--------
Epoch: 7, Accuracy: 0.9950, Loss: 0.0155
Validation --- Accuracy: 0.9887, Loss: 0.0417
------best_acc is updated.--------
Epoch: 8, Accuracy: 0.9955, Loss: 0.0137
Validation --- Accuracy: 0.9910, Loss: 0.0350
------best_acc is updated.--------
Epoch: 9, Accuracy: 0.9966, Loss: 0.0101
Validation --- Acc