In [1]:
import spacy
import torch
from torchtext import data    
from libs import sql_tokenizer
import libs
import numpy as np
import pandas as pd

In [None]:
#Reproducing same results
SEED = 2019

#Torch
torch.manual_seed(SEED)

#Cuda algorithms
torch.backends.cudnn.deterministic = True  

In [None]:
TEXT = data.Field(tokenize='spacy',batch_first=True,include_lengths=True)
LABEL = data.LabelField(dtype = torch.float,batch_first=True)

In [None]:
fields = [("id", None), ('text',TEXT),('label', LABEL)]

In [None]:
training_data=data.TabularDataset(path = 'csv_files/safe_xss_sql.csv',format = 'csv',fields = fields,skip_header = True)

In [None]:

#print preprocessed text
#print(vars(training_data.examples[0]))
for i,dt in enumerate(training_data):
    if dt.label == "label":
        print("ID:",i)
        print(dt.text)
        print(dt.label)
        #print(training_data.__dir__())
        del training_data.examples[i]
        print("-"*50)
    if(len(dt.text) <= 0):
        training_data[i].text = "<blank>"
        #print(i,training_data[i].text,training_data[i].label)

In [None]:
import random
train_data, valid_data = training_data.split(split_ratio=0.7, random_state = random.seed(SEED))
#initialize glove embeddings
# uncomment to use pretrained glove and comment the other one
TEXT.build_vocab(train_data,min_freq=3,vectors = "fasttext.simple.300d")  
#TEXT.build_vocab(train_data,min_freq=3)
LABEL.build_vocab(train_data)
print("Size of TEXT vocabulary:",len(TEXT.vocab))
print("Size of LABEL vocabulary:",len(LABEL.vocab))
print(LABEL.vocab.stoi)
#Commonly used words
#print(TEXT.vocab.freqs.most_common(100))  
#print(TEXT.vocab.stoi)   

In [60]:
import pickle
def save_vocab(vocab, path):
    
    output = open(path, 'wb')
    pickle.dump(vocab, output)
    output.close()

save_vocab(TEXT,"cleaned-version/pickles/vocab.pt")
save_vocab(LABEL, "cleaned-version/pickles/label.pt")

In [None]:
with open("cleaned-version/pickles/vocab.pt", 'rb') as pickle_file:
    TEXT_ = pickle.load(pickle_file)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')  

#set batch size
BATCH_SIZE = 64

#Load an iterator
train_iterator, valid_iterator = data.BucketIterator.splits(
    (train_data, valid_data), 
    batch_size = BATCH_SIZE,
    sort_key = lambda x: len(x.text),
    sort_within_batch=True,
    device = device)

In [None]:
import torch.nn as nn

class classifier(nn.Module):
    
    #define all the layers used in model
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, 
                 bidirectional, dropout):
        
        #Constructor
        super().__init__()          
        
        #embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        #lstm layer
        self.lstm = nn.LSTM(embedding_dim, 
                           hidden_dim, 
                           num_layers=n_layers, 
                           bidirectional=bidirectional, 
                           dropout=dropout,
                           batch_first=True)
        
        #dense layer
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        
        #activation function
#        self.act = nn.Sigmoid()
        
    def forward(self, text, text_lengths):
        
        #text = [batch size,sent_length]
        embedded = self.embedding(text)
        #embedded = [batch size, sent_len, emb dim]
      
#packed sequence
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths,batch_first=True)
        
        packed_output, (hidden, cell) = self.lstm(packed_embedded)
        #hidden = [batch size, num layers * num directions,hid dim]
        #cell = [batch size, num layers * num directions,hid dim]
        
        #concat the final forward and backward hidden state
        hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)
                
        #hidden = [batch size, hid dim * num directions]
        dense_outputs=self.fc(hidden)

        #Final activation function
        outputs= dense_outputs#self.act(dense_outputs)
        
        return outputs

In [None]:
#LABEL.vocab.stoi['safe'] = 0
#LABEL.vocab.stoi['xss'] = 1 
#LABEL.vocab.stoi['sql'] = 2 
#LABEL.vocab.stoi.pop("label")


In [19]:
size_of_vocab = len(TEXT.vocab)
embedding_dim = 300 # 100
num_hidden_nodes = 32
num_output_nodes = len(LABEL.vocab.stoi)
num_layers = 2
bidirection = True
dropout = 0.2

#instantiate the model
model = classifier(size_of_vocab, embedding_dim, num_hidden_nodes,num_output_nodes, num_layers, 
                   bidirectional = True, dropout = dropout)

In [20]:
print(model)

#No. of trianable parameters
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
    
print(f'The model has {count_parameters(model):,} trainable parameters')

#Initialize the pretrained embedding
pretrained_embeddings = TEXT.vocab.vectors
model.embedding.weight.data.copy_(pretrained_embeddings)

print(pretrained_embeddings.shape)

classifier(
  (embedding): Embedding(9656, 300)
  (lstm): LSTM(300, 32, num_layers=2, batch_first=True, dropout=0.2, bidirectional=True)
  (fc): Linear(in_features=64, out_features=3, bias=True)
)
The model has 3,007,587 trainable parameters
torch.Size([9656, 300])


In [21]:
import torch.optim as optim

#define optimizer and loss
optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss()

#define metric
def binary_accuracy(preds, y):
    #round predictions to the closest integer
    rounded_pred = torch.round(preds)
    _,pred_label = torch.max(rounded_pred, dim = 1)
    correct = (pred_label == y).float()
    acc = correct.sum() / len(correct)
    return acc
    
#push to cuda if available
model = model.to(device)
criterion = criterion.to(device)

In [22]:
def train(model, iterator, optimizer, criterion):
    
    #initialize every epoch 
    epoch_loss = 0
    epoch_acc = 0
    
    #set the model in training phase
    model.train()  
    
    for batch in iterator:
        
        #resets the gradients after every batch
        optimizer.zero_grad()   
        
        #retrieve text and no. of words
        text, text_lengths = batch.text   
        
        #convert to 1D tensor
        predictions = model(text, text_lengths).squeeze()  
 
        #compute the loss
        y_tensor = torch.tensor(batch.label, dtype=torch.long, device=device)
        loss = criterion(predictions, y_tensor)        
        
        #compute the binary accuracy
        acc = binary_accuracy(predictions, batch.label)   
        
        #backpropage the loss and compute the gradients
        loss.backward()       
        
        #update the weights
        optimizer.step()      
        
        #loss and accuracy
        epoch_loss += loss.item()  
        epoch_acc += acc.item()    
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [23]:
def evaluate(model, iterator, criterion):
    
    #initialize every epoch
    epoch_loss = 0
    epoch_acc = 0

    #deactivating dropout layers
    model.eval()
    
    #deactivates autograd
    with torch.no_grad():
    
        for batch in iterator:
        
            #retrieve text and no. of words
            text, text_lengths = batch.text
            
            #convert to 1d tensor
            predictions = model(text, text_lengths).squeeze()
            
            #compute loss and accuracy
            y_tensor = torch.tensor(batch.label, dtype=torch.long, device=device)
            loss = criterion(predictions, y_tensor)      
            acc = binary_accuracy(predictions, batch.label)
            
            #keep track of loss and accuracy
            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [24]:
N_EPOCHS = 7
best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
     
    #train the model
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    
    #evaluate the model
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    #save the best model
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'saved_weights.pt')
    
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')



	Train Loss: 0.797 | Train Acc: 56.80%
	 Val. Loss: 0.286 |  Val. Acc: 93.98%
	Train Loss: 0.119 | Train Acc: 96.82%
	 Val. Loss: 0.055 |  Val. Acc: 97.92%
	Train Loss: 0.041 | Train Acc: 98.29%
	 Val. Loss: 0.034 |  Val. Acc: 98.55%
	Train Loss: 0.031 | Train Acc: 98.99%
	 Val. Loss: 0.035 |  Val. Acc: 98.96%
	Train Loss: 0.029 | Train Acc: 99.24%
	 Val. Loss: 0.025 |  Val. Acc: 99.54%
	Train Loss: 0.024 | Train Acc: 99.47%
	 Val. Loss: 0.021 |  Val. Acc: 99.48%
	Train Loss: 0.019 | Train Acc: 99.60%
	 Val. Loss: 0.015 |  Val. Acc: 99.77%


In [39]:
import time
import os
# lets save the model
def save_model():
    models_path = "saved_weights"
    md_val_acc = "%.2f"%(valid_acc*100)
    model_name = "Acc "+md_val_acc+".pt"
    full_path = os.path.join(models_path, model_name)
    torch.save(model.state_dict(),full_path)
    print("SAVED\n",model_name)

In [40]:
save_model()

SAVED
 Acc 99.77.pt


In [41]:
import spacy

nlp = spacy.load('en')
pred_2_lbl = {num:key for key,num in LABEL.vocab.stoi.items()}
def predict(model,sentence):
    
    tokenized = [tok.text for tok in nlp.tokenizer(sql_tokenizer(sentence))] # tokenize the sentence
    print(tokenized)
    #indexed = [TEXT.vocab.stoi[t] for t in tokenized] # convert to integer sequence
    indexed = []
    for t in tokenized:
        tt = TEXT.vocab.stoi[t]
        if tt != 0:
            indexed.append(tt)
        
    print(indexed)
    length = [len(indexed)] #compute no. of words
    tensor = torch.LongTensor(indexed).to(device) # convert to tensor
    tensor = tensor.unsqueeze(1).T
    length_tensor = torch.LongTensor(length)
    prediction = model(tensor,length_tensor)
    pred_lbl = np.argmax(prediction.detach().numpy())
    print('\n')
    print('predicted threat type:',pred_2_lbl[pred_lbl])
    return prediction

In [42]:
pred = predict(model,""" SELECT * FROM items
WHERE owner = 'wiley'
AND itemname = 'name' OR 'a'='a'; """)

[' ', 'SELECT', 'STAR', 'FROM', 'items', '\n', 'where', 'OWNER', 'EQ', 'SQUT', 'wiley', 'SQUT', 'AND', 'itemname', 'EQ', 'SQUT', 'NAME', 'SQUT', 'OR', 'SQUT', 'A', 'SQUT', 'EQ', 'SQUT', 'A', 'SQUT', 'SMCLN']
[148, 105, 44, 1830, 42, 24, 6018, 24, 5, 42, 24, 503, 24, 49, 24, 4, 24, 42, 24, 4, 24, 19]


predicted threat type: injection


In [43]:
#np.argmax(pred.detach().numpy())

In [44]:
#LABEL.vocab.stoi.items()

# Safe Text

In [45]:
i_good = predict(model,'im good')
i_good

['i', 'm', 'good']
[11, 1790, 67]


predicted threat type: Safe


tensor([[ 2.5017, -1.1686, -1.4168]], grad_fn=<AddmmBackward>)

In [46]:
i_good.detach().numpy()[0]

array([ 2.5016747, -1.1686461, -1.4167845], dtype=float32)

In [47]:
pred_2_lbl

{0: 'Safe', 1: 'injection', 2: 'xss'}

# Sql detection

In [48]:
predict(model,'group_concat(namapemohon,0x3a,email),3,4,5,6 from pendaftaran_user ')

['group_concat', 'LPRN', 'namapemohon', 'CMMA', '0x3a', 'CMMA', 'email', 'RPRN', 'CMMA', 'INT', 'CMMA', 'INT', 'CMMA', 'INT', 'CMMA', 'INT', 'FROM', 'pendaftaran_user']
[26, 20, 20, 7682, 17, 20, 13, 20, 13, 20, 13, 20, 13, 44]


predicted threat type: injection


tensor([[-2.9238,  5.0265, -2.0399]], grad_fn=<AddmmBackward>)

In [49]:
predict(model, """
<SCRIPT type="text/javascript">
var adr = '../evil.php?cakemonster=' + escape(document.cookie);
</SCRIPT>
""")

['LT', 'script', 'TYPE', 'EQ', 'DQUT', 'TEXT', 'SLSH', 'javascript', 'DQUT', 'GT', 'var', 'adr', 'EQ', 'SQUT', 'DOT', 'DOT', 'SLSH', 'evil', 'DOT', 'php', 'QSTN', 'cakemonster', 'EQ', 'SQUT', 'PLUS', 'ESCAPE', 'LPRN', 'DOCUMENT', 'DOT', 'cookie', 'RPRN', 'SMCLN', ' ', 'LT', 'SLSH', 'script', 'GT']
[62, 88, 505, 42, 47, 819, 35, 268, 47, 65, 2085, 42, 24, 2, 2, 35, 1211, 2, 1009, 42, 24, 174, 1632, 26, 859, 2, 1210, 17, 19, 62, 35, 88, 65]


predicted threat type: xss


tensor([[-2.6121, -1.3265,  4.5491]], grad_fn=<AddmmBackward>)

In [50]:
predict(model,"this is safe text lmao hema dufi")

['this', 'IS', 'safe', 'TEXT', 'lmao', 'hema', 'dufi']
[14, 9, 1889, 819, 8329]


predicted threat type: Safe


tensor([[ 4.1226, -2.3482, -2.1836]], grad_fn=<AddmmBackward>)

In [51]:
predict(model,"hema shko here today")

['hema', 'shko', 'here', 'today']
[168, 666]


predicted threat type: Safe


tensor([[ 2.4138, -1.4912, -1.1647]], grad_fn=<AddmmBackward>)

In [52]:
# fixing this problem using pretrained vocabs like globe.6.b

In [58]:
predict(model, "hello did you write hema's script")

['hello', 'did', 'you', 'WRITE', 'hema', 'SQUT', 's', 'script']
[139, 33, 845, 24, 16, 88]


predicted threat type: Safe


tensor([[ 2.2955, -3.4369,  0.7129]], grad_fn=<AddmmBackward>)