# Cyber Security Project (CP8320)
**Title: Detection of SQL injection with a Machine Learning approach**

**Name : Urmi Patel (501064008)**

Press shift + enter -> to run the cell line by line

#Import necessary libraries

In [None]:
import sys
import pandas as pd
import numpy as np
import glob
import time
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords 

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import tree
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation, Flatten, Conv2D, MaxPooling2D
import pickle
from sklearn.metrics import accuracy_score
from keras.models import load_model


# Add/import dataset

when you run this cell, you have to click on a choose files button and select the provided dataset file (sqli.csv) and after the cell run, you can see the dataset file will be imported in this python file.

In [None]:
from google.colab import files
uploaded = files.upload()
for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(name=fn, length=len(uploaded[fn])))

In [None]:
df = pd.read_csv('/content/sqli.csv',encoding='utf-16')

In [None]:
df.head()

# Vectorization

In [None]:
vector = CountVectorizer( max_df=0.7, max_features=4096, stop_words=stopwords.words('english'))
new_sent = vector.fit_transform(df['Sentence'].values.astype('U')).toarray()

In [None]:
#Printing the identified Unique words along with their indices
print("Sentences: ", vector.vocabulary_)

In [None]:
len(vector.vocabulary_.keys())

In [None]:
print(new_sent)

In [None]:
new_sent.shape=(4200,64,64,1)
new_sent.shape

In [None]:
X=new_sent
y=df['Label']

In [None]:
print(X)

# Training Testing Splits

In [None]:
# spliting training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
trainX=X_train.copy()
trainX.shape=(X_train.shape[0],trainX.shape[1]*trainX.shape[2])
testX=X_test.copy()
testX.shape=(testX.shape[0],testX.shape[1]*testX.shape[2])


# Naive Bayes

In [None]:
nb = GaussianNB()
nb.fit(trainX, y_train)
pred_nb = nb.predict(testX)

# SVM (Support Vector Machine)

In [None]:
clf = SVC(gamma='auto')
clf.fit(trainX, y_train)
pred_svm=clf.predict(testX)

# Logistic Regression

In [None]:
clff = LogisticRegression(random_state=0).fit(trainX, y_train)
pred_lr=clff.predict(testX)

# KNN (K Nearest Neighbour)

In [None]:
kn = KNeighborsClassifier(n_neighbors=3)
kn.fit(trainX, y_train)
pred_knn = kn.predict(testX)

# DS (Decision Tree)

In [None]:
dt = tree.DecisionTreeClassifier()
dt = dt.fit(trainX, y_train)
pred_dt = dt.predict(testX)

# Neural Network

In [None]:
model = Sequential()

model.add(layers.Conv2D(64, (3, 3), activation='relu', input_shape=(64, 64, 1)))
model.add(layers.MaxPooling2D((2, 2)))

model.add(layers.Conv2D(128, (3, 3), activation='relu'))
model.add(layers.MaxPooling2D((2, 2)))

model.add(Flatten())
model.add(Dense(64,activation='relu'))
model.add(Dense(128,activation='relu'))
model.add(Dense(1, activation='sigmoid'))

In [None]:
model.compile(loss='binary_crossentropy', 
              optimizer='adam', 
              metrics=['accuracy'])
model.summary()

In [None]:
classifier_nn = model.fit(X_train,y_train,epochs=10,validation_data=(X_test, y_test),batch_size=32)

In [None]:
 pred=model.predict(X_test)

#Prediction

In [None]:
for i in range(len(pred)):
    if pred[i]>0.5:
        pred[i]=1
    elif pred[i]<=0.5:
        pred[i]=0

In [None]:
# save the cnn model for future use
model.save('my_model_cnn.h5')
with open('vect_cnn_2', 'wb') as fin:
    pickle.dump(vector, fin)

In [None]:
def accuracy_function(tp,tn,fp,fn):
  accuracy = (tp+tn) / (tp+tn+fp+fn)
  return accuracy

In [None]:
def precision_function(tp,fp):
  precision = tp / (tp+fp)
  return precision

In [None]:
def recall_function(tp,fn):
  recall=tp / (tp+fn)
  return recall

## confusion matrix

In [None]:
def confusion_matrix(truth,predicted):
    
    true_positive = 0
    true_negative = 0
    false_positive = 0
    false_negative = 0
    
    for true,pred in zip(truth,predicted):
        
        if true == 1:
            if pred == true:
                true_positive += 1
            elif pred != true:
                false_negative += 1

        elif true == 0:
            if pred == true:
                true_negative += 1
            elif pred != true:
                false_positive += 1
            
    accuracy=accuracy_function(true_positive, true_negative, false_positive, false_negative)
    precision=precision_function(true_positive, false_positive)
    recall=recall_function(true_positive, false_negative)
    
    return (accuracy,
            precision,
           recall)

## Accuracy & precision & Recall

In [None]:
accuracy,precision,recall=confusion_matrix(y_test,pred)

In [None]:
print(" For CNN \n Accuracy : {0} \n Precision : {1} \n Recall : {2}".format(accuracy, precision, recall))

In [None]:
accuracy,precision,recall=confusion_matrix(y_test,pred_nb)
print(" For Naive Bayes Accuracy : {0} \n Precision : {1} \n Recall : {2}".format(accuracy, precision, recall))

In [None]:
accuracy,precision,recall=confusion_matrix(y_test,pred_svm)
print(" For SVM Accuracy : {0} \n Precision : {1} \n Recall : {2}".format(accuracy, precision, recall))

In [None]:
accuracy,precision,recall=confusion_matrix(y_test,pred_knn)
print(" For KNN Accuracy : {0} \n Precision : {1} \n Recall : {2}".format(accuracy, precision, recall))


In [None]:
accuracy,precision,recall=confusion_matrix(y_test,pred_lr)
print(" For Logistic Regression Accuracy : {0} \n Precision : {1} \n Recall : {2}".format(accuracy, precision, recall))

In [None]:
accuracy,precision,recall=confusion_matrix(y_test,pred_dt)
print(" For Decision Tree Accuracy : {0} \n Precision : {1} \n Recall : {2}".format(accuracy, precision, recall))

# Testing Result using CNN

In [None]:
mymodel = tf.keras.models.load_model('my_model_cnn.h5')
myvectorizer = pickle.load(open("vect_cnn_2", 'rb'))

def clean_data(input_val):

    input_val=input_val.replace('\n', '')
    input_val=input_val.replace('%20', ' ')
    input_val=input_val.replace('=', ' = ')
    input_val=input_val.replace('((', ' (( ')
    input_val=input_val.replace('))', ' )) ')
    input_val=input_val.replace('(', ' ( ')
    input_val=input_val.replace(')', ' ) ')
    input_val=input_val.replace('1 ', 'numeric')
    input_val=input_val.replace(' 1', 'numeric')
    input_val=input_val.replace("'1 ", "'numeric ")
    input_val=input_val.replace(" 1'", " numeric'")
    input_val=input_val.replace('1,', 'numeric,')
    input_val=input_val.replace(" 2 ", " numeric ")
    input_val=input_val.replace(' 3 ', ' numeric ')
    input_val=input_val.replace(' 3--', ' numeric--')
    input_val=input_val.replace(" 4 ", ' numeric ')
    input_val=input_val.replace(" 5 ", ' numeric ')
    input_val=input_val.replace(' 6 ', ' numeric ')
    input_val=input_val.replace(" 7 ", ' numeric ')
    input_val=input_val.replace(" 8 ", ' numeric ')
    input_val=input_val.replace('1234', ' numeric ')
    input_val=input_val.replace("22", ' numeric ')
    input_val=input_val.replace(" 8 ", ' numeric ')
    input_val=input_val.replace(" 200 ", ' numeric ')
    input_val=input_val.replace("23 ", ' numeric ')
    input_val=input_val.replace('"1', '"numeric')
    input_val=input_val.replace('1"', '"numeric')
    input_val=input_val.replace("7659", 'numeric')
    input_val=input_val.replace(" 37 ", ' numeric ')
    input_val=input_val.replace(" 45 ", ' numeric ')

    return input_val

def predict_sqli_attack():
    
    repeat=True
    
    beautify=''
    for i in range(20):
        beautify+= "="

    print(beautify) 
    input_val=input("Give me some data to work on : ")
    print(beautify)

    
    if input_val== '0':
        repeat=False
    
    

    input_val=clean_data(input_val)
    input_val=[input_val]



    input_val=myvectorizer.transform(input_val).toarray()
    
    input_val.shape=(1,64,64,1)

    result=mymodel.predict(input_val)


    print(beautify)
    
    
    if repeat == True:
        
        if result>0.5:
            print("ALERT - This can be SQL injection")


        elif result<=0.5:
            print("yes, it seems to be safe")
            
        print(beautify)
            
        predict_sqli_attack()
            
    elif repeat == False:
        print( " see you next time ")

## Results

In [None]:
predict_sqli_attack()

NameError: ignored