<h1 style="color:red;"> SQL Injection + </h1>
<span> Chou@ibCher+</span>


In [1]:
import pandas as pd
import os
from sklearn.metrics import accuracy_score, classification_report
import joblib
# machine learning models
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
# ensemble learning
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier, RandomForestClassifier, StackingClassifier



In [None]:
# Correct the path
current_dir = os.getcwd()  
parent_dir = os.path.abspath(os.path.join(current_dir, '..', '..'))
print('Current directory:', parent_dir)

# Define the file path
df_path = os.path.join(parent_dir, 'data', 'processed', 'sqli-by-chou@ibcher+.csv')

# Load the dataset
df = pd.read_csv(df_path, usecols=['payload', 'is_malicious', 'injection_type'])


print('Data distribution')
print(df['injection_type'].value_counts())


Current directory: c:\Users\LENOVO\Documents\GitHub\WAF_AI-AI
Data distribution
injection_type
LEGAL    15257
SQL       3288
Name: count, dtype: int64


In [3]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

# Load the dataset

df = pd.read_csv(df_path, usecols=['payload', 'is_malicious', 'injection_type'])

# Convert the 'payload' column to strings and fill NaN values
df['payload'] = df['payload'].astype(str).fillna('')

# Remove any empty data points
df = df[df['payload'] != '']

# Remove any duplicate payloads
df = df.drop_duplicates(subset=['payload'])

# Custom tokenization function to capture SQL injection patterns
def custom_tokenizer(text):
    tokens = text.split()
    return tokens

# Initialize the Count Vectorizer (Bag of Words) with custom tokenizer and n-grams
count_vectorizer = CountVectorizer(min_df=1, tokenizer=custom_tokenizer, ngram_range=(1, 3))

# Transform the 'payload' column
X = count_vectorizer.fit_transform(df['payload'])

# Define the target variable
y = df['is_malicious']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [4]:
# Initialize the Naive Bayes classifier
nb_classifier = MultinomialNB()

# Train the classifier
nb_classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred = nb_classifier.predict(X_test)

# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(report)



Accuracy: 0.9599090133636622
Classification Report:
              precision    recall  f1-score   support

           0       0.96      1.00      0.98      3062
           1       1.00      0.69      0.82       455

    accuracy                           0.96      3517
   macro avg       0.98      0.85      0.90      3517
weighted avg       0.96      0.96      0.96      3517



In [5]:
# Initialize the classifiers
svm_classifier = SVC()
logistic_regression_classifier = LogisticRegression(max_iter=1000)
decision_tree_classifier = DecisionTreeClassifier()

# Train the classifiers
svm_classifier.fit(X_train, y_train)
logistic_regression_classifier.fit(X_train, y_train)
decision_tree_classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred_svm = svm_classifier.predict(X_test)
y_pred_logistic = logistic_regression_classifier.predict(X_test)
y_pred_tree = decision_tree_classifier.predict(X_test)

# Evaluate the classifiers
accuracy_svm = accuracy_score(y_test, y_pred_svm)
accuracy_logistic = accuracy_score(y_test, y_pred_logistic)
accuracy_tree = accuracy_score(y_test, y_pred_tree)

report_svm = classification_report(y_test, y_pred_svm)
report_logistic = classification_report(y_test, y_pred_logistic)
report_tree = classification_report(y_test, y_pred_tree)

print(f'SVM Accuracy: {accuracy_svm}')
print('SVM Classification Report:')
print(report_svm)

print(f'Logistic Regression Accuracy: {accuracy_logistic}')
print('Logistic Regression Classification Report:')
print(report_logistic)

print(f'Decision Tree Accuracy: {accuracy_tree}')
print('Decision Tree Classification Report:')
print(report_tree)

SVM Accuracy: 0.9599090133636622
SVM Classification Report:
              precision    recall  f1-score   support

           0       0.96      1.00      0.98      3062
           1       0.99      0.69      0.82       455

    accuracy                           0.96      3517
   macro avg       0.98      0.85      0.90      3517
weighted avg       0.96      0.96      0.96      3517

Logistic Regression Accuracy: 0.9556440147853285
Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       0.95      1.00      0.98      3062
           1       1.00      0.66      0.79       455

    accuracy                           0.96      3517
   macro avg       0.98      0.83      0.88      3517
weighted avg       0.96      0.96      0.95      3517

Decision Tree Accuracy: 0.9596246801251066
Decision Tree Classification Report:
              precision    recall  f1-score   support

           0       0.96      1.00      0.98      3062
     

<h2 color='red'>Ensemble learning</h2>

In [None]:

# Bagging
bagging_classifier = BaggingClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=50, random_state=42)
bagging_classifier.fit(X_train, y_train)
y_pred_bagging = bagging_classifier.predict(X_test)
accuracy_bagging = accuracy_score(y_test, y_pred_bagging)
report_bagging = classification_report(y_test, y_pred_bagging)

print(f'Bagging Accuracy: {accuracy_bagging}')
print('Bagging Classification Report:')
print(report_bagging)

# Boosting (AdaBoost)
boosting_classifier = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=50, random_state=42)
boosting_classifier.fit(X_train, y_train)
y_pred_boosting = boosting_classifier.predict(X_test)
accuracy_boosting = accuracy_score(y_test, y_pred_boosting)
report_boosting = classification_report(y_test, y_pred_boosting)

print(f'Boosting Accuracy: {accuracy_boosting}')
print('Boosting Classification Report:')
print(report_boosting)

# Random Forest
random_forest_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
random_forest_classifier.fit(X_train, y_train)
y_pred_rf = random_forest_classifier.predict(X_test)
accuracy_rf = accuracy_score(y_test, y_pred_rf)
report_rf = classification_report(y_test, y_pred_rf)

print(f'Random Forest Accuracy: {accuracy_rf}')
print('Random Forest Classification Report:')
print(report_rf)

# Stacking
estimators = [
    ('rf', RandomForestClassifier(n_estimators=50, random_state=42)),
    ('svr', SVC(probability=True, random_state=42))
]
stacking_classifier = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())
stacking_classifier.fit(X_train, y_train)
y_pred_stacking = stacking_classifier.predict(X_test)
accuracy_stacking = accuracy_score(y_test, y_pred_stacking)
report_stacking = classification_report(y_test, y_pred_stacking)

print(f'Stacking Accuracy: {accuracy_stacking}')
print('Stacking Classification Report:')
print(report_stacking)



Bagging Accuracy: 0.959340346886551
Bagging Classification Report:
              precision    recall  f1-score   support

           0       0.96      1.00      0.98      3062
           1       1.00      0.69      0.81       455

    accuracy                           0.96      3517
   macro avg       0.98      0.84      0.90      3517
weighted avg       0.96      0.96      0.96      3517





Boosting Accuracy: 0.959340346886551
Boosting Classification Report:
              precision    recall  f1-score   support

           0       0.96      1.00      0.98      3062
           1       1.00      0.69      0.81       455

    accuracy                           0.96      3517
   macro avg       0.98      0.84      0.90      3517
weighted avg       0.96      0.96      0.96      3517

Random Forest Accuracy: 0.9587716804094398
Random Forest Classification Report:
              precision    recall  f1-score   support

           0       0.96      1.00      0.98      3062
           1       1.00      0.68      0.81       455

    accuracy                           0.96      3517
   macro avg       0.98      0.84      0.89      3517
weighted avg       0.96      0.96      0.96      3517

Stacking Accuracy: 0.960762013079329
Stacking Classification Report:
              precision    recall  f1-score   support

           0       0.96      1.00      0.98      3062
           1       

<h2>Saving the model and vectorizer</h2>

In [None]:

modelPathSaving = os.path.join(parent_dir, 'WAF_AI', 'nb.pkl')
vectorizerPathSaving = os.path.join(parent_dir, 'WAF_AI', 'vectorizer.pkl')

joblib.dump(nb_classifier, modelPathSaving)
joblib.dump(count_vectorizer, vectorizerPathSaving)

In [23]:
# SQL injection examples
sql_injections = [
    'verve',
    'helllo chouaib',
    'username',
    'password',
    'bounjour',
    "1' OR '1'='1",
    "1' OR '1'='1' --",
    "1' OR '1'='1' ({",
    "1' OR '1'='1' /*",
    "1' OR '1'='1' #",
    "1' OR '1'='1' AND '1'='1",
    "1' OR '1'='1' AND '1'='2",
    "1' OR '1'='1' UNION SELECT NULL, NULL",
    "1' OR '1'='1' UNION SELECT username, password FROM users",
    "1' OR '1'='1' UNION SELECT table_name, column_name FROM information_schema.columns"
]

# Transform the SQL injections using the vectorizer
sql_injections_vectorized = count_vectorizer.transform(sql_injections).toarray()

# Predict using the Naive Bayes model
predictions = nb_classifier.predict(sql_injections_vectorized)

# Print the predictions
for i, sql in enumerate(sql_injections):
    print(f"SQL Injection: {sql} -> Prediction: {predictions[i]}")

SQL Injection: verve -> Prediction: 0
SQL Injection: helllo chouaib -> Prediction: 0
SQL Injection: username -> Prediction: 0
SQL Injection: password -> Prediction: 0
SQL Injection: bounjour -> Prediction: 0
SQL Injection: 1' OR '1'='1 -> Prediction: 1
SQL Injection: 1' OR '1'='1' -- -> Prediction: 1
SQL Injection: 1' OR '1'='1' ({ -> Prediction: 1
SQL Injection: 1' OR '1'='1' /* -> Prediction: 1
SQL Injection: 1' OR '1'='1' # -> Prediction: 1
SQL Injection: 1' OR '1'='1' AND '1'='1 -> Prediction: 1
SQL Injection: 1' OR '1'='1' AND '1'='2 -> Prediction: 1
SQL Injection: 1' OR '1'='1' UNION SELECT NULL, NULL -> Prediction: 1
SQL Injection: 1' OR '1'='1' UNION SELECT username, password FROM users -> Prediction: 1
SQL Injection: 1' OR '1'='1' UNION SELECT table_name, column_name FROM information_schema.columns -> Prediction: 1
