In [None]:
# --- Cell 1: Setup & Data ---

# Install the Kaggle API
!pip install kaggle

# Import the 'files' module to upload your Kaggle API key
from google.colab import files
print("--- Please upload your kaggle.json file ---")
uploaded = files.upload()

# Move the API key to the correct folder
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
print("✅ Kaggle API key configured.")

# Download the dataset from Kaggle
!kaggle datasets download -d sajid576/sql-injection-dataset

# Unzip the file (the -o flag overwrites without asking)
!unzip -o sql-injection-dataset.zip

print("✅ Dataset downloaded and unzipped as 'Modified_SQL_Dataset.csv'")

--- Please upload your kaggle.json file ---


Saving Modified_SQL_Dataset.csv to Modified_SQL_Dataset (1).csv
cp: cannot stat 'kaggle.json': No such file or directory
chmod: cannot access '/root/.kaggle/kaggle.json': No such file or directory
✅ Kaggle API key configured.
Traceback (most recent call last):
  File "/usr/local/bin/kaggle", line 10, in <module>
    sys.exit(main())
             ^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/kaggle/cli.py", line 68, in main
    out = args.func(**command_args)
          ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/kaggle/api/kaggle_api_extended.py", line 1741, in dataset_download_cli
    with self.build_kaggle_client() as kaggle:
         ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/kaggle/api/kaggle_api_extended.py", line 688, in build_kaggle_client
    username=self.config_values['username'],
             ~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^
KeyError: 'username'
unzip:  cannot find or open sql-injection-dataset.zip, sql-inje

In [None]:
# --- Cell 2: Imports & Data Prep ---

# Import all libraries at the top
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout
from tensorflow.keras.utils import to_categorical

# --- [Weeks 1-2]: Loading & Preprocessing Data ---
print("--- [Weeks 1-2]: Loading & Preprocessing Data ---")
df = pd.read_csv('Modified_SQL_Dataset.csv')
print(f"Dataset loaded. Total queries: {len(df)}")

# Clean data
df.dropna(inplace=True)
df.drop_duplicates(subset=['Query'], inplace=True)
print(f"Cleaned dataset size: {len(df)}")
print(df['Label'].value_counts())

# Define X (features) and y (target)
X = df['Query']
y = df['Label']

# Split data into training and testing sets (for ALL models)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print(f"\nData split: {len(X_train)} training samples, {len(X_test)} testing samples.")
print("✅ Data is clean and split. Ready for model training.")

--- [Weeks 1-2]: Loading & Preprocessing Data ---
Dataset loaded. Total queries: 30919
Cleaned dataset size: 30905
Label
0    19527
1    11378
Name: count, dtype: int64

Data split: 24724 training samples, 6181 testing samples.
✅ Data is clean and split. Ready for model training.


In [None]:
# --- Cell 3: Baseline ML Models ---

print("\n--- [Weeks 3-4]: Training Baseline ML Models ---")

# Step 1: Feature Extraction (TF-IDF)
print("Creating TF-IDF vectors...")
tfidf_vectorizer = TfidfVectorizer(analyzer='char_wb', ngram_range=(3, 5), max_features=10000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)
print("TF-IDF vectors created.")

# Step 2: Train and Evaluate Models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(n_estimators=100, n_jobs=-1),
    "SVM (Support Vector Machine)": SVC()
}

# This file will be your "Baseline accuracy report" deliverable
with open("baseline_accuracy_report.txt", "w") as report_file:
    report_file.write("--- BASELINE MODEL EVALUATION REPORT ---\n\n")

    for name, model in models.items():
        print(f"\nTraining {name}...")
        model.fit(X_train_tfidf, y_train)
        y_pred = model.predict(X_test_tfidf)

        accuracy = accuracy_score(y_test, y_pred)
        report = classification_report(y_test, y_pred, target_names=['Benign (0)', 'Malicious (1)'])

        print(f"✅ {name} - Test Accuracy: {accuracy * 100:.2f}%")

        # Write to report
        report_file.write(f"--- Model: {name} ---\n")
        report_file.write(f"Test Accuracy: {accuracy * 100:.2f}%\n")
        report_file.write(report)
        report_file.write("Confusion Matrix:\n")
        report_file.write(str(confusion_matrix(y_test, y_pred)))
        report_file.write("\n" + "="*50 + "\n\n")

print("\n✅ [Deliverable] 'baseline_accuracy_report.txt' created.")


--- [Weeks 3-4]: Training Baseline ML Models ---
Creating TF-IDF vectors...
TF-IDF vectors created.

Training Logistic Regression...
✅ Logistic Regression - Test Accuracy: 99.09%

Training Random Forest...
✅ Random Forest - Test Accuracy: 99.69%

Training SVM (Support Vector Machine)...
✅ SVM (Support Vector Machine) - Test Accuracy: 99.56%

✅ [Deliverable] 'baseline_accuracy_report.txt' created.


In [None]:
# --- Cell 4: Deep Learning Model ---

print(" Deep Learning Model (1D-CNN) ---")

# Step 1: Text Preprocessing for Deep Learning
MAX_VOCAB_SIZE = 20000
MAX_SEQUENCE_LENGTH = 100

tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE, char_level=False)
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_SEQUENCE_LENGTH)
X_test_pad = pad_sequences(X_test_seq, maxlen=MAX_SEQUENCE_LENGTH)
print("Data tokenized and padded for DL model.")

# Step 2: Define and Train the CNN Model
EMBEDDING_DIM = 64
model_cnn = Sequential([
    Embedding(input_dim=MAX_VOCAB_SIZE, output_dim=EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH),
    Dropout(0.3),
    Conv1D(128, 5, activation='relu'),
    GlobalMaxPooling1D(),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])
model_cnn.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model_cnn.summary()

print("\nTraining CNN model...")
history = model_cnn.fit(
    X_train_pad,
    y_train,
    epochs=5,
    batch_size=128,
    validation_split=0.1,
    verbose=1
)
print("✅ CNN model trained.")

# --- Final Model Evaluation & Comparison ---
print("\n---  Evaluating DL Model ---")

loss, accuracy = model_cnn.evaluate(X_test_pad, y_test)
print(f"✅ Deep Learning (CNN) - Test Accuracy: {accuracy * 100:.2f}%")

y_pred_proba_cnn = model_cnn.predict(X_test_pad)
y_pred_cnn = (y_pred_proba_cnn > 0.5).astype(int)

cnn_report = classification_report(y_test, y_pred_cnn, target_names=['Benign (0)', 'Malicious (1)'])
cnn_cm = confusion_matrix(y_test, y_pred_cnn)

# 3. Append CNN results to the main report
with open("baseline_accuracy_report.txt", "a") as report_file:
    report_file.write(f"--- Model: Deep Learning (1D-CNN) ---\n")
    report_file.write(f"Test Accuracy: {accuracy * 100:.2f}%\n")
    report_file.write(cnn_report)
    report_file.write("Confusion Matrix:\n")
    report_file.write(str(cnn_cm))
    report_file.write("\n" + "="*50 + "\n\n")

print("\n✅ [Deliverable] CNN results added to 'baseline_accuracy_report.txt'.")


 Deep Learning Model (1D-CNN) ---
Data tokenized and padded for DL model.





Training CNN model...
Epoch 1/5
[1m174/174[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 23ms/step - accuracy: 0.8555 - loss: 0.3285 - val_accuracy: 0.9923 - val_loss: 0.0314
Epoch 2/5
[1m174/174[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.9931 - loss: 0.0344 - val_accuracy: 0.9951 - val_loss: 0.0246
Epoch 3/5
[1m174/174[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.9958 - loss: 0.0207 - val_accuracy: 0.9943 - val_loss: 0.0211
Epoch 4/5
[1m174/174[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9969 - loss: 0.0145 - val_accuracy: 0.9939 - val_loss: 0.0237
Epoch 5/5
[1m174/174[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9977 - loss: 0.0097 - val_accuracy: 0.9956 - val_loss: 0.0232
✅ CNN model trained.

---  Evaluating DL Model ---
[1m194/194[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9922 - loss: 0.0281
✅ Deep Lear

In [None]:
#Cell 5: Saving the best model for the API

import joblib

print("Saving the best performing model")

best_model = models["Random Forest"]

#2. Get the TF - IDF vectorizer
#also fitted in cell 3
vectorizer = tfidf_vectorizer

#3. Save both objects to files using joblib
# joblib is the standard way to save scikit-learn models
joblib.dump(best_model, 'rf_model.joblib')
joblib.dump(vectorizer, 'tfidf_vectorizer.joblib')

print("\n✅ Success! Your files are saved:")
print("1. rf_model.joblib (The trained Random Forest model)")
print("2. tfidf_vectorizer.joblib (The fitted text vectorizer)")

print("\nLook in the file browser (refresh it!) to see your files.")

Saving the best performing model

✅ Success! Your files are saved:
1. rf_model.joblib (The trained Random Forest model)
2. tfidf_vectorizer.joblib (The fitted text vectorizer)

Look in the file browser (refresh it!) to see your files.
