In [1]:
import pandas as pd
import numpy as np
import urllib.parse
import re
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [2]:
def load_dataset():
    print("Loading Data...")
    # SQLi
    sqli_df = pd.read_csv('data/SQLiV3.csv', on_bad_lines='skip')
    clean_sqli = []
    for _, row in sqli_df.iterrows():
        values = [v for v in row.tolist() if pd.notnull(v)]
        if len(values) < 2: continue
        label = values[-1]
        sentence = ", ".join([str(v) for v in values[:-1]])
        if str(label) in ['0', '1', '0.0', '1.0']:
            clean_sqli.append({'Sentence': sentence, 'Label': int(float(label))})
    df_sqli = pd.DataFrame(clean_sqli)
    df_sqli['Type'] = df_sqli['Label'].apply(lambda x: 'SQLi' if x == 1 else 'Benign')

    # XSS
    xss_df = pd.read_csv('data/XSS_dataset.csv')
    xss_df['Type'] = xss_df['Label'].apply(lambda x: 'XSS' if x == 1 else 'Benign')

    # Merge
    full_df = pd.concat([df_sqli[['Sentence', 'Type']], xss_df[['Sentence', 'Type']]], ignore_index=True)
    full_df.drop_duplicates(subset=['Sentence'], inplace=True)
    return full_df

In [3]:
def preprocess_text(text):
    text = str(text)
    try: text = urllib.parse.unquote(text)
    except: pass
    text = text.lower()
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [4]:
if __name__ == "__main__":
\
    df = load_dataset()
    df['Clean_Sentence'] = df['Sentence'].apply(preprocess_text)
    X = df['Clean_Sentence']
    
    le = LabelEncoder()
    y = le.fit_transform(df['Type'])  # 0,1,2 labels


    models = [
        ('Logistic Regression', LogisticRegression(class_weight='balanced', max_iter=1000, n_jobs=-1)),
        ('Linear SVM', LinearSVC(class_weight='balanced', dual=False, max_iter=1000)),
        ('Random Forest', RandomForestClassifier(n_estimators=100, class_weight='balanced', n_jobs=-1)),
    ]

    print(f"\n{'MODEL':<25} | {'ACCURACY':<10} | {'F1 SCORE':<10} | {'RECALL':<10}")
    print("-" * 65)

    # Shared objects
    tfidf = TfidfVectorizer(analyzer='char', ngram_range=(2, 4), max_features=5000)
    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    scoring = ['accuracy', 'f1_macro', 'recall_macro']


    for name, model in models:
        pipeline = Pipeline([('tfidf', tfidf), ('clf', model)])

        results = cross_validate(
            pipeline, X, y,
            cv=kfold,
            scoring=scoring,
            n_jobs=-1,
            error_score='raise'
        )

        acc = results['test_accuracy'].mean()
        f1 = results['test_f1_macro'].mean()
        rec = results['test_recall_macro'].mean()

        print(f"{name:<25} | {acc:.4f}     | {f1:.4f}     | {rec:.4f}")

 
    X_tfidf = tfidf.fit_transform(X)

    xgb = XGBClassifier(
        eval_metric="mlogloss",
        n_jobs=-1,
        tree_method="hist"
    )

    xgb_results = cross_validate(
        xgb, X_tfidf, y,
        cv=kfold,
        scoring=scoring,
        n_jobs=1,           # IMPORTANT FIX
        error_score='raise'
    )

    xgb_acc = xgb_results['test_accuracy'].mean()
    xgb_f1  = xgb_results['test_f1_macro'].mean()
    xgb_rec = xgb_results['test_recall_macro'].mean()

    print(f"{'XGBoost':<25} | {xgb_acc:.4f}     | {xgb_f1:.4f}     | {xgb_rec:.4f}")

    print("-" * 65)


Loading Data...

MODEL                     | ACCURACY   | F1 SCORE   | RECALL    
-----------------------------------------------------------------
Logistic Regression       | 0.9952     | 0.9951     | 0.9942
Linear SVM                | 0.9966     | 0.9966     | 0.9958
Random Forest             | 0.9965     | 0.9964     | 0.9957
XGBoost                   | 0.9964     | 0.9964     | 0.9952
-----------------------------------------------------------------


In [5]:

print("\nTraining Final Model...")
    
    
selected_model = LinearSVC(class_weight='balanced', dual=False, max_iter=1000)

    
final_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(analyzer='char', ngram_range=(2, 4), max_features=5000)),
    ('clf', selected_model)
])
    
final_pipeline.fit(X, y)
    
    # Save Bundle (Pipeline + Encoder)
bundle = {"pipeline": final_pipeline, "encoder": le}
    
with open('chameleon_brain.pkl', 'wb') as f:
    pickle.dump(bundle, f)
        
print("Saved to chameleon_brain.pkl")
 


Training Final Model...
Saved to chameleon_brain.pkl
