# Import Libraries
First, we need to import the required libraries. This will include libraries for data manipulation, model training, and evaluation, as well as Flask for deployment.

In [56]:
# Import essential libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, accuracy_score
import xgboost as xgb
from catboost import CatBoostClassifier
from flask import Flask, request, jsonify, render_template
import gunicorn
import pickle


# Print a message indicating successful import
print("Libraries imported successfully")

Libraries imported successfully


# Load Dataset
Next, we need to load the dataset. We'll assume the dataset is in a CSV file.

In [57]:
# Load the dataset from a CSV file
data = pd.read_csv('datasets.csv')

# Display the first few rows of the dataset
data.head()

Unnamed: 0,Index,UsingIP,LongURL,ShortURL,Symbol@,Redirecting//,PrefixSuffix-,SubDomains,HTTPS,DomainRegLen,...,UsingPopupWindow,IframeRedirection,AgeofDomain,DNSRecording,WebsiteTraffic,PageRank,GoogleIndex,LinksPointingToPage,StatsReport,class
0,0,1,1,1,1,1,-1,0,1,-1,...,1,1,-1,-1,0,-1,1,1,1,-1
1,1,1,0,1,1,1,-1,-1,-1,-1,...,1,1,1,-1,1,-1,1,0,-1,-1
2,2,1,0,1,1,1,-1,-1,-1,1,...,1,1,-1,-1,1,-1,1,-1,1,-1
3,3,1,0,-1,1,1,-1,1,1,-1,...,-1,1,-1,-1,0,-1,1,1,1,1
4,4,-1,0,-1,1,-1,-1,1,1,-1,...,1,1,1,1,1,-1,1,-1,-1,1


# Preprocessing
Extract features (X) and labels (y) from the dataset.

In [58]:
# Dropping index column
data = data.drop(['Index'],axis = 1)

# Assuming the 'class' column is the target variable
X = data.drop(columns=['class'])
y = data['class']

# Display the shape of the features and labels
print("Features shape:", X.shape)
print("Labels shape:", y.shape)

Features shape: (11054, 30)
Labels shape: (11054,)


# Description of preprocessed data 

In [59]:
data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
UsingIP,11054.0,0.313914,0.949495,-1.0,-1.0,1.0,1.0,1.0
LongURL,11054.0,-0.633345,0.765973,-1.0,-1.0,-1.0,-1.0,1.0
ShortURL,11054.0,0.738737,0.674024,-1.0,1.0,1.0,1.0,1.0
Symbol@,11054.0,0.700561,0.713625,-1.0,1.0,1.0,1.0,1.0
Redirecting//,11054.0,0.741632,0.670837,-1.0,1.0,1.0,1.0,1.0
PrefixSuffix-,11054.0,-0.734938,0.678165,-1.0,-1.0,-1.0,-1.0,1.0
SubDomains,11054.0,0.064049,0.817492,-1.0,-1.0,0.0,1.0,1.0
HTTPS,11054.0,0.25104,0.911856,-1.0,-1.0,1.0,1.0,1.0
DomainRegLen,11054.0,-0.336711,0.941651,-1.0,-1.0,-1.0,1.0,1.0
Favicon,11054.0,0.628551,0.777804,-1.0,1.0,1.0,1.0,1.0


# Train-Test Split
Split the dataset into training and testing sets.

In [60]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the shape of the training and testing sets
print("Training features shape:", X_train.shape)
print("Testing features shape:", X_test.shape)
print("Training labels shape:", y_train.shape)
print("Testing labels shape:", y_test.shape)

Training features shape: (8843, 30)
Testing features shape: (2211, 30)
Training labels shape: (8843,)
Testing labels shape: (2211,)


# Initialize Classifiers
Initialize various classifiers for the ensemble model.

In [61]:
# Initialize the classifiers
models = [
    ('gbc', GradientBoostingClassifier()),
    ('catboost', CatBoostClassifier(verbose=0)),
    ('xgboost', xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')),
    ('mlp', MLPClassifier(max_iter=300)),
    ('rf', RandomForestClassifier()),
    ('svc', SVC(probability=True)),
    ('dt', DecisionTreeClassifier()),
    ('knn', KNeighborsClassifier()),
    ('lr', LogisticRegression(max_iter=100000)),
    ('nb', GaussianNB())
]

# Create a Voting Classifier
ensemble_model = VotingClassifier(estimators=models, voting='soft')

# Print a message indicating successful initialization
print("Classifiers initialized successfully")

Classifiers initialized successfully


# Train the Model
Train the ensemble model using the training data.

In [62]:
# Train the ensemble model
ensemble_model.fit(X_train, y_train)

# Print a message indicating successful training
print("Model trained successfully")



Model trained successfully


# Evaluate the Model
Evaluate the model on the test data.

In [63]:
# Predict the test set results
y_pred = ensemble_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

# Display the evaluation results
print("Accuracy:", accuracy)
print("Classification Report:\n", report)

Accuracy: 0.968340117593849
Classification Report:
               precision    recall  f1-score   support

          -1       0.97      0.96      0.96       976
           1       0.97      0.98      0.97      1235

    accuracy                           0.97      2211
   macro avg       0.97      0.97      0.97      2211
weighted avg       0.97      0.97      0.97      2211



# Export my trained model
The model will be exported to a joblib file

In [64]:
# Define the file path where you want to save the model
model_filename = 'models/ensemble_model.pkl'

# Export the trained model to a file
with open(model_filename, 'wb') as file:
    pickle.dump(ensemble_model, file)

# Print a message indicating successful export
print("Model exported successfully as", model_filename)

Model exported successfully as models/ensemble_model.pkl
