In [9]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import tqdm

import pickle

from sklearn.neighbors import KNeighborsClassifier 
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score, precision_score, recall_score
from sklearn.metrics import roc_curve, auc, PrecisionRecallDisplay
from sklearn.model_selection import cross_val_score, GridSearchCV, KFold
from warnings import simplefilter
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC 

# from keras_visualizer import visualizer 

In [3]:
# What version of Python do you have?
import sys

import tensorflow.keras
import pandas as pd
import sklearn as sk
import tensorflow as tf
import platform

print(f"Python Platform: {platform.platform()}")
print(f"Tensor Flow Version: {tf.__version__}")
print(f"Keras Version: {tensorflow.keras.__version__}")
print()
print(f"Python {sys.version}")
print(f"Pandas {pd.__version__}")
print(f"Scikit-Learn {sk.__version__}")
gpu = len(tf.config.list_physical_devices('GPU') ) > 0
print("GPU is", "available" if gpu else "NOT AVAILABLE")

Python Platform: macOS-13.0.1-arm64-arm-64bit
Tensor Flow Version: 2.12.0
Keras Version: 2.12.0

Python 3.10.10 | packaged by conda-forge | (main, Mar 24 2023, 20:12:31) [Clang 14.0.6 ]
Pandas 2.0.0
Scikit-Learn 1.2.2
GPU is available


### Load the training set and testing set

In [4]:
imputed_train_data = pd.read_csv('/Users/yihongan/Desktop/lr_imputed_train.csv')
imputed_test_data = pd.read_csv('/Users/yihongan/Desktop/lr_imputed_test.csv')
imputed_train_data

Unnamed: 0,url,age,num_suspicious_words,has_ip_address,is_url_shortened,length,subdomain_ratio,num_dots,num_www,num_dcom,...,num_semic,num_comma,login_time,num_ex_links,num_redirects,num_img,num_iframe,num_hidden,ext_tot_ratio,status
0,https://www.crossstitch.pk/,17.266665,0,0,0,27,0.666667,2,1,0,...,0,0,1.296597,247.0,1.0,95.0,1.0,1.0,247.0,0
1,http://www.wikiwand.com/en/LATAM_Chile,9.200000,0,0,0,38,0.421053,2,1,1,...,0,0,0.093795,91.0,1.0,3.0,0.0,1.0,91.0,0
2,http://www.iconmeals.com/,9.065753,0,0,0,25,0.680000,2,1,1,...,0,0,1.031102,105.0,1.0,20.0,0.0,1.0,105.0,0
3,http://calvaryepiscopalbatavia.org/wp-includes...,7.057534,1,0,0,68,0.397059,1,0,0,...,0,0,0.640474,0.0,0.0,0.0,0.0,0.0,0.0,1
4,www.sharpaviation.com.au/index.php/pt_home,27.353425,0,0,0,42,0.571429,4,1,1,...,0,0,5.873612,76.0,1.0,11.0,1.0,1.0,76.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14155,http://lbcpzonasegurabeta.rf.gd/,9.638965,0,0,0,32,0.750000,2,0,0,...,0,0,0.734203,0.0,0.0,0.0,0.0,0.0,0.0,1
14156,https://www.e-serviceparts.info/landingpages/d...,4.575342,0,0,0,125,0.184000,2,1,0,...,0,0,0.046956,0.0,0.0,0.0,0.0,0.0,0.0,1
14157,rzr.online.fr/resume.htm,14.227397,0,0,0,24,0.541667,3,0,0,...,0,0,1.753927,1.0,0.0,0.0,0.0,0.0,1.0,0
14158,en.wikipedia.org/wiki/Aldus_Corporation,22.191781,0,0,0,39,0.410256,2,0,0,...,0,0,0.298482,330.0,1.0,11.0,0.0,4.0,330.0,0


### Preprogress both textual and numerical features

In [5]:
# Define input shapes
text_input_shape = (100,)
num_input_shape = (imputed_train_data.shape[1] - 2,)

# Numerical data
num_train = imputed_train_data.iloc[:, 1:(num_input_shape[0] + 1)].values
num_test = imputed_test_data.iloc[:, 1:(num_input_shape[0] + 1)].values

mean_sd_sc = StandardScaler()
num_train_std = mean_sd_sc.fit_transform(num_train)
num_test_std = mean_sd_sc.transform(num_test)

# Response
y_train = imputed_train_data.iloc[:, (num_input_shape[0] + 1)].values
y_test = imputed_test_data.iloc[:, (num_input_shape[0] + 1)].values.squeeze()

In [6]:
def confusion_matrix_heatmap(model, y_true, y_pred):
    plt.figure(figsize=(8,8))
    sns.heatmap(confusion_matrix(y_true, y_pred, normalize='true'),
                fmt='.1%',
                annot=True,
                cmap="Blues",
                xticklabels=['Predicted legitimate', 'Predicted phishing'],
                yticklabels=['Actual legitimate', 'Actual phishing'],
                annot_kws={"size": 14})
    plt.title(f'Normalized Confusion Matrix-{model}', {'fontsize':18})

###  KNN Classification Model

In [14]:
knn_pipe = make_pipeline(StandardScaler(), KNeighborsClassifier(metric='euclidean'))

knn_parameters = [{
    'kneighborsclassifier__n_neighbors': np.arange(1, 20, 2)
}]
knn_grid_search = GridSearchCV(knn_pipe, knn_parameters, scoring='accuracy', cv=5, verbose=3, refit=True)
knn_grid_search.fit(num_train, y_train)

print("Best Accuracy: {}" .format(knn_grid_search.best_score_))
print("Best num of neighbors: {}" .format(knn_grid_search.best_estimator_))

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5] END kneighborsclassifier__n_neighbors=1;, score=0.917 total time=   0.1s
[CV 2/5] END kneighborsclassifier__n_neighbors=1;, score=0.915 total time=   0.1s
[CV 3/5] END kneighborsclassifier__n_neighbors=1;, score=0.911 total time=   0.1s
[CV 4/5] END kneighborsclassifier__n_neighbors=1;, score=0.914 total time=   0.1s
[CV 5/5] END kneighborsclassifier__n_neighbors=1;, score=0.920 total time=   0.1s
[CV 1/5] END kneighborsclassifier__n_neighbors=3;, score=0.920 total time=   0.1s
[CV 2/5] END kneighborsclassifier__n_neighbors=3;, score=0.923 total time=   0.1s
[CV 3/5] END kneighborsclassifier__n_neighbors=3;, score=0.921 total time=   0.1s
[CV 4/5] END kneighborsclassifier__n_neighbors=3;, score=0.922 total time=   0.1s
[CV 5/5] END kneighborsclassifier__n_neighbors=3;, score=0.929 total time=   0.1s
[CV 1/5] END kneighborsclassifier__n_neighbors=5;, score=0.920 total time=   0.1s
[CV 2/5] END kneighborsclassifier__n_

In [21]:
best_k = knn_grid_search.best_params_['kneighborsclassifier__n_neighbors']
best_k

3

In [22]:
KNN_final = KNeighborsClassifier(n_neighbors=best_k ,metric='euclidean')
KNN_final.fit(num_train_std, y_train)

In [23]:
pickle.dump(KNN_final, open("/Users/yihongan/Desktop/KNN.pkl", 'wb'))

### SVM Classification Model

In [19]:
svm_pipe = make_pipeline(StandardScaler(), SVC(probability=True, kernel='rbf'))

svm_parameters = [{
    'svc__C': [0.1, 1, 10, 100],
    'svc__gamma': [0.1, 1, 10, 100],
}]

svm_grid_search = GridSearchCV(svm_pipe, svm_parameters, scoring='accuracy', cv=5, verbose=3 , n_jobs=4, refit=True)
svm_grid_search.fit(num_train, y_train)

print("Best Accuracy: {}" .format(svm_grid_search.best_score_))
print("Best num of neighbors: {}" .format(svm_grid_search.best_estimator_))

Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV 3/5] END ........svc__C=0.1, svc__gamma=0.1;, score=0.891 total time=  10.0s
[CV 4/5] END ........svc__C=0.1, svc__gamma=0.1;, score=0.905 total time=  10.2s
[CV 2/5] END ........svc__C=0.1, svc__gamma=0.1;, score=0.901 total time=  10.3s
[CV 1/5] END ........svc__C=0.1, svc__gamma=0.1;, score=0.904 total time=  10.4s
[CV 5/5] END ........svc__C=0.1, svc__gamma=0.1;, score=0.902 total time=  10.8s
[CV 2/5] END ..........svc__C=0.1, svc__gamma=1;, score=0.746 total time=  20.0s
[CV 1/5] END ..........svc__C=0.1, svc__gamma=1;, score=0.732 total time=  20.1s
[CV 3/5] END ..........svc__C=0.1, svc__gamma=1;, score=0.728 total time=  20.0s
[CV 4/5] END ..........svc__C=0.1, svc__gamma=1;, score=0.730 total time=  20.2s
[CV 5/5] END ..........svc__C=0.1, svc__gamma=1;, score=0.745 total time=  20.3s
[CV 1/5] END .........svc__C=0.1, svc__gamma=10;, score=0.646 total time=  34.8s
[CV 2/5] END .........svc__C=0.1, svc__gamma=10;

In [20]:
svm_grid_search.best_params_

{'svc__C': 10, 'svc__gamma': 0.1}

In [24]:
svm_final = SVC(probability=True, kernel='rbf', C=10, gamma=0.1)
svm_final.fit(num_train_std, y_train)

In [25]:
pickle.dump(svm_final, open("/Users/yihongan/Desktop/SVM.pkl", 'wb'))

###  Random Forest Classification Model

In [26]:
rf =  RandomForestClassifier(random_state=4012)
rf.fit(num_train_std, y_train)

In [29]:
pickle.dump(rf, open("/Users/yihongan/Desktop/RF.pkl", 'wb'))