In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

## Data Description

In [2]:
df = pd.read_csv('Phishing_paper1.csv')
df.head()

FileNotFoundError: ignored

In [None]:
df.info()

## Data Preprocessing

In [None]:
target_cols = 'Phishing Status'

df[target_cols].value_counts().plot(kind='bar');

In [None]:
df.describe().T

## Machine Learning Model Building

In [None]:
from sklearn.model_selection import train_test_split as tts

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier


from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report, confusion_matrix

from sklearn.utils.class_weight import compute_class_weight

In [None]:
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values # target_col

In [None]:
X_train, X_test, y_train, y_test = tts(X, y, test_size=0.2, random_state=42)

https://stackoverflow.com/questions/21762161/what-does-the-minnumobj-parameter-do-in-j48-classifier-weka

https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.VotingClassifier.html

In [None]:
%%time
clf1 = DecisionTreeClassifier(min_samples_leaf=8) # MinNumObj = 8
clf2 = RandomForestClassifier(max_depth=15) # Max_depth = 15

clf = VotingClassifier(estimators=[('DT', clf1), ('RF', clf2)], 
                       voting='soft')

clf.fit(X_train, y_train)

## Evaluation Procedure

In [None]:
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_pred, y_test)

print(f'Model accuracy {accuracy:.3f}')

In [None]:
# Compute and plot the Confusion matrix
cf_matrix = confusion_matrix(y_test, y_pred)

categories  = ['Non-Phishing','Phishing']
group_names = ['True Neg','False Pos', 'False Neg','True Pos']
group_values = [f'{value}' for value in cf_matrix.flatten()]

labels = [f'{v1}\n{v2}' for v1, v2 in zip(group_names, group_values)]
labels = np.asarray(labels).reshape(2,2)

sns.heatmap(cf_matrix, annot = labels, cmap = 'Blues',fmt = '',
            xticklabels = categories, yticklabels = categories)

plt.xlabel("Predicted values", fontdict = {'size':14}, labelpad = 10)
plt.ylabel("Actual values"   , fontdict = {'size':14}, labelpad = 10)
plt.title ("Confusion Matrix", fontdict = {'size':18}, pad = 20)

plt.show()

## Result

In [None]:
# Print the evaluation metrics for the dataset.
print(classification_report(y_test, y_pred, target_names=categories))