In [1]:
import pandas as pd

data = pd.read_csv("./dataset_phishing.csv")

In [3]:
from sklearn.preprocessing import LabelEncoder

le_status = LabelEncoder()

In [4]:
data["status_n"] = le_status.fit_transform(data.status)

In [5]:
X = data.drop(['status', 'status_n'], axis='columns')
Y = data.status_n

In [6]:
df = data.apply(pd.to_numeric, errors='coerce')
correlations = df.corrwith(df['status_n'])

# Get the top 5 correlations
top_correlations = correlations.abs().nlargest(11)[1:]

top_correlations

google_index        0.731171
page_rank           0.511137
nb_www              0.443468
ratio_digits_url    0.356395
domain_in_title     0.342807
nb_hyperlinks       0.342628
phish_hints         0.335393
domain_age          0.331889
ip                  0.321698
nb_qm               0.294319
dtype: float64

In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

In [8]:
from sklearn.ensemble import RandomForestClassifier

model_rfc = RandomForestClassifier(n_estimators=50, criterion="entropy")
model_rfc.fit(X_train, Y_train)

In [13]:
model_rfc.score(X_test, Y_test)

0.9663167104111986

In [14]:
y_predicted = model_rfc.predict(X_test)

In [16]:
%matplotlib inline
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(Y_test, y_predicted)
cm

array([[1088,   35],
       [  42, 1121]])

In [17]:
TN, FP, FN, TP = confusion_matrix(Y_test, y_predicted).ravel()
Precision = TP / (TP + FP)
Recall = TP / (TP + FN)
F1_score = 2 * (Precision * Recall) / (Precision + Recall)

print("Precision: {:.3f}".format(Precision))
print("Recall: {:.3f}".format(Recall))
print("F1 Score: {:.3f}".format(F1_score))

Precision: 0.970
Recall: 0.964
F1 Score: 0.967


In [27]:
import pickle

pickle.dump(model_rfc, open('model_rfc.pkl', 'wb'))

In [28]:
from sklearn import tree

model_dtc = tree.DecisionTreeClassifier(criterion="entropy")
model_dtc.fit(X_train, Y_train)

In [29]:
model_dtc.score(X_test, Y_test)

0.937007874015748

In [30]:
import pickle

pickle.dump(model_dtc, open('model_dtc.pkl', 'wb'))