In [102]:
import csv
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn import svm
from sklearn import metrics
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
import numpy as np

In [103]:
dataset = pd.read_csv('data/dataset_full.csv')

print(dataset.columns)
print(dataset.shape)

Index(['qty_dot_url', 'qty_hyphen_url', 'qty_underline_url', 'qty_slash_url',
       'qty_questionmark_url', 'qty_equal_url', 'qty_at_url', 'qty_and_url',
       'qty_exclamation_url', 'qty_space_url',
       ...
       'qty_ip_resolved', 'qty_nameservers', 'qty_mx_servers', 'ttl_hostname',
       'tls_ssl_certificate', 'qty_redirects', 'url_google_index',
       'domain_google_index', 'url_shortened', 'phishing'],
      dtype='object', length=112)
(88647, 112)


In [104]:
dataset.isnull().any()

qty_dot_url             False
qty_hyphen_url          False
qty_underline_url       False
qty_slash_url           False
qty_questionmark_url    False
                        ...  
qty_redirects           False
url_google_index        False
domain_google_index     False
url_shortened           False
phishing                False
Length: 112, dtype: bool

In [105]:
train_data, test_data, train_labels, test_labels = train_test_split(
    dataset.iloc[:, 0:97], 
    dataset.iloc[:, -1], 
    test_size=0.2
)

In [106]:
train_data.shape, train_labels.shape, test_data.shape, test_labels.shape

((70917, 97), (70917,), (17730, 97), (17730,))

In [107]:
print(train_data)

       qty_dot_url  qty_hyphen_url  qty_underline_url  qty_slash_url  \
3927             2               0                  0              0   
2698             2               0                  0              0   
23859            2               0                  0              3   
65134            9               2                  1              7   
58287            2               0                  0              0   
...            ...             ...                ...            ...   
59851            1               0                  1              1   
70284            2               0                  0              0   
82536            2               1                  0              3   
7274             1               0                  0              0   
23610            2               0                  0              0   

       qty_questionmark_url  qty_equal_url  qty_at_url  qty_and_url  \
3927                      0              0           0          

In [108]:
dataset.iloc[:, -1]

0        1
1        1
2        0
3        1
4        0
        ..
88642    0
88643    0
88644    1
88645    1
88646    0
Name: phishing, Length: 88647, dtype: int64

In [109]:
clf = tree.DecisionTreeClassifier()
clf.fit(train_data, train_labels)

DecisionTreeClassifier()

In [110]:
pred_labels_val = cross_val_predict(clf, train_data, train_labels, cv=10)
conf_mat = confusion_matrix(train_labels, pred_labels_val)
print(conf_mat)

[[44668  1682]
 [ 3461 21106]]


In [111]:
pred_labels_test = clf.predict(test_data)

In [112]:
print("Accuracy:", metrics.accuracy_score(test_labels, pred_labels_test))      # Calculating the accuracy of the model
print("Precision:", metrics.precision_score(test_labels, pred_labels_test))    # Calculating the precision of the model
print("Recall:", metrics.recall_score(test_labels, pred_labels_test)) 

Accuracy: 0.9279187817258884
Precision: 0.9281383737517832
Recall: 0.8560855263157895


In [113]:
clf.predict([test_data.iloc[2773]])

array([1])

In [114]:
clf.predict(test_data.iloc[2773].values.reshape(1,-1))

array([1])

In [115]:
for i in range(1, 1000):
    l = test_data.iloc[i].values.tolist()
    print(clf.predict(np.array(l).reshape(1,-1)))

[0]
[1]
[1]
[0]
[1]
[0]
[0]
[0]
[1]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[1]
[0]
[1]
[0]
[0]
[0]
[1]
[0]
[0]
[1]
[0]
[0]
[1]
[0]
[0]
[1]
[0]
[0]
[0]
[0]
[1]
[0]
[0]
[1]
[1]
[0]
[0]
[0]
[0]
[1]
[0]
[1]
[1]
[1]
[0]
[1]
[0]
[1]
[1]
[1]
[1]
[0]
[0]
[0]
[0]
[0]
[1]
[0]
[0]
[0]
[1]
[1]
[0]
[1]
[0]
[1]
[0]
[0]
[1]
[1]
[0]
[1]
[0]
[1]
[0]
[1]
[0]
[1]
[1]
[1]
[1]
[0]
[1]
[0]
[0]
[0]
[0]
[0]
[1]
[0]
[0]
[0]
[0]
[0]
[1]
[1]
[0]
[0]
[0]
[1]
[0]
[0]
[0]
[0]
[0]
[0]
[1]
[0]
[0]
[0]
[1]
[0]
[1]
[0]
[1]
[0]
[0]
[0]
[0]
[1]
[0]
[0]
[1]
[1]
[0]
[0]
[1]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[1]
[0]
[0]
[1]
[0]
[0]
[0]
[0]
[1]
[1]
[0]
[0]
[0]
[0]
[0]
[1]
[0]
[1]
[0]
[0]
[0]
[0]
[1]
[0]
[0]
[1]
[0]
[1]
[1]
[1]
[0]
[1]
[0]
[0]
[0]
[1]
[1]
[0]
[1]
[1]
[1]
[1]
[0]
[0]
[1]
[0]
[1]
[0]
[0]
[0]
[1]
[0]
[0]
[1]
[0]
[0]
[0]
[0]
[1]
[1]
[0]
[0]
[0]
[0]
[1]
[0]
[1]
[0]
[0]
[0]
[0]
[0]
[1]
[1]
[0]
[0]
[1]
[0]
[1]
[1]
[1]
[0]
[0]
[0]
[1]
[1]
[0]
[0]
[0]
[1]
[0]
[0]
[0]
[1]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[1]
[0]
[1]
[1]
[0]
[1]
[1]


In [116]:
res = clf.predict(test_data.iloc[2773].values.reshape(1,-1))
type(res[0])

numpy.int64

In [117]:
with open('model.pkl','wb') as f:     # save
    pickle.dump(clf,f)
    
with open('./model.pkl', 'rb') as f:    # load
    new_clf = pickle.load(f)

In [118]:
res = new_clf.predict(test_data.iloc[2773].values.reshape(1,-1))
res[0]

1