# Setup

In [1]:
from collections import Counter, defaultdict
from urllib.parse import urlparse

import pandas as pd
import numpy as np
import seaborn as sns

import matplotlib.pyplot as plt
%matplotlib inline

import project3utils

In [2]:
df = pd.read_csv('data/phishtank2018-05-02_verified_online.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32660 entries, 0 to 32659
Data columns (total 8 columns):
phish_id             32660 non-null int64
url                  32660 non-null object
phish_detail_url     32660 non-null object
submission_time      32660 non-null object
verified             32660 non-null object
verification_time    32660 non-null object
online               32660 non-null object
target               32660 non-null object
dtypes: int64(1), object(7)
memory usage: 2.0+ MB


In [4]:
df_counts = pd.DataFrame(list(df.head(10000).url.apply(project3utils.get_features_from_url)), dtype=np.float64)

In [5]:
df_counts.mean().sort_values(ascending=False)

path_total_chars             31.182200
domain_total_chars           15.571100
path_consonants              15.438500
domain_consonants             8.812400
path_vowels                   7.630500
domain_vowels                 5.028300
path_digits                   4.675100
subdomain_total_chars         3.568800
path_entropy                  2.370844
domain_entropy                2.294947
path_char_e                   2.201500
subdomain_consonants          2.184000
path_char_slash               2.075000
path_char_i                   1.648300
path_char_a                   1.635800
domain_char_o                 1.577600
path_char_o                   1.543200
path_char_n                   1.501000
path_char_c                   1.410200
path_char_p                   1.399100
path_char_d                   1.280500
domain_char_c                 1.172800
domain_char_e                 1.169900
path_char_t                   1.159500
domain_char_dot               1.121900
domain_char_a            

In [6]:
valid_df = pd.read_csv('data/bing_search_results.csv')
valid_df.head()

Unnamed: 0,search_term,market,url
0,management,en-US,https://www.merriam-webster.com/dictionary/man...
1,management,en-US,https://en.wikipedia.org/wiki/Management
2,management,en-US,http://www.businessdictionary.com/definition/m...
3,management,en-US,https://www.forbes.com/management/
4,management,en-US,https://www.thebalancecareers.com/management-4...


In [7]:
valid_df_counts = pd.DataFrame(list(valid_df.url.apply(project3utils.get_features_from_url)), dtype=np.float64)

In [8]:
valid_df_counts.mean().sort_values(ascending=False)

path_total_chars             17.755000
domain_total_chars           14.238000
path_consonants               9.506100
domain_consonants             7.967700
path_vowels                   5.500600
domain_vowels                 5.077200
subdomain_total_chars         3.452400
subdomain_consonants          2.861700
domain_entropy                2.252793
subdomain_char_w              2.020500
path_entropy                  1.939410
domain_char_o                 1.692800
path_char_e                   1.584900
path_char_i                   1.319000
domain_char_c                 1.283500
path_char_t                   1.235300
path_char_a                   1.130400
domain_char_dot               1.114300
domain_char_e                 1.097500
path_char_s                   1.045200
path_char_o                   1.028100
path_char_r                   1.011800
domain_char_m                 1.007100
domain_char_i                 1.005600
path_char_n                   0.978400
domain_char_r            

In [9]:
df_counts['is_phishing'] = 1
valid_df_counts['is_phishing'] = 0

data = pd.concat([df_counts, valid_df_counts], ignore_index=True)
data.corr()['is_phishing'].sort_values(ascending=False)

is_phishing               1.000000
subdomain_empty_string    0.465914
path_ends_in_php          0.437556
path_char_p               0.365491
path_char_slash           0.359659
path_char_dot             0.354019
path_total_chars          0.269768
path_digits               0.255262
path_entropy              0.236865
path_char_c               0.235424
domain_char_0             0.235233
path_char_d               0.232536
path_consonants           0.230433
domain_char_p             0.229802
path_char_b               0.224619
domain_digits             0.221201
path_char_7               0.216139
path_char_x               0.216090
path_char_1               0.208978
path_char_0               0.202616
path_char_f               0.201525
path_char_4               0.201338
domain_char_l             0.196191
path_char_3               0.191680
path_char_m               0.190423
domain_char_s             0.189819
path_char_8               0.185932
path_char_h               0.185613
path_char_5         

# Modeling

In [10]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, classification_report
from sklearn.externals import joblib

In [11]:
models = []
X = data[project3utils.numeric_feature_columns]
y = data['is_phishing']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=987)
meta_model_X_train = None
meta_model_X_test = None
for i, feature_columns in enumerate([project3utils.nonpath_feature_columns, project3utils.path_feature_columns]):
    Xa_train, Xa_test = X_train[feature_columns], X_test[feature_columns]
    # model = DummyClassifier()
    # model = LogisticRegression()
    # model = LinearSVC()
    # model = SVC()
    # model = DecisionTreeClassifier()
    model = RandomForestClassifier(n_estimators=80, criterion='gini')
    # model = GaussianNB()
    model.fit(Xa_train, y_train)
    predictions = model.predict(Xa_test)
    print('### {} Model ###'.format(['Domain', 'Path'][i]))
    print("Accuracy: {}".format(accuracy_score(y_test, predictions)))
    print(classification_report(y_test, predictions))
    models.append(model)
    if i == 0:
        meta_model_X_train = np.array([model.predict(Xa_train)]).T
        meta_model_X_test = np.array([model.predict(Xa_test)]).T
    else:
        meta_model_X_train = np.append(meta_model_X_train, np.array([model.predict(Xa_train)]).T, 1)
        meta_model_X_test = np.append(meta_model_X_test, np.array([model.predict(Xa_test)]).T, 1)

meta_model_X_train = np.append(meta_model_X_train, X_train, 1)
meta_model_X_test = np.append(meta_model_X_test, X_test, 1)
meta_model = SVC()
meta_model.fit(meta_model_X_train, y_train)
predictions = meta_model.predict(meta_model_X_test)
print('### Meta Model ###')
print("Accuracy: {}".format(accuracy_score(y_test, predictions)))
print(classification_report(y_test, predictions))

### Domain Model ###
Accuracy: 0.9323333333333333
             precision    recall  f1-score   support

          0       0.94      0.93      0.93      3035
          1       0.93      0.93      0.93      2965

avg / total       0.93      0.93      0.93      6000

### Path Model ###
Accuracy: 0.909
             precision    recall  f1-score   support

          0       0.88      0.95      0.91      3035
          1       0.95      0.87      0.90      2965

avg / total       0.91      0.91      0.91      6000

### Meta Model ###
Accuracy: 0.9465
             precision    recall  f1-score   support

          0       0.97      0.93      0.95      3035
          1       0.93      0.97      0.95      2965

avg / total       0.95      0.95      0.95      6000



In [12]:
def predict_for_url(url):
    features = project3utils.get_features_from_url(url)
    numeric_features = [features[c] for c in project3utils.numeric_feature_columns]
    nonpath_features = [features[c] for c in project3utils.nonpath_feature_columns]
    path_features = [features[c] for c in project3utils.path_feature_columns]
    domain_prediction = models[0].predict([nonpath_features])
    path_prediction = models[1].predict([path_features])
    meta_prediction = meta_model.predict([[domain_prediction, path_prediction] + numeric_features])
    return {
        'domain': domain_prediction[0],
        'path': path_prediction[0],
        'meta': meta_prediction[0]
    }

In [18]:
predict_for_url('https://app.calculist.io/login')

{'domain': 0, 'meta': 0, 'path': 1}

In [14]:
joblib.dump(models[0], 'domain_model.pkl')
joblib.dump(models[1], 'path_model.pkl')
joblib.dump(meta_model, 'meta_model.pkl')

['meta_model.pkl']