# Setup

In [1]:
from collections import Counter, defaultdict
from urllib.parse import urlparse

import pandas as pd
import numpy as np
import seaborn as sns

import matplotlib.pyplot as plt
%matplotlib inline

import project3utils

In [2]:
df = pd.read_csv('data/phishtank2018-05-02_verified_online.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32660 entries, 0 to 32659
Data columns (total 8 columns):
phish_id             32660 non-null int64
url                  32660 non-null object
phish_detail_url     32660 non-null object
submission_time      32660 non-null object
verified             32660 non-null object
verification_time    32660 non-null object
online               32660 non-null object
target               32660 non-null object
dtypes: int64(1), object(7)
memory usage: 2.0+ MB


In [4]:
df_counts = pd.DataFrame(list(df.head(10000).url.apply(project3utils.get_features_from_url)), dtype=np.float64)

In [5]:
df_counts.mean().sort_values(ascending=False)

path_total_chars             31.182200
domain_total_chars           15.571100
path_consonants              15.438500
domain_consonants             8.812400
path_vowels                   7.630500
domain_vowels                 5.028300
path_digits                   4.675100
subdomain_total_chars         3.568800
path_entropy                  2.370844
domain_entropy                2.294947
path_char_e                   2.201500
subdomain_consonants          2.184000
path_char_slash               2.075000
path_char_i                   1.648300
path_char_a                   1.635800
domain_char_o                 1.577600
path_char_o                   1.543200
path_char_n                   1.501000
path_char_c                   1.410200
path_char_p                   1.399100
path_char_d                   1.280500
domain_char_c                 1.172800
domain_char_e                 1.169900
path_char_t                   1.159500
domain_char_dot               1.121900
domain_char_a            

In [6]:
with open('data/scraped_urls.txt', 'r') as file:
    valid_df = pd.DataFrame({'url': [url.strip() for url in file]})
valid_df.tail()

Unnamed: 0,url
10039,http://www.radiopublic.com
10040,http://status.mailchimp.com
10041,http://game.xiaomi.com/
10042,https://us.diablo3.com/
10043,http://www.esrb.org/confirm/blizzard-confirmat...


In [7]:
valid_df_counts = pd.DataFrame(list(valid_df.url.apply(project3utils.get_features_from_url)), dtype=np.float64)

In [8]:
valid_df_counts.mean().sort_values(ascending=False)

path_total_chars             15.126145
domain_total_chars           12.772800
path_consonants               7.230884
domain_consonants             7.164576
subdomain_total_chars         5.058045
domain_vowels                 4.433393
path_vowels                   4.151235
subdomain_consonants          3.558542
domain_entropy                2.191288
path_digits                   1.611410
domain_char_o                 1.489745
subdomain_vowels              1.351752
path_entropy                  1.325770
path_char_e                   1.185982
subdomain_char_w              1.177220
domain_char_dot               1.105337
path_char_i                   1.033851
domain_char_c                 1.027579
path_char_a                   0.948029
domain_char_i                 0.883015
domain_char_a                 0.865691
domain_char_e                 0.862007
domain_char_m                 0.859518
subdomain_entropy             0.844500
path_char_t                   0.790621
path_char_s              

In [9]:
df_counts['is_phishing'] = 1
valid_df_counts['is_phishing'] = 0

data = pd.concat([df_counts, valid_df_counts], ignore_index=True)
data.corr()['is_phishing'].sort_values(ascending=False)

is_phishing               1.000000
subdomain_empty_string    0.459840
path_entropy              0.456112
path_ends_in_php          0.430053
path_char_slash           0.418053
path_char_p               0.382408
path_char_dot             0.350024
path_consonants           0.297406
path_total_chars          0.290490
path_char_c               0.288355
path_keyword_login        0.286396
domain_total_chars        0.283019
path_char_o               0.265302
domain_consonants         0.261279
path_char_n               0.255089
path_vowels               0.254044
path_char_h               0.253158
path_char_m               0.239106
path_char_l               0.237881
domain_char_0             0.233911
path_char_d               0.231249
path_char_x               0.219283
path_char_e               0.215113
domain_digits             0.209968
path_char_f               0.205992
path_char_g               0.201615
domain_entropy            0.199857
path_digits               0.191017
path_char_b         

# Modeling

In [10]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, classification_report
from sklearn.externals import joblib

In [11]:
models = []
X = data[project3utils.numeric_feature_columns]
y = data['is_phishing']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=987)
meta_model_X_train = None
meta_model_X_test = None
for i, feature_columns in enumerate([project3utils.nonpath_feature_columns, project3utils.path_feature_columns]):
    Xa_train, Xa_test = X_train[feature_columns], X_test[feature_columns]
    # model = DummyClassifier()
    # model = LogisticRegression()
    # model = LinearSVC()
    # model = SVC()
    # model = DecisionTreeClassifier()
    model = RandomForestClassifier(n_estimators=80, criterion='gini')
    # model = GaussianNB()
    model.fit(Xa_train, y_train)
    predictions = model.predict(Xa_test)
    print('### {} Model ###'.format(['Domain', 'Path'][i]))
    print("Accuracy: {}".format(accuracy_score(y_test, predictions)))
    print(classification_report(y_test, predictions))
    models.append(model)
    if i == 0:
        meta_model_X_train = np.array([model.predict(Xa_train)]).T
        meta_model_X_test = np.array([model.predict(Xa_test)]).T
    else:
        meta_model_X_train = np.append(meta_model_X_train, np.array([model.predict(Xa_train)]).T, 1)
        meta_model_X_test = np.append(meta_model_X_test, np.array([model.predict(Xa_test)]).T, 1)

meta_model_X_train = np.append(meta_model_X_train, X_train, 1)
meta_model_X_test = np.append(meta_model_X_test, X_test, 1)
meta_model = SVC()
meta_model.fit(meta_model_X_train, y_train)
predictions = meta_model.predict(meta_model_X_test)
print('### Meta Model ###')
print("Accuracy: {}".format(accuracy_score(y_test, predictions)))
print(classification_report(y_test, predictions))

### Domain Model ###
Accuracy: 0.9100432324575989
             precision    recall  f1-score   support

          0       0.91      0.91      0.91      3034
          1       0.91      0.91      0.91      2980

avg / total       0.91      0.91      0.91      6014

### Path Model ###
Accuracy: 0.8967409378117726
             precision    recall  f1-score   support

          0       0.88      0.92      0.90      3034
          1       0.91      0.87      0.89      2980

avg / total       0.90      0.90      0.90      6014

### Meta Model ###
Accuracy: 0.9220152976388427
             precision    recall  f1-score   support

          0       0.94      0.90      0.92      3034
          1       0.90      0.94      0.92      2980

avg / total       0.92      0.92      0.92      6014



In [12]:
def predict_for_url(url):
    features = project3utils.get_features_from_url(url)
    numeric_features = [features[c] for c in project3utils.numeric_feature_columns]
    nonpath_features = [features[c] for c in project3utils.nonpath_feature_columns]
    path_features = [features[c] for c in project3utils.path_feature_columns]
    domain_prediction = models[0].predict([nonpath_features])
    path_prediction = models[1].predict([path_features])
    meta_prediction = meta_model.predict([[domain_prediction, path_prediction] + numeric_features])
    return {
        'domain': domain_prediction[0],
        'path': path_prediction[0],
        'meta': meta_prediction[0]
    }

In [13]:
predict_for_url('http://q.pet-dog-cat-supply-store.com')

{'domain': 0, 'meta': 0, 'path': 0}

In [14]:
joblib.dump(models[0], 'domain_model.pkl')
joblib.dump(models[1], 'path_model.pkl')
joblib.dump(meta_model, 'meta_model.pkl')

['meta_model.pkl']

In [15]:
from sklearn.neural_network import MLPClassifier

In [16]:
# def url_as_vector(url):
#     vec = [0] * 64
#     for i in range(min(len(url), len(vec))):
#         vec[i] = ord(url[i])
#     return vec
# phish_vectors = pd.DataFrame(list(df.head(10000).url.apply(url_as_vector)), dtype=np.float64)
# phish_vectors = pd.concat([phish_vectors, df_counts[project3utils.numeric_feature_columns]], axis=1)
# phish_vectors['is_phishing'] = 1
# valid_vectors = pd.DataFrame(list(valid_df.url.apply(url_as_vector)), dtype=np.float64)
# valid_vectors = pd.concat([valid_vectors, valid_df_counts[project3utils.numeric_feature_columns]], axis=1)
# valid_vectors['is_phishing'] = 0
# data = pd.concat([phish_vectors, valid_vectors], ignore_index=True)
# X = data.drop(['is_phishing'], axis=1)
# y = data['is_phishing']
# X.head()

In [17]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=789)

In [25]:
nn_clf = MLPClassifier(hidden_layer_sizes=(2 * X.shape[1],), activation='relu', max_iter=400, tol=0.00001)
nn_clf.fit(X_train, y_train)
predictions = nn_clf.predict(X_test)
print("Accuracy: {}".format(accuracy_score(y_test, predictions)))
print(classification_report(y_test, predictions))

Accuracy: 0.9298303957432658
             precision    recall  f1-score   support

          0       0.93      0.93      0.93      3034
          1       0.93      0.93      0.93      2980

avg / total       0.93      0.93      0.93      6014



In [19]:
# from collections import defaultdict
# feature_weights = defaultdict(dict)
# for c_0 in nn_clf.coefs_:
#     for i, c_1 in enumerate(c_0):
#         for j, c_2 in enumerate(c_1):
#             feature_weights[j][X_train.columns[i]] = c_2

IndexError: index 170 is out of bounds for axis 0 with size 170

In [None]:
# for i in feature_weights.keys():
#     print(i)
#     for f in sorted(feature_weights[i].items(), reverse=True, key=lambda w: w[1])[0:10]:
#         print(i, f)

In [22]:
X.shape

(20044, 170)