# Setup

In [1]:
from collections import Counter, defaultdict
from urllib.parse import urlparse

import pandas as pd
import numpy as np
import seaborn as sns

import matplotlib.pyplot as plt
%matplotlib inline

import project3utils

In [2]:
df = pd.read_csv('data/phishtank2018-05-02_verified_online.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32660 entries, 0 to 32659
Data columns (total 8 columns):
phish_id             32660 non-null int64
url                  32660 non-null object
phish_detail_url     32660 non-null object
submission_time      32660 non-null object
verified             32660 non-null object
verification_time    32660 non-null object
online               32660 non-null object
target               32660 non-null object
dtypes: int64(1), object(7)
memory usage: 2.0+ MB


In [4]:
df_counts = pd.DataFrame(list(df.head(10000).url.apply(project3utils.get_features_from_url)), dtype=np.float64)

In [5]:
df_counts.mean().sort_values(ascending=False)

path_total_chars             32.4592
domain_total_chars           15.4995
path_consonants              15.4385
domain_consonants             8.8124
path_vowels                   7.6305
domain_vowels                 5.0283
path_digits                   4.6751
subdomain_total_chars         3.6305
path_char_slash               3.3520
subdomain_consonants          2.1840
domain_char_dot               1.1021
subdomain_vowels              0.9508
path_char_dot                 0.6479
subdomain_empty_string        0.5446
domain_digits                 0.4190
subdomain_digits              0.3649
path_char_hyphen              0.3262
subdomain_is_www              0.1843
https                         0.1730
domain_char_hyphen            0.1365
path_char_underscore          0.1096
subdomain_char_hyphen         0.0705
subdomain_char_dot            0.0599
domain_is_raw_ip              0.0099
path_empty_string             0.0094
path_char_colon               0.0012
domain_char_colon             0.0012
s

In [6]:
valid_df = pd.read_csv('data/bing_search_results.csv')
valid_df.head()

Unnamed: 0,search_term,market,url
0,management,en-US,https://www.merriam-webster.com/dictionary/man...
1,management,en-US,https://en.wikipedia.org/wiki/Management
2,management,en-US,http://www.businessdictionary.com/definition/m...
3,management,en-US,https://www.forbes.com/management/
4,management,en-US,https://www.thebalancecareers.com/management-4...


In [7]:
valid_df_counts = pd.DataFrame(list(valid_df.url.apply(project3utils.get_features_from_url)), dtype=np.float64)

In [8]:
valid_df_counts.mean().sort_values(ascending=False)

path_total_chars             18.8800
domain_total_chars           14.2380
path_consonants               9.5061
domain_consonants             7.9677
path_vowels                   5.5006
domain_vowels                 5.0772
subdomain_total_chars         3.4524
subdomain_consonants          2.8617
path_char_slash               1.9983
domain_char_dot               1.1143
path_digits                   0.8855
path_char_hyphen              0.6763
subdomain_is_www              0.6564
https                         0.6421
subdomain_vowels              0.5422
path_char_dot                 0.1442
subdomain_empty_string        0.1148
path_char_underscore          0.1055
domain_char_hyphen            0.0584
subdomain_char_dot            0.0224
domain_digits                 0.0204
subdomain_char_hyphen         0.0186
subdomain_digits              0.0075
path_char_colon               0.0011
path_empty_string             0.0000
subdomain_char_colon          0.0000
subdomain_char_slash          0.0000
d

In [9]:
df_counts['is_phishing'] = 1
valid_df_counts['is_phishing'] = 0

data = pd.concat([df_counts, valid_df_counts], ignore_index=True)
data.corr()['is_phishing'].sort_values(ascending=False)

is_phishing                  1.000000
subdomain_empty_string       0.457133
path_char_slash              0.385751
path_char_dot                0.354019
path_total_chars             0.272223
path_digits                  0.255262
domain_digits                0.246363
path_consonants              0.230433
path_vowels                  0.171452
subdomain_digits             0.138797
domain_consonants            0.138340
domain_total_chars           0.129308
subdomain_vowels             0.112652
domain_char_hyphen           0.111730
subdomain_char_hyphen        0.083987
subdomain_char_dot           0.074609
domain_is_raw_ip             0.070531
path_empty_string            0.068718
domain_char_colon            0.024502
subdomain_total_chars        0.017399
subdomain_char_underscore    0.010001
subdomain_char_colon         0.007071
path_char_underscore         0.004351
path_char_colon              0.001231
domain_vowels               -0.011398
domain_char_dot             -0.019637
subdomain_co

# Modeling

In [10]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.externals import joblib

In [11]:
X = data[project3utils.numeric_feature_columns]
y = data['is_phishing']

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=987)
# model = DummyClassifier()
# model = LogisticRegression()
# model = LinearSVC()
# model = SVC()
# model = DecisionTreeClassifier()
model = RandomForestClassifier(n_estimators=100, criterion='gini')
model.fit(X_train, y_train)
(model.score(X_test, y_test), f1_score(y_test, model.predict(X_test)))

(0.93420000000000003, 0.93370944992947802)

In [13]:
def predict_for_url(url):
    features = project3utils.get_numeric_features_list_from_url(url)
    return model.predict_proba([features])

In [14]:
predict_for_url('https://www.wikipedia.org')

array([[ 0.23,  0.77]])

In [15]:
joblib.dump(model, 'model.pkl')

['model.pkl']