# Setup

In [1]:
from collections import Counter, defaultdict
from urllib.parse import urlparse

import pandas as pd
import numpy as np
import seaborn as sns

import matplotlib.pyplot as plt
%matplotlib inline

import project3utils

In [2]:
df = pd.read_csv('data/phishtank2018-05-02_verified_online.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32660 entries, 0 to 32659
Data columns (total 8 columns):
phish_id             32660 non-null int64
url                  32660 non-null object
phish_detail_url     32660 non-null object
submission_time      32660 non-null object
verified             32660 non-null object
verification_time    32660 non-null object
online               32660 non-null object
target               32660 non-null object
dtypes: int64(1), object(7)
memory usage: 2.0+ MB


In [4]:
df_counts = pd.DataFrame(list(df.head(10000).url.apply(project3utils.get_features_from_url)), dtype=np.float64)

In [5]:
df_counts.mean().sort_values(ascending=False)

domain_total_chars       15.4995
domain_consonants         8.8124
domain_vowels             5.0283
subdomain_total_chars     3.6305
subdomain_consonants      2.1840
domain_char_o             1.5776
domain_char_c             1.1728
domain_char_e             1.1699
domain_char_a             1.1107
domain_char_dot           1.1021
domain_char_m             0.9641
subdomain_vowels          0.9508
domain_char_i             0.8223
domain_char_s             0.8038
domain_char_t             0.7929
domain_char_r             0.7733
domain_char_n             0.7430
subdomain_char_w          0.6191
domain_char_l             0.5679
domain_char_p             0.5055
domain_digits             0.4190
domain_char_g             0.3959
domain_char_h             0.3809
subdomain_digits          0.3649
domain_char_b             0.3509
domain_char_u             0.3478
domain_char_d             0.3333
domain_char_0             0.3047
subdomain_char_e          0.3045
domain_char_w             0.2002
          

In [6]:
valid_df = pd.read_csv('data/bing_search_results.csv')
valid_df.head()

Unnamed: 0,search_term,market,url
0,management,en-US,https://www.merriam-webster.com/dictionary/man...
1,management,en-US,https://en.wikipedia.org/wiki/Management
2,management,en-US,http://www.businessdictionary.com/definition/m...
3,management,en-US,https://www.forbes.com/management/
4,management,en-US,https://www.thebalancecareers.com/management-4...


In [7]:
valid_df_counts = pd.DataFrame(list(valid_df.url.apply(project3utils.get_features_from_url)), dtype=np.float64)

In [8]:
valid_df_counts.mean().sort_values(ascending=False)

domain_total_chars       14.2380
domain_consonants         7.9677
domain_vowels             5.0772
subdomain_total_chars     3.4524
subdomain_consonants      2.8617
subdomain_char_w          2.0205
domain_char_o             1.6928
domain_char_c             1.2835
domain_char_dot           1.1143
domain_char_e             1.0975
domain_char_m             1.0071
domain_char_i             1.0056
domain_char_r             0.9708
domain_char_a             0.9266
domain_char_t             0.7519
subdomain_is_www          0.6564
https                     0.6421
domain_char_n             0.5744
subdomain_vowels          0.5422
domain_char_s             0.4954
domain_char_d             0.4501
domain_char_u             0.3547
domain_char_g             0.3145
domain_char_l             0.3019
domain_char_y             0.2980
domain_char_w             0.2382
domain_char_b             0.2348
domain_char_h             0.2329
domain_char_p             0.2077
domain_char_f             0.2023
          

In [9]:
df_counts['is_phishing'] = 1
valid_df_counts['is_phishing'] = 0

data = pd.concat([df_counts, valid_df_counts], ignore_index=True)
data.corr()['is_phishing'].sort_values(ascending=False)

is_phishing              1.000000
domain_digits            0.246363
domain_char_0            0.232798
domain_char_p            0.229802
domain_char_l            0.196191
domain_char_s            0.189819
subdomain_digits         0.138797
domain_consonants        0.138340
domain_char_h            0.135577
subdomain_char_b         0.130508
domain_total_chars       0.129308
subdomain_char_1         0.119655
subdomain_char_s         0.119461
subdomain_char_f         0.118427
domain_char_b            0.114466
subdomain_vowels         0.112652
domain_char_hyphen       0.111730
subdomain_char_r         0.109337
subdomain_char_v         0.104426
subdomain_char_0         0.103889
subdomain_char_k         0.102632
subdomain_char_2         0.101842
domain_char_n            0.099988
subdomain_char_e         0.097918
domain_char_a            0.097262
subdomain_char_3         0.097034
subdomain_char_o         0.096505
subdomain_char_8         0.095752
domain_char_j            0.091402
subdomain_char

# Modeling

In [10]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

In [11]:
X = data.drop(['is_phishing','domain','subdomain','full_domain'], axis=1)
y = data['is_phishing']

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=987)
model = RandomForestClassifier(n_estimators=10, criterion='gini')
model.fit(X_train, y_train)
(model.score(X_test, y_test), f1_score(y_test, model.predict(X_test)))

(0.91039999999999999, 0.90857142857142859)

In [13]:
def predict_for_url(url):
    features_dict = project3utils.get_features_from_url(url)
    features = [features_dict[c] for c in X.columns]
    return model.predict_proba([features])

In [20]:
predict_for_url('https://smile.amazon.co.uk/com')

array([[ 0.8,  0.2]])