# TF-IDF

A method to identify the importance of a word relative to the entire corpus

In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import pandas as pd
pd.set_option('display.max_colwidth', -1)

In [2]:
df = pd.read_csv('./data/phishing_dataset.csv')
df['uri'] = df.uri.apply(str)
df[['url', 'label']]

Unnamed: 0,url,label
0,"http://stpatricksfc.net/xs/cn/ac38648851b8d41a726d486e13a781eb/?login=&.verify?service=mail&data:text/html;charset=utf-8;base64,PGh0bWw+DQo8c3R5bGU+IGJvZHkgeyBtYXJnaW46IDA7IG92ZXJmbG93OiBoaWRkZW47IH0gPC9zdHlsZT4NCiAgPGlmcmFt",malicious
1,http://freeemailextractor.us/305bd4c64738af203e5544c86393d9d1,malicious
2,http://52.70.129.120/js/.wp.php/rf/myaccount/34622/,malicious
3,http://52.70.129.120/js/.wp.php/rf/myaccount/34622/home?cmd=_account-details&session=626dcf1ea909ba1d031b89440d6ad034&dispatch=0405946fc903c08eb521696f62e245b18eadcfd7,malicious
4,http://ahaslo.org/ayv/pdf/87b6f0ef86c9a996fe35ef9d27354440/,malicious
5,http://syshainc.com/OPC/54cd9ed8eabe2f3d7cf4ca1a60f2f362/,malicious
6,http://syshainc.com/OPC/54cd9ed8eabe2f3d7cf4ca1a60f2f362/pass.php,malicious
7,http://syshainc.com/OPC/54cd9ed8eabe2f3d7cf4ca1a60f2f362/error.php,malicious
8,http://mustardtech.com/live/7da8f0dbfabacc817c1caf46a48a6a4c/login2.php?https://login.live.com/public/IdentifyUser.aspx?LOB=RBGLogon,malicious
9,http://mustardtech.com/live/7da8f0dbfabacc817c1caf46a48a6a4c,malicious


In [3]:
# Instantiate Vectorizer
vectorizer = TfidfVectorizer(max_df=.5, min_df=.001) #, stop_words=['abuse'])
# Fit
features_transformed = vectorizer.fit_transform(df.url)
vocab = vectorizer.vocabulary_

In [4]:
vocab

{'000webhostapp': 0,
 '01': 1,
 '09': 2,
 '10': 3,
 '10065877425': 4,
 '1033': 5,
 '12': 6,
 '120': 7,
 '1252899642': 8,
 '129': 9,
 '13698': 10,
 '13inboxlight': 11,
 '13inboxlightaspxn': 12,
 '13vqcr8bp0gud': 13,
 '14750ef6ccc9d69c326c97bf81a1ac34': 14,
 '162': 15,
 '163': 16,
 '1774256418': 17,
 '185': 18,
 '1error': 19,
 '20': 20,
 '2016': 21,
 '2017': 22,
 '2018': 23,
 '20account': 24,
 '20auto': 25,
 '20dqo8c3r5bgu': 26,
 '20igjvzhkgeybtyxjnaw46ida7ig92zxjmbg93oiboawrkzw47ih0gpc9zdhlszt4nciagpglmcmft': 27,
 '20in': 28,
 '20now': 29,
 '23': 30,
 '230': 31,
 '234': 32,
 '2f': 33,
 '2fwww': 34,
 '3a': 35,
 '3d1774256418': 36,
 '3fn': 37,
 '40b': 38,
 '5183999': 39,
 '51jianli': 40,
 '52': 41,
 '64855': 42,
 '70': 43,
 '_93894574342hdfjsixaoweue5_j1489738549283781331983743fncn_product': 44,
 '_account': 45,
 '_jehfuq_vjoxk0qwhtogydw_product': 46,
 '_pagelabel': 47,
 'abuse': 48,
 'ac': 49,
 'access': 50,
 'account': 51,
 'accounts': 52,
 'accountverification': 53,
 'acct': 54,
 'aces

In [5]:
def top_tfidf(vectorizer, fit_transform_result, topn=20):
    scores = zip(vectorizer.get_feature_names(), np.asarray(fit_transform_result.sum(axis=0)).ravel())
    sorted_scores = sorted(scores, key=lambda x: x[1], reverse=True)
    for item in sorted_scores[0:topn]:
        print("{0:50} Score: {1}".format(item[0], item[1]))

top_tfidf(vectorizer, features_transformed, topn=20)

www                                                Score: 2961.190084302796
php                                                Score: 2797.8760450294217
html                                               Score: 2269.0477926517606
https                                              Score: 2099.234735520745
amp                                                Score: 1959.8398245654153
index                                              Score: 1573.566282176735
wp                                                 Score: 1560.166389374524
000webhostapp                                      Score: 1453.9669529851258
login                                              Score: 1410.6467786604208
net                                                Score: 1049.3759580848232
org                                                Score: 925.8204925515827
htm                                                Score: 914.8349221342077
includes                                           Score: 831.1866703978121
admin 

In [6]:
df[df.url.str.contains('wp')][['url', 'label']]

Unnamed: 0,url,label
2,http://52.70.129.120/js/.wp.php/rf/myaccount/34622/,malicious
3,http://52.70.129.120/js/.wp.php/rf/myaccount/34622/home?cmd=_account-details&session=626dcf1ea909ba1d031b89440d6ad034&dispatch=0405946fc903c08eb521696f62e245b18eadcfd7,malicious
56,http://www.balasdecocoribeirao.com.br/wp-content/themes/twentyfourteen/inc/first/verify/,malicious
60,http://crownfin.co.za/wp-content/plugins/sigma-wp/chase/home/index.php,malicious
243,http://eready.com/wp-content/content/msg-external-home/login.jsp.htm,malicious
268,https://www.birminghamonyork.com.au/mapmusez/vpresxe122/wp-drop2018/,malicious
278,http://www.satcomnet.com/glee/d41d8cd98f00b204e9800998ecf8427e/Up-dating4.php?country.x=-&ACCT.x=ID-PPL=PA324188.166.98.249=ScrPg=109523201732411dacb66f9f7b4f922834829049dc36145ce460af3e500fe6d3S=$1$KsBwpEiC$fu.vGk5yYy7Rbm/kUI6LX0in0bwkJu71CUj6XA2eH5hcQZSf9szKLpmIYv3ToBFl8GyRgOMqadNrVEDtx4PWS7XGmL9qhvYtAk21IJdbDVpyU6HZCl0nxOwr5fiMgE3QzoFaeNTKW8cjuPBR4s82318880498,malicious
292,https://mgmj.com/wp-includes/login.hpe.com/index.html,malicious
353,http://pwfurnishings.com/wp-admin/alibaba/alibaba/index.php?email=info@treescene.com,malicious
391,http://gepc.or.tz/wp/Office/index.html,malicious
