# Jaro Similarity

<img src="images/jaro.png" width="600">

https://en.wikipedia.org/wiki/Jaro–Winkler_distance

# Levenshtein Distance

<img src="images/levenshtein.png" width="800">

https://en.wikipedia.org/wiki/Levenshtein_distance

# Let's Try It

In [1]:
import jellyfish

In [2]:
jellyfish.levenshtein_distance('google.com', 'g00gle.com')

2

In [3]:
jellyfish.jaro_distance('google.com', 'g00gle.com')

0.8666666666666667

In [None]:
words_to_compare = [(u'google.com', u'google.com'), 
                    (u'google.com', u'g00gle.com'),
                    (u'google.com', u'google.badguy.com'),
                    (u'google.com', u'malware.ru'),
                    (u'bit', u'bot'),
                    (u'bitly.bit', u'bitly.bot')]
for word1, word2 in words_to_compare:
    levenshtein = jellyfish.levenshtein_distance(word1, word2)
    jaro = jellyfish.jaro_distance(word1, word2)
    print("{} vs. {} Levenshtein: {} Jaro: {}".format(word1, word2, round(levenshtein, 3), round(jaro, 3)))


# Load URL Data

In [4]:
import pandas as pd
pd.set_option('display.max_colwidth', -1)
df = pd.read_csv('./data/phishing_dataset.csv')
df['uri'] = df.uri.apply(str)
df

Unnamed: 0,label,source,url,protocol,fqdn,port,uri
0,malicious,openphish_1,"http://stpatricksfc.net/xs/cn/ac38648851b8d41a726d486e13a781eb/?login=&.verify?service=mail&data:text/html;charset=utf-8;base64,PGh0bWw+DQo8c3R5bGU+IGJvZHkgeyBtYXJnaW46IDA7IG92ZXJmbG93OiBoaWRkZW47IH0gPC9zdHlsZT4NCiAgPGlmcmFt",http,stpatricksfc.net,80.0,"/xs/cn/ac38648851b8d41a726d486e13a781eb/?login=&.verify?service=mail&data:text/html;charset=utf-8;base64,PGh0bWw+DQo8c3R5bGU+IGJvZHkgeyBtYXJnaW46IDA7IG92ZXJmbG93OiBoaWRkZW47IH0gPC9zdHlsZT4NCiAgPGlmcmFt"
1,malicious,openphish_2,http://freeemailextractor.us/305bd4c64738af203e5544c86393d9d1,http,freeemailextractor.us,80.0,/305bd4c64738af203e5544c86393d9d1
2,malicious,openphish_3,http://52.70.129.120/js/.wp.php/rf/myaccount/34622/,http,52.70.129.120,80.0,/js/.wp.php/rf/myaccount/34622/
3,malicious,openphish_4,http://52.70.129.120/js/.wp.php/rf/myaccount/34622/home?cmd=_account-details&session=626dcf1ea909ba1d031b89440d6ad034&dispatch=0405946fc903c08eb521696f62e245b18eadcfd7,http,52.70.129.120,80.0,/js/.wp.php/rf/myaccount/34622/home?cmd=_account-details&session=626dcf1ea909ba1d031b89440d6ad034&dispatch=0405946fc903c08eb521696f62e245b18eadcfd7
4,malicious,openphish_5,http://ahaslo.org/ayv/pdf/87b6f0ef86c9a996fe35ef9d27354440/,http,ahaslo.org,80.0,/ayv/pdf/87b6f0ef86c9a996fe35ef9d27354440/
5,malicious,openphish_6,http://syshainc.com/OPC/54cd9ed8eabe2f3d7cf4ca1a60f2f362/,http,syshainc.com,80.0,/OPC/54cd9ed8eabe2f3d7cf4ca1a60f2f362/
6,malicious,openphish_7,http://syshainc.com/OPC/54cd9ed8eabe2f3d7cf4ca1a60f2f362/pass.php,http,syshainc.com,80.0,/OPC/54cd9ed8eabe2f3d7cf4ca1a60f2f362/pass.php
7,malicious,openphish_8,http://syshainc.com/OPC/54cd9ed8eabe2f3d7cf4ca1a60f2f362/error.php,http,syshainc.com,80.0,/OPC/54cd9ed8eabe2f3d7cf4ca1a60f2f362/error.php
8,malicious,openphish_9,http://mustardtech.com/live/7da8f0dbfabacc817c1caf46a48a6a4c/login2.php?https://login.live.com/public/IdentifyUser.aspx?LOB=RBGLogon,http,mustardtech.com,80.0,/live/7da8f0dbfabacc817c1caf46a48a6a4c/login2.php?https://login.live.com/public/IdentifyUser.aspx?LOB=RBGLogon
9,malicious,openphish_10,http://mustardtech.com/live/7da8f0dbfabacc817c1caf46a48a6a4c,http,mustardtech.com,80.0,/live/7da8f0dbfabacc817c1caf46a48a6a4c


In [5]:
malicious_uri = 'www.paypal.com cgi-bin webscrcmd _login-run update.php'
df['jaro'] = df.uri.apply(jellyfish.jaro_distance, args=(malicious_uri,))
df[['url', 'jaro']].sort_values('jaro', ascending=False).head(5)

Unnamed: 0,url,jaro
45278,http://www.avgt.com/www.paypal.com/clients/update/cgi-bin/webscr.php?cmd=_login-run,0.795725
32138,http://shinwoodnp.com/admin/auto/www1.paypal.de/cgi-bin/webscr/cmd=_login-run.php?ckattempt=1,0.720123
45279,http://www.floridarentfinders.com/uploads/ws/css/www.paypal.com/cgi-bin/webscrcmd=_login-run/update.php,0.713404
10409,http://skskskks.000webhostapp.com/www.paypal.com/,0.71142
32180,http://shinwoodnp.com/pod/asdr/www.paypal.de/cgi-bin/webscr/cmd=_login-run.php?ckattempt=1,0.708736


# Cosine Similarity

In [7]:
from sklearn.metrics.pairwise import cosine_similarity

In [8]:
sentence1 = [1, 0, 5]
sentence2 = [1, 1, 5]
cosine_similarity([sentence1], [sentence2])

array([[0.98130676]])

In [9]:
sentence3 = [200, 7, 0]
cosine_similarity([sentence1], [sentence3])

array([[0.19599612]])

In [10]:
from scipy.spatial.distance import euclidean
euclidean(sentence1, sentence2)

1.0

In [11]:
sentence3 = [200, 7, 0]
euclidean(sentence1, sentence3)

199.18584287042088