# Jaro Similarity
<img src="images/jaro.png" width="80%">

https://en.wikipedia.org/wiki/Jaro–Winkler_distance

# Levenshtein Distance
<img src="images/levenshtein.png" width="80%">

https://en.wikipedia.org/wiki/Levenshtein_distance

## Let's Try It!

In [2]:
import jellyfish

In [3]:
# Levenshtein distance
jellyfish.levenshtein_distance('google.com', 'g00gle.com')

2

In [4]:
# Jaro distance
jellyfish.jaro_distance('google.com', 'g00gle.com')

0.8666666666666667

## More!

In [5]:
words_to_compare = [(u'google.com', u'google.com'), 
                    (u'google.com', u'g00gle.com'),
                    (u'google.com', u'google.badguy.com'),
                    (u'google.com', u'malware.ru'),
                    (u'bit', u'bot'),
                    (u'bitly.bit', u'bitly.bot')]
for word1, word2 in words_to_compare:
    levenshtein = jellyfish.levenshtein_distance(word1, word2)
    jaro = jellyfish.jaro_distance(word1, word2)
    print("{} vs. {} Levenshtein: {} Jaro: {}".format(word1, word2, round(levenshtein, 3), round(jaro, 3)))

google.com vs. google.com Levenshtein: 0 Jaro: 1.0
google.com vs. g00gle.com Levenshtein: 2 Jaro: 0.867
google.com vs. google.badguy.com Levenshtein: 7 Jaro: 0.863
google.com vs. malware.ru Levenshtein: 9 Jaro: 0.533
bit vs. bot Levenshtein: 1 Jaro: 0.778
bitly.bit vs. bitly.bot Levenshtein: 1 Jaro: 0.926


## Finding Needles in a Haystack
<img src="images/dataset.png" width="50%">

In [34]:
pd.set_option('large_repr', 'truncate')
pd.set_option('max_rows', 7)

In [35]:
# Load Dataset
import pandas as pd
pd.set_option('display.max_colwidth', 80)
df = pd.read_csv('./data/dataset_medium.csv', dtype={'port': str, 'subdomains':str})
df.fillna('', inplace=True)
df = df[['label', 'url', 'uri', 'subdomains']]
df.head(3)

Unnamed: 0,label,url,uri,subdomains
0,benign,https://coinmarketcap.com/historical/20140223/,/historical/20140223/,
1,benign,https://reddit.com/7wxie4,/7wxie4,
2,benign,https://youtu.be/M4QVkwY65wk,/M4QVkwY65wk,


In [36]:
# Find common malicious
df[(df.label == 'malicious') & \
   (df.subdomains.str.contains('bank'))][['uri', 'subdomains', 'url']]

Unnamed: 0,uri,subdomains,url
950128,/account/verification/D50M74890M8414B93618/qes.php,bankofamerica-com.,http://bankofamerica-com.microdeal.co.uk/account/verification/D50M74890M8414...
950152,/aa892b70d8b0c077329c1baaa6800583/contactinfo.php,bankoffamerica.,http://bankoffamerica.webcindario.com/aa892b70d8b0c077329c1baaa6800583/conta...
950164,/signonline/login.go.php,secure-bankofamerica.com.checking-accounts.,http://secure-bankofamerica.com.checking-accounts.osbirigui.com.br/signonlin...
...,...,...,...
998579,/vx/lang-en-GB/candidate/application/292496,bankcampuscareers.,https://bankcampuscareers.tal.net/vx/lang-en-GB/candidate/application/292496
999288,/web/83bf4421703abebf1de652c240c2ca73f34f880c7ce03caa10fe10a3931a417c4d9c33e...,ebanking-ch-ubs-workbench-workbench-openaction.,http://ebanking-ch-ubs-workbench-workbench-openaction.storrien.com/web/83bf4...
999432,/1bae5192eb41949d09446761a144e460/,www-hellobank-fr.toulone0.,http://www-hellobank-fr.toulone0.beget.tech/1bae5192eb41949d09446761a144e460/


In [37]:
malicious_uri = '/account/verification/D50M74890M8414B93618/qes.php'
df['jaro'] = df.uri.apply(jellyfish.jaro_distance, args=(malicious_uri,))
df[['label', 'url', 'jaro']].sort_values('jaro', ascending=False)[0:10]

Unnamed: 0,label,url,jaro
970632,malicious,https://bankofamerica-com.microdeal.co.uk/account/verification/D50M74890M841...,1.000000
950128,malicious,http://bankofamerica-com.microdeal.co.uk/account/verification/D50M74890M8414...,1.000000
988840,malicious,https://bankofamerica-com.microdeal.co.uk/account/verification/D50M74890M841...,0.961026
...,...,...,...
958875,malicious,https://www.bofadatareview-bofa-com.ga/account/verification/E4M05EN65B112E71...,0.847480
971096,malicious,https://bankofamerica-com.microdeal.co.uk/account/verification/99AMDC2583N1C...,0.847480
956430,malicious,http://bankofamerica-com.microdeal.co.uk/account/verification/EBCMAC90A8D6B4...,0.839350


# TF-IDF

A method to identify the importance of a word relative to the entire corpus

In [9]:
# Load TF-IDF Library
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

In [10]:
# Let's do TF-IDF just on the malicious dataset
df_malicious = df[df.label=='malicious'][['label', 'url']]
df_malicious.head(5)

Unnamed: 0,label,url
950000,malicious,http://byzcc.com/2017/index.php?email=abuse@rsresponse.co.uk
950001,malicious,http://m.facebook.udir.me/?_fb_noscript=1
950002,malicious,http://www.bloggingfornetworking.com/lmages/71884f5280caef99226832baf70ef980
950003,malicious,http://alsulthanscrap.com/AdminDB/index.html
950004,malicious,http://download-apk-files.ru/api/?name=StickMan%20Gun&amp;icon=http:


In [11]:
# Instantiate Vectorizer
vectorizer = TfidfVectorizer(max_df=.5, min_df=.001)
# Fit
features_transformed = vectorizer.fit_transform(df_malicious.url)
vocab = vectorizer.vocabulary_

In [39]:
vocab

{'https': 23694,
 'www': 40806,
 'islamicaid': 24858,
 'youtu': 41379,
 'be': 12846,
 'sxvia0bto': 36888,
 '79': 7769,
 'limraexim': 26865,
 'export': 20140,
 'diffuser': 17901,
 'data': 17229,
 'aspx': 11624,
 'imgur': 24263,
 'tkkc0av': 37838,
 'jpg': 25431,
 'ul263yb': 38729,
 'png': 31846,
 'en': 19622,
 'wikipedia': 40416,
 'org': 30602,
 'wiki': 40408,
 'issues_affecting_the_single_transferable_vote': 24878,
 'tactical_voting': 37001,
 'magiccards': 27519,
 'info': 24466,
 'query': 32907,
 'ugin': 38680,
 '2c': 3872,
 '20the': 3011,
 '20spirit': 2981,
 '20dragon': 2716,
 'justice': 25563,
 'gov': 22463,
 'usao': 39052,
 'sdca': 34728,
 'pr': 32134,
 'bank': 12560,
 'pleads': 31774,
 'guilty': 22736,
 'pays': 31233,
 'historic': 23352,
 'penalty': 31342,
 'concealing': 16043,
 'anti': 11187,
 'money': 28671,
 'laundering': 26511,
 'failures': 20494,
 'quickship': 32924,
 'quartet': 32896,
 'porcelain': 31988,
 'magnetic': 27526,
 'in': 24322,
 'and': 11078,
 'out': 30718,
 'board'

In [54]:
def top_tfidf(vectorizer, fit_transform_result, topn=20):
    scores = zip(vectorizer.get_feature_names(), np.asarray(fit_transform_result.sum(axis=0)).ravel())
    sorted_scores = sorted(scores, key=lambda x: x[1], reverse=True)
    for item in sorted_scores[0:topn]:
        print("{0:50} Score: {1:.3f}".format(item[0], item[1]))

top_tfidf(vectorizer, features_transformed, topn=20)

https                                              Score: 120044.553
imgur                                              Score: 108367.614
www                                                Score: 53651.943
jpg                                                Score: 49357.261
be                                                 Score: 36253.121
youtu                                              Score: 36190.031
org                                                Score: 28714.850
png                                                Score: 26458.172
wiki                                               Score: 22466.461
en                                                 Score: 22046.400
amp                                                Score: 20219.532
wikipedia                                          Score: 19490.361
twitter                                            Score: 15501.480
html                                               Score: 12948.997
net                                           

In [40]:
# Redo with the entire dataset
features_transformed = vectorizer.fit_transform(df.url)
print(features_transformed.shape)
vocab = vectorizer.vocabulary_
top_tfidf(vectorizer, features_transformed, topn=25)

(1000000, 41796)
https                                              Score: 120044.55274561205
imgur                                              Score: 108367.61358839607
www                                                Score: 53651.94264232749
jpg                                                Score: 49357.26125641032
be                                                 Score: 36253.12061804477
youtu                                              Score: 36190.031198026896
org                                                Score: 28714.850040695368
png                                                Score: 26458.171645913455
wiki                                               Score: 22466.46123095859
en                                                 Score: 22046.399934650282
amp                                                Score: 20219.532093269685
wikipedia                                          Score: 19490.361005623367
twitter                                            Score: 15501

In [51]:
# Load cosine_similarity function
from sklearn.metrics.pairwise import cosine_similarity

# Helper function
def print_top_n(result, df, top_n=5):
    top_n = 5
    sorted_result = result.argsort()[0][(-1*top_n):]
    list(reversed(sorted_result.tolist()))
    for idx in sorted_result:
        print("[{0}] : {1:.3f} : {2}".format(idx, result[0][idx], df.iloc[idx]['url']))

In [43]:
df[((df.label == 'malicious') & df.url.str.contains('paypal'))]['url'].head(5)

950028    https://iad-login.dotomi.com/commonid/match?rurl=https%253A%252F%252Fadfarm....
950158           http://www.safetyking.ae/paypal/us-mmm/545fdad6a2d3ef280ded4edf8cbcd91c/
950286    http://theaccountidlimit.com/support-paypal-custmer-confirm-account-suspende...
950311    http://support.secure.paypal.com.services.d4a0v6qj59pi3.paripoorna.com/CA/1/...
950436    http://shinwoodnp.com/pod/asdr/www.paypal.de/cgi-bin/webscr/cmd=_login-run.p...
Name: url, dtype: object

In [52]:
idx = 950311
result = cosine_similarity(features_transformed[idx:(idx+1)], features_transformed)
print_top_n(result, df)

[989802] : 0.617 : http://www.paypal.com.ca.services.ejlbv633r4yt.nadiastrologymumbai.com/CA/1/signn.php
[998634] : 0.736 : http://customercare.secure.paypal.online.services.nvd8xjw.paripoorna.com/CA/1/signn.php?REDACTED
[976859] : 0.839 : http://support.secure.paypal.online.services.j1avk9h.paripoorna.com/CA/1/signn.php
[983475] : 0.872 : http://support.secure.paypal.com.services.g6nhzlipq9o.paripoorna.com/CA/1/signn.php
[950311] : 1.000 : http://support.secure.paypal.com.services.d4a0v6qj59pi3.paripoorna.com/CA/1/signn.php


# Machine Learning with Naive Bayes


In [21]:
ml_dataset = df[940000:960000]
ml_dataset.label.value_counts()

malicious    10000
benign       10000
Name: label, dtype: int64

In [22]:
from sklearn.naive_bayes import GaussianNB
# TF-IDF
vectorizer = TfidfVectorizer(max_df=.5)
X_data = vectorizer.fit_transform(ml_dataset.url)
vocab = vectorizer.vocabulary_

# Naive Bayes Classifier
cls = GaussianNB()
clf = cls.fit(X_data.toarray(), ml_dataset.label)

In [23]:
# malicious url
malicious_url = 'http://000webhostapp.php/wp-content/plugins/ubh/wells/gzjzty=/myaccount/emailaccess/login'
sample = pd.DataFrame([{'url' : malicious_url}])

vectorizer = TfidfVectorizer(vocabulary=vocab)
sample_tfidf = vectorizer.fit_transform(sample.url)
clf.predict(sample_tfidf.toarray())[0]

'malicious'

In [24]:
# benign url
benign_url = 'https://www.youtube.com/watch?v=svlEfxTyJQE'
sample = pd.DataFrame([{'url': benign_url}])
sample_tfidf = vectorizer.fit_transform(sample.url)
clf.predict(sample_tfidf.toarray())[0]

'benign'