# Simple Naive Bayes Classifier
Below we quickly demonstrate a simple Naive Bayes classifier, using barely enough data

## Import libraries

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import numpy as np
from sklearn.naive_bayes import GaussianNB

## Load the data

In [19]:
df_labeled = pd.read_csv('./data/labeled_data.csv')
#df_labeled.fillna(' ', inplace=True)
display(df_labeled.head(3))
display(df_labeled.tail(3))
df_labeled.label.value_counts()

Unnamed: 0,fqdn,label,port,protocol,source,uri,url
0,,benign,,,stratosphere,,http://sc.symcd.com/MFEwTzBNMEswSTAJBgUrDgMCGg...
1,,benign,,,stratosphere,,http://evcs-ocsp.ws.symantec.com/MFEwTzBNMEswS...
2,,benign,,,stratosphere,,http://crl.verisign.com/pca3.crl


Unnamed: 0,fqdn,label,port,protocol,source,uri,url
2208,syshainc.com,malicious,80.0,http,openphish_1098,/OPC/7b1614bd3badbd36f55dc5409253732a/error.php,http://syshainc.com/OPC/7b1614bd3badbd36f55dc5...
2209,syshainc.com,malicious,80.0,http,openphish_1099,/OPC/7b1614bd3badbd36f55dc5409253732a/,http://syshainc.com/OPC/7b1614bd3badbd36f55dc5...
2210,mahartrad.com,malicious,80.0,http,openphish_1100,/hotMAIL%20copy/hotMAIL/Validation/login2.php?...,http://mahartrad.com/hotMAIL%20copy/hotMAIL/Va...


benign       1111
malicious    1100
Name: label, dtype: int64

## TF-IDF and Train Naive Bayes

In [20]:
# TF-IDF
vectorizer = TfidfVectorizer(max_df=.5)
X_data = vectorizer.fit_transform(df_labeled.url)
vocab = vectorizer.vocabulary_

# Naive Bayes Classifier
cls = GaussianNB()
clf = cls.fit(X_data.toarray(), df_labeled.label)

## Use Trained Naive Bayes to Make Decisions

In [14]:
def top_tfidf(vectorizer, fit_transform_result, topn=20):
    scores = zip(vectorizer.get_feature_names(), np.asarray(fit_transform_result.sum(axis=0)).ravel())
    sorted_scores = sorted(scores, key=lambda x: x[1], reverse=True)
    for item in sorted_scores[0:topn]:
        print("{0:50} Score: {1}".format(item[0], item[1]))

In [15]:
# malicious url
malicious_url = 'http://000webhostapp.php/wp-content/plugins/ubh/wells/gzjzty=/myaccount/emailaccess/login'
sample = pd.DataFrame([{'url' : malicious_url}])

vectorizer = TfidfVectorizer(vocabulary=vocab)
sample_tfidf = vectorizer.fit_transform(sample.url)
clf.predict(sample_tfidf.toarray())[0]

'malicious'

In [16]:
top_tfidf(vectorizer, sample_tfidf, topn=5)

000webhostapp                                      Score: 0.30151134457776363
content                                            Score: 0.30151134457776363
emailaccess                                        Score: 0.30151134457776363
gzjzty                                             Score: 0.30151134457776363
login                                              Score: 0.30151134457776363


In [17]:
# benign url
benign_url = 'https://www.youtube.com/watch?v=svlEfxTyJQE'
sample = pd.DataFrame([{'url': benign_url}])
sample_tfidf = vectorizer.fit_transform(sample.url)
clf.predict(sample_tfidf.toarray())[0]

'benign'

In [18]:
top_tfidf(vectorizer, sample_tfidf, topn=5)

https                                              Score: 0.5
watch                                              Score: 0.5
www                                                Score: 0.5
youtube                                            Score: 0.5
00                                                 Score: 0.0
