In [1]:
# EDA Packages
import pandas as pd
import numpy as np
import random
# Machine Learning Packages
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

In [2]:
# Load Url Data 
urls_data = pd.read_csv("urldata.csv")
urls_data.head()

Unnamed: 0,url,label
0,diaryofagameaddict.com,bad
1,espdesign.com.au,bad
2,iamagameaddict.com,bad
3,kalantzis.net,bad
4,slightlyoffcenter.net,bad


In [3]:
urls_data['label'].value_counts() # Total label Single values

good    344821
bad      75643
Name: label, dtype: int64

In [4]:
urls_data.shape

(420464, 2)

In [5]:
def tokenazier(f):
    tkns_BySlash = str(f.encode('utf-8')).split('/')  # make tokens after splitting by slash
    total_Tokens = []
    for i in tkns_BySlash:
        tokens = str(i).split('-') # make tokens after splitting by dash
        tkns_ByDot = []
        for j in range(0,len(tokens)):
            temp_Tokens = str(tokens[j]).split('.')	# make tokens after splitting by dot
            tkns_ByDot = tkns_ByDot + temp_Tokens
        total_Tokens = total_Tokens + tokens + tkns_ByDot
    total_Tokens = list(set(total_Tokens))	#remove redundant tokens
    if 'com' in total_Tokens:
        total_Tokens.remove('com')	#removing .com since it occurs a lot of times and it should not be included in our features
    return total_Tokens

In [6]:
b = tokenazier('tsurihack.com')
b

["b'tsurihack", "b'tsurihack.com'", "com'"]

In [7]:
# Labels
y = urls_data["label"]
y.head()

0    bad
1    bad
2    bad
3    bad
4    bad
Name: label, dtype: object

In [8]:
# Features
url_list = urls_data["url"]
url_list.head()

0    diaryofagameaddict.com
1          espdesign.com.au
2        iamagameaddict.com
3             kalantzis.net
4     slightlyoffcenter.net
Name: url, dtype: object

In [9]:
# Using Custom Tokenizer
vectorizer = TfidfVectorizer(tokenizer=tokenazier)
print(vectorizer)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=<function tokenazier at 0x000001C86C2FC288>,
                use_idf=True, vocabulary=None)


In [10]:
# Store vectors into X variable as Our XFeatures
X = vectorizer.fit_transform(url_list)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
#using DecisionTreeClassifier
drugTree = DecisionTreeClassifier(criterion="entropy")
drugTree

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='entropy',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [13]:
# Accuracy of Our Model
drugTree.fit(X_train,y_train)
print("Accuracy ",drugTree.score(X_test, y_test))

Accuracy  0.9667986633845861


In [14]:
coursera_predict = drugTree.predict(vectorizer.transform(["https://www.coursera.org/learn/python-text-mining/lecture/zV9nP/naive-bayes-classifiers"]))
print(coursera_predict)
phishing_predict = drugTree.predict(vectorizer.transform(["louisianasportsman.com"]))
print(phishing_predict)

['good']
['bad']
