In [1]:
#Importing the libraries
import pandas as pd
import numpy as np
import random
import pickle
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

In [2]:
def sanitization(web):                      # tokenization method for the URL
    web = web.lower()                       #makes url all lowercase 
    token = []
    dot_token_slash = []
    raw_slash = str(web).split('/')
    for i in raw_slash:
        raw1 = str(i).split('-')            # removing slash to get token
        slash_token = []
        for j in range(0,len(raw1)):
            raw2 = str(raw1[j]).split('.')  # removing dot to get the tokens
            slash_token = slash_token + raw2
        dot_token_slash = dot_token_slash + raw1 + slash_token # all tokens
    token = list(set(dot_token_slash))      # to remove same words  
    if 'com' in token:
        token.remove('com')                 # remove com
    return token

In [3]:
#Reading the CSV files of input URLs
url1 = 'data_url.csv'
url_csv1 = pd.read_csv(url1, ',', error_bad_lines=False)
url_df1= pd.DataFrame(url_csv1)              # to convert into data frames        

In [4]:
url_df1.head   #position head of the dataframe

<bound method NDFrame.head of                                                       url label
0                                  diaryofagameaddict.com   bad
1                                        espdesign.com.au   bad
2                                      iamagameaddict.com   bad
3                                           kalantzis.net   bad
4                                   slightlyoffcenter.net   bad
5                                        toddscarwash.com   bad
6                                          tubemoviez.com   bad
7                                                  ipl.hk   bad
8             crackspider.us/toolbar/install.php?pack=exe   bad
9                                         pos-kupang.com/   bad
10                                             rupor.info   bad
11      svision-online.de/mgfi/administrator/component...   bad
12      officeon.ch.ma/office.js?google_ad_format=728x...   bad
13                                            sn-gzzx.com   bad
14        

In [5]:
url_df1 = np.array(url_df1)                   # to convert into array 

In [6]:
url_df1

array([['diaryofagameaddict.com', 'bad'],
       ['espdesign.com.au', 'bad'],
       ['iamagameaddict.com', 'bad'],
       ...,
       ['apple-iclods.org/', 'bad'],
       ['apple-uptoday.org/', 'bad'],
       ['apple-search.info', 'bad']], dtype=object)

In [7]:
#shuffle the data
random.shuffle(url_df1)   

In [8]:
#Split the label and the URL
labels = [d[1] for d in url_df1]             
urls = [d[0] for d in url_df1]   
urls

['diaryofagameaddict.com',
 'espdesign.com.au',
 'iamagameaddict.com',
 'diaryofagameaddict.com',
 'espdesign.com.au',
 'iamagameaddict.com',
 'espdesign.com.au',
 'tubemoviez.com',
 'iamagameaddict.com',
 'slightlyoffcenter.net',
 'diaryofagameaddict.com',
 'rupor.info',
 'diaryofagameaddict.com',
 'crackspider.us/toolbar/install.php?pack=exe',
 'pos-kupang.com/',
 'pos-kupang.com/',
 'slightlyoffcenter.net',
 'outporn.com',
 'xindalawyer.com',
 'espdesign.com.au',
 'xindalawyer.com',
 'freeserials.spb.ru/key/68703.htm',
 'slightlyoffcenter.net',
 'adserving.favorit-network.com/eas?camp=19320;cre=mu&grpid=1738&tag_id=618&nums=FGApbjFAAA',
 'iamagameaddict.com',
 'diaryofagameaddict.com',
 'slightlyoffcenter.net',
 'sn-gzzx.com',
 'zkic.com',
 'kalantzis.net',
 'rupor.info',
 'crackspider.us/toolbar/install.php?pack=exe',
 'adserving.favorit-network.com/eas?camp=19320;cre=mu&grpid=1738&tag_id=618&nums=FGApbjFAAA',
 'kalantzis.net',
 'svision-online.de/mgfi/administrator/components/com_

In [9]:
#Use the Sanitizer function defined above
vectorizer = TfidfVectorizer(tokenizer=sanitization)  # term-frequency and inverse-document-frequency
x = vectorizer.fit_transform(urls)        #fits the data url
x_train, x_test, y_train, y_test = train_test_split(x, labels, test_size=0.2, shuffle= True, random_state=145)

In [10]:
#creates the decision tree classifier with random state set to 0
from sklearn.tree import DecisionTreeClassifier
decision_tree_clf = DecisionTreeClassifier(random_state=0)   

In [11]:
#classifier is fitted to the model
decision_tree_clf.fit(x_train, y_train)      

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=0, splitter='best')

In [12]:
# Logistic regression
logistic_regression = LogisticRegression()     
# fitted to model
logistic_regression.fit(x_train, y_train)                   



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [13]:
#gets score from decision tree model
score = decision_tree_clf.score(x_test, y_test)        
print("score: {0:.2f} %".format(100 * score))

score: 99.08 %


In [14]:
#gets score from logistic regression model
score = logistic_regression.score(x_test, y_test)       
print("score: {0:.2f} %".format(100 * score))

score: 98.53 %


In [16]:
#sets vectorizer to variable
vectorizer_save = vectorizer       

#saves logistic regression model to file
file = "pickel_model.pkl"         #file set to file named "pickle_model.pkl"
with open(file, 'wb') as f:       
    pickle.dump(logistic_regression, f)          #dump all the weights from lgr into file
f.close()

#saves vector to file
file2 = "pickel_vector.pkl"      
with open(file2,'wb') as f2:
    pickle.dump(vectorizer_save, f2)   #dump vectorizer save into file
f2.close()

In [17]:
#loads model
file = "pickel_model.pkl"     
with open(file, 'rb') as f1:  
    logistic_regression = pickle.load(f1)
f1.close()

In [18]:
#loads vector
file = "pickel_vector.pkl"    
with open(file, 'rb') as f2:  
    vectorizer = pickle.load(f2)
f2.close()

In [19]:
#urls we want to test
urls = ['google.com/search=VAD3R','www.pakistanifacebook.com','stackoverflow.com','facebook.com']#,'google.com/search=VAD3R','wikipedia.co.uk'


In [20]:
vectorizer = vectorizer
x = vectorizer.transform(urls)
#score = lgr.score(x_test, y_test)
y_predict = decision_tree_clf.predict(x)

In [21]:
print(urls)
print(y_predict)

['google.com/search=VAD3R', 'www.pakistanifacebook.com', 'stackoverflow.com', 'facebook.com']
['bad' 'bad' 'good' 'good']
