In [1]:
from sklearn.model_selection import train_test_split
from sklearn.externals import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import pandas as pd
import numpy as np
import random
from mylib import tokenize as tk

## Read data from .csv file

In [2]:
dataset_path = 'dataset.csv'    # path to our all urls file
dataset = pd.read_csv(dataset_path,sep=',',encoding='utf-8', error_bad_lines=False)    # reading file
df = pd.DataFrame(dataset)    # converting to a dataframe

df    # print

Unnamed: 0,url,label
0,diaryofagameaddict.com,bad
1,espdesign.com.au,bad
2,iamagameaddict.com,bad
3,kalantzis.net,bad
4,slightlyoffcenter.net,bad
5,toddscarwash.com,bad
6,tubemoviez.com,bad
7,ipl.hk,bad
8,crackspider.us/toolbar/install.php?pack=exe,bad
9,pos-kupang.com/,bad


In [3]:
df = np.array(df)    # converting it into an array
random.shuffle(df)    # shuffling

In [4]:
target = [d[1] for d in df]    # all labels
corpus = [d[0] for d in df]    # all urls corresponding to a label (either good or bad)

# TF-IDF

In [5]:
vect = TfidfVectorizer(tokenizer=tk, max_features=1000)    # get a vector for each url but use our customized tokenizer
corpus_vect = vect.fit_transform(corpus)    # get the corpus vector

vect.get_feature_names()

['',
 '%20secure%20login_files',
 "'",
 '.https',
 '0',
 '0001',
 '001',
 '01',
 '02',
 '03',
 '04',
 '05',
 '06',
 '07',
 '08',
 '09',
 '0secure0login_files',
 '1',
 '1&amp;email=&amp;',
 '1&amp;fav',
 '1&amp;rand',
 '1&email=&',
 '1&fav',
 '1&rand',
 "1'",
 "1.html'",
 '10',
 '11',
 '12',
 '1252899642&amp;fid',
 '1252899642&amp;fid=1&amp;fav',
 '1252899642&fid',
 '1252899642&fid=1&fav',
 '1252899642=&amp;fid',
 '13',
 '13inboxlight',
 '14',
 '15',
 '16',
 '160624',
 '163',
 '17',
 '1774256418&amp;fid',
 '1774256418&amp;rand',
 '1774256418&fid',
 "1774256418'",
 '18',
 '19',
 '1=&amp;fid',
 '2',
 "2'",
 '20',
 '2003',
 '2004',
 '2005',
 '2006',
 '2007',
 '2008',
 '2009',
 '2010',
 '2011',
 '2012',
 '2013',
 '2015',
 '2016',
 '21',
 '22',
 '23',
 '24',
 '25',
 '26',
 '27',
 '28',
 '29',
 '3',
 '30',
 '31',
 '4',
 '5',
 '6',
 '7',
 '8',
 '80port',
 '9',
 '?http:',
 '?ref=http:',
 '?us',
 '?us.battle.net',
 '_',
 'a',
 'about',
 'ac',
 'access',
 'account',
 'accounts',
 'acesso',
 'acto

## Split train / test data (70:30%)
split into training and testing set 70/30 ratio

In [6]:
test_size = 0.3

data_train, data_test, target_train, target_test = train_test_split(corpus_vect, target, test_size=test_size, random_state=100)

# Create model: using logistic regression

In [7]:
model = LogisticRegression()
model.fit(data_train, target_train)    # run(fit)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

# Evaluate model : Accuracy

In [8]:
print('Accuracy = %.2f %%' %( model.score(data_test, target_test)*100) )

Accuracy = 92.73 %


# Save model

In [9]:
joblib.dump(model, 'url_clf.model')

['url_clf.model']

In [10]:
joblib.dump(vect, 'url_clf.vect')

['url_clf.vect']