# Import libraries
These libraries will be used for our URL_classification project.

In [None]:
import datetime
import csv
import nltk
import numpy as np
from urllib.request import urlopen
from bs4 import BeautifulSoup
from langdetect import detect
from sklearn.model_selection import cross_val_predict
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.metrics import classification_report
import pandas as pd

Use this command if you have any errors on importing nltk library. It will open a nltk meniu with download and update options. If it's still missing some libraries, it needs to install manually by writing nltk.download('library name') where library name is missing library name which asserts error message.

In [None]:
# nltk.download()

# Function for accuracy calculations
A function responsible for accuracy, precision, recall, f1 score calculation. Also it provides Confusion matrix.

In [None]:
def score_calculation(labels, prediction):
    y_true = pd.Series(labels)
    y_pred = pd.Series(prediction)
    print(str(lr))
    print('Confusion matrix: \n{}'.format(pd.crosstab(y_true, y_pred, rownames=['True'], colnames=['Predicted'], margins=True)))
    print(classification_report(labels, prediction))
    print("Accuracy score: {}".format(metrics.accuracy_score(labels, prediction)))

# Setup
Determine file path with all URL_classification data, set how many lines we want to read(limiter);
top - a number which represents how many most frequent words is stored for each category.
char_blacklist, stopwords, language_whitelist, domains_whitelist, english_vocab - these variables are for URL filtering.

In [None]:
file = 'URL-categorization-DFE.csv'
limiter = 150 #Number of URL for analyzing. 2000 ~ 40 min, 1000-16 min. MINIMUM VALUE=150
cv_number = 2
#cv - Cross valid classification parameter:
# if limiter >= 2000: cv <= 5
# if limiter >= 
top = 15
reader = csv.reader(open(file), delimiter=',')
header = next(reader)
char_blacklist = list(chr(i) for i in range(32, 127) if i <= 64 or i >= 91 and i <= 96 or i >= 123)
stopwords = nltk.corpus.stopwords.words('english')
stopwords.extend(char_blacklist)
language_whitelist = ['en']
domains_whitelist = ['com', 'org', 'net', '.us', '.uk', '.au', '.ca']
english_vocab = set(w.lower() for w in nltk.corpus.words.words())
print(datetime.datetime.now().time())

Store data to the list.

In [None]:
data = []
for row in reader:
    data.append(row)

# Downloading and analyzing URL
Downloading each URL content. Filter URL for analyzing most frequent words and excluding stop words. Take note that this process take a lot of time. For example: downloading 2000 first URL takes about 40 minutes. Limiter value is how many URL will be downloaded and analyzed.

In [None]:
tokens_list = []
filter_data = []
counter = 0
print('URL parsing and filtering')
for url_counter, row in enumerate(data):
    if url_counter >= limiter:
        break
    if row[5] != 'Not_working' and float(row[6]) > 0.5:
        try:
            url = 'http://' + row[-1]
            html = urlopen(url, timeout=1).read()
            soup = BeautifulSoup(html, "html.parser")
            for script in soup(["script", "style"]):
                script.extract()
            text = soup.get_text()
            lines = (line.strip() for line in text.splitlines())
            chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
            text = '\n'.join(chunk.lower() for chunk in chunks if chunk)
            text_vocab = set(w.lower() for w in text if w.lower().isalpha())
            if detect(text) not in language_whitelist or (row[-1][-3:] not in domains_whitelist and row[-1][-3:] not in domains_whitelist):
                continue
            counter += 1
            tokens = nltk.word_tokenize(text)
            tokens_list += [nltk.word_tokenize(text)]
            print('{} | {} |URL: {}| CATEGORY: {}'.format(url_counter, counter, row[-1], row[5]))
            filter_data += [row[5]]

        except:
            pass

# Filtering categories
Determine which categories are suitable for machine learning classifier.

In [None]:
print('Filtering categories')
f1 = nltk.FreqDist(filter_data).most_common()
f2 = list(category for category, number in f1 if number >= cv_number)
all_categories = list(set(f2))


# Labels
Creating labels for machine learning classifier.

In [None]:
print('CREATING LABELS DATA.')
labels = []
counter = 0
for index, word in enumerate(filter_data):
    if word in all_categories:
        labels += [all_categories.index(word)]
    else:
        tokens_list.pop(index - counter)
        counter += 1
save = labels
labels = np.array(labels).reshape((len(labels), 1))

# Most frequent words
Creating a list of most frequent words for each category.

In [None]:
print('CREATING FREQUENT WORDS LIST..')
freq_words = []
for tokens in tokens_list:
    allWordDist = nltk.FreqDist(w.lower() for w in tokens)
    allWordExceptStopDist = nltk.FreqDist(
        w.lower() for w in tokens if w not in stopwords and len(w) >= 3 and w[0] not in char_blacklist)
    all_words = [i for i in allWordExceptStopDist]
    mostCommon = allWordExceptStopDist.most_common(top)
    freq_words += [word for word, number in mostCommon]

# Features
Creating features list for classifier.

In [None]:
print('CREATING FEATURES DATA...')
features = np.zeros(pow(len(tokens_list), 2) * top).reshape(len(tokens_list), len(tokens_list) * top)
for index, line in enumerate(tokens_list):
    for word in line:
        if word in freq_words:
            features[index][freq_words.index(word)] = 1

Display all categories.

In [None]:
for number, word in enumerate(all_categories):
    print(number, word)

# Prediction and performance score
Predict values and calculating score by using Logistic regression classifier.

In [None]:
c, r = labels.shape
labels = labels.reshape(c,)
print('************ Logistic Regression ************')
lr = LogisticRegression()
prediction = cross_val_predict(lr, features, labels, cv=cv_number)
score_calculation(labels, prediction)

Predict values and calculating score by using Decision tree classifier.

In [None]:
print('************ Decision Tree ************')
lr = DecisionTreeClassifier()
prediction = cross_val_predict(lr, features, labels, cv=cv_number)
score_calculation(labels, prediction)

Predict values and calculating score by using KNeighbors classifier.

In [None]:
print('************ KNeighbors ************')
lr = KNeighborsClassifier(n_neighbors=5, metric="euclidean")
prediction = cross_val_predict(lr, features, labels, cv=cv_number)
score_calculation(labels, prediction)

print(datetime.datetime.now().time())