In [1]:
import pandas as pd
import nltk
import ast
import numpy as np
import os
import ast
import urllib.request
from urllib.request import urlopen
from bs4 import BeautifulSoup
import os.path
nltk.download('stopwords')
nltk.download('words')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/domantas/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package words to /home/domantas/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package wordnet to /home/domantas/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# Dataset creation if it is not existing.
__Dataset is filtered by these set of rules:__
1. Main category != Not_working (Exclude non working URL's)
2. Main category:confidence > 0.5 (Leave url's with likely know categories)
3. Non responding URL's are excluded
4. Non english language URL's are excluded.

### Caution, the full data set creation may take ~15 hours.

In [2]:
def no_filter_data():
    file = 'Datasets/URL-categorization-DFE.csv'
    df = pd.read_csv(file)[['main_category', 'main_category:confidence', 'url']]
    df = df[(df['main_category'] != 'Not_working') & (df['main_category:confidence'] > 0.5)]
    df['tokenized_words'] = ''
    
    counter = 0
    for i, row in df.iterrows():
        counter += 1
        print("{}, {}/{}".format(row['url'], counter, len(df)))

        try:
            hdr = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
               'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
               'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
               'Accept-Encoding': 'none',
               'Accept-Language': 'en-US,en;q=0.8',
               'Connection': 'keep-alive'}
            req = urllib.request.Request(url, headers=hdr)
            html = urlopen(req).read()
#             html = urlopen('http://' + row['url'], timeout=15).read()
        except:
            continue

        soup = BeautifulSoup(html, "html.parser")
        [tag.decompose() for tag in soup("script")]
        [tag.decompose() for tag in soup("style")]
        text = soup.get_text()
        lines = (line.strip() for line in text.splitlines())
        chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
        text = '\n'.join(chunk.lower() for chunk in chunks if chunk)
        tokens = nltk.word_tokenize(text)

        df.at[i, 'tokenized_words'] = tokens if len(tokens) > 0 else ''
        
    df = df[~df['tokenized_words'].isnull()]
    df.to_csv("Datasets/full_data_v3.csv")
    
if not os.path.isfile("Datasets/full_data_v3.csv"):
    no_filter_data()

### Reading data set and creating list of stopwords and english vocabulary for further investigation

In [3]:
df = pd.read_csv("Datasets/full_data_december.csv")
df = df[~df['tokenized_words'].isnull()]
char_blacklist = list(chr(i) for i in range(32, 127) if i <= 64 or i >= 91 and i <= 96 or i >= 123)
stopwords = nltk.corpus.stopwords.words('english')
stopwords.extend(char_blacklist)
english_vocab = set(w.lower() for w in nltk.corpus.words.words())

# Filter webpages with english language
If the webpage contains at least 20 % english words of total words, then the webpage is considered as english

In [5]:
english_confidence = []
english_tolerance = 50
for i, row in df.iterrows():
    english_words = 0
    words = ast.literal_eval(row['tokenized_words'])
    for word in words:
        if word.lower() in english_vocab:
            english_words += 1
    english_confidence.append(english_words / len(words) * 100)
df['english:confidence'] = english_confidence
df = df[df['english:confidence'] > english_tolerance]

# Make the most popular word list for each catgegory

In [6]:
top = 2500
words_frequency = {}
for category in set(df['main_category'].values):
    all_words = []
    for row in df[df['main_category'] == category]['tokenized_words'].tolist():
        for word in ast.literal_eval(row):
            all_words.append(word)
                
    allWordExceptStopDist = nltk.FreqDist(
        w.lower() for w in all_words if w not in stopwords and len(w) >= 3 and w[0] not in char_blacklist)

    most_common = allWordExceptStopDist.most_common(top)
    words_frequency[category] = most_common

for category in set(df['main_category'].values):
    words_frequency[category] = [word for word, number in words_frequency[category]]

### Remove most frequent words in all categories

In [13]:
from collections import Counter
words = []
for category in words_frequency.keys():
    words.extend(words_frequency[category][0:15])
words_counter = Counter(words)
words_filter = {x : words_counter[x] for x in words_counter if words_counter[x] >= 7}
words_stop = list(words_filter.keys())
for category in words_frequency.keys():
    words_frequency[category] = [word for word in words_frequency[category] if word not in words_stop]

In [22]:
words_filter

{'new': 22,
 'home': 7,
 'view': 7,
 'read': 13,
 'learn': 7,
 'information': 8,
 'contact': 14,
 'free': 7,
 'news': 9,
 'get': 7}

# Create features and labels for Machine learning training

In [18]:
from collections import Counter

features = np.zeros(df.shape[0] * top).reshape(df.shape[0], top)
labels = np.zeros(df.shape[0])
counter = 0
for i, row in df.iterrows():
    c = [word for word, word_count in Counter(ast.literal_eval(row['tokenized_words'])).most_common(top)]
    labels[counter] = list(set(df['main_category'].values)).index(row['main_category'])
    for word in c:
        if word in words_frequency[row['main_category']]:
            features[counter][words_frequency[row['main_category']].index(word)] = 1
    counter += 1

# Create seperate training/testing datasets and shuffle them

In [19]:
from sklearn.metrics import accuracy_score
from scipy.sparse import coo_matrix
X_sparse = coo_matrix(features)

from sklearn.utils import shuffle
X, X_sparse, y = shuffle(features, X_sparse, labels, random_state=0)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

# Predictions

In [20]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X_train, y_train)
predictions = lr.predict(X_test)
score = lr.score(X_test, y_test)
print('LogisticRegression')
print('Score: ', score)
print('Top: ', top)
print('Tolerance: ', english_tolerance)
print('Dataset length: ', df.shape[0])
print()
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier()
dtc.fit(X_train, y_train)
predictions = dtc.predict(X_test)
score = dtc.score(X_test, y_test)
print('DecisionTreeClassifier')
print('Score: ', score)
print('Top: ', top)
print('Tolerance: ', english_tolerance)
print('Dataset length: ', df.shape[0])
print()
from sklearn.svm import LinearSVC
clf = LinearSVC()
clf.fit(X_train, y_train) 
predictions = clf.predict(X_test)
score = clf.score(X_test, y_test)
print('SVM')
print('Score: ', score)
print('Top: ', top)
print('Tolerance: ', english_tolerance)
print('Dataset length: ', df.shape[0])



LogisticRegression
Score:  0.8388802488335926
Top:  2500
Tolerance:  50
Dataset length:  9740

DecisionTreeClassifier
Score:  0.3244167962674961
Top:  2500
Tolerance:  50
Dataset length:  9740





SVM
Score:  0.728149300155521
Top:  2500
Tolerance:  50
Dataset length:  9740


### Save ML model

In [23]:
month = 'December'
from sklearn.externals import joblib
filename = "Models/{}/LR_model_v3_stop_{}.joblib".format(month, month)
if not os.path.isfile(filename):
    joblib.dump(lr, filename) 

import pickle
words_filename = "Models/{}/word_frequency_v3_stop_{}.picle".format(month, month)
if not os.path.isfile(words_filename):
    pickle_out = open(words_filename,"wb")
    pickle.dump(words_frequency, pickle_out)
    pickle_out.close()
    
filename = "Models/{}/LR_maxtrain_v3.joblib_stop_{}".format(month, month)
if not os.path.isfile(filename):
    from sklearn.linear_model import LogisticRegression
    lr = LogisticRegression()
    lr.fit(X, y)
    joblib.dump(lr, filename) 



In [None]:
# import matplotlib.pyplot as plt; plt.rcdefaults()
# import numpy as np
# import matplotlib.pyplot as plt
 
# objects = ('English', 'Italic', 'Russian', 'Japan', 'China', 'Belgium')
# y_pos = np.arange(len(objects))
# performance = [8143,260,646,338,125,100]
 
# plt.bar(y_pos, performance, align='center', alpha=0.5)
# plt.xticks(y_pos, objects)
# plt.ylabel('URLs')
# plt.title('Languages diversity in the data set')
 
# plt.show()
# plt.savefig("language_diversity.png")
# df[df['main_category'] == 'Business_and_Industry']['url']

In [None]:
# import matplotlib.pyplot as plt; plt.rcdefaults()
# import numpy as np
# import matplotlib.pyplot as plt
# from collections import Counter

# words = []
# for category in words_frequency.keys():
#     words.extend(words_frequency[category][0:15])
# words_counter = Counter(words)
# words_filter = {x : words_counter[x] for x in words_counter if words_counter[x] >= 7}
# objects = tuple(words_filter.keys())
# y_pos = np.arange(len(objects))
# performance = list(words_filter.values())

# plt.barh(y_pos, performance, align='center', alpha=1)
# plt.xticks(range(1, max(performance) + 1))
# plt.yticks(y_pos, objects)
# plt.xlabel('Word diversity in categories (TOP 15 words)')
# plt.title('Words diversity in each category TOP 15 most frequent words')
 
# plt.show()
# plt.savefig("words_diversity.png")