# Import libraries
These libraries will be used for our URL_classification project.

In [None]:
import datetime
import csv
import nltk
import numpy as np
import pandas as pd
import ast
from urllib.request import urlopen
from bs4 import BeautifulSoup

print(datetime.datetime.now().time())

Use this command if you have any errors on importing nltk library. It will open a nltk meniu with download and update options. If it's still missing some libraries, it needs to install manually by writing nltk.download('library name') where library name is missing library name which asserts error message.

In [None]:
nltk.download('stopwords')
nltk.download('words')
nltk.download('punkt')

# Setup
Determine file path with all URL_classification data, set how many lines we want to read(limiter);
top - a number which represents how many most frequent words is stored for each category.
char_blacklist, stopwords, language_whitelist, domains_whitelist, english_vocab - these variables are for URL filtering.

In [None]:
file = 'URL-categorization-DFE.csv'
df = pd.read_csv(file)[['main_category', 'main_category:confidence', 'url']]
df = df[(df['main_category'] != 'Not_working') & (df['main_category:confidence'] > 0.5)]

In [4]:
char_blacklist = list(chr(i) for i in range(32, 127) if i <= 64 or i >= 91 and i <= 96 or i >= 123)
stopwords = nltk.corpus.stopwords.words('english')
stopwords.extend(char_blacklist)
language_whitelist = ['en']
english_vocab = set(w.lower() for w in nltk.corpus.words.words())
blacklist_domain = ['.it', '.ru', '.cn', '.jp', '.tw', '.de', '.pl', '.fr', '.hu', '.bg', '.nl']

### Filter out all non english language domains

In [None]:
df = df[~df['url'].str.endswith(tuple(blacklist_domain))]
df['tokenized_words'] = ''

In [None]:
counter = 0
for i, row in df.iterrows():
    counter += 1
    if counter >= 50:
        break
    print("{}, {}/{}".format(row['url'], counter, len(df)))
    
    try:
        html = urlopen('http://' + row['url'], timeout=1).read()
    except:
        continue
        
    soup = BeautifulSoup(html, "html.parser")
    for script in soup(["script", "style"]):
        script.extract()
    text = soup.get_text()
    lines = (line.strip() for line in text.splitlines())
    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
    text = '\n'.join(chunk.lower() for chunk in chunks if chunk)
    filter_text = " ".join(w for w in nltk.word_tokenize(text) \
         if w.lower() in english_vocab)
    
    tokens = nltk.word_tokenize(filter_text)
    
    allWordExceptStopDist = nltk.FreqDist(
        w.lower() for w in tokens if w not in stopwords and len(w) >= 3 and w[0] not in char_blacklist)
    
    all_words = [i for i in allWordExceptStopDist]

    if len(all_words) > 0:
        continue
        
    df.at[i, 'tokenized_words'] = all_words

In [None]:
df = df[df['tokenized_words'] != '']

In [None]:
print(datetime.datetime.now().time())

In [261]:
df = pd.read_csv('cleaned_data.csv')
# df = df[df['main_category'].isin(['Health', 'Food_and_Drink'])]

In [262]:
top = 50
words_frequency = {}
for category in set(df['main_category'].values):
    all_words = []
    for row in df[df['main_category'] == category]['tokenized_words'].tolist():
        for word in ast.literal_eval(row):
            all_words.append(word)
    allWordExceptStopDist = nltk.FreqDist(
        w.lower() for w in all_words if w not in stopwords and len(w) >= 3 and w[0] not in char_blacklist)

    most_common = allWordExceptStopDist.most_common(top)
    words_frequency[category] = most_common

In [263]:
for category in set(df['main_category'].values):
    words_frequency[category] = [word for word, number in words_frequency[category]]

In [264]:
# set(words_frequency['Reference']) & set(words_frequency['Adult']) & set(words_frequency['Arts_and_Entertainment'])\
#     & set(words_frequency['Autos_and_Vehicles']) & set(words_frequency['Beauty_and_Fitness'])\
#     & set(words_frequency['Books_and_Literature']) & set(words_frequency['Business_and_Industry'])\
#     & set(words_frequency['Career_and_Education']) & set(words_frequency['Computer_and_Electronics'])\
#     & set(words_frequency['Finance']) & set(words_frequency['Food_and_Drink'])\
#     & set(words_frequency['Gambling']) & set(words_frequency['Games'])\
#     & set(words_frequency['Health']) & set(words_frequency['Home_and_Garden'])\
#     & set(words_frequency['Internet_and_Telecom']) & set(words_frequency['Law_and_Government'])\
#     & set(words_frequency['News_and_Media']) & set(words_frequency['People_and_Society'])\
#     & set(words_frequency['Pets_and_Animals']) & set(words_frequency['Recreation_and_Hobbies'])\
#     & set(words_frequency['Science']) & set(words_frequency['Shopping'])\
#     & set(words_frequency['Sports']) & set(words_frequency['Travel'])

In [265]:
from collections import Counter
from sklearn.metrics import accuracy_score

In [266]:
features = np.zeros(df.shape[0] * top).reshape(df.shape[0], top)
labels = np.zeros(df.shape[0])
counter = 0
for i, row in df.iterrows():
    c = [word for word, word_count in Counter(ast.literal_eval(row['tokenized_words'])).most_common(top)]
    labels[counter] = list(set(df['main_category'].values)).index(row['main_category'])
    for word in c:
        if word in words_frequency[row['main_category']]:
            features[counter][words_frequency[row['main_category']].index(word)] = 1
    counter += 1

In [267]:
from scipy.sparse import coo_matrix
X_sparse = coo_matrix(features)

In [268]:
from sklearn.utils import shuffle
X, X_sparse, y = shuffle(features, X_sparse, labels, random_state=0)

In [317]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [318]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X_train, y_train)
predictions = lr.predict(X_test)
score = lr.score(X_test, y_test)
print(predictions)
print(score)

[  2.   2.  12. ...,   7.   7.  13.]
0.421686746988


In [319]:
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier()
dtc.fit(X_train, y_train)
predictions = dtc.predict(X_test)
score = dtc.score(X_test, y_test)
print(predictions)
print(score)

[ 20.   3.   6. ...,   4.  19.  13.]
0.270080321285


In [321]:
from sklearn import svm
svm = svm.SVC()
svm.fit(X_train, y_train)
predictions = svm.predict(X_test)
score = svm.score(X_test, y_test)
print(predictions)
print(score)

[  2.   2.  12. ...,   7.   7.  13.]
0.397590361446


In [273]:
for category in set(df['main_category'].values):
    print("Category: {} ; Entries: {}".format(category, len(df[df['main_category'] == category])))

Category: Career_and_Education ; Entries: 263
Category: Games ; Entries: 199
Category: Autos_and_Vehicles ; Entries: 565
Category: Books_and_Literature ; Entries: 573
Category: Arts_and_Entertainment ; Entries: 151
Category: Internet_and_Telecom ; Entries: 282
Category: People_and_Society ; Entries: 426
Category: Science ; Entries: 373
Category: Law_and_Government ; Entries: 587
Category: Recreation_and_Hobbies ; Entries: 143
Category: Pets_and_Animals ; Entries: 129
Category: Reference ; Entries: 578
Category: Food_and_Drink ; Entries: 757
Category: Finance ; Entries: 391
Category: Health ; Entries: 732
Category: Computer_and_Electronics ; Entries: 351
Category: Adult ; Entries: 54
Category: Travel ; Entries: 282
Category: Shopping ; Entries: 207
Category: Gambling ; Entries: 385
Category: Business_and_Industry ; Entries: 295
Category: Home_and_Garden ; Entries: 39
Category: Sports ; Entries: 510
Category: Beauty_and_Fitness ; Entries: 526
Category: News_and_Media ; Entries: 254


In [274]:
Health Food_and_Drink 

SyntaxError: invalid syntax (<ipython-input-274-2c61fd203262>, line 1)