In [110]:
import pandas as pd
import numpy as np
import requests
import json
import re

from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

import matplotlib.pyplot as plt

from sklearn.manifold import TSNE

In [111]:
df = pd.read_csv('/dlabdata1/lugeon/datasets/websites_dmoz_multi.gz', compression='gzip')[['url', 'lang', 'lang_id', 'cat0']]

In [112]:
df_en = df[df.lang_id == 'en']
df_en.shape

(572098, 4)

In [113]:
df_en.reset_index(drop=True, inplace=True)

In [114]:
df_en.cat0.value_counts()

Business      148144
Society        82079
Arts           66721
Shopping       54062
Recreation     46095
Computers      45194
Sports         34890
Science        28138
Health         24218
Reference      21663
Games          10246
Home            6952
News            3696
Name: cat0, dtype: int64

In [115]:
def clean_url(url):
    url = re.sub(r"www.|http://|https://|-|_", '', url)
    return url.split('.')[0]

In [116]:
df_en['clean_url'] = df_en.url.apply(clean_url)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_en['clean_url'] = df_en.url.apply(clean_url)


In [117]:
df_en.head()

Unnamed: 0,url,lang,lang_id,cat0,clean_url
0,www.232analyzer.com,English,en,Computers,232analyzer
1,www.cs-interiors.co.uk,English,en,Business,csinteriors
2,ccc-stl.org,English,en,Society,cccstl
3,www.utahwebdesign.com,English,en,Computers,utahwebdesign
4,www.hampsteadstage.org,English,en,Arts,hampsteadstage


In [118]:
def compute_ngram(s, n):
    return [s[i:i+n] for i in range(0, len(s)-n+1)]

def concat_ngrams(s, ns):
    l = []
    for n in ns:
        l += compute_ngram(s, n)
    return ' '.join(l)

In [119]:
ns = range(3, 6) # from 3 to 5
df_en['ngrams'] = df_en.apply(lambda row: concat_ngrams(row.clean_url, ns), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_en['ngrams'] = df_en.apply(lambda row: concat_ngrams(row.clean_url, ns), axis=1)


In [120]:
df_en.head()

Unnamed: 0,url,lang,lang_id,cat0,clean_url,ngrams
0,www.232analyzer.com,English,en,Computers,232analyzer,232 32a 2an ana nal aly lyz yze zer 232a 32an ...
1,www.cs-interiors.co.uk,English,en,Business,csinteriors,csi sin int nte ter eri rio ior ors csin sint ...
2,ccc-stl.org,English,en,Society,cccstl,ccc ccs cst stl cccs ccst cstl cccst ccstl
3,www.utahwebdesign.com,English,en,Computers,utahwebdesign,uta tah ahw hwe web ebd bde des esi sig ign ut...
4,www.hampsteadstage.org,English,en,Arts,hampsteadstage,ham amp mps pst ste tea ead ads dst sta tag ag...


In [121]:
r = np.random.rand(df_en.shape[0])
mask = r < 0.9
train = df_en[mask]
test = df_en[~mask]

In [155]:
from sklearn.feature_extraction.text import TfidfVectorizer

n_features = 20_000

vectorizer = TfidfVectorizer(max_features=n_features)
x_train = vectorizer.fit_transform(train.ngrams.values)
x_test = vectorizer.transform(test.ngrams.values)

In [156]:
from sklearn import preprocessing

encoder = preprocessing.LabelEncoder()
encoder.fit(df_en.cat0.unique())

y_train = encoder.transform(train.cat0.values)
y_test = encoder.transform(test.cat0.values)

In [157]:
class_counts = pd.Series(y_train).value_counts().sort_index()
class_counts

0      59903
1     132866
2      40635
3       9239
4      21735
5       6265
6       3300
7      41460
8      19440
9      25270
10     48535
11     73955
12     31277
dtype: int64

In [158]:
weights = class_counts.values / class_counts.values.sum()
weights = 1 / weights
weights = weights / weights.sum()
weights = weights * 13
weights

array([0.25623338, 0.11552352, 0.37773221, 1.66134304, 0.706195  ,
       2.44998378, 4.65125709, 0.37021583, 0.78956525, 0.60740595,
       0.31624907, 0.20754714, 0.49074874])

In [159]:
import torch
from torch import nn
from torch.nn import functional as F
from torch import optim

In [160]:
class Embnet(nn.Module):
    def __init__(self):
        super(Embnet, self).__init__()

        self.fc1 = torch.nn.Linear(n_features, 768)
        self.fc2 = torch.nn.Linear(768, 768)
        self.fc2 = torch.nn.Linear(768, 768)
        self.fc3 = torch.nn.Linear(768, 13)

    def forward(self, x):
        x = self.fc1(x)
        x = self.fc2(F.relu(x))
        x = self.fc3(F.relu(x))
        return x
    

In [161]:
def accuracy(output, target):
    nb_samples = output.shape[0]
    
    # Convert probability to decision
    output_class = torch.argmax(output, 1)
    
    nb_correct = (output_class == target).sum().item()
    return nb_correct / nb_samples

In [162]:
train_input = torch.FloatTensor(x_train.toarray())
train_target = torch.LongTensor(y_train)
test_input = torch.FloatTensor(x_test.toarray())
test_target = torch.LongTensor(y_test)

In [163]:
train_input.shape, train_target.shape

(torch.Size([513880, 20000]), torch.Size([513880]))

In [None]:
torch.set_num_threads(12)

epochs = 70
batch_size = 256

model = Embnet()
nb_trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print("Number of parameters: {}".format(nb_trainable_params))
# Loss
criterion = nn.CrossEntropyLoss(weight=torch.FloatTensor(weights))
#criterion = nn.CrossEntropyLoss()

# Optimizer
optimizer = optim.Adam(model.parameters(), 1e-4)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.1, eps=1e-8, patience=3, verbose=True)

#model.train(True)

for e in range(epochs):
    
    for input, target in zip(train_input.split(batch_size), train_target.split(batch_size)):
                             
        output = model(input)
        loss = criterion(output, target)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    with torch.no_grad():
        #model.train(False)
        train_output = model(train_input)
        test_output = model(test_input)
        train_loss = criterion(train_output, train_target)
        train_acc = accuracy(train_output, train_target)
        test_loss = criterion(test_output, test_target)
        test_acc = accuracy(test_output, test_target)
        print("Epoch {}".format(e) +\
              " | Train loss : {:.3f}".format(train_loss) +\
              " | Test loss : {:.3f}".format(test_loss) +\
              " | Train accuracy : {:.3f}".format(train_acc) +\
              " | Test accuracy : {:.3f}".format(test_acc))
        
    scheduler.step(test_acc)

Number of parameters: 15961357
Epoch 0 | Train loss : 1.651 | Test loss : 1.789 | Train accuracy : 0.403 | Test accuracy : 0.374
Epoch 1 | Train loss : 1.504 | Test loss : 1.767 | Train accuracy : 0.428 | Test accuracy : 0.387
Epoch 2 | Train loss : 1.416 | Test loss : 1.791 | Train accuracy : 0.443 | Test accuracy : 0.390
Epoch 3 | Train loss : 1.352 | Test loss : 1.834 | Train accuracy : 0.454 | Test accuracy : 0.393
Epoch 4 | Train loss : 1.301 | Test loss : 1.887 | Train accuracy : 0.463 | Test accuracy : 0.394
Epoch 5 | Train loss : 1.257 | Test loss : 1.948 | Train accuracy : 0.472 | Test accuracy : 0.396
Epoch 6 | Train loss : 1.217 | Test loss : 2.015 | Train accuracy : 0.481 | Test accuracy : 0.397
Epoch 7 | Train loss : 1.179 | Test loss : 2.091 | Train accuracy : 0.490 | Test accuracy : 0.398
Epoch 8 | Train loss : 1.140 | Test loss : 2.176 | Train accuracy : 0.500 | Test accuracy : 0.399
Epoch 9 | Train loss : 1.101 | Test loss : 2.273 | Train accuracy : 0.510 | Test accura