In [1]:
import numpy as np
import pandas as pd

import requests
import csv
import time
from bs4 import BeautifulSoup

import torch
from transformers import BertTokenizer, BertModel

from sklearn.metrics.pairwise import cosine_similarity

from tqdm import tqdm

import ast

In [3]:
folder = '/dlabdata1/lugeon/'
name = 'websites_alexa_10000_5cat_emb_bert'
ext = '.gz'
data = pd.read_csv(folder + name + ext, names = ['last_id', 'uid', 'emb', 'cat0'], header=0)

In [4]:
data = data[data.emb.notnull()]

In [5]:
data['emb'] = data.apply(lambda row: np.array(ast.literal_eval(row.emb)), axis=1)
data['emb_red'] = data.emb.apply(lambda x: np.delete(x, 227))

In [6]:
data.head()

Unnamed: 0,last_id,uid,emb,cat0
0,0,36126,"[-0.5465862154960632, -0.4531528055667877, 0.5...",Arts
1,1,25238,"[-0.5333141684532166, -0.25329938530921936, 0....",Arts
2,2,20371,"[-0.5620501637458801, -0.5124451518058777, 0.8...",Arts
3,3,11312,"[-0.6283508539199829, -0.11153016984462738, 0....",Arts
4,4,16434,"[-0.062120892107486725, -0.3814326822757721, 0...",Arts


In [7]:
data.shape

(38533, 4)

In [8]:
data.cat0.value_counts()

Arts         7835
Computers    7820
Science      7642
Health       7630
Sports       7606
Name: cat0, dtype: int64

In [9]:
def categorize(s):
    if s == 'Health':
        return 0
    if s == 'Science':
        return 1
    if s == 'Arts':
        return 2
    if s == 'Computers':
        return 3
    if s == 'Sports':
        return 4

In [10]:
data['cat_no'] = data.apply(lambda row: categorize(row.cat0), axis=1)

In [79]:
embeddings = np.concatenate(data.emb.values)

In [80]:
embedding_dim = 768
input = torch.tensor(embeddings)
input = torch.reshape(input, (-1, embedding_dim)).float()

In [81]:
from sklearn.preprocessing import normalize

input_norm = torch.FloatTensor(normalize(input, axis=0)) # normalizing w.r.t to features

In [82]:
emb_norm.shape

torch.Size([38533, 768])

In [83]:
cat_no = data.cat_no.values
target = torch.tensor(cat_no).long()

In [84]:
target.shape

torch.Size([38533])

In [85]:
id = np.arange(emb_norm.shape[0])
np.random.shuffle(id)

tr_id = id[:32_000]
te_id = id[32_000:]

train_input_ = input_norm[tr_id]
test_input_ = input_norm[te_id]

train_target_ = target[tr_id]
test_target_ = target[te_id]

In [86]:
data.iloc[tr_id].cat0.value_counts()

Arts         6537
Computers    6477
Science      6367
Health       6323
Sports       6296
Name: cat0, dtype: int64

In [87]:
import torch
from torch import nn
from torch.nn import functional as F
from torch import optim
import time


In [88]:
class Classifier(nn.Module):
    def __init__(self):
        super(Classifier, self).__init__()
        self.fc1 = nn.Linear(768, 200)
        self.fc2 = nn.Linear(200, 200)
        self.fc3 = nn.Linear(200, 200)
        self.fc4 = nn.Linear(200, 5)
        self.drop = nn.Dropout(0.6)

    def forward(self, x):
        x = self.fc1(x)
        x = self.drop(x)
        x = self.fc2(F.relu(x))
        x = self.drop(x)
        x = self.fc3(F.relu(x))
        x = self.drop(x)
        x = self.fc4(F.relu(x))
        return x

In [89]:
def accuracy(output, target):
    nb_samples = output.shape[0]
    
    # Convert probability to decision
    output_class = torch.argmax(output, 1)
    
    nb_correct = (output_class == target).sum().item()
    return nb_correct / nb_samples


In [None]:
epochs = 200
batch_size = 64

model = Classifier()

# Loss
criterion = nn.CrossEntropyLoss()

# Optimizer
optimizer = optim.Adam(model.parameters(), 1e-4)

scheduler = optim.lr_scheduler.StepLR(optimizer, step_size= int(epochs/2), gamma = 0.1)

# Training the model
model.train(True)

for e in range(epochs):
    
    for input, target in zip(train_input_.split(batch_size), train_target_.split(batch_size)):
                             
        output = model(input)
        loss = criterion(output, target)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    model.train(False)
    tr_output = model(train_input_)
    te_output = model(test_input_)
    tr_loss = criterion(tr_output, train_target_)
    tr_acc = accuracy(tr_output, train_target_)
    te_acc = accuracy(te_output, test_target_)
    model.train(True)
    print("Epoch {}".format(e) +\
          " | Train loss : {:.3f}".format(tr_loss) +\
          " | Train accuracy : {:.3f}".format(tr_acc) +\
          " | Test accuracy : {:.3f}".format(te_acc))

Epoch 0 | Train loss : 1.604 | Train accuracy : 0.213 | Test accuracy : 0.208
Epoch 1 | Train loss : 1.337 | Train accuracy : 0.398 | Test accuracy : 0.399
Epoch 2 | Train loss : 1.269 | Train accuracy : 0.447 | Test accuracy : 0.446
Epoch 3 | Train loss : 1.215 | Train accuracy : 0.495 | Test accuracy : 0.496
Epoch 4 | Train loss : 1.136 | Train accuracy : 0.555 | Test accuracy : 0.561
Epoch 5 | Train loss : 1.051 | Train accuracy : 0.594 | Test accuracy : 0.600
Epoch 6 | Train loss : 1.000 | Train accuracy : 0.614 | Test accuracy : 0.616
Epoch 7 | Train loss : 0.961 | Train accuracy : 0.636 | Test accuracy : 0.637
Epoch 8 | Train loss : 0.930 | Train accuracy : 0.652 | Test accuracy : 0.649
Epoch 9 | Train loss : 0.905 | Train accuracy : 0.667 | Test accuracy : 0.663
Epoch 10 | Train loss : 0.880 | Train accuracy : 0.681 | Test accuracy : 0.678
Epoch 11 | Train loss : 0.856 | Train accuracy : 0.697 | Test accuracy : 0.689
Epoch 12 | Train loss : 0.835 | Train accuracy : 0.707 | Test 