In [1]:
import pandas as pd
import numpy as np
import torch
from torch import nn
from torch.nn import functional as F
from torch import optim

import ast

In [291]:
from sklearn.metrics import precision_recall_fscore_support

In [2]:
device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")
print(device)

cuda:1


In [200]:
torch.set_num_threads(12) # only with CPU

In [17]:
train_data = pd.read_csv('/dlabdata1/lugeon/dmozfinalset/dmoz_en_full_train_finalemb.gz', index_col=0)

In [18]:
test_data = pd.read_csv('/dlabdata1/lugeon/dmozfinalset/dmoz_en_full_valid_finalemb.gz', index_col=0)

In [124]:
train_data.head(5)

Unnamed: 0,url,embedding,error,cat0
0,sunrise.mobileread.com,"[0.04443914070725441, 0.006837127730250359, 0....",ucdk:oo__,Computers
1,firefalcon.blogspot.com,"[-0.0012733626645058393, 0.08161848783493042, ...",ucdk:oo__,Computers
2,www.sarcophagus.co.uk,"[0.09634807705879211, -0.18308357894420624, 0....",ucdk:oooo,Computers
3,www.mjmcom.co.uk,"[-0.0032559819519519806, 0.009568585082888603,...",ucdk:ooo_,Computers
4,joyofandroid.com,,invalid website,Computers


In [19]:
train_data.error.value_counts()

ucdk:oo__          31186
invalid website    21316
ucdk:ooo_          21243
ucdk:oooo          18085
ucdk:oo_o           1486
ucdk:o___           1137
ucdk:o_o_            173
ucdk:o_oo             17
ucdk:o__o              4
ucdk:____              1
Name: error, dtype: int64

In [20]:
test_data.error.value_counts()

ucdk:oo__          8684
invalid website    5868
ucdk:ooo_          5678
ucdk:oooo          4905
ucdk:oo_o           526
ucdk:o___           292
ucdk:o_o_            43
ucdk:o__o             2
ucdk:o_oo             2
Name: error, dtype: int64

In [196]:
def is_valid(error):
    return not(error in ['invalid website', 'ucdk:____', 'ucdk:o___', 'ucdk:o_oo', 'ucdk:o_o_', 'ucdk:o__o'])

In [197]:
train_valid = train_data[train_data.error.apply(is_valid)]
test_valid = test_data[test_data.error.apply(is_valid)]

In [198]:
print("There is {:.2f}% of valid embeddings".format(train_valid.shape[0] / train_data.shape[0]))

There is 0.76% of valid embeddings


In [202]:
train_valid.shape[0], test_valid.shape[0]

(72000, 19793)

In [199]:
train_valid.cat0.value_counts()

Science       6240
Computers     6204
Games         6174
Recreation    6172
Shopping      6093
Sports        6037
Business      6035
Arts          6035
Society       6033
Reference     6025
Health        5917
Home          3714
News          1321
Name: cat0, dtype: int64

In [200]:
test_valid.cat0.value_counts()

Games         1552
Science       1549
Computers     1547
Recreation    1540
News          1536
Business      1530
Home          1530
Arts          1529
Reference     1521
Shopping      1518
Society       1505
Sports        1471
Health        1465
Name: cat0, dtype: int64

In [201]:
train_valid['embedding'] = train_valid.apply(lambda row: np.array(ast.literal_eval(row.embedding)), axis=1)
test_valid['embedding'] = test_valid.apply(lambda row: np.array(ast.literal_eval(row.embedding)), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_valid['embedding'] = train_valid.apply(lambda row: np.array(ast.literal_eval(row.embedding)), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_valid['embedding'] = test_valid.apply(lambda row: np.array(ast.literal_eval(row.embedding)), axis=1)


In [357]:
_test_inputs = np.concatenate(test_valid.embedding.to_numpy()).reshape(test_valid.shape[0], -1)

In [358]:
_train_inputs = np.concatenate(train_valid.embedding.to_numpy()).reshape(train_valid.shape[0], -1)

In [360]:
categories = np.sort(train_valid.cat0.unique()).tolist()

In [361]:
# these are numpy series
_train_targets = train_valid.cat0.apply(categories.index)
_test_targets = test_valid.cat0.apply(categories.index)

In [362]:
np.random.seed(42)
idx = np.arange(_train_inputs.shape[0])
np.random.shuffle(idx)

_train_inputs = _train_inputs[idx]
_train_targets = _train_targets.iloc[idx]

In [363]:
_train_targets

64704     7
19928    11
49236     8
64142    10
80937    12
         ..
48986     8
8094      1
72170    12
1104      2
20703    11
Name: cat0, Length: 72000, dtype: int64

In [364]:
train_inputs = torch.FloatTensor(_train_inputs).to(device)
test_inputs = torch.FloatTensor(_test_inputs).to(device)
train_targets = torch.LongTensor(_train_targets.values).to(device)
test_targets = torch.LongTensor(_test_targets.values).to(device)

In [365]:
train_inputs.shape, train_targets.shape, test_inputs.shape, test_targets.shape

(torch.Size([72000, 3072]),
 torch.Size([72000]),
 torch.Size([19793, 3072]),
 torch.Size([19793]))

In [366]:
class Webnet(nn.Module):
    def __init__(self, features_dim, out_dim, internal_dim):
        super(Webnet, self).__init__()
        
        self.fc1 = torch.nn.Linear(features_dim, internal_dim)
        self.fc2 = torch.nn.Linear(internal_dim, internal_dim)
        self.fc3 = torch.nn.Linear(internal_dim, internal_dim)
        self.fc4 = torch.nn.Linear(internal_dim, out_dim)
        
        self.drop = torch.nn.Dropout(0.5)

    def forward(self, x):
        x = self.fc1(x)
        x = self.drop(x)
        x = self.fc2(F.relu(x))
        x = self.drop(x)
        x = self.fc3(F.relu(x))
        x = self.drop(x)
        x = self.fc4(F.relu(x))
        return x

In [367]:
class DME(nn.Module):
    
    def __init__(self, latent_dim, embeddings_dim):
        """
        the dimensions of the prior embeddings must be passed has a list
        """
        super(DME, self).__init__()
        
        self.n_embeddings = len(embeddings_dim) # number of prior embeddings
        self.transforms = nn.ModuleList([torch.nn.Linear(dim, latent_dim).to(device) for dim in embeddings_dim]) # linear transformation into the latent space
        self.attention = torch.nn.Linear(latent_dim, 1) # linear transformation defining the attention
        self.softmax = nn.Softmax(dim=1) # to normalize the attention 
        
    def forward(self, embeddings):
        """
        The embeddings are given as tuples of the form [samples x embedding_dim]
        """
        
        # first we transform each prior embedding into the latent space
        latent_embeddings = [transform(x) for transform, x in zip(self.transforms, embeddings)] # this is a list
        
        # then we apply the attention transformation to each of the latent embeddings
        att = torch.cat([self.attention(x) for x in latent_embeddings], dim=1) # this is a tensor
        
        # we normalize the attention and reshape for multiplication
        att_coeffs = self.softmax(att).reshape(-1, 1, self.n_embeddings)
        
        # we reshape for multiplication
        latent_embeddings_t = torch.cat(latent_embeddings, dim=1).reshape(-1, self.n_embeddings, latent_dim)
        
        # the reshape is so that we multiply matrices of the form 
        # [#samples x 1 x #embeddings] @ [#samples x #embeddings x latent_dim]
        
        aggregation = att_coeffs.matmul(latent_embeddings_t)
        
        
        return aggregation.squeeze()

In [368]:
class DynamicClassifier(nn.Module):
    
    def __init__(self, latent_dim, out_dim, internal_dim, embeddings_dim):
        super(DynamicClassifier, self).__init__()
        
        self.dme = DME(latent_dim, embeddings_dim)
        self.classifier = Webnet(latent_dim, out_dim, internal_dim)
        self.embeddings_dim = embeddings_dim
        
    def forward(self, x):
        
        splitted_embeddings = x.split(self.embeddings_dim, dim=1)
        latent_embedding = self.dme(splitted_embeddings)
        output = self.classifier(latent_embedding)
        
        return output


In [369]:
counts = _train_targets.value_counts().sort_index().values

weights = torch.Tensor([1 / (x / sum(counts)) for x in counts])
weights = weights / weights.sum() * counts.shape[0]
weights

tensor([0.7602, 0.7602, 0.7395, 0.7431, 0.7754, 1.2353, 3.4730, 0.7433, 0.7615,
        0.7352, 0.7530, 0.7605, 0.7600])

In [370]:
feature_dim = 768
latent_dim = 500
internal_dim = 300
out_dim = len(categories)

splits = [768, 768, 768, 768]


nb_epochs = 100
batch_size = 64




#model = DynamicClassifier(latent_dim, out_dim, internal_dim, splits, binary).to(device)
model = Webnet(4 * feature_dim, out_dim, internal_dim).to(device)
        
nb_trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print("Number of parameters: {}".format(nb_trainable_params))
 
criterion = nn.CrossEntropyLoss(weight=weights.to(device))

optimizer = optim.Adam(model.parameters(), 1e-3)
#scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=int(nb_epochs / 2), gamma=0.1)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.1, patience=10, verbose=True)

train_acc_hist = []
train_loss_hist = []
test_acc_hist = []
test_loss_hist = []


for e in range(nb_epochs):
    
    model.train()
    
    for input, target in zip(train_inputs.split(batch_size), train_targets.split(batch_size)):
                  
        output = model(input)
        loss = criterion(output, target)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    with torch.no_grad():
        
        model.eval()
        
        train_outputs = model(train_inputs)
        test_outputs = model(test_inputs)
        
        train_loss = criterion(train_outputs, train_targets)
        test_loss = criterion(test_outputs, test_targets)

        _, train_preds = torch.max(train_outputs, 1)
        _, test_preds = torch.max(test_outputs, 1)
        
        train_acc = torch.sum(train_preds == train_targets.data).item() / train_targets.shape[0]
        test_acc = torch.sum(test_preds == test_targets.data).item() / test_targets.shape[0]
         
        print("Epoch {}".format(e) +\
              " | Train loss : {:.3f}".format(train_loss) +\
              " | Test loss : {:.3f}".format(test_loss) +\
              " | Train accuracy : {:.3f}".format(train_acc) +\
              " | Test accuracy : {:.3f}".format(test_acc))
        
        
    scheduler.step(test_acc)

            

Number of parameters: 1106413
Epoch 0 | Train loss : 1.362 | Test loss : 1.259 | Train accuracy : 0.576 | Test accuracy : 0.577
Epoch 1 | Train loss : 1.263 | Test loss : 1.234 | Train accuracy : 0.609 | Test accuracy : 0.599
Epoch 2 | Train loss : 1.222 | Test loss : 1.216 | Train accuracy : 0.625 | Test accuracy : 0.608
Epoch 3 | Train loss : 1.171 | Test loss : 1.159 | Train accuracy : 0.637 | Test accuracy : 0.618
Epoch 4 | Train loss : 1.139 | Test loss : 1.187 | Train accuracy : 0.644 | Test accuracy : 0.613
Epoch 5 | Train loss : 1.114 | Test loss : 1.222 | Train accuracy : 0.655 | Test accuracy : 0.623
Epoch 6 | Train loss : 1.101 | Test loss : 1.198 | Train accuracy : 0.660 | Test accuracy : 0.626
Epoch 7 | Train loss : 1.067 | Test loss : 1.172 | Train accuracy : 0.670 | Test accuracy : 0.630


KeyboardInterrupt: 