In [1]:
import torch
import os
import pandas as pd
import pickle
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import numpy as np

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

ModuleNotFoundError: No module named 'torch'

# Load data

Notice we call 'test' the validation set

In [2]:
from pathlib import Path

# get data PATH from current directory
PATH = os.path.join(Path(os.getcwd()).parent, 'data')


PATH_DATASET = os.path.join(PATH, 'datasets')

PATH_VOCAB = os.path.join(PATH, 'vocabularies')

PATH_MODEL = os.path.join(PATH, 'models')

PATH_STATS = os.path.join(PATH, 'stats')

In [3]:
# Load training and test set in pandas dataframe

train = pd.read_hdf(os.path.join(PATH_DATASET,'train_set.h5'))
test = pd.read_hdf(os.path.join(PATH_DATASET,'test_set.h5'))


train_rel = pd.read_hdf(os.path.join(PATH_DATASET,'train_set_rel.h5'))
train_unrel = pd.read_hdf(os.path.join(PATH_DATASET,'train_set_unrel.h5'))

# Load vocabulary

file = open(os.path.join(PATH_VOCAB,'vocab_test.pkl'), 'rb')
vocab = pickle.load(file)


# Load classes dictionary

file = open(os.path.join(PATH,'classes_dict.pkl'), 'rb')
classes_dict = pickle.load(file)

cat_dict = classes_dict['class_stoi']
cat_dict_inv = classes_dict['class_itos']
dict_count = classes_dict['class_count']

# Define my tokenizer

In this model we  will use a Bag of words approach using sk-learn CountVectorizer. We define our own tokenizer.

Originally we have our main vocabulary that represents a biyective function mapping one token with one number. Using that dictionary we have made a new dictionary that maps each group of words that are spelled similarly to the same integer.


for example: 


In [4]:
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer


file = open(os.path.join(PATH_VOCAB,'vocab_test_red.pkl'), 'rb')
vocab_mix = pickle.load(file)

vectorizer = CountVectorizer(analyzer = 'word',
                             strip_accents='unicode',
                             min_df=2
                            )

tokenizer = vectorizer.build_analyzer()


counter = Counter(vocab_mix.values())

a = [x for x in counter if counter[x]>1]



vocab_transf = {}
for key in vocab_mix:
    vocab_transf[vocab_mix[key]] = 'r_{}'.format(key)
    
vocab_transf


def my_tokenizer(string):
    tokens = tokenizer(string)
    list_tokens = []
    for token in tokens:
        if token in vocab_mix:
            num = vocab_mix[token]
            #print(token, vocab_mix[token], counter[num])
            if counter[num]>1:
                list_tokens.append(vocab_transf[vocab_mix[token]])
            list_tokens.append(token)

    #print(list_tokens, string)
    return list_tokens

# Vectorize

Make BoW using my tokenizer. We should notice that we don't put explicitly a 'vocabulary' in the CountVectorizer object. However, the vocabulary is implicitly restricted by the tokenizer we use.

In [5]:
# Define the vectorizer to generate the BoW

vectorizer_tok = CountVectorizer(
                             analyzer = 'word',
                             strip_accents='unicode',
                             tokenizer=my_tokenizer
                            )


# Reset indexes of the training set before vectorizer
train = train.reset_index()

# Vectorize the training and test_set
vectors_train = vectorizer_tok.fit_transform(train.title)

vectors_test = vectorizer_tok.transform(test.title)


y_train = train.category.values

X_test = vectors_test
y_test = test.category.values

We save the vocabulary output of the vectorizer.

In [6]:
vocab = vectorizer_tok.vocabulary_

with open(os.path.join(PATH_VOCAB,'vocab_test_mytk.pkl'), 'wb') as f:
    pickle.dump(vocab, f)
    
vectors_train.shape

(19500000, 122547)

# Util functions

We define some util functions

- Functions to sample in a balanced way and priortizing reliable examples

## Sampling

In [7]:
# Split training set by categories
list_train_rel = list(train_rel.groupby('category'))
list_train_unrel = list(train_unrel.groupby('category'))

class_rel = train_rel.groupby('category').count().title.index
class_unrel = train_unrel.groupby('category').count().title.index


file = open(os.path.join(PATH,'classes_weights.pkl'), 'rb')
weights = pickle.load(file)


def numero(weight, num_examples, proporcion):
    
    if weight==0:
        return 0
        
    return int(min(max(1, round(num_examples*weight*proporcion)), num_examples-1))


def index_sample(list_train_rel, list_train_unrel, num_samples, proporcion):
    list_idx = []
    
    # Unreliable indexes
    
    for i in range(len(class_unrel)):
        # Convention index
        idx = cat_dict[class_unrel[i]]
        number = num_samples - numero(weights[idx,2], num_samples, proporcion)
        list_idx += list(np.random.choice(list_train_unrel[i][1].index, size=number))
    
    # Reliable indexes
    for i,category in enumerate(class_rel):
        
        idx = cat_dict[class_rel[i]]
        number = numero(weights[idx,2], num_samples, proporcion)
        list_idx += list(np.random.choice(list_train_rel[i][1].index, size=number))
    
    
    return list_idx

## Predictions in CPU

Functions to calculate bacc and accuracy using CPU.

We train the model using GPU, but we don't have enough memory to run the predictions in the validation set while training. So that, at the end of each epoch we set the model to the cpu, make predictions in the validation set and then set the model to gpu and continue training

In [8]:
import torch
import numpy as np
from scipy.sparse import coo_matrix
from numpy import array
from sklearn.metrics import balanced_accuracy_score, accuracy_score


def report(model, X_test, y_test):
    
    x_tensor = csr_to_tensor(X_test)
    model_cpu = model.to('cpu')
    outputs = model_cpu(x_tensor)
    _, predicted = torch.max(outputs.data, 1)
    predicted.cpu().numpy()
    y_label = []
    for val in y_test:
        y_label.append(cat_dict[val])
    y_label = np.array(y_label)
    
    return balanced_accuracy_score(y_label, predicted), accuracy_score(y_label, predicted)


def csr_to_tensor(X_sample):
    coo = coo_matrix(X_sample)

    values = coo.data
    indices = np.vstack((coo.row, coo.col))

    i = torch.LongTensor(indices)
    v = torch.FloatTensor(values)
    shape = coo.shape

    return torch.sparse.FloatTensor(i, v, torch.Size(shape))

def numerate(y):
    index = []
    for i in y:
        index.append(cat_dict[i])
    return np.array(index)


# Training

In [9]:
import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
import numpy as np
import pickle
from sklearn.utils import shuffle




input_size = vectors_train.shape[1]
num_classes = len(train.category.unique())
H = num_classes

H1 = round(1.6*num_classes)

vec_proporcion = [20]
vec_num_epochs = [100] 
vec_learning_rate = [0.0003]

total_step = 10

for learning_rate, num_epochs, prop in zip(vec_learning_rate, vec_num_epochs, vec_proporcion):

    ## Define model
    model = torch.nn.Sequential(
          torch.nn.Linear(input_size, H1),
          torch.nn.ReLU(),
          torch.nn.Dropout(p=0.75),
          torch.nn.Linear(H1, num_classes),
    ).to(device)
    
    # Loss and optimizer
    criterion = nn.CrossEntropyLoss()  
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    
    # Some training stats
    model_stats = {}
    model_stats['epoch'] = []
    model_stats['avg_loss'] = []
    model_stats['accuracy'] = []
    model_stats['bacc'] = []

    # Record maximum BACC value
    max_bacc = 0
    num_max = 0
    
    
    # Epochs aren't 'real epochs' in which we use all the training set. 
    # Instead each epochs is a collection of 1000 mini-batches generated using the sampling defined above
    # Each epoch only defines when and how many times we evaluate the model using the validation set.
    for epoch in range(num_epochs):
        
        model.train()
        
        list_loss = []
        
        for i in range(1000):
            
            # List of index of the minibatch using the function defined above
            list_idx = index_sample(list_train_rel, list_train_unrel, 10, prop)
            
            X = vectors_train[list_idx, :]
            y = y_train[list_idx]
            
            X_sample, y_sample = shuffle(X, y)

            x_tensor = csr_to_tensor(X_sample).to(device)

            labels = torch.from_numpy(numerate(y_sample)).long().to(device)

            outputs = model(x_tensor)
            #images.to('cpu')
            #torch.cuda.empty_cache()
            
            
            loss = criterion(outputs, labels)
            lo = loss.item()
            list_loss.append(lo)
            # Test
            #outputs.to('cpu')
            #torch.cuda.empty_cache()

            # Backward and optimize
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            if (i+1) % 100 == 0:
                print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}' 
                       .format(epoch+1, num_epochs, i+1, total_step, lo )) ##ss.item()))

        ## Statistics

        # Bacc and accuracy
        
        bacc, accuracy = report(model.eval(), X_test, y_test)
        print(bacc,accuracy)
        model.to(device)

        # Save in the model_stats dictionary
        model_stats['epoch'].append(epoch)
        model_stats['bacc'].append(bacc)
        model_stats['accuracy'].append(accuracy)
        model_stats['avg_loss'].append(np.array(list_loss).mean())
        
        # We upgrade the model each time bacc > max_bacc
        # Also,  we store the model of the last 5 upgrades
        
        if bacc > max_bacc:
            num_max += 1
            max_bacc = bacc
            torch.save(model, os.path.join(PATH_MODEL,'model_test_mytk_prop{}_lr{}_v{}.pt'.format(prop, learning_rate, num_max%7+1)))


        with open(os.path.join(PATH_STATS,'stats_test_mytk_prop{}_lr{}.pt'.format(prop, learning_rate)), 'wb') as f:
            pickle.dump(model_stats, f)          

0.022619249078842243 0.019652
0.08870892096696123 0.07691
0.16926413889865527 0.149348
0.2613926699196681 0.233256
0.3511625891578535 0.317432
0.42433325755941503 0.390224


RuntimeError: [enforce fail at CPUAllocator.cpp:56] posix_memalign(&data, gAlignment, nbytes) == 0. 12 vs 0
