In [0]:
import datetime
import numpy as np
import os
import pandas as pd
import random

from collections import Counter
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torch.utils.data import (DataLoader, RandomSampler, WeightedRandomSampler, SequentialSampler, TensorDataset)

!pip install snorkel
import snorkel
from snorkel.classification import cross_entropy_with_probs

USE_DRIVE = True

if USE_DRIVE:
    from google.colab import drive, files
    drive.mount('/content/drive')
    DATA_FOLDER = '/content/drive/My Drive/Colab Notebooks/data/blogger_age_range/'
    OUT_FOLDER = '/content/drive/My Drive/Colab Notebooks/out/blogger_age_range/'
else:
    DATA_FOLDER = 'data/'
    OUT_FOLDER = 'out/'

SAVED_MODELS_FOLDER = 'models/'
SUBMISSIONS_FOLDER = 'submissions/'
!mkdir -p "{OUT_FOLDER}{SAVED_MODELS_FOLDER}"
!mkdir -p "{OUT_FOLDER}{SUBMISSIONS_FOLDER}"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

In [0]:
BLOG = 'blog'
CLASS = 'class'

In [0]:
def load_data(filename, folder='./data/', has_labels=False):
    path = folder + filename
    dataset = pd.read_csv(path, names=[BLOG, CLASS])
    if has_labels:
        return dataset[BLOG].to_numpy(), dataset[CLASS].to_numpy()
    else:
        return dataset[BLOG].to_numpy()

In [0]:
def split_data(data, labels, valid_prop, test_prop=0, seed=1234):
    """
    Function that takes a dataset and splits it into three subsets: a training set, a validation set, and a test set
    :param dataset: Complete dataset to split into training validation and test sets
    :param valid_prop: What proportion (in percentage; expressed as a value from 0 to 1) of the full dataset should be
    used for the validation set
    :param test_prop: What proportion (in percentage; expressed as a value from 0 to 1) of the full dataset should be
    used for the test set
    :return: A tuple containing, in that order:
        * The datapoints of the training set
        * The label of each datapoint in the training set
        * The datapoints of the validation set
        * The label of each datapoint in the validation set
        * The datapoints of the test set
        * The label of each datapoint in the test set
    """

    # we set the seed for reproducibility
    np.random.seed(SEED)

    # shuffle data according to seed
    idx = np.arange(data.shape[0])
    np.random.shuffle(idx)
    data = data[idx]
    labels = labels[idx]

    # retrieve classes
    classes = list(set(labels))
    classes.sort()

    # will be needed to select indices for train/valid/test splits
    idx = np.arange(data.shape[0])
    train_idx, valid_idx, test_idx = [], [], []

    # get split indices for every class
    for i in classes:
        class_idx = idx[labels == i]
        class_idx = np.array_split(class_idx, 1 / DATASET_FRACTION)[0]

        n = len(class_idx)
        valid_split_idx = int(valid_prop * n)
        test_split_idx = int((1 - test_prop) * n)

        train_idx.append(class_idx[valid_split_idx:test_split_idx])
        valid_idx.append(class_idx[:valid_split_idx])
        test_idx.append(class_idx[test_split_idx:])

    # find class with less examples
    train_class_counts = np.asarray(list(map(len, train_idx)))
    train_class_weights = 1 / len(classes) / train_class_counts

    # concatenate indices of all classes
    train_idx = np.concatenate([arr for arr in train_idx])
    valid_idx = np.concatenate([arr for arr in valid_idx])
    test_idx = np.concatenate([arr for arr in test_idx])

    train_examples_weights = train_class_weights / len(train_idx)
    train_idx_weights = [train_examples_weights[i]
                            for i, count in enumerate(train_class_counts)
                                for _ in range(count)]

    # shuffle the datasets
    #np.random.shuffle(train_idx)
    np.random.shuffle(valid_idx)
    np.random.shuffle(test_idx)
    train_data, train_labels = data[train_idx], labels[train_idx]
    valid_data, valid_labels = data[valid_idx], labels[valid_idx]
    test_data, test_labels = data[test_idx], labels[test_idx]
    
    if test_prop is 0:
        return train_data, train_labels, valid_data, valid_labels, train_idx_weights
    else:
        return train_data, train_labels, valid_data, valid_labels, test_data, test_labels, train_idx_weights

In [0]:
def prepare_data(tokenizer, raw_data, labels=None, train_idx_weights=None):

    all_inputs = []
    all_labels = []

    for i, data in enumerate(tqdm(raw_data)):

        # split input into words (tokens)
        words = data.split()

        # truncate input if longer than max length
        if len(words) > MAX_LENGTH:
            words = words[:MAX_LENGTH]

        # compute padding length
        padding = [0] * (MAX_LENGTH - len(words))

        # add padding to and store the input
        all_inputs.append(tokenizer.convert_words_to_ids(words) + padding)
        if labels is not None:
            all_labels.append(labels[i])

    # convert inputs to torch tensors
    all_inputs = torch.tensor(all_inputs)
    all_labels = torch.tensor(all_labels)
    
    # wrap inputs in a dataloader
    if labels is not None:
        dataset = TensorDataset(all_inputs, all_labels)
        sampler = WeightedRandomSampler(train_idx_weights, len(train_idx_weights), replacement=True)
        dataloader = DataLoader(dataset, sampler=sampler, batch_size=BATCH_SIZE)
    else:
        dataset = TensorDataset(all_inputs)
        dataloader = DataLoader(dataset, batch_size=BATCH_SIZE)

    return dataloader

In [0]:
class Tokenizer():

    def __init__(self, sentences, threshold=1):

        self.threshold = threshold

        # get full list of words
        all_words = ' '.join(sentences).split()

        # if there's a threshold, remove words less frequent than threshold
        if threshold > 1:
            vocabulary = Counter(all_words)
            vocabulary = [word for word, freq in vocabulary.items() if freq >= threshold]
        else:
            vocabulary = list(set(all_words))

        # adding 'padding' and 'unknown' tokens to vocabulary
        vocabulary = ['PAD', 'UNK'] + vocabulary
        
        # create words to ids dictionary
        self.tokens_to_ids = {tok: i for i, tok in enumerate(tqdm(vocabulary))}

    def convert_words_to_ids(self, tokens):

        # build list containing the ids of the tokens received as input
        ids = [self.tokens_to_ids[tok]
                if tok in self.tokens_to_ids else self.tokens_to_ids['UNK'] for tok in tokens]

        return ids

In [0]:
def predict(model, dataloader):
    model.eval()
    preds = torch.zeros(len(dataloader.dataset))
    with torch.no_grad():
        for batch_idx, batch in enumerate(tqdm(dataloader)):
            # send examples to the device and extract them from the list
            inputs = batch[0].to(device)

            # reinitialize lstm hidden state
            hidden = model.init_hidden(inputs.size(0))
            
            # batch forward pass and predictions
            logits, hidden = model(inputs, hidden)
            preds[batch_idx*BATCH_SIZE:(batch_idx+1)*BATCH_SIZE] = logits.argmax(dim=1)
            """
            if batch_idx == 0:
                print()
                print(logits[:10])
            """
    
    return preds

In [0]:
def confusion_matrix(classes, preds, labels, pourcents=False):
    """
    Generates a confusion matrix indicated which class is often mixed up with which other class
    :param classes: List of all possible classes in the model
    :param preds: Classification predictions given by a model
    :param labels: True labels for the given predictions
    :param pourcents: Whether or not to express the values in the confusion matrix in percentages
    :return: A m X m confusion matrix M where M(i,j) is how many times, on average, a point belonging to class i is
    given class j
    """
    m = len(classes)
    conf_matrix = np.zeros((m, m))
    for i, ground_truth in enumerate(classes):
        class_idx = (labels == ground_truth)
        for j, predicted in enumerate(classes):
            if pourcents:
                conf_matrix[i, j] = np.round(100 * np.mean(preds[class_idx] == predicted), 1)
            else:
                conf_matrix[i, j] = np.sum(preds[class_idx] == predicted)

    return conf_matrix

In [0]:
# load datasets
training_data, training_labels = load_data('data_train_preprocessed.csv', folder=DATA_FOLDER, has_labels=True)
submission_data = load_data('data_test_preprocessed.csv', folder=DATA_FOLDER)

# extract classes
classes = list(set(training_labels))
classes.sort()
num_classes = len(classes)

print(num_classes)
print()
print(Counter(training_labels))
print()
print(len(training_data))
print(len(training_labels))
print(len(submission_data))

In [0]:
BATCH_SIZE = 256
MAX_LENGTH = 512
THRESHOLD = 5
DATASET_FRACTION = 1  # fraction of training examples to use (between 0 and 1)

USE_ONEHOT = True

# set seeds for reproducibility
SEED = 1234
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

# split dataset
train_data, train_labels, valid_data, valid_labels, train_idx_weights = \
    split_data(training_data, training_labels, valid_prop=.1, seed=SEED)
print(len(train_data))
print(len(valid_data))
print()

if USE_ONEHOT:
    # make one hot encoding for train set labels
    LABEL_SMOOTHING = .1
    num_examples = train_labels.shape[0]

    one_hot_helper = np.zeros((num_examples, num_classes))
    one_hot_helper[np.arange(num_examples), train_labels] = 1
    train_labels = one_hot_helper

    train_labels[train_labels == 1] = 1 - LABEL_SMOOTHING
    train_labels[train_labels == 0] = LABEL_SMOOTHING / (num_classes - 1)

# tokenizer instantiation
tokenizer = Tokenizer(train_data, THRESHOLD)
print()
print(len(tokenizer.tokens_to_ids))
print()

# prepare data for classification task
train_dataloader = prepare_data(tokenizer, train_data, train_labels, train_idx_weights)
valid_dataloader = prepare_data(tokenizer, valid_data)

In [0]:
class BloggerLSTM(nn.Module):

    def __init__(self, vocab_size, embed_size, hidden_size, num_layers, dropout, fc_size, bidirectional, class_cnt):

        super(BloggerLSTM, self).__init__()
        self.vocab_size = vocab_size
        self.embed_size = embed_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.fc_size = fc_size
        self.bidirectional = bidirectional
        self.bidi_mult = 2 if bidirectional else 1

        self.hidden = None

        self.word_embeddings = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size,
                            hidden_size,
                            num_layers=num_layers,
                            dropout=dropout,
                            bidirectional=bidirectional,
                            batch_first=True)
        
        if fc_size is not None:
            self.fc1 = nn.Linear(hidden_size * self.bidi_mult, fc_size)
            self.relu = nn.ReLU()
            self.fc2 = nn.Linear(fc_size, class_cnt)
        else:
            self.fc = nn.Linear(hidden_size * self.bidi_mult, class_cnt)
        
        self.softmax = nn.Softmax(dim=1)
    
    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        hidden = (weight.new(self.num_layers * self.bidi_mult, batch_size, self.hidden_size).zero_().to(device),
                  weight.new(self.num_layers * self.bidi_mult, batch_size, self.hidden_size).zero_().to(device))
        return hidden
    
    def forward(self, inputs, hidden):

        embeddings = self.word_embeddings(inputs)
        output, hidden = self.lstm(embeddings, hidden)

        out = output[:,-1]

        if self.fc_size is not None:
            out = self.fc1(out)
            out = self.relu(out)
            out = self.fc2(out)
        else:
            out = self.fc(out)

        out = self.softmax(out)

        return out, hidden

In [0]:
EMBED_SIZE = 400
HIDDEN_SIZE = 512
NUM_LAYERS = 3
DROPOUT = .0
FC_SIZE = 256
BIDIRECTIONAL = True

LEARNING_RATE = .00002
WEIGHT_DECAY = .0005

vocab_size = len(tokenizer.tokens_to_ids)

# instantiate model
model = BloggerLSTM(vocab_size, EMBED_SIZE, HIDDEN_SIZE, NUM_LAYERS, DROPOUT, FC_SIZE, BIDIRECTIONAL, num_classes).to(device)

# instantiate optimizer
optimizer = optim.Adam(model.parameters())#, lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)

# instantiate loss function
if USE_ONEHOT:
    criterion = cross_entropy_with_probs
else:
    criterion = nn.CrossEntropyLoss()

In [0]:
NUM_EPOCHS = 20

best_accuracy = 0

for epoch in range(NUM_EPOCHS):
    model.train()
    epoch_loss = 0
    
    for batch_idx, batch in enumerate(tqdm(train_dataloader)):
        # get tuple examples from the batch
        batch = tuple(t.to(device) for t in batch)
        inputs, labels = batch
        
        # reinitialize lstm hidden state
        hidden = model.init_hidden(inputs.size(0))

        # forward pass
        logits, hidden = model(inputs, hidden)
        loss = criterion(logits, labels)

        # backward pass
        loss.backward()
        optimizer.step()

        # reset gradients and accumulate loss
        optimizer.zero_grad()
        epoch_loss += loss.item()

    # compute validation accuracy
    preds = predict(model, valid_dataloader)
    accuracy = np.mean(preds.numpy() == valid_labels)
    
    print(f'\nEpoch {epoch+1}: loss={epoch_loss}')
    print(f'Validation accuracy = {accuracy}\n')

    conf_matrix = confusion_matrix(classes, preds.numpy(), valid_labels, pourcents=True)
    print(conf_matrix)
    print()
    
    # save model if best accuracy yet
    if accuracy > best_accuracy:
        model_save_name = f'model_{LEARNING_RATE}_{MAX_LENGTH}_{BATCH_SIZE}_{NUM_EPOCHS}_{accuracy}.pt'
        path = OUT_FOLDER + SAVED_MODELS_FOLDER + model_save_name
        torch.save(model.state_dict(), path)
        best_accuracy = accuracy

In [0]:
model = BloggerLSTM(vocab_size, EMBED_SIZE, HIDDEN_SIZE, NUM_LAYERS, DROPOUT, FC_SIZE, BIDIRECTIONAL, num_classes).to(device)
model.load_state_dict(torch.load(path))

# compute validation accuracy
preds = predict(model, valid_dataloader)
accuracy = np.mean(preds.numpy() == valid_labels)

print(f'\nEpoch {epoch+1}: loss={epoch_loss}')
print(f'Validation accuracy = {accuracy}\n')

conf_matrix = confusion_matrix(classes, preds.numpy(), valid_labels, pourcents=False)
print(conf_matrix)
print()