In [0]:
import datetime
import numpy as np
import os
import pandas as pd
import random

from tqdm import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import (DataLoader, RandomSampler, WeightedRandomSampler, SequentialSampler, TensorDataset)

!pip install pytorch_pretrained_bert
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertConfig
from pytorch_pretrained_bert import BertAdam, BertForSequenceClassification
from pytorch_pretrained_bert.modeling import BertLayerNorm

"""
!pip install snorkel
import snorkel
from snorkel.classification import cross_entropy_with_probs
"""

USE_DRIVE = True

if USE_DRIVE:
    from google.colab import drive, files
    drive.mount('/content/drive')
    DATA_FOLDER = '/content/drive/My Drive/Colab Notebooks/data/blogger_age_range/'
    OUT_FOLDER = '/content/drive/My Drive/Colab Notebooks/out/blogger_age_range/'
else:
    DATA_FOLDER = '/data/'
    OUT_FOLDER = '/out/'

SAVED_MODELS_FOLDER = 'models/'
SUBMISSIONS_FOLDER = 'submissions/'
!mkdir -p "{OUT_FOLDER}{SAVED_MODELS_FOLDER}"
!mkdir -p "{OUT_FOLDER}{SUBMISSIONS_FOLDER}"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

In [0]:
BLOG = 'blog'
CLASS = 'class'

In [0]:
def load_data(filename, folder='./data/', has_labels=False):
    path = folder + filename
    dataset = pd.read_csv(path, names=[BLOG, CLASS])
    if has_labels:
        return dataset[BLOG].to_numpy(), dataset[CLASS].to_numpy()
    else:
        return dataset[BLOG].to_numpy()


In [0]:
def split_data(data, labels, valid_prop, test_prop=0, seed=1234):
    """
    Function that takes a dataset and splits it into three subsets: a training set, a validation set, and a test set
    :param dataset: Complete dataset to split into training validation and test sets
    :param valid_prop: What proportion (in percentage; expressed as a value from 0 to 1) of the full dataset should be
    used for the validation set
    :param test_prop: What proportion (in percentage; expressed as a value from 0 to 1) of the full dataset should be
    used for the test set
    :return: A tuple containing, in that order:
        * The datapoints of the training set
        * The label of each datapoint in the training set
        * The datapoints of the validation set
        * The label of each datapoint in the validation set
        * The datapoints of the test set
        * The label of each datapoint in the test set
    """

    # we set the seed for reproducibility
    np.random.seed(SEED)

    # shuffle data according to seed
    idx = np.arange(data.shape[0])
    np.random.shuffle(idx)
    data = data[idx]
    labels = labels[idx]

    # retrieve classes
    classes = list(set(labels))
    classes.sort()

    # will be needed to select indices for train/valid/test splits
    idx = np.arange(data.shape[0])
    train_idx, valid_idx, test_idx = [], [], []

    # get split indices for every class
    for i in classes:
        class_idx = idx[labels == i]
        class_idx = np.array_split(class_idx, 1 / DATASET_FRACTION)[0]

        n = len(class_idx)
        valid_split_idx = int(valid_prop * n)
        test_split_idx = int((1 - test_prop) * n)

        train_idx.append(class_idx[valid_split_idx:test_split_idx])
        valid_idx.append(class_idx[:valid_split_idx])
        test_idx.append(class_idx[test_split_idx:])

    # find class with less examples
    train_class_counts = np.asarray(list(map(len, train_idx)))
    train_class_weights = 1 / len(classes) / train_class_counts

    # concatenate indices of all classes
    train_idx = np.concatenate([arr for arr in train_idx])
    valid_idx = np.concatenate([arr for arr in valid_idx])
    test_idx = np.concatenate([arr for arr in test_idx])

    train_examples_weights = train_class_weights / len(train_idx)
    train_idx_weights = [train_examples_weights[i]
                            for i, count in enumerate(train_class_counts)
                                for _ in range(count)]

    # shuffle the datasets
    #np.random.shuffle(train_idx)
    np.random.shuffle(valid_idx)
    np.random.shuffle(test_idx)
    train_data, train_labels = data[train_idx], labels[train_idx]
    valid_data, valid_labels = data[valid_idx], labels[valid_idx]
    test_data, test_labels = data[test_idx], labels[test_idx]
    
    if test_prop is 0:
        return train_data, train_labels, valid_data, valid_labels, train_idx_weights
    else:
        return train_data, train_labels, valid_data, valid_labels, test_data, test_labels, train_idx_weights


In [0]:
def prepare_data(tokenizer, raw_data, labels=None, train_idx_weights=None):

    all_inputs = []
    all_masks = []
    all_segments = []
    all_labels = []

    for i, data in enumerate(tqdm(raw_data)):

        tokenized_data = tokenizer.tokenize(data)

        if len(tokenized_data) > MAX_LENGTH - 2:
            tokenized_data = tokenized_data[:(MAX_LENGTH - 2)]

        tokenized_data = ["[CLS]"] + tokenized_data + ["[SEP]"]
        tokenized_len = len(tokenized_data)

        padding = [0] * (MAX_LENGTH - tokenized_len)

        all_inputs.append(tokenizer.convert_tokens_to_ids(tokenized_data) + padding)
        all_masks.append([1]*tokenized_len + padding)
        all_segments.append([0]*tokenized_len + padding)
        if labels is not None:
            all_labels.append(labels[i])

    all_inputs = torch.tensor(all_inputs)
    all_masks = torch.tensor(all_masks)
    all_segments = torch.tensor(all_segments)
    all_labels = torch.tensor(all_labels)
    
    # wrap inputs in a dataloader
    if labels is not None:
        dataset = TensorDataset(all_inputs, all_masks, all_segments, all_labels)
        sampler = WeightedRandomSampler(train_idx_weights, len(train_idx_weights), replacement=True)
        dataloader = DataLoader(dataset, sampler=sampler, batch_size=BATCH_SIZE)
    else:
        dataset = TensorDataset(all_inputs, all_masks, all_segments)
        dataloader = DataLoader(dataset, batch_size=BATCH_SIZE)

    return dataloader


In [0]:
def predict(model, dataloader):
    model.eval()
    preds = torch.zeros(len(dataloader.dataset))
    with torch.no_grad():
        for batch_idx, batch in enumerate(tqdm(dataloader)):
            # send examples to the device and extract them from tuples
            batch = tuple(t.to(device) for t in batch)
            inputs, masks, segments = batch

            # forward pass and predictions
            logits = model(inputs, masks, segments)
            preds[batch_idx*BATCH_SIZE:(batch_idx+1)*BATCH_SIZE] = logits.argmax(dim=1)
    
    return preds

In [0]:
# load datasets
training_data, training_labels = load_data('data_train_preprocessed.csv', folder=DATA_FOLDER, has_labels=True)
submission_data = load_data('data_test_preprocessed.csv', folder=DATA_FOLDER)

# extract classes
classes = list(set(training_labels))
classes.sort()
class_dict = {label: i for i, label in enumerate(classes)}
num_classes = len(class_dict)

# convert labels to integers
training_labels = np.asarray(list(map(lambda label: class_dict[label], training_labels)))

print(class_dict)
print(num_classes)
print()
print(len(training_data))
print(len(training_labels))
print(len(submission_data))

In [0]:
# Recommanded hyperparameters:
# Batch size: 16, 32
# Learning rate (Adam): 5e-5, 3e-5, 2e-5
# Number of epochs: 2, 3, 4

BATCH_SIZE = 32
LEARNING_RATE = 2e-5
NUM_EPOCHS = 4

BERT_KIND = 'bert-base-uncased'
MAX_LENGTH = 128

DATASET_FRACTION = 1  # fraction of training examples to use (between 0 and 1)

# set seeds for reproducibility
SEED = 1234
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

# split dataset
train_data, train_labels, valid_data, valid_labels, train_idx_weights = \
    split_data(training_data, training_labels, valid_prop=.1, seed=SEED)
print(len(train_data))
print(len(valid_data))
print()

# Bert tokenizer instantiation
tokenizer = BertTokenizer.from_pretrained(BERT_KIND)

# prepare data for Bert Sequence Classification
train_dataloader = prepare_data(tokenizer, train_data, train_labels, train_idx_weights)
valid_dataloader = prepare_data(tokenizer, valid_data)

In [0]:
# instantiate model
model = BertForSequenceClassification.from_pretrained(BERT_KIND, num_labels=num_classes).to(device)

# optimizer configuration
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]

# instantiate optimizer
optimizer = BertAdam(optimizer_grouped_parameters,
                     lr=LEARNING_RATE,
                     warmup=.1,
                     t_total=(int(len(train_dataloader.dataset) / BATCH_SIZE) * NUM_EPOCHS))

# instantiate loss function
criterion = nn.CrossEntropyLoss()
#criterion = cross_entropy_with_probs

In [0]:
best_accuracy = 0

for epoch in range(NUM_EPOCHS):
    model.train()
    epoch_loss = 0
    
    for batch_idx, batch in enumerate(tqdm(train_dataloader)):
        # send examples to the device and extract them from tuples
        batch = tuple(t.to(device) for t in batch)
        inputs, masks, segments, labels = batch

        # forward pass
        logits = model(inputs, masks, segments)
        loss = criterion(logits, labels)

        # backward pass
        loss.backward()
        optimizer.step()

        # reset gradients and accumulate loss
        optimizer.zero_grad()
        epoch_loss += loss.item()

    # compute validation accuracy
    preds = predict(model, valid_dataloader)
    accuracy = np.mean(preds.numpy() == valid_labels)

    print(f'\nEpoch {epoch+1}: loss={epoch_loss}')
    print(f'Validation accuracy = {accuracy}\n')

    # save model if best accuracy yet
    if accuracy > best_accuracy:
        model_save_name = f'model_{LEARNING_RATE}_{MAX_LENGTH}_{BATCH_SIZE}_{NUM_EPOCHS}_{accuracy}.pt'
        path = OUT_FOLDER + SAVED_MODELS_FOLDER + model_save_name
        torch.save(model.state_dict(), path)
        best_accuracy = accuracy

In [0]:
"""
LEARNING_RATE = 
MAX_LENGTH = 
BATCH_SIZE = 
NUM_EPOCHS = 
best_accuracy = 
model_save_name = f'model_{LEARNING_RATE}_{MAX_LENGTH}_{BATCH_SIZE}_{NUM_EPOCHS}_{best_accuracy}.pt'
path = OUT_FOLDER + SAVED_MODELS_FOLDER + model_save_name
"""

model.load_state_dict(torch.load(path))

submission_dataloader = prepare_data(tokenizer, submission_data)

In [0]:
preds = predict(model, submission_dataloader)
preds_classes = np.asarray(list(map(lambda pred: classes[pred], preds.long())))

filename = f'submission_{LEARNING_RATE}_{MAX_LENGTH}_{BATCH_SIZE}_{NUM_EPOCHS}_{best_accuracy}.csv'
path = OUT_FOLDER + SUBMISSIONS_FOLDER + filename

with open(path, 'w+') as file:
    file.write('Id,Category')
    for i, pred in enumerate(preds_classes):
        file.write(f'\n{i},{pred}')