In [None]:
import json
import pandas as pd
from IPython.display import clear_output
from pytorch_metric_learning import miners, losses, samplers
from pytorch_metric_learning.utils.accuracy_calculator import AccuracyCalculator
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [None]:
import neptune

neptune.init('gregrolwes/course-embedder',)

In [None]:
PARAMS = {
    'lr': 0.001,
    'group size': 4,
    'batch size': 32,
    'token embedding dim': 256,
    'hidden dim': 100,
    'output dim': 256,
    'num layers': 2,
    'bidirectional': True,
    'dropout': 0,
}

In [None]:
neptune.create_experiment(name="", params=PARAMS, tags=['WOS', '50/50 split'])

In [None]:
import random
import torch
from torch import nn
from torchtext import data
from torchtext.data import TabularDataset, Example

# make the experiment reproducible
SEED = 42
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [None]:
TEXT = data.Field(include_lengths = True)
COURSE = data.LabelField(dtype = torch.long)

In [None]:
fields = [('course', COURSE), ('text', TEXT)]

In [None]:
all_data = TabularDataset(path = 'data/all_WOS.csv', format = 'csv', fields = fields,)
train_data = TabularDataset(path = 'data/train_WOS_metric.csv', format = 'csv', fields = fields,)
valid_data = TabularDataset(path = 'data/test_WOS_metric.csv', format = 'csv', fields = fields,)

In [None]:
TEXT.build_vocab(train_data, max_size=50000)
COURSE.build_vocab(all_data)

In [None]:
print(f"Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}")
print(f"Unique tokens in COURSE vocabulary: {len(COURSE.vocab)}")

In [None]:
class TripletIterator(data.Iterator):
    def __init__(self, dataset, sampler, **kwargs):
        self.sampler = sampler
        super(TripletIterator, self).__init__(dataset, shuffle=False, **kwargs)
    
    def data(self):
        """Return the examples in the dataset in sampled order."""
        xs = [self.dataset[i] for i in self.sampler]
        return xs

In [None]:
train_targets = [COURSE.vocab.stoi[sample.course] for sample in train_data]
sampler = samplers.MPerClassSampler(train_targets, 4, length_before_new_iter=len(train_data))

In [None]:
BATCH_SIZE = PARAMS['batch size']

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator = TripletIterator(
    train_data,
    sampler,
    batch_size = BATCH_SIZE,
    sort_key = lambda x: len(x.text),
    sort_within_batch = True,
    train=True,
    device = device)

valid_iterator = data.Iterator(
    valid_data,
    batch_size = BATCH_SIZE,
    sort_key = lambda x: len(x.text),
    sort_within_batch = True,
    shuffle=False,
    train=False,
    device = device)

In [None]:
class RNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout, pad_idx):
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)
        self.rnn = nn.LSTM(embedding_dim,
                          hidden_dim,
                          num_layers=n_layers,
                          bidirectional=bidirectional,
                          dropout=dropout)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        self.activation = nn.Sigmoid()
    
    def forward(self, text, text_lengths):
        embedded = self.dropout(self.embedding(text))
        #pack sequence
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths.type(torch.IntTensor))
        packed_output, (hidden, cell) = self.rnn(packed_embedded)
        #unpack sequence
        output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output)
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1))

        return self.activation(self.fc(hidden))

In [None]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = PARAMS['token embedding dim']
HIDDEN_DIM = PARAMS['hidden dim']
OUTPUT_DIM = PARAMS['output dim']
N_LAYERS = PARAMS['num layers']
BIDIRECTIONAL = PARAMS['bidirectional']
DROPOUT = PARAMS['dropout']
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

model = RNN(INPUT_DIM,
           EMBEDDING_DIM,
           HIDDEN_DIM,
           OUTPUT_DIM,
           N_LAYERS,
           BIDIRECTIONAL,
           DROPOUT,
           PAD_IDX)

In [None]:
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]

model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

print(model.embedding.weight.data)

In [None]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters(), lr=PARAMS['lr'])
miner = miners.TripletMarginMiner(margin=1.0, type_of_triplets="all")
#criterion = losses.MultiSimilarityLoss(alpha=2, beta=50, base=0.5)
criterion = losses.TripletMarginLoss(margin=0.1).to(device)

In [None]:
model = model.to(device)
miner = miner.to(device)
criterion = criterion.to(device)

In [None]:
def progress_bar(current, final, acc, loss, thing):
    progress = int(100*(current/final))
    remaining = 100-progress
    clear_output(wait=True)
    
    print(thing)
    print('|', end='')
    print('='*progress, end='')
    print('>', end='')
    print(' '*remaining, end='')
    print('| {}%'.format(progress))
    print("Loss:", loss)
    

In [None]:
accuracy_calculator1 = AccuracyCalculator(include = ("mean_average_precision_at_r",), k = 1)
accuracy_calculator5 = AccuracyCalculator(include = ("mean_average_precision_at_r",), k = 5)
accuracy_calculator10 = AccuracyCalculator(include = ("mean_average_precision_at_r",), k = 10)

In [None]:
def get_acc(embeddings, labels):
    p_at_1 = accuracy_calculator1.get_accuracy(embeddings.numpy(), 
                                                embeddings.numpy(),
                                                labels.numpy(),
                                                labels.numpy(),
                                                True)['mean_average_precision_at_r']
    torch.cuda.empty_cache()
    p_at_5 = accuracy_calculator5.get_accuracy(embeddings.numpy(), 
                                                embeddings.numpy(),
                                                labels.numpy(),
                                                labels.numpy(),
                                                True)['mean_average_precision_at_r']
    torch.cuda.empty_cache()
    p_at_10 = accuracy_calculator10.get_accuracy(embeddings.numpy(), 
                                                embeddings.numpy(),
                                                labels.numpy(),
                                                labels.numpy(),
                                                True)['mean_average_precision_at_r']
    return p_at_1, p_at_5, p_at_10

In [None]:
def train(model, iterator, optimizer, criterion):
    epoch_loss = 0
    epoch_acc = 0
    
    train_embeddings = torch.Tensor([])
    train_labels = torch.Tensor([])
    
    model.train()
    
    for i, batch in enumerate(iterator):
        optimizer.zero_grad()
        
        text, text_lengths = batch.text
        outputs = model(text, text_lengths).squeeze(1)
        triplets = miner(outputs, batch.course)
        loss = criterion(outputs, batch.course, triplets)
        
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        
        train_embeddings = torch.cat((train_embeddings, outputs.detach().cpu()))
        train_labels = torch.cat((train_labels, batch.course.detach().cpu()))
        
        if i % 1 == 0:
            progress_bar(i, len(iterator), epoch_acc / (i+1), epoch_loss / (i+1), "TRAINING")
        
        torch.cuda.empty_cache()
        
    train_acc, p_at_5, p_at_10 = get_acc(train_embeddings, train_labels)
    neptune.log_metric('Training P@1', train_acc)
    neptune.log_metric('Training P@5', p_at_5)
    neptune.log_metric('Training P@10', p_at_10)
    
    embedding_space = map_features(train_embeddings, train_labels, "Training")
        
    return epoch_loss / len(iterator), train_acc, embedding_space

In [None]:
def evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_acc = 0
    
    val_embeddings = torch.Tensor([])
    val_labels = torch.Tensor([])
    
    model.eval()
    
    with torch.no_grad():
        for i, batch in enumerate(iterator):
            text, text_lengths = batch.text            
            outputs = model(text, text_lengths).squeeze(1)
            triplets = miner(outputs, batch.course)
            loss = criterion(outputs, batch.course, triplets)

            epoch_loss += loss.item()
            
            val_embeddings = torch.cat((val_embeddings, outputs.detach().cpu()))
            val_labels = torch.cat((val_labels, batch.course.detach().cpu()))
            
            if i % 1 == 0:
                progress_bar(i, len(iterator), epoch_acc / (i+1), epoch_loss / (i+1), "VALIDATION")

            torch.cuda.empty_cache()
            
    
    val_acc, p_at_5, p_at_10 = get_acc(val_embeddings, val_labels)
    neptune.log_metric('Validation P@1', val_acc)
    neptune.log_metric('Validation P@5', p_at_5)
    neptune.log_metric('Validation P@10', p_at_10)
    
    embedding_space = map_features(val_embeddings, val_labels, "Validation")
    
    return epoch_loss / len(iterator), val_acc, embedding_space

In [None]:
def map_features(outputs, labels, phase):
        # create array of column for each feature output
        feat_cols = ['feature' + str(i) for i in range(outputs.shape[1])]

        # make dataframe of outputs -> labels
        df = pd.DataFrame(outputs, columns=feat_cols)
        df['y'] = labels
        df['labels'] = df['y'].apply(lambda i: str(i))

        # clear outputs and labels
        outputs, labels = None, None

        # creates an array of random indices from size of outputs
        np.random.seed(42)
        rand_perm = np.random.permutation(df.shape[0])

        num_examples = 10000

        df_subset = df.loc[rand_perm[:num_examples], :].copy()
        data_subset = df_subset[feat_cols].values

        pca = PCA(n_components=50)
        pca_result = pca.fit_transform(data_subset)

        tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=300)
        tsne_results = tsne.fit_transform(data_subset)
        df_subset['tsne-2d-one'] = tsne_results[:, 0]
        df_subset['tsne-2d-two'] = tsne_results[:, 1]

        plt.figure(figsize=(16, 10))
        plt.scatter(
            x=df_subset["tsne-2d-one"],
            y=df_subset["tsne-2d-two"],
            c=df_subset["y"],
            s=4
        )
        plt.axis('off')
        plt.title("{} Embedding Space".format(phase))
        return plt.gcf()

In [None]:
N_EPOCHS = 50

# initialize best loss as infinity
best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    train_loss, train_acc, embedding_space = train(model, train_iterator, optimizer, criterion)
    neptune.log_metric('Training Loss', train_loss)
    neptune.log_image('Training Embedding Space', embedding_space, image_name='Epoch {}'.format(epoch+1))
    
    valid_loss, valid_acc, embedding_space = evaluate(model, valid_iterator, criterion)
    neptune.log_metric('Validation Loss', valid_loss)
    neptune.log_image('Validation Embedding Space', embedding_space, image_name='Epoch {}'.format(epoch+1))
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'models/course_embedder.pt')
    
    print(f'Epoch: {epoch+1}')
    print(f'\tTrain Loss: {train_loss:.3f}\t|\tTrain Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f}\t|\tVal. Acc: {valid_acc*100:.2f}%')
    

In [None]:
len(train_data)