In [0]:
from copy import deepcopy as copy
import numpy as np

import sys
sys.path.append('../Tools')
from Metrics import full_report
from DataBuilder import read_data, user_builder, separate_data, prepare_data

import warnings
warnings.filterwarnings("ignore")

In [3]:
# Loading and preparing data

train_data, dev_data, test_data = separate_data(user_builder(read_data('../Data/city_search.json')))
(train_data, _), (dev_data, _), (test_data, test_mask), vocab = prepare_data(train_data, dev_data, test_data, create_iterators=False)

HBox(children=(IntProgress(value=0, max=5777), HTML(value='')))




In [0]:
# Defining the Unigram and Bigram classes

class Unigram:
    
    '''The Unigram model as the first baseline'''
    
    def __init__(self, vocab):
        self.vocab = dict(vocab.stoi)
        del self.vocab['<unk>'], self.vocab['<pad>']
        for k in self.vocab:
            self.vocab[k] -= 2
        self.matrix = np.zeros(len(self.vocab))
    
    def fit(self, data):
        for sample in data:
            for i in range(len(sample.input)):
                self.matrix[self.vocab[sample.output[i]]] += 1
        self.matrix = np.divide(self.matrix, np.sum(self.matrix))

    def predict(self, sample, mask=None):
        prob = np.zeros((len(sample.input), len(self.matrix)))
        for i in range(len(sample.input)):
            prob[i, :] = self.matrix
        if mask is not None:
            new_mask = copy(mask)
            for k in new_mask:
                new_mask[k] = new_mask[k][:, 2:]
            prob[new_mask[int(sample.user_id)] == 1] = 0
        return prob


class Bigram:
    
    '''The bigram model as the second baseline.'''
    
    def __init__(self, vocab):
        self.vocab = dict(vocab.stoi)
        del self.vocab['<unk>'], self.vocab['<pad>']
        for k in self.vocab:
            self.vocab[k] -= 2
        self.matrix = np.zeros((len(self.vocab), len(self.vocab)))
    
    def fit(self, data):
        for sample in data:
            for i in range(len(sample.input)):
                self.matrix[self.vocab[sample.input[i]], self.vocab[sample.output[i]]] += 1
        self.matrix = np.divide(self.matrix, np.sum(self.matrix, axis=1).reshape(self.matrix.shape[0], 1))

    def predict(self, sample, mask=None):
        prob = np.zeros((len(sample.input), len(self.matrix)))
        for i in range(len(sample.input)):
            prob[i, :] = self.matrix[self.vocab[sample.input[i]], :]
        if mask is not None:
            new_mask = copy(mask)
            for k in new_mask:
                new_mask[k] = new_mask[k][:, 2:]
            prob[new_mask[int(sample.user_id)] == 1] = 0
        return prob

In [0]:
# Training the models on the training and validation data

unigram = Unigram(vocab)
bigram = Bigram(vocab)

unigram.fit(train_data+dev_data)
bigram.fit(train_data+dev_data)

y_true_test, unigram_prob_test, bigram_prob_test = [], [], []

# Making predictions on the test data using the trained models

for sample in test_data:
    y_true = np.array([unigram.vocab[so] for so in sample.output])
    unigram_prob = unigram.predict(sample, test_mask)
    bigram_prob = bigram.predict(sample, test_mask)

    y_true_test.append(y_true)
    unigram_prob_test.append(unigram_prob)
    bigram_prob_test.append(bigram_prob)

y_true_test = np.concatenate(y_true_test, axis=0)
unigram_prob_test = np.concatenate(unigram_prob_test, axis=0)
bigram_prob_test = np.concatenate(bigram_prob_test, axis=0)

unigram_pred_test = np.argmax(unigram_prob_test, axis=1)
bigram_pred_test = np.argmax(bigram_prob_test, axis=1)    

In [9]:
# Reporting the performance

uni_words = set(unigram_pred_test)
bi_words = set(bigram_pred_test)

print('**************************UNIGRAM Performance**************************\n')
print(full_report(y_true_test, unigram_pred_test, unigram_prob_test))
print('\nThe Unigram predicts %d distinct labels on the test data:\n%s\n' % (len(uni_words), ' - '.join([vocab.itos[i+2] for i in uni_words])))
print('\n**************************BIGRAM Performance***************************\n')
print(full_report(y_true_test, bigram_pred_test, bigram_prob_test))
print('\nThe Bigram predicts %d distinct labels on the test data:\n%s' % (len(uni_words), ' - '.join([vocab.itos[i+2] for i in uni_words])))

**************************UNIGRAM Performance**************************

Overall Report:
+----------+----------+-------------+--------------+--------+--------+
| Accuracy | Macro-F1 | Weighted-F1 | Average Rank | Hit@5  | MRR@5  |
+----------+----------+-------------+--------------+--------+--------+
|  45.123  |  1.274   |    31.368   |    7.315     | 47.271 | 27.482 |
+----------+----------+-------------+--------------+--------+--------+

Performance on <sos>:
+----------+--------+-----------+---------+
| Accuracy |   F1   | Precision |  Recall |
+----------+--------+-----------+---------+
|  74.991  | 74.991 |   59.988  | 100.000 |
+----------+--------+-----------+---------+

Scores@k:
+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+
|        | k = 1  | k = 2  | k = 3  | k = 4  | k = 5  | k = 6  | k = 7  | k = 8  | k = 9  | k = 10 |
+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+