### raw data
* word embedding: glove
* doc text: ./data/IMDB.txt

### preprocess
1. filter too frequent and less frequent words
2. stemming
3. document vector aggregation

### model
1. TopK
2. Sklearn
3. Our model

### evaluation
1. F1
2. NDCG

In [1]:
from collections import defaultdict
import math
import numpy as np 
import re
import torch
import torch.nn as nn
from itertools import cycle
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset
import torch.nn.functional as F
from tqdm.auto import tqdm

from sklearn.metrics import ndcg_score
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from nltk.stem import PorterStemmer
import matplotlib.pyplot as plt 
import pandas as pd

## Preprocess config

In [2]:
config = {}

config["n_document"] = 10000
config["min_word_freq_threshold"] = 20
config["topk_word_freq_threshold"] = 500
config["document_vector_agg"] = 'TF-IDF'
config["select_topk_TFIDF"] = None


In [3]:
# load word embedding
embedding_file = "../data/glove.6B.100d.txt"

word2embedding = dict()
word_dim = int(re.findall(r".(\d+)d",embedding_file)[0])

with open(embedding_file,"r") as f:
    for line in tqdm(f):
        line = line.strip().split()
        word = line[0]
        embedding = list(map(float,line[1:]))
        word2embedding[word] = embedding

print("Number of words:%d" % len(word2embedding))

0it [00:00, ?it/s]

Number of words:400000


In [4]:
class Vocabulary:
    def __init__(self, word2embedding, min_word_freq_threshold=0, topk_word_freq_threshold=0):
        # The low frequency words will be assigned as <UNK> token
        self.itos = {0: "<UNK>"}
        self.stoi = {"<UNK>": 0}
        
        self.word2embedding = word2embedding
        self.min_word_freq_threshold = min_word_freq_threshold
        self.topk_word_freq_threshold = topk_word_freq_threshold
        
        self.word_freq_in_corpus = defaultdict(int)
        self.IDF = {}
        self.ps = PorterStemmer()
        
    def __len__(self):
        return len(self.itos)

#     @staticmethod
    def tokenizer_eng(self, text):
        text = re.sub(r'[^A-Za-z0-9 ]+', '', text)
        text = text.strip().split()
        
        return [self.ps.stem(w) for w in text]

    def build_vocabulary(self, sentence_list):
        
        self.doc_freq = defaultdict(int) # # of document a word appear
        self.document_num = len(sentence_list)
        self.word_vectors = [[0]*word_dim] # unknown word emb
        
        for sentence in tqdm(sentence_list, desc="Preprocessing documents"):
            # for doc_freq
            document_words = set()
            
            for word in self.tokenizer_eng(sentence):
                # pass unknown word
                if word not in self.word2embedding:
                    continue
                    
                # calculate word freq
                self.word_freq_in_corpus[word] += 1
                document_words.add(word)
                
            for word in document_words:
                self.doc_freq[word] += 1
        
        # calculate IDF
        print('doc num', self.document_num)
        for word, freq in self.doc_freq.items():
            self.IDF[word] = math.log(self.document_num / (freq+1))
        
        # delete less freq words:
        delete_words = []
        for word, v in self.word_freq_in_corpus.items():
            if v < self.min_word_freq_threshold:
                delete_words.append(word)     
        for word in delete_words:
            del self.IDF[word]    
            del self.word_freq_in_corpus[word]    
        
        # delete too freq words
        print('eliminate freq words')
        IDF = [(word, freq) for word, freq in self.IDF.items()]
        IDF.sort(key=lambda x: x[1])

        for i in range(self.topk_word_freq_threshold):
            print(word)
            word = IDF[i][0]
            del self.IDF[word]
            del self.word_freq_in_corpus[word]
        
        # construct word_vectors
        idx = 1
        for word in self.word_freq_in_corpus:
            self.word_vectors.append(self.word2embedding[word])
            self.stoi[word] = idx
            self.itos[idx] = word
            idx += 1
            
    def calculate_document_vector(self, sentence_list, agg, select_topk_TFIDF=None):
        document_vectors = []
        document_answers = []
        document_answers_w = []
        
        for sentence in tqdm(sentence_list, desc="calculate document vectors"):
            document_vector = np.zeros(len(self.word_vectors[0]))
            select_words = []
            for word in self.tokenizer_eng(sentence):
                # pass unknown word
                if word not in self.stoi:
                    continue
                else:
                    select_words.append(word)

            # select topk TDIDF
            if select_topk_TFIDF is not None:
                doc_TFIDF = defaultdict(float)
                for word in select_words:    
                    doc_TFIDF[word] += self.IDF[word]

                doc_TFIDF_l = [(word, TFIDF) for word, TFIDF in doc_TFIDF.items()]
                doc_TFIDF_l.sort(key=lambda x:x[1], reverse=True)
                
                select_topk_words = set(list(map(lambda x:x[0], doc_TFIDF_l[:select_topk_TFIDF])))
                select_words = [word for word in select_words if word in select_topk_words]
            else:
                pass
            
            total_weight = 0
            # aggregate to doc vectors
            for word in select_words:
                if agg == 'mean':
                    document_vector += self.word2embedding[word]
                    total_weight += 1
                elif agg == 'TF-IDF':
                    document_vector += np.array(self.word2embedding[word]) * self.IDF[word]
                    total_weight += self.IDF[word]

            if len(select_words) == 0:
                print('error', sentence)
                continue
            else:
                document_vector /= len(select_words)
                total_weight /= len(select_words)
            
            document_vectors.append(document_vector)
            document_answers.append(select_words)
            document_answers_w.append(total_weight)
        
        # get answers
        document_answers_idx = []    
        for ans in document_answers:
            ans_idx = []
            for token in ans:
                if token in self.stoi:
                    ans_idx.append(self.stoi[token])                    
            document_answers_idx.append(ans_idx)
            
        return document_vectors, document_answers_idx, document_answers_w
        
    def numericalize(self, text):
        tokenized_text = self.tokenizer_eng(text)

        return [
            self.stoi[token] if token in self.stoi else self.stoi["<UNK>"]
            for token in tokenized_text
        ]

In [5]:
class CBowDataset(Dataset):
    def __init__(self, 
                 raw_data_file_path,
                 word2embedding,
                 skip_header = False,
                 n_document = None, # read first n document
                 min_word_freq_threshold = 20, # eliminate less freq words
                 topk_word_freq_threshold = 5, # eliminate smallest k IDF words
                 select_topk_TFIDF = None, # select topk tf-idf as ground-truth
                 document_vector_agg = 'mean',
                 ):

        assert document_vector_agg in ['mean', 'TF-IDF']
        
        # raw documents
        self.documents = []
        
        with open(raw_data_file_path,'r',encoding='utf-8') as f:
            if skip_header:
                f.readline()
            for line in tqdm(f, desc="Loading documents"):
                # read firt n document
                if n_document is not None and len(self.documents) >= n_document:
                    break    
                self.documents.append(line.strip("\n"))

        # build vocabulary
        self.vocab = Vocabulary(word2embedding, min_word_freq_threshold, topk_word_freq_threshold)
        self.vocab.build_vocabulary(self.documents)
        self.vocab_size = len(self.vocab)

        # calculate document vectors
        self.document_vectors, self.document_answers, self.document_answers_w = self.vocab.calculate_document_vector(self.documents, \
                                                                                           document_vector_agg, select_topk_TFIDF)
        # train-test split
        # training
        self.train_split_ratio = 0.8
        self.train_length = int(len(self.document_answers) * self.train_split_ratio)
        self.train_vectors = self.document_vectors[:self.train_length]
        self.train_words = self.document_answers[:self.train_length]
        self.document_ids = list(range(self.train_length))
        self.generator = cycle(self.context_target_generator())
        self.dataset_size = sum([len(s) for s in self.train_words])
        
        # testing
        self.test_vectors = self.document_vectors[self.train_length:]
        self.test_words = self.document_answers[self.train_length:]

    def context_target_generator(self):
        np.random.shuffle(self.document_ids) # inplace shuffle

        # randomly select a document and create its training example
        for document_id in self.document_ids: 
            word_list = set(self.train_words[document_id])
            negative_sample_space = list(set(range(self.vocab_size)) - word_list)
            negative_samples = np.random.choice(negative_sample_space,size=len(word_list),replace = False)
            for word_id, negative_wordID in zip(word_list, negative_samples):
                yield [document_id, word_id, negative_wordID]
                
    def __getitem__(self, idx):
        doc_id, word_id, negative_wordID = next(self.generator)
        doc_id = torch.FloatTensor(self.document_vectors[doc_id])
        word_id = torch.FloatTensor(self.vocab.word_vectors[word_id])
        negative_word = torch.FloatTensor(self.vocab.word_vectors[negative_wordID])

        return doc_id, word_id, negative_word

    def __len__(self):
        return self.dataset_size 


In [6]:
# load and build torch dataset
data_file_path = '../data/IMDB.txt'

print("Building dataset....")
dataset = CBowDataset(
                    raw_data_file_path=data_file_path,
                    word2embedding=word2embedding,
                    skip_header=False,
                    n_document = config["n_document"],
                    min_word_freq_threshold = config["min_word_freq_threshold"],
                    topk_word_freq_threshold = config["topk_word_freq_threshold"],
                    document_vector_agg = config["document_vector_agg"],
                    select_topk_TFIDF = config["select_topk_TFIDF"]
                    )
print("Finish building dataset!")
print(f"Number of documents:{len(dataset.documents)}")
print(f"Number of words:{dataset.vocab_size}")

Building dataset....


Loading documents: 0it [00:00, ?it/s]

Preprocessing documents:   0%|          | 0/10000 [00:00<?, ?it/s]

doc num 10000
eliminate freq words
boel
the
and
a
of
to
is
it
thi
in
that
i
for
with
but
as
on
wa
be
film
one
not
are
have
all
an
you
at
by
from
who
like
hi
ha
so
he
time
about
out
there
if
veri
see
good
what
more
they
when
just
some
or
make
watch
great
get
well
my
other
up
can
love
also
which
would
their
will
even
most
her
me
had
much
than
first
do
way
into
play
end
were
no
best
scene
think
been
how
go
look
show
made
she
after
we
year
mani
work
know
too
seen
act
him
them
come
thing
perform
two
life
still
never
take
dont
could
give
say
then
actor
ani
doe
your
where
seem
find
enjoy
want
ever
while
man
did
over
cast
feel
here
such
back
these
part
those
lot
live
tri
role
plot
wonder
interest
use
though
better
through
now
real
off
new
befor
world
should
set
both
quit
again
alway
day
director
star
young
actual
few
own
old
same
doesnt
music
direct
may
excel
right
fact
bit
start
im
turn
whi
between
us
saw
without
thought
person
long
bad
point
down
fan
big
recommend
differ
didnt
around
final
m

calculate document vectors:   0%|          | 0/10000 [00:00<?, ?it/s]

error i think it's one of the greatest movies which are ever made ,  and i've seen many .  .  .  the book is better ,  but it's still a very good movie ! 
error all this talk about this being a bad movie is nonsense .  as a matter of fact this is the best movie i've ever seen .  it's an excellent story and the actors in the movie are some of the best .  i would not give criticism to any of the actors .  that movie is the best and it will always stay that way . 
error smallville episode justice is the best episode of smallville  !   !   !   !   !   !   !   !   !   !   !   !   !   !   !   !   !   !   !   !   !   !   !   !   !   !   !   !   !   !   !   !   !   !   !   !   !   !   !   !   !   !   !   !   !   !   !   !   !   !   !   !   !   !   !   !   !   !   !   !   !   !   !   !   !   !   !   !   !   !   !   !   !   !   !   !   !   !   !   !   !   !   !   !   !   !   !   !   !   !   !   !   !   !   !   !   !   !   !   !   !   !   !   !   !   !   !   !   !   !   !   !   !   !   !   !   ! 

In [7]:
# check test doc vectors' correctness
word_vectors = np.array(dataset.vocab.word_vectors)
word_vectors.shape

pred = np.zeros(100)
cnt = 0
for word_idx in dataset.test_words[0]:
    pred += word_vectors[word_idx] * dataset.vocab.IDF[dataset.vocab.itos[word_idx]]
    cnt += 1
print(dataset.test_vectors[0] - pred/cnt)

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0.]


In [8]:
## create tfidf_ans
document_answers = dataset.document_answers

onehot_ans = np.zeros((len(document_answers), word_vectors.shape[0]))
tfidf_ans = np.zeros((len(document_answers), word_vectors.shape[0]))
print(tfidf_ans.shape)

for i in tqdm(range(len(document_answers))):
    for word_idx in document_answers[i]:
        tfidf_ans[i, word_idx] += dataset.vocab.IDF[dataset.vocab.itos[word_idx]]
        onehot_ans[i, word_idx] += 1

(9993, 4127)


  0%|          | 0/9993 [00:00<?, ?it/s]

## Top K freq word

In [9]:
test_ans = dataset.document_answers

In [10]:
word_freq = [(word, freq) for word, freq in dataset.vocab.word_freq_in_corpus.items()]
word_freq.sort(key=lambda x:x[1], reverse=True)
word_freq[:10]

[('game', 783),
 ('season', 671),
 ('david', 549),
 ('ladi', 538),
 ('jack', 536),
 ('killer', 527),
 ('sex', 524),
 ('town', 524),
 ('king', 512),
 ('novel', 496)]

In [11]:
def topk_word_evaluation(k=50):
    topk_word = [word for (word, freq) in word_freq[:k]]

    pr, re = [], []
    for ans in tqdm(test_ans):
        ans = set(ans)
        ans = [dataset.vocab.itos[a] for a in ans]

        hit = []
        for word in ans:
            if word in topk_word:
                hit.append(word)

        precision = len(hit) / k
        recall = len(hit) / len(ans)
        pr.append(precision)
        re.append(recall)

    print('top {} word'.format(k))
    print('percision', np.mean(pr))
    print('recall', np.mean(re))

topk_word_evaluation(k=50)
topk_word_evaluation(k=100)
topk_word_evaluation(k=200)


  0%|          | 0/9993 [00:00<?, ?it/s]

top 50 word
percision 0.035060542379665764
recall 0.06100105864457941


  0%|          | 0/9993 [00:00<?, ?it/s]

top 100 word
percision 0.034443110177123995
recall 0.11986880156935618


  0%|          | 0/9993 [00:00<?, ?it/s]

top 200 word
percision 0.03148153707595318
recall 0.22046597654109124


In [12]:
def topk_word_evaluation_NDCG(k=50):
    freq_word =[word for (word, freq) in word_freq]
    freq_word_idx = [dataset.vocab.stoi[word] for word in freq_word if word in dataset.vocab.stoi]
    
    scores = np.zeros(len(dataset.vocab.word_vectors))
    for rank, idx in enumerate(freq_word_idx):
        scores[idx] = len(dataset.vocab.word_vectors) - rank
    
    NDCGs = []
    
    for ans in tqdm(test_ans):
        TFIDF_ans = np.zeros(len(dataset.vocab.word_vectors))
        
        for word_idx in ans:
            if word_idx == 0:
                continue
            word = dataset.vocab.itos[word_idx]
            TFIDF_ans[word_idx] += dataset.vocab.IDF[word]

        NDCG_score = ndcg_score(TFIDF_ans.reshape(1,-1), scores.reshape(1,-1), k=k)
        NDCGs.append(NDCG_score)

    print('top {} NDCG:{}'.format(k, np.mean(NDCGs)))

topk_word_evaluation_NDCG(k=50)
topk_word_evaluation_NDCG(k=100)
topk_word_evaluation_NDCG(k=200)
topk_word_evaluation_NDCG(k=None)


  0%|          | 0/9993 [00:00<?, ?it/s]

top 50 NDCG:0.033762099027129774


  0%|          | 0/9993 [00:00<?, ?it/s]

top 100 NDCG:0.050457240136176806


  0%|          | 0/9993 [00:00<?, ?it/s]

top 200 NDCG:0.07732441220294914


  0%|          | 0/9993 [00:00<?, ?it/s]

top None NDCG:0.2833640263103313


## Sklearn

## Our Model

In [13]:
class Custom_Dataset(Dataset):
    def __init__(self, 
                 doc_vectors,
                 doc_w_sum,
                 tfidf_ans
                 ):
        self.doc_vectors = torch.FloatTensor(doc_vectors)
        self.doc_w_sum = torch.FloatTensor(doc_w_sum)
        self.tfidf_ans = tfidf_ans
        assert len(doc_vectors) == len(doc_w_sum)
        
    def __getitem__(self, idx):
                
        return self.doc_vectors[idx], self.doc_w_sum[idx], idx

    def __len__(self):
        return len(self.doc_vectors)


In [14]:
class LR(nn.Module):
    """
    Input shape: (N, 3, 64, 64)
    Output shape: (N, )
    """
    def __init__(self, num_doc, num_words):
        super(LR, self).__init__()
        weight = torch.zeros(num_doc, num_words)
        self.emb = torch.nn.Embedding.from_pretrained(weight, freeze=False)
        
    def forward(self, doc_ids, word_vectors):
        return self.emb(doc_ids) @ word_vectors

In [15]:
def evaluate_NDCG(model, train_loader):
    results = {}
    model.eval()
    
    scores = np.array(model.emb.weight.data)
    true_relevance = train_loader.dataset.tfidf_ans
        
    results['ndcg@50'] = (ndcg_score(true_relevance, scores, k=50))
    results['ndcg@100'] = (ndcg_score(true_relevance, scores, k=100))
    results['ndcg@200'] = (ndcg_score(true_relevance, scores, k=200))
    results['ndcg@all'] = (ndcg_score(true_relevance, scores, k=None))
    
    return results

In [21]:
batch_size = 200
train_size_ratio = 0.1

train_size = int(len(dataset.document_vectors) * train_size_ratio)
print('document num', train_size)

document_vectors = np.array(dataset.document_vectors)
document_answers_w = np.array(dataset.document_answers_w).reshape(-1, 1)

train_dataset = Custom_Dataset(document_vectors[:train_size], document_answers_w[:train_size], tfidf_ans[:train_size])
train_loader  = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, pin_memory=True)

document num 999


## start training

In [24]:
# setting
lr = 0.02
momentum = 0.99
weight_decay = 0
nesterov = False # True

n_epoch = 5000

w_sum_reg = 1e-3
w_sum_reg_mul = 0.8
w_clip_value = 0

verbose = True
valid_epoch = 100

model = LR(num_doc=train_size, num_words=word_vectors.shape[0])
model.train()

word_vectors_tensor = torch.FloatTensor(word_vectors)

opt = torch.optim.SGD(model.parameters(), lr=lr, momentum=momentum, weight_decay=weight_decay, nesterov=nesterov)
criterion = nn.MSELoss(reduction='mean')

results = []
step = 0
for epoch in tqdm(range(n_epoch)):    
    loss_mse_his = []
    loss_w_reg_his = []
    
    model.train()
    for data in train_loader:
        doc_embs, doc_w_sum, doc_ids = data
        # MSE loss
        pred_doc_embs = model(doc_ids, word_vectors_tensor)     
        loss_mse = criterion(pred_doc_embs, doc_embs)

        pred_w_sum = torch.sum(model.emb(doc_ids), axis=1).view(-1, 1)
        loss_w_reg = criterion(pred_w_sum, doc_w_sum * w_sum_reg_mul)
        
        loss = loss_mse + loss_w_reg * w_sum_reg
        
        # Model backwarding
        model.zero_grad()
        loss.backward()
        opt.step()

        loss_mse_his.append(loss_mse.item())
        loss_w_reg_his.append(loss_w_reg.item())

        for p in model.parameters():
            p.data.clamp_(w_clip_value, float('inf'))

        
    if epoch % valid_epoch == 0:
        res = {}
        res['epoch'] = epoch
        res['loss_mse'] = np.mean(loss_mse_his)
        res['loss_w_reg'] = np.mean(loss_w_reg_his)
        
        res_ndcg = evaluate_NDCG(model, train_loader)
        res.update(res_ndcg)
        results.append(res)
        
        if verbose:
            for k, v in res.items():
                print(k, v)

    results.append(res)

  0%|          | 0/5000 [00:00<?, ?it/s]

epoch 0
loss_mse 1.1490628719329834
loss_w_reg 12.81353816986084
ndcg@50 0.2393351073073729
ndcg@100 0.2648776289053164
ndcg@200 0.29479027112870226
ndcg@all 0.4482029312540207
epoch 100
loss_mse 0.006500968988984823
loss_w_reg 9.558284950256347
ndcg@50 0.569417324314581
ndcg@100 0.5995820158353545
ndcg@200 0.6308079438538046
ndcg@all 0.7198786361148665
epoch 200
loss_mse 0.0034469071310013534
loss_w_reg 7.397932720184326
ndcg@50 0.5906546487545926
ndcg@100 0.6189840169533798
ndcg@200 0.6503873998819233
ndcg@all 0.7347971424377536
epoch 300
loss_mse 0.002494489448145032
loss_w_reg 6.34325008392334
ndcg@50 0.601336915754799
ndcg@100 0.6290661594206691
ndcg@200 0.6598518419501244
ndcg@all 0.7421291650375673
epoch 400
loss_mse 0.001994803408160806
loss_w_reg 5.646750640869141
ndcg@50 0.6091635519733181
ndcg@100 0.6365924641499666
ndcg@200 0.6673771276506394
ndcg@all 0.7476708711353401
epoch 500
loss_mse 0.0016836134251207112
loss_w_reg 5.1406303405761715
ndcg@50 0.6148230785715703
ndcg@10

KeyboardInterrupt: 

In [None]:
pd.set_option('display.max_rows', 500)
results_df = pd.DataFrame(results)
results_df.groupby(by=['epoch']).mean().plot()
results_df.groupby(by=['epoch']).mean()
# results_df.groupby(by=['epoch']).mean().iloc[-1]

## Quality Check

In [25]:
# select doc_id and k
doc_id = 4
topk = 50

model

LR(
  (emb): Embedding(999, 4127)
)

In [26]:
import colored
from colored import stylize

word_list = dataset.vocab.itos

gt = [word_list[word_idx] for word_idx in np.argsort(tfidf_ans[doc_id])[::-1][:topk]]
pred = [word_list[word_idx] for word_idx in np.argsort(model.emb.weight.data[doc_id].numpy())[::-1][:topk]]

print('ground truth')
for word in gt:
    if word in pred:
        print(stylize(word, colored.bg("yellow")), end=' ')
    else:
        print(word, end=' ')

print()
print('\nprediction')
for word in pred:
    if word in gt:
        print(stylize(word, colored.bg("yellow")), end=' ')
    else:
        print(word, end=' ')


ground truth
[48;5;3mwarren[0m [48;5;3mbrook[0m sailor [48;5;3mmel[0m flesh [48;5;3mslapstick[0m [48;5;3mlesli[0m price [48;5;3mann[0m rent room speak cut inherit richard warp process interpret montag ebert roger 3rd spectacular grin pg g x mislead humour couldnt mankind rapid complex spiritu notion 4th rid dean dimension krell list storm struck hors discern stone cum soap detect current 

prediction
[48;5;3mbrook[0m [48;5;3mwarren[0m [48;5;3mlesli[0m merril [48;5;3mmel[0m worker dana wood [48;5;3mann[0m cabin bacon charli jim cliff walk scarlett bargain [48;5;3mslapstick[0m oil duck chuck columbia farmer swim stream spike jack hickock meadow cunningham student gershwin matthau craig neeson liam hill harrow deeper christin lower bend robin brent bill prison river darker lane regret 

In [27]:
# raw document
dataset.documents[doc_id]

'this is not the typical mel brooks film .  it was much less slapstick than most of his movies and actually had a plot that was followable .  leslie ann warren made the movie ,  she is such a fantastic ,  under-rated actress .  there were some moments that could have been fleshed out a bit more ,  and some scenes that could probably have been cut to make the room to do so ,  but all in all ,  this is worth the price to rent and see it .  the acting was good overall ,  brooks himself did a good job without his characteristic speaking to directly to the audience .  again ,  warren was the best actor in the movie ,  but  " fume "  and  " sailor "  both played their parts well . '

In [28]:
results = {}
   
scores = np.array(model.emb.weight.data)[doc_id].reshape(1, -1)
true_relevance = train_loader.dataset.tfidf_ans[doc_id].reshape(1, -1)

results['ndcg@50'] = (ndcg_score(true_relevance, scores, k=50))
results['ndcg@100'] = (ndcg_score(true_relevance, scores, k=100))
results['ndcg@200'] = (ndcg_score(true_relevance, scores, k=200))
results['ndcg@all'] = (ndcg_score(true_relevance, scores, k=None))

print('NDCG top50', results['ndcg@50'])
print('NDCG top100', results['ndcg@100'])
print('NDCG top200', results['ndcg@200'])
print('NDCG ALL', results['ndcg@all'])


NDCG top50 0.6997726360764731
NDCG top100 0.6997726360764731
NDCG top200 0.7617430914633224
NDCG ALL 0.807197545132113
