### raw data
* word embedding: glove
* doc text: ./data/IMDB.txt

### dataset
1. IMDB
2. CNNNews
3. [PubMed](https://github.com/LIAAD/KeywordExtractor-Datasets/blob/master/datasets/PubMed.zip)

### preprocess
1. filter too frequent and less frequent words
2. stemming
3. document vector aggregation

### model
1. TopK
2. Sklearn
3. Our model

### evaluation
1. F1
2. NDCG

In [1]:
import os
from collections import defaultdict
import math
import numpy as np 
import random
import re
import torch
import torch.nn as nn
from itertools import cycle
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset
import torch.nn.functional as F
from torch.optim.lr_scheduler import MultiStepLR
from tqdm.auto import tqdm

# Used to get the data
from sklearn.metrics import ndcg_score

import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.collocations import BigramAssocMeasures, BigramCollocationFinder
nltk.download('stopwords')

import matplotlib.pyplot as plt 
import matplotlib
matplotlib.use('Agg')

seed = 33
import pandas as pd

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/chrisliu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Preprocess config

In [2]:
config = {}

config["dataset"] = "CNN" # "IMDB" "CNN", "PubMed"
config["n_document"] = 100
config["normalize_word_embedding"] = False
config["min_word_freq_threshold"] = 20
config["topk_word_freq_threshold"] = 100
config["document_vector_agg_weight"] = 'IDF' # ['mean', 'IDF', 'uniform', 'gaussian', 'exponential', 'pmi']
config["document_vector_weight_normalize"] = True # weighted sum or mean, True for mean, False for sum 
config["select_topk_TFIDF"] = None # ignore
config["embedding_file"] = "../data/glove.6B.100d.txt"
config["topk"] = [10, 30, 50]


In [3]:
def in_notebook():
    try:
        from IPython import get_ipython
        if 'IPKernelApp' not in get_ipython().config:  # pragma: no cover
            return False
    except ImportError:
        return False
    return True

In [4]:
def load_word2emb(embedding_file):
    word2embedding = dict()
    word_dim = int(re.findall(r".(\d+)d", embedding_file)[0])

    with open(embedding_file, "r") as f:
        for line in tqdm(f):
            line = line.strip().split()
            word = line[0]
            embedding = list(map(float, line[1:]))
            word2embedding[word] = np.array(embedding)

    print("Number of words:%d" % len(word2embedding))

    return word2embedding

word2embedding = load_word2emb(config["embedding_file"])

0it [00:00, ?it/s]

Number of words:400000


In [5]:
def normalize_wordemb(word2embedding):
    # Every word emb should have norm 1
    
    word_emb = []
    word_list = []
    for word, emb in word2embedding.items():
        word_list.append(word)
        word_emb.append(emb)

    word_emb = np.array(word_emb)

    for i in range(len(word_emb)):
        norm = np.linalg.norm(word_emb[i])
        word_emb[i] = word_emb[i] / norm

    for word, emb in tqdm(zip(word_list, word_emb)):
        word2embedding[word] = emb
    return word2embedding

if config["normalize_word_embedding"]:
    normalize_wordemb(word2embedding)

In [6]:
class Vocabulary:
    def __init__(self, word2embedding, config):
        # The low frequency words will be assigned as <UNK> token
        self.itos = {0: "<UNK>"}
        self.stoi = {"<UNK>": 0}
        
        self.word2embedding = word2embedding
        self.config = config

        self.word_freq_in_corpus = defaultdict(int)
        self.IDF = {}
        self.ps = PorterStemmer()
        self.stop_words = set(stopwords.words('english'))
        
        self.word_dim = len(word2embedding['the'])
    def __len__(self):
        return len(self.itos)

    def tokenizer_eng(self, text):
        text = re.sub(r'[^A-Za-z0-9 ]+', '', text)
        text = text.strip().split()
        
        return [self.ps.stem(w) for w in text if w.lower() not in self.stop_words]
    
    def read_raw(self):        
        if self.config["dataset"] == 'IMDB':
            data_file_path = '../data/IMDB.txt'
        elif self.config["dataset"] == 'CNN':
            data_file_path = '../data/CNN.txt'
        elif self.config["dataset"] == 'PubMed':
            data_file_path = '../data/PubMed.txt'
        
        # raw documents
        self.raw_documents = []
        with open(data_file_path,'r',encoding='utf-8') as f:
            for line in tqdm(f, desc="Loading documents"):
                self.raw_documents.append(line.strip("\n"))
                
        return self.raw_documents
    
    def build_vocabulary(self):
        sentence_list = self.raw_documents
        
        self.doc_freq = defaultdict(int) # # of document a word appear
        self.document_num = len(sentence_list)
        self.word_vectors = [[0]*self.word_dim] # unknown word emb
        
        for sentence in tqdm(sentence_list, desc="Preprocessing documents"):
            # for doc_freq
            document_words = set()
            
            for word in self.tokenizer_eng(sentence):
                # pass unknown word
                if word not in self.word2embedding:
                    continue
                    
                # calculate word freq
                self.word_freq_in_corpus[word] += 1
                document_words.add(word)
                
            for word in document_words:
                self.doc_freq[word] += 1
        
        # calculate IDF
        print('doc num', self.document_num)
        for word, freq in self.doc_freq.items():
            self.IDF[word] = math.log(self.document_num / (freq+1))
        
        # delete less freq words:
        delete_words = []
        for word, v in self.word_freq_in_corpus.items():
            if v < self.config["min_word_freq_threshold"]:
                delete_words.append(word)     
        for word in delete_words:
            del self.IDF[word]    
            del self.word_freq_in_corpus[word]    
        
        # delete too freq words
        print('eliminate freq words')
        IDF = [(word, freq) for word, freq in self.IDF.items()]
        IDF.sort(key=lambda x: x[1])

        for i in range(self.config["topk_word_freq_threshold"]):
            print(word)
            word = IDF[i][0]
            del self.IDF[word]
            del self.word_freq_in_corpus[word]
        
        # construct word_vectors
        idx = 1
        for word in self.word_freq_in_corpus:
            self.word_vectors.append(self.word2embedding[word])
            self.stoi[word] = idx
            self.itos[idx] = word
            idx += 1
            
    def init_word_weight(self,sentence_list, agg):
        if agg == 'mean':
            self.word_weight = {word: 1 for word in self.IDF.keys()}
        elif agg == 'IDF':
            self.word_weight = self.IDF
        elif agg == 'uniform':
            self.word_weight = {word: np.random.uniform(low=0.0, high=1.0) for word in self.IDF.keys()}
        elif agg == 'gaussian':
            mu, sigma = 10, 1 # mean and standard deviation
            self.word_weight = {word: np.random.normal(mu, sigma) for word in self.IDF.keys()}
        elif agg == 'exponential':
            self.word_weight = {word: np.random.exponential(scale=1.0) for word in self.IDF.keys()}
        elif agg == 'pmi':
            trigram_measures = BigramAssocMeasures()
            self.word_weight = defaultdict(int)
            corpus = []

            for text in tqdm(sentence_list):
                corpus.extend(text.split())

            finder = BigramCollocationFinder.from_words(corpus)
            for pmi_score in finder.score_ngrams(trigram_measures.pmi):
                pair, score = pmi_score
                self.word_weight[pair[0]] += score
                self.word_weight[pair[1]] += score
                
    def calculate_document_vector(self):
        # Return
        # document_vectors: weighted sum of word emb
        # document_answers_idx: doc to word index list
        # document_answers_wsum: word weight summation, e.g. total TFIDF score of a doc
        
        document_vectors = [] 
        document_answers = []
        document_answers_wsum = []
        
        sentence_list = self.raw_documents
        agg = self.config["document_vector_agg_weight"]
        n_document = self.config["n_document"]
        select_topk_TFIDF = self.config["select_topk_TFIDF"]
        
        self.init_word_weight(sentence_list, agg)
        for sentence in tqdm(sentence_list[:min(n_document, len(sentence_list))], desc="calculate document vectors"):
            document_vector = np.zeros(len(self.word_vectors[0]))
            select_words = []
            for word in self.tokenizer_eng(sentence):
                # pass unknown word
                if word not in self.stoi:
                    continue
                else:
                    select_words.append(word)

            # select topk TDIDF
            if select_topk_TFIDF is not None:
                doc_TFIDF = defaultdict(float)
                for word in select_words:    
                    doc_TFIDF[word] += self.IDF[word]

                doc_TFIDF_l = [(word, TFIDF) for word, TFIDF in doc_TFIDF.items()]
                doc_TFIDF_l.sort(key=lambda x:x[1], reverse=True)
                
                select_topk_words = set(list(map(lambda x:x[0], doc_TFIDF_l[:select_topk_TFIDF])))
                select_words = [word for word in select_words if word in select_topk_words]
            else:
                pass
            
            total_weight = 0
            # aggregate to doc vectors
            for word in select_words:
                document_vector += np.array(self.word2embedding[word]) * self.word_weight[word]
                total_weight += self.word_weight[word]
                
            if len(select_words) == 0:
                print('error', sentence)
                continue
            else:
                if self.config["document_vector_weight_normalize"]:
                    document_vector /= total_weight
                    total_weight = 1
            
            document_vectors.append(document_vector)
            document_answers.append(select_words)
            document_answers_wsum.append(total_weight)
        
        # get answers
        document_answers_idx = []    
        for ans in document_answers:
            ans_idx = []
            for token in ans:
                if token in self.stoi:
                    ans_idx.append(self.stoi[token])                    
            document_answers_idx.append(ans_idx)
        
        self.document_vectors = document_vectors
        self.document_answers_idx = document_answers_idx
        self.document_answers_wsum = document_answers_wsum
        
        return document_vectors, document_answers_idx, document_answers_wsum
        
    def numericalize(self, text):
        tokenized_text = self.tokenizer_eng(text)

        return [
            self.stoi[token] if token in self.stoi else self.stoi["<UNK>"]
            for token in tokenized_text
        ]
    
    def check_docemb(self):
        word_vectors = np.array(self.word_vectors)
        pred = np.zeros(word_vectors.shape[1])
        cnt = 0

        for word_idx in self.document_answers_idx[0]:
            pred += word_vectors[word_idx] * self.word_weight[self.itos[word_idx]]
            cnt += self.word_weight[self.itos[word_idx]]
        
        if self.config["document_vector_weight_normalize"]:
            pred /= cnt
        assert np.sum(self.document_vectors[0]) - np.sum(pred) == 0

In [7]:
def build_vocab(config, word2embedding):
    # build vocabulary
    vocab = Vocabulary(word2embedding, config)
    vocab.read_raw()
    vocab.build_vocabulary()
    vocab_size = len(vocab)
    # get doc emb
    vocab.calculate_document_vector()
    vocab.check_docemb()
    
    return vocab

vocab = build_vocab(config, word2embedding)

Loading documents: 0it [00:00, ?it/s]

Preprocessing documents:   0%|          | 0/100000 [00:00<?, ?it/s]

doc num 100000
eliminate freq words
hadrian
film
one
like
time
make
good
see
watch
get
even
would
well
much
look
end
act
scene
go
also
way
great
think
dont
first
thing
made
bad
love
could
play
know
say
show
seen
plot
seem
come
mani
take
want
work
never
actor
tri
two
best
ever
year
give
better
life
still
find
perform
part
use
actual
interest
feel
lot
back
man
im
director
real
cast
doesnt
though
enjoy
didnt
noth
start
live
cant
point
set
guy
role
new
turn
thought
old
direct
fact
that
quit
star
day
wonder
around
happen
got
enough
right
effect
world
long
music
without


calculate document vectors:   0%|          | 0/100 [00:00<?, ?it/s]

In [8]:
print("Finish building dataset!")
print(f"Number of documents:{len(vocab.raw_documents)}")
print(f"Number of words:{len(vocab)}")

l = list(map(len, vocab.document_answers_idx))
print("Average length of document:", np.mean(l))

Finish building dataset!
Number of documents:100000
Number of words:13976
Average length of document: 65.79


In [9]:
word_vectors = np.array(vocab.word_vectors)
print("word_vectors:", word_vectors.shape)

document_vectors = np.array(vocab.document_vectors)
print("document_vectors", document_vectors.shape)

document_answers_wsum = np.array(vocab.document_answers_wsum).reshape(-1, 1)
print("document_answers_wsum", document_answers_wsum.shape)

# create weight_ans
document_answers_idx = vocab.document_answers_idx

# random shuffle
shuffle_idx = list(range(len(document_vectors)))
random.Random(seed).shuffle(shuffle_idx)

document_vectors = document_vectors[shuffle_idx]
document_answers_wsum = document_answers_wsum[shuffle_idx]
document_answers_idx = [document_answers_idx[idx] for idx in shuffle_idx]

word_vectors: (13976, 100)
document_vectors (100, 100)
document_answers_wsum (100, 1)


In [10]:
# onthot_ans: word freq matrix
# weight_ans: TFIDF matrix

onehot_ans = np.zeros((len(document_answers_idx), word_vectors.shape[0]))
weight_ans = np.zeros((len(document_answers_idx), word_vectors.shape[0]))
print(weight_ans.shape)

for i in tqdm(range(len(document_answers_idx))):
    for word_idx in document_answers_idx[i]:
        weight_ans[i, word_idx] += vocab.word_weight[vocab.itos[word_idx]]
        onehot_ans[i, word_idx] += 1
        
    if config["document_vector_weight_normalize"]:
        weight_ans[i] /= np.sum(weight_ans[i])

(100, 13976)


  0%|          | 0/100 [00:00<?, ?it/s]

In [11]:
# check
assert np.sum(document_vectors - np.dot(weight_ans, word_vectors) > 1e-10) == 0

## Results

In [12]:
final_results = []
select_columns = ['model']
for topk in config["topk"]:
    select_columns.append('percision@{}'.format(topk))
for topk in config["topk"]:
    select_columns.append('recall@{}'.format(topk))
for topk in config["topk"]:
    select_columns.append('F1@{}'.format(topk))
for topk in config["topk"]:
    select_columns.append('ndcg@{}'.format(topk))
select_columns.append('ndcg@all')
select_columns

['model',
 'percision@10',
 'percision@30',
 'percision@50',
 'recall@10',
 'recall@30',
 'recall@50',
 'F1@10',
 'F1@30',
 'F1@50',
 'ndcg@10',
 'ndcg@30',
 'ndcg@50',
 'ndcg@all']

## setting training size

In [13]:
train_size_ratio = 1
train_size = int(len(document_answers_idx) * train_size_ratio)
train_size

100

## Top K freq word

In [14]:
topk_results = {}

In [15]:
test_ans = document_answers_idx[:train_size]

In [16]:
word_freq = [(word, freq) for word, freq in vocab.word_freq_in_corpus.items()]
word_freq.sort(key=lambda x:x[1], reverse=True)
word_freq[:10]

[('girl', 15618),
 ('origin', 15299),
 ('kill', 15068),
 ('us', 14729),
 ('action', 14559),
 ('horror', 14226),
 ('young', 14197),
 ('fan', 13417),
 ('bit', 13324),
 ('big', 13265)]

In [17]:
def topk_word_evaluation(k=50):
    topk_word = [word for (word, freq) in word_freq[:k]]

    pr, re = [], []
    for ans in tqdm(test_ans):
        ans = set(ans)
        ans = [vocab.itos[a] for a in ans]

        hit = []
        for word in ans:
            if word in topk_word:
                hit.append(word)

        precision = len(hit) / k
        recall = len(hit) / len(ans)
        pr.append(precision)
        re.append(recall)

    pr = np.mean(pr)
    re = np.mean(re)
    f1 = 2 * pr * re / (pr + re) if (pr + re) != 0 else 0
    print('top {} word'.format(k))
    print('percision', np.mean(pr))
    print('recall', np.mean(re))
    print('F1', f1)
    return f1


for topk in config['topk']:
    topk_results["F1@{}".format(topk)] = topk_word_evaluation(k=topk)


  0%|          | 0/100 [00:00<?, ?it/s]

top 10 word
percision 0.11399999999999999
recall 0.023834712657783915
F1 0.03942631272767297


  0%|          | 0/100 [00:00<?, ?it/s]

top 30 word
percision 0.09666666666666664
recall 0.05891858643470809
F1 0.0732134086209844


  0%|          | 0/100 [00:00<?, ?it/s]

top 50 word
percision 0.09140000000000001
recall 0.09227283986795463
F1 0.09183434600340698


In [18]:
def topk_word_evaluation_NDCG(k=50):
    freq_word =[word for (word, freq) in word_freq]
    freq_word_idx = [vocab.stoi[word] for word in freq_word if word in vocab.stoi]
    
    scores = np.zeros(len(vocab.word_vectors))
    for rank, idx in enumerate(freq_word_idx):
        scores[idx] = len(vocab.word_vectors) - rank
    
    NDCGs = []
    
    for ans in tqdm(test_ans):
        weight_ans = np.zeros(len(vocab.word_vectors))
        
        for word_idx in ans:
            if word_idx == 0:
                continue
            word = vocab.itos[word_idx]
            weight_ans[word_idx] += vocab.IDF[word]

        NDCG_score = ndcg_score(weight_ans.reshape(1,-1), scores.reshape(1,-1), k=k)
        NDCGs.append(NDCG_score)

    print('top {} NDCG:{}'.format(k, np.mean(NDCGs)))
    
    return np.mean(NDCGs)


# for topk in config['topk']:
#     topk_results["ndcg@{}".format(topk)] = topk_word_evaluation_NDCG(k=topk)
    
# topk_results["ndcg@all"] = topk_word_evaluation_NDCG(k=None)


In [19]:
topk_results["model"] = "topk"
final_results.append(pd.Series(topk_results))

## Sklearn

In [20]:
from sklearn.linear_model import LinearRegression, Lasso

In [21]:
print(document_vectors.shape)
print(weight_ans.shape)
print(word_vectors.shape)

(100, 100)
(100, 13976)
(13976, 100)


In [22]:
def evaluate_sklearn(pred, ans):
    results = {}
        
    one_hot_ans = np.arange(ans.shape[0])[ans > 0]
    
    for topk in config["topk"]:
        one_hot_pred = np.argsort(pred)[-topk:]
        hit = np.intersect1d(one_hot_pred, one_hot_ans)
        percision = len(hit) / topk
        recall = len(hit) / len(one_hot_ans)
        f1 = 2 * percision * recall / (percision + recall) if (percision + recall) > 0 else 0
        
        results['percision@{}'.format(topk)] = percision
        results['recall@{}'.format(topk)] = recall
        results['F1@{}'.format(topk)] = f1
        
    ans = ans.reshape(1, -1)
    pred = pred.reshape(1, -1)
    for topk in config["topk"]:
        results['ndcg@{}'.format(topk)] = ndcg_score(ans, pred, k=topk)

    results['ndcg@all'] = (ndcg_score(ans, pred, k=None))
    
    return results

### linear regression

In [23]:
results = []

for doc_id, doc_emb in enumerate(tqdm(document_vectors[:train_size])):
    x = word_vectors.T
    y = doc_emb
    
    ans = weight_ans[doc_id]
    model = LinearRegression(fit_intercept=False).fit(x, y)
    r2 = model.score(x, y)

    res = evaluate_sklearn(model.coef_, ans)
    results.append(res)

  0%|          | 0/100 [00:00<?, ?it/s]

In [24]:
results = pd.DataFrame(results).mean()
results['model'] = 'sk-linear-regression'
final_results.append(results)
results

percision@10                   0.341
recall@10                   0.100978
F1@10                       0.140458
percision@30                   0.213
recall@30                   0.171015
F1@30                       0.167621
percision@50                  0.1674
recall@50                   0.219625
F1@50                       0.168138
ndcg@10                     0.363496
ndcg@30                     0.327512
ndcg@50                     0.334415
ndcg@all                    0.563008
model           sk-linear-regression
dtype: object

### lasso

In [25]:
results = []
sk_lasso_epoch = 10000

for doc_id, doc_emb in enumerate(tqdm(document_vectors[:train_size])):
    x = word_vectors.T
    y = doc_emb
    
    ans = weight_ans[doc_id]
    model = Lasso(positive=True, fit_intercept=False, alpha=0.0001, max_iter=sk_lasso_epoch, tol=0).fit(x, y)
    r2 = model.score(x, y)

    res = evaluate_sklearn(model.coef_, ans)
    results.append(res)

  0%|          | 0/100 [00:00<?, ?it/s]

  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)


  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)


  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)


  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)


In [26]:
results = pd.DataFrame(results).mean()
results['model'] = 'sk-lasso'
final_results.append(results)
results

percision@10        0.53
recall@10       0.200465
F1@10           0.262543
percision@30    0.279667
recall@30       0.280765
F1@30           0.247872
percision@50       0.204
recall@50       0.319288
F1@50           0.221158
ndcg@10         0.540316
ndcg@30         0.468284
ndcg@50         0.465313
ndcg@all        0.621234
model           sk-lasso
dtype: object

## Our Model

In [27]:
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'

In [28]:
class Custom_Lasso_Dataset(Dataset):
    def __init__(self, 
                 doc_vectors,
                 doc_w_sum,
                 weight_ans
                 ):
        self.doc_vectors = torch.FloatTensor(doc_vectors)
        self.doc_w_sum = torch.FloatTensor(doc_w_sum)
        self.weight_ans = weight_ans
        assert len(doc_vectors) == len(doc_w_sum)
        
    def __getitem__(self, idx):
                
        return self.doc_vectors[idx], self.doc_w_sum[idx], idx

    def __len__(self):
        return len(self.doc_vectors)


In [29]:
class LR(nn.Module):
    """
    Input shape: (N, 3, 64, 64)
    Output shape: (N, )
    """
    def __init__(self, num_doc, num_words):
        super(LR, self).__init__()
        weight = torch.zeros(num_doc, num_words).to(device)
        self.emb = torch.nn.Embedding.from_pretrained(weight, freeze=False)
        
    def forward(self, doc_ids, word_vectors):
        return self.emb(doc_ids) @ word_vectors

In [30]:
def evaluate_Custom_Lasso(model, train_loader):
    results = {}
    model.eval()
    
    scores = np.array(model.emb.cpu().weight.data)
    model.emb.to(device)
    true_relevance = train_loader.dataset.weight_ans

    # F1
    F1s = []
    precisions = []
    recalls = []
    for i in range(true_relevance.shape[0]):
        one_hot_ans = np.arange(true_relevance.shape[1])[true_relevance[i] > 0]
        pred = scores[i]
        
        F1_ = []
        percision_ = []
        recall_ = []
        for topk in config["topk"]:
            one_hot_pred = np.argsort(pred)[-topk:]
            
            hit = np.intersect1d(one_hot_pred, one_hot_ans)
            percision = len(hit) / topk
            recall = len(hit) / len(one_hot_ans)
            
            F1 = 2 * percision * recall / (percision + recall) if (percision + recall) > 0 else 0
            F1_.append(F1)
            percision_.append(percision)
            recall_.append(recall)
            
        F1s.append(F1_)
        precisions.append(percision_)
        recalls.append(recall_)
        
    F1s = np.mean(F1s, axis=0)
    precisions = np.mean(precisions, axis=0)
    recalls = np.mean(recalls, axis=0)
    
    for i, topk in enumerate(config["topk"]):
        results['F1@{}'.format(topk)] = F1s[i]
        results['percision@{}'.format(topk)] = precisions[i]
        results['recall@{}'.format(topk)] = recalls[i]

    # NDCG
    for topk in config["topk"]:
        results['ndcg@{}'.format(topk)] = ndcg_score(true_relevance, scores, k=topk)
    results['ndcg@all'] = ndcg_score(true_relevance, scores, k=None)
    
    return results

In [31]:
batch_size = 100
print('document num', train_size)

train_dataset = Custom_Lasso_Dataset(document_vectors[:train_size], document_answers_wsum[:train_size], weight_ans[:train_size])
train_loader  = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, pin_memory=True)

document num 100


## start training

In [32]:
# setting
lr = 0.1
momentum = 0.999
weight_decay = 0
nesterov = False # True

n_epoch = 10000

w_sum_reg = 1e-2
w_sum_reg_mul = 1
w_clip_value = 0

L1 = 1e-5

verbose = True
valid_epoch = 100

model = LR(num_doc=train_size, num_words=word_vectors.shape[0]).to(device)
model.train()

word_vectors_tensor = torch.FloatTensor(word_vectors).to(device)
    
opt = torch.optim.SGD(model.parameters(), lr=lr, momentum=momentum, weight_decay=weight_decay, nesterov=nesterov)
criterion = nn.MSELoss(reduction='mean')

results = []
step = 0
for epoch in tqdm(range(n_epoch)):    
    loss_mse_his = []
    loss_w_reg_his = []
    
    model.train()

    for data in train_loader:
        doc_embs, doc_w_sum, doc_ids = data
        
        doc_embs = doc_embs.to(device)
        doc_w_sum = doc_w_sum.to(device)
        doc_ids = doc_ids.to(device)
        
        w_reg = doc_w_sum * w_sum_reg_mul
        # w_reg = (torch.ones(doc_embs.size(0), 1) * w_sum_reg_mul).to(device)
        
        # MSE loss
        pred_doc_embs = model(doc_ids, word_vectors_tensor)     
        loss_mse = criterion(pred_doc_embs, doc_embs)

        pred_w_sum = torch.sum(model.emb(doc_ids), axis=1).view(-1, 1)
        loss_w_reg = criterion(pred_w_sum, w_reg)
        
        loss_l1 = torch.sum(torch.abs(model.emb(doc_ids)))
        loss = loss_mse + loss_w_reg * w_sum_reg + loss_l1 * L1
        
        # Model backwarding
        model.zero_grad()
        loss.backward()
        opt.step()

        loss_mse_his.append(loss_mse.item())
        loss_w_reg_his.append(loss_w_reg.item())

        for p in model.parameters():
            p.data.clamp_(w_clip_value, float('inf'))

        
    if epoch % valid_epoch == 0:
        res = {}
        res['epoch'] = epoch
        res['loss_mse'] = np.mean(loss_mse_his)
        res['loss_w_reg'] = np.mean(loss_w_reg_his)
        
        res_ndcg = evaluate_Custom_Lasso(model, train_loader)
        res.update(res_ndcg)
        results.append(res)
        
        if verbose:
            print()
            for k, v in res.items():
                print(k, v)

  0%|          | 0/10000 [00:00<?, ?it/s]


epoch 0
loss_mse 0.05655769631266594
loss_w_reg 1.0
F1@10 0.022739436255302586
percision@10 0.051000000000000004
recall@10 0.01664564410447777
F1@30 0.047511352112411614
percision@30 0.060333333333333294
recall@30 0.04970928685243355
F1@50 0.07689679351510892
percision@50 0.07739999999999995
recall@50 0.0994594691733635
ndcg@10 0.04947000370588328
ndcg@30 0.057901463913454514
ndcg@50 0.07767041814788288
ndcg@all 0.3566580724768421

epoch 100
loss_mse 8.220216841436923e-05
loss_w_reg 0.0026517000515013933
F1@10 0.19868552855822608
percision@10 0.4660000000000001
recall@10 0.14327672144879688
F1@30 0.2224742250524924
percision@30 0.2743333333333332
recall@30 0.2312151506064769
F1@50 0.21302662396518135
percision@50 0.20819999999999989
recall@50 0.28153576639018973
ndcg@10 0.5214077218378206
ndcg@30 0.4531714292296421
ndcg@50 0.45057845844363764
ndcg@all 0.6441837809264502

epoch 200
loss_mse 1.6649612007313408e-05
loss_w_reg 0.002097564982250333
F1@10 0.21961618408486486
percision@10 0.


epoch 1900
loss_mse 5.879484206161578e-07
loss_w_reg 0.0022000744938850403
F1@10 0.2738658534295196
percision@10 0.5670000000000001
recall@10 0.2074447110312699
F1@30 0.27170016234402583
percision@30 0.3176666666666666
recall@30 0.2993156266349564
F1@50 0.23958560158545533
percision@50 0.22699999999999998
recall@50 0.3351657998126924
ndcg@10 0.5851428672103336
ndcg@30 0.508767677234998
ndcg@50 0.4992563334501472
ndcg@all 0.6755133929877316

epoch 2000
loss_mse 5.737636001867941e-07
loss_w_reg 0.0022020807955414057
F1@10 0.2742939298324945
percision@10 0.568
recall@10 0.20771055291742024
F1@30 0.27133166240825485
percision@30 0.31733333333333325
recall@30 0.2989514370842617
F1@50 0.2393424785349686
percision@50 0.2268
recall@50 0.33469437691644016
ndcg@10 0.5841076293244825
ndcg@30 0.5082131932714128
ndcg@50 0.4987648016781867
ndcg@all 0.6758491735465281

epoch 2100
loss_mse 5.477900231198873e-07
loss_w_reg 0.002204054733738303
F1@10 0.2738771701779053
percision@10 0.5660000000000001
r


epoch 3800
loss_mse 4.7457814389417763e-07
loss_w_reg 0.0022167530842125416
F1@10 0.2762747925359972
percision@10 0.5709999999999998
recall@10 0.20905887090198594
F1@30 0.27377499173009895
percision@30 0.3189999999999999
recall@30 0.3017605343685108
F1@50 0.24083903956391187
percision@50 0.22719999999999999
recall@50 0.3377970054347175
ndcg@10 0.5799159903406654
ndcg@30 0.5085773770828247
ndcg@50 0.4983262567632096
ndcg@all 0.678281503091502

epoch 3900
loss_mse 4.764117988997896e-07
loss_w_reg 0.0022159439977258444
F1@10 0.2762747925359972
percision@10 0.5709999999999998
recall@10 0.20905887090198594
F1@30 0.2740784008541955
percision@30 0.31933333333333325
recall@30 0.30202893433677347
F1@50 0.24096927908486993
percision@50 0.22719999999999999
recall@50 0.3380448686825808
ndcg@10 0.5800935734169371
ndcg@30 0.5088453400767861
ndcg@50 0.49856617363878103
ndcg@all 0.6785726657896259

epoch 4000
loss_mse 4.796611960955488e-07
loss_w_reg 0.0022178262006491423
F1@10 0.27667479253599714
pe


epoch 5700
loss_mse 4.6463776470773155e-07
loss_w_reg 0.0022198979277163744
F1@10 0.27806845366601735
percision@10 0.5749999999999998
recall@10 0.2101170088000951
F1@30 0.2743065724571541
percision@30 0.32033333333333325
recall@30 0.3018042600918227
F1@50 0.24198498948074218
percision@50 0.2287999999999999
recall@50 0.33907311490884967
ndcg@10 0.5832826542341473
ndcg@30 0.510048306300283
ndcg@50 0.5000949585661901
ndcg@all 0.6800302280849216

epoch 5800
loss_mse 4.634248966794985e-07
loss_w_reg 0.002219131449237466
F1@10 0.27806845366601735
percision@10 0.5749999999999998
recall@10 0.2101170088000951
F1@30 0.2744584964951351
percision@30 0.32033333333333325
recall@30 0.30197371382649385
F1@50 0.24219775543818897
percision@50 0.22899999999999987
recall@50 0.33930038763612236
ndcg@10 0.583432104387863
ndcg@30 0.5101450859005059
ndcg@50 0.5003023515764339
ndcg@all 0.6804755517846899

epoch 5900
loss_mse 4.644358853056474e-07
loss_w_reg 0.0022192730102688074
F1@10 0.27841933085899984
perc


epoch 7600
loss_mse 4.6443071255453106e-07
loss_w_reg 0.002221336355432868
F1@10 0.2779490271305185
percision@10 0.576
recall@10 0.20994500989459822
F1@30 0.27432853888054387
percision@30 0.31966666666666677
recall@30 0.30230957837319344
F1@50 0.24257903479863252
percision@50 0.22939999999999988
recall@50 0.33963853267317545
ndcg@10 0.5836802142937101
ndcg@30 0.5104434475393468
ndcg@50 0.5006683590476818
ndcg@all 0.6807289469590616

epoch 7700
loss_mse 4.6335523506968457e-07
loss_w_reg 0.0022212478797882795
F1@10 0.27759620661769796
percision@10 0.576
recall@10 0.20968713883013046
F1@30 0.27414996745197245
percision@30 0.3193333333333334
recall@30 0.3021876271536812
F1@50 0.24281713003672775
percision@50 0.22959999999999986
recall@50 0.3399326503202343
ndcg@10 0.5833643291367355
ndcg@30 0.5103659433606872
ndcg@50 0.5007457980343226
ndcg@all 0.6805122801305672

epoch 7800
loss_mse 4.6315187773871003e-07
loss_w_reg 0.0022213971242308617
F1@10 0.27827427541606753
percision@10 0.576999999


epoch 9500
loss_mse 4.637664972051425e-07
loss_w_reg 0.0022203372791409492
F1@10 0.27769776579404243
percision@10 0.5760000000000001
recall@10 0.20975396776266192
F1@30 0.2749898052376281
percision@30 0.3206666666666667
recall@30 0.30280157588315365
F1@50 0.2429976247965542
percision@50 0.2291999999999999
recall@50 0.3404328300153681
ndcg@10 0.5837227731664335
ndcg@30 0.5113307825040025
ndcg@50 0.5020117374572217
ndcg@all 0.6818659254594569

epoch 9600
loss_mse 4.654537804071879e-07
loss_w_reg 0.002221371280029416
F1@10 0.27732177680195436
percision@10 0.5750000000000001
recall@10 0.20951012539812988
F1@30 0.27449377443664374
percision@30 0.32
recall@30 0.3023157018689777
F1@50 0.24305406218632492
percision@50 0.2291999999999999
recall@50 0.3405735924787112
ndcg@10 0.5832735411727799
ndcg@30 0.5109200111182657
ndcg@50 0.5019454304175515
ndcg@all 0.6806168962926098

epoch 9700
loss_mse 4.6256249675025174e-07
loss_w_reg 0.0022204697597771883
F1@10 0.27732177680195436
percision@10 0.5750

In [33]:
pd.set_option('display.max_rows', 500)
results_df = pd.DataFrame(results).set_index('epoch')
results_df

Unnamed: 0_level_0,loss_mse,loss_w_reg,F1@10,percision@10,recall@10,F1@30,percision@30,recall@30,F1@50,percision@50,recall@50,ndcg@10,ndcg@30,ndcg@50,ndcg@all
epoch,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0,0.0565577,1.0,0.022739,0.051,0.016646,0.047511,0.060333,0.049709,0.076897,0.0774,0.099459,0.04947,0.057901,0.07767,0.356658
100,8.220217e-05,0.002652,0.198686,0.466,0.143277,0.222474,0.274333,0.231215,0.213027,0.2082,0.281536,0.521408,0.453171,0.450578,0.644184
200,1.664961e-05,0.002098,0.219616,0.495,0.160905,0.237151,0.286333,0.252111,0.225536,0.2176,0.303493,0.541894,0.471427,0.469708,0.6534
300,8.199593e-06,0.002056,0.229954,0.511,0.169616,0.24472,0.291333,0.264835,0.231727,0.2228,0.315911,0.549071,0.478819,0.477913,0.654057
400,4.486861e-06,0.00205,0.237516,0.521,0.176136,0.247526,0.293667,0.269039,0.234579,0.2244,0.321612,0.557201,0.48488,0.484384,0.655765
500,2.70461e-06,0.002078,0.248206,0.533,0.186238,0.252684,0.299333,0.274879,0.236445,0.2262,0.325385,0.564945,0.490653,0.488342,0.657767
600,2.824854e-06,0.002098,0.25502,0.544,0.191643,0.256726,0.303,0.280561,0.236005,0.2256,0.325782,0.572093,0.495901,0.491953,0.660538
700,1.79631e-06,0.002115,0.259312,0.551,0.195177,0.260709,0.306667,0.285326,0.237722,0.227,0.329044,0.575137,0.499316,0.494191,0.662025
800,1.451007e-06,0.00213,0.264561,0.558,0.199458,0.263361,0.308667,0.289184,0.23782,0.227,0.329399,0.578882,0.501032,0.494666,0.66281
900,1.194433e-06,0.002142,0.269833,0.564,0.203857,0.264281,0.309333,0.290494,0.238776,0.2276,0.331316,0.581838,0.502053,0.495871,0.664681


In [34]:
results_df['model'] = 'our-lasso'
final_results.append(results_df[select_columns].iloc[-1])

## Quality Check

In [35]:
# select doc_id and k
doc_id = 40
topk = 30

model

LR(
  (emb): Embedding(100, 13976)
)

In [36]:
import colored
from colored import stylize

word_list = vocab.itos

gt = [word_list[word_idx] for word_idx in np.argsort(weight_ans[doc_id])[::-1][:topk]]
pred = [word_list[word_idx] for word_idx in np.argsort(model.emb.cpu().weight.data[doc_id].numpy())[::-1][:topk]]

print('ground truth')
for word in gt:
    if word in pred:
        print(stylize(word, colored.bg("yellow")), end=' ')
    else:
        print(word, end=' ')

print()
print('\nprediction')
for word in pred:
    if word in gt:
        print(stylize(word, colored.bg("yellow")), end=' ')
    else:
        print(word, end=' ')


ground truth
[48;5;3mlibido[0m [48;5;3mbash[0m [48;5;3mneither[0m [48;5;3mmale[0m [48;5;3mtouch[0m [48;5;3mstay[0m [48;5;3mfather[0m [48;5;3mkid[0m [48;5;3mlet[0m flavour robber mount unto 1933 cecil 1932 pepper fujimori someway ahm conceit carpet kali goddess rear kit squint lui jose arab 

prediction
[48;5;3mlibido[0m [48;5;3mbash[0m [48;5;3mmale[0m [48;5;3mneither[0m [48;5;3mstay[0m [48;5;3mtouch[0m [48;5;3mfather[0m [48;5;3mkid[0m [48;5;3mlet[0m your felt not talk might dad nothing win husband do mood her miss drink clinton moment comfort god wanna sexual affair 

In [37]:
# raw document
print()
ps = PorterStemmer()
    
for word in vocab.raw_documents[doc_id].split():
    word_stem = ps.stem(word).lower()

    if word_stem in gt:
        if word_stem in pred:
            print(stylize(word, colored.bg("yellow")), end=' ')
        else:
            print(stylize(word, colored.bg("light_gray")), end=' ')
    else:
        print(word, end=' ')
# print(dataset.documents[doc_id])


i think james cameron might be becoming my favorite director because this is my second review of his movies . anyway , everyone remembers the rms titanic . it was big , fast , and " unsinkable " . . . until april 1912 . it was all over the news and one of the biggest tragedies ever . well james cameron decided to make a movie out of it but star two fictional characters to be in the spotlight instead of the ship . well , onto the main review but [48;5;3mlet[0m me remind you that this is all opinion and zero fact and the only fact that will be present is an event from the film . so our two main characters are jack ( leonardo dicaprio ) and rose ( kate winslet ) . they're not annoying too much but watch this and you'll find out why they could become annoying ( http : //tinyurl . com/ojhoyn ) . the main villain i guess is bad luck , fate , hand of god ( no blasphemy intended ) , or just plain caledon hockley ( billy zane ) . combine all of the above and what do you get ? ! oh yes ! we g

In [38]:
results = {}
   
scores = np.array(model.emb.weight.data)[doc_id].reshape(1, -1)
true_relevance = train_loader.dataset.weight_ans[doc_id].reshape(1, -1)

results['ndcg@50'] = (ndcg_score(true_relevance, scores, k=50))
results['ndcg@100'] = (ndcg_score(true_relevance, scores, k=100))
results['ndcg@200'] = (ndcg_score(true_relevance, scores, k=200))
results['ndcg@all'] = (ndcg_score(true_relevance, scores, k=None))

print('This document ndcg:')
print('ground truth length:', np.sum(weight_ans[doc_id] > 0))
print('NDCG top50', results['ndcg@50'])
print('NDCG top100', results['ndcg@100'])
print('NDCG top200', results['ndcg@200'])
print('NDCG ALL', results['ndcg@all'])


This document ndcg:
ground truth length: 9
NDCG top50 0.9994384705953052
NDCG top100 0.9994384705953052
NDCG top200 0.9994384705953052
NDCG ALL 0.9994384705953052


## Final results

In [39]:
is_notebook = in_notebook()

In [40]:
final_results_df = pd.DataFrame(final_results).reset_index(drop=True)

experiment_dir = './records/dataset-{}-n_document-{}-wdist-{}-filtertopk-{}'.format(
                                        config['dataset'],
                                        config['n_document'],
                                        config["document_vector_agg_weight"],
                                        config["topk_word_freq_threshold"])

print('Saving to directory', experiment_dir)
os.makedirs(experiment_dir, exist_ok=True)

Saving to directory ./records/dataset-IMDB-n_document-100-wdist-IDF-filtertopk-100


In [41]:
final_results_df.to_csv(os.path.join(experiment_dir, 'result.csv'), index=False)

import json
with open(os.path.join(experiment_dir, 'config.json'), 'w') as json_file:
    json.dump(config, json_file)

In [42]:
for feat in final_results_df.set_index('model').columns:
    plt.bar(final_results_df['model'],
            final_results_df[feat], 
            width=0.5, 
            bottom=None, 
            align='center', 
            color=['lightsteelblue', 
                   'cornflowerblue', 
                   'royalblue', 
                   'navy'])
    plt.title(feat)
    plt.savefig(os.path.join(experiment_dir, '{}.png'.format(feat)))
    plt.clf()
    if is_notebook:
        plt.show()

  from ipykernel import kernelapp as app


In [43]:
print(final_results_df)
final_results_df

      F1@10     F1@30     F1@50                 model  percision@10  \
0  0.039426  0.073213  0.091834                  topk           NaN   
1  0.140458  0.167621  0.168138  sk-linear-regression         0.341   
2  0.262543  0.247872  0.221158              sk-lasso         0.530   
3  0.277352  0.273977  0.242109             our-lasso         0.575   

   recall@10  percision@30  recall@30  percision@50  recall@50   ndcg@10  \
0        NaN           NaN        NaN           NaN        NaN       NaN   
1   0.100978      0.213000   0.171015        0.1674   0.219625  0.363496   
2   0.200465      0.279667   0.280765        0.2040   0.319288  0.540316   
3   0.209560      0.319333   0.301890        0.2284   0.339401  0.583037   

    ndcg@30   ndcg@50  ndcg@all  
0       NaN       NaN       NaN  
1  0.327512  0.334415  0.563008  
2  0.468284  0.465313  0.621234  
3  0.510223  0.501191  0.681515  


Unnamed: 0,F1@10,F1@30,F1@50,model,percision@10,recall@10,percision@30,recall@30,percision@50,recall@50,ndcg@10,ndcg@30,ndcg@50,ndcg@all
0,0.039426,0.073213,0.091834,topk,,,,,,,,,,
1,0.140458,0.167621,0.168138,sk-linear-regression,0.341,0.100978,0.213,0.171015,0.1674,0.219625,0.363496,0.327512,0.334415,0.563008
2,0.262543,0.247872,0.221158,sk-lasso,0.53,0.200465,0.279667,0.280765,0.204,0.319288,0.540316,0.468284,0.465313,0.621234
3,0.277352,0.273977,0.242109,our-lasso,0.575,0.20956,0.319333,0.30189,0.2284,0.339401,0.583037,0.510223,0.501191,0.681515
