In [2]:
import os
from collections import defaultdict
import math
import numpy as np 
import re
import torch
import torch.nn as nn
from itertools import cycle
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset
import torch.nn.functional as F
from torch.optim.lr_scheduler import MultiStepLR
from tqdm.auto import tqdm
from sklearn import preprocessing

# Used to get the data
from sklearn.metrics import ndcg_score

import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.collocations import BigramAssocMeasures, BigramCollocationFinder
nltk.download('stopwords')

import matplotlib.pyplot as plt 
import matplotlib
matplotlib.use('Agg')


import pandas as pd


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
config = {}

config["dataset"] = "CNN" # "IMDB" "CNN", "PubMed"
config["n_document"] = 6852 #6852
config["normalize_word_embedding"] = True
config["min_word_freq_threshold"] = 20
config["topk_word_freq_threshold"] = 100
config["document_vector_agg_weight"] = 'pmi' # ['mean', 'IDF', 'uniform', 'gaussian', 'exponential', 'pmi']
config["select_topk_TFIDF"] = None
config["embedding_file"] = "/content/drive/MyDrive/ML/autoencoder/glove.6B.100d.txt"
config["topk"] = [10, 30, 50]

sk_lasso_epoch = 10000
our_lasso_epoch = 50000
is_notebook = True

In [5]:
# load word embedding
embedding_file = config["embedding_file"]
word2embedding = dict()
word_dim = int(re.findall(r".(\d+)d",embedding_file)[0])

with open(embedding_file,"r") as f:
    for line in tqdm(f):
        line = line.strip().split()
        word = line[0]
        embedding = list(map(float,line[1:]))
        word2embedding[word] = np.array(embedding)

print("Number of words:%d" % len(word2embedding))

0it [00:00, ?it/s]

Number of words:400000


In [6]:
def normalize_wordemb(word2embedding):
    word_emb = []
    word_list = []
    for word, emb in word2embedding.items():
        word_list.append(word)
        word_emb.append(emb)

    word_emb = np.array(word_emb)

    for i in range(len(word_emb)):
        norm = np.linalg.norm(word_emb[i])
        word_emb[i] = word_emb[i] / norm

    for word, emb in tqdm(zip(word_list, word_emb)):
        word2embedding[word] = emb
    return word2embedding

if config["normalize_word_embedding"]:
    normalize_wordemb(word2embedding)

0it [00:00, ?it/s]

In [7]:
class Vocabulary:
    def __init__(self, word2embedding, min_word_freq_threshold=0, topk_word_freq_threshold=0):
        # The low frequency words will be assigned as <UNK> token
        self.itos = {0: "<UNK>"}
        self.stoi = {"<UNK>": 0}
        
        self.word2embedding = word2embedding
        self.min_word_freq_threshold = min_word_freq_threshold
        self.topk_word_freq_threshold = topk_word_freq_threshold
        
        self.word_freq_in_corpus = defaultdict(int)
        self.IDF = {}
        self.ps = PorterStemmer()
        self.stop_words = set(stopwords.words('english'))

    def __len__(self):
        return len(self.itos)

    def tokenizer_eng(self, text):
        text = re.sub(r'[^A-Za-z0-9 ]+', '', text)
        text = text.strip().split()
        
        return [self.ps.stem(w) for w in text if w.lower() not in self.stop_words]

    def build_vocabulary(self, sentence_list):
        self.doc_freq = defaultdict(int) # # of document a word appear
        self.document_num = len(sentence_list)
        self.word_vectors = [[0]*word_dim] # unknown word emb
        
        for sentence in tqdm(sentence_list, desc="Preprocessing documents"):
            # for doc_freq
            document_words = set()
            
            for word in self.tokenizer_eng(sentence):
                # pass unknown word
                if word not in self.word2embedding:
                    continue
                    
                # calculate word freq
                self.word_freq_in_corpus[word] += 1
                document_words.add(word)
                
            for word in document_words:
                self.doc_freq[word] += 1
        
        # calculate IDF
        print('doc num', self.document_num)
        for word, freq in self.doc_freq.items():
            self.IDF[word] = math.log(self.document_num / (freq+1))
        
        # delete less freq words:
        delete_words = []
        for word, v in self.word_freq_in_corpus.items():
            if v < self.min_word_freq_threshold:
                delete_words.append(word)     
        for word in delete_words:
            del self.IDF[word]    
            del self.word_freq_in_corpus[word]    
        
        # delete too freq words
        print('eliminate freq words')
        IDF = [(word, freq) for word, freq in self.IDF.items()]
        IDF.sort(key=lambda x: x[1])

        for i in range(self.topk_word_freq_threshold):
            print(word)
            word = IDF[i][0]
            del self.IDF[word]
            del self.word_freq_in_corpus[word]
        
        # construct word_vectors
        idx = 1
        for word in self.word_freq_in_corpus:
            self.word_vectors.append(self.word2embedding[word])
            self.stoi[word] = idx
            self.itos[idx] = word
            idx += 1
            
    def init_word_weight(self,sentence_list, agg):
        if agg == 'mean':
            self.word_weight = {word: 1 for word in self.IDF.keys()}
        elif agg == 'IDF':
            self.word_weight = self.IDF
        elif agg == 'uniform':
            self.word_weight = {word: np.random.uniform(low=0.0, high=1.0) for word in self.IDF.keys()}
        elif agg == 'gaussian':
            mu, sigma = 10, 1 # mean and standard deviation
            self.word_weight = {word: np.random.normal(mu, sigma) for word in self.IDF.keys()}
        elif agg == 'exponential':
            self.word_weight = {word: np.random.exponential(scale=1.0) for word in self.IDF.keys()}
        elif agg == 'pmi':
            trigram_measures = BigramAssocMeasures()
            self.word_weight = defaultdict(int)
            corpus = []

            for text in tqdm(sentence_list):
                corpus.extend(text.split())

            finder = BigramCollocationFinder.from_words(corpus)
            for pmi_score in finder.score_ngrams(trigram_measures.pmi):
                pair, score = pmi_score
                self.word_weight[pair[0]] += score
                self.word_weight[pair[1]] += score
                
    def calculate_document_vector(self, sentence_list, agg, n_document, select_topk_TFIDF=None):
        document_vectors = []
        document_answers = []
        document_answers_w = []
        
        self.init_word_weight(sentence_list, agg)
        
        for sentence in tqdm(sentence_list[:min(n_document, len(sentence_list))], desc="calculate document vectors"):
            document_vector = np.zeros(len(self.itos), dtype='float32')
            select_words = []        
            for word in self.tokenizer_eng(sentence):
                # pass unknown word
                if word not in self.stoi:
                    continue
                else:
                    select_words.append(word)


            doc_TFIDF = defaultdict(float)
            for word in select_words:    
              document_vector[self.stoi[word]] += self.IDF[word]
            
            # document_vector_s = sorted(range(4086), key = lambda k: document_vector[k], reverse=True)
            # print(document_vector_s[:20])
            # document_vector_s = np.array(document_vector_s, dtype='float32')
            norm_vector = torch.from_numpy(document_vector)
            norm_vector = torch.nn.functional.normalize(norm_vector, dim=0)
            total_weight = 0
            # aggregate to doc vectors
            for word in select_words:
                total_weight += self.word_weight[word]
            
            document_vectors.append(norm_vector)
            document_answers.append(select_words)
            document_answers_w.append(total_weight)
            
        
        # get answers
        document_answers_idx = []    
        for ans in document_answers:
            ans_idx = []
            for token in ans:
                if token in self.stoi:
                    ans_idx.append(self.stoi[token])                    
            document_answers_idx.append(ans_idx)

        return document_vectors, document_answers_idx, document_answers_w
        
    def numericalize(self, text):
        tokenized_text = self.tokenizer_eng(text)

        return [
            self.stoi[token] if token in self.stoi else self.stoi["<UNK>"]
            for token in tokenized_text
        ]

In [8]:
class DocDataset(Dataset):
    def __init__(self, 
                 raw_data_file_path,
                 word2embedding,
                 skip_header = False,
                 n_document = None, # read first n document
                 min_word_freq_threshold = 20, # eliminate less freq words
                 topk_word_freq_threshold = 5, # eliminate smallest k IDF words
                 select_topk_TFIDF = None, # select topk tf-idf as ground-truth
                 document_vector_agg_weight = 'mean',
                 ):

        assert document_vector_agg_weight in ['mean', 'IDF', 'uniform', 'gaussian', 'exponential', 'pmi']
        
        # raw documents
        self.documents = []
        
        with open(raw_data_file_path,'r',encoding='utf-8') as f:
            if skip_header:
                f.readline()
            for line in tqdm(f, desc="Loading documents"):
                # read firt n document
                # if n_document is not None and len(self.documents) >= n_document:
                #     break    
                self.documents.append(line.strip("\n"))

        # build vocabulary
        self.vocab = Vocabulary(word2embedding, min_word_freq_threshold, topk_word_freq_threshold)
        self.vocab.build_vocabulary(self.documents)
        self.vocab_size = len(self.vocab)

        # calculate document vectors
        self.document_vectors, self.document_answers, self.document_answers_w = self.vocab.calculate_document_vector(self.documents, \
                                                                                           document_vector_agg_weight, n_document, select_topk_TFIDF)
                
        # train-test split
        # training
        self.train_split_ratio = 0.8
        self.train_length = int(len(self.document_answers) * self.train_split_ratio)
        self.train_vectors = self.document_vectors[:self.train_length]
        self.train_words = self.document_answers[:self.train_length]
        self.document_ids = list(range(self.train_length))
        self.generator = cycle(self.context_target_generator())
        self.dataset_size = n_document
        
        # testing
        self.test_vectors = self.document_vectors[self.train_length:]
        self.test_words = self.document_answers[self.train_length:]

    def context_target_generator(self):
        np.random.shuffle(self.document_ids) # inplace shuffle

        # randomly select a document and create its training example
        for document_id in self.document_ids: 
            word_list = set(self.train_words[document_id])
            negative_sample_space = list(set(range(self.vocab_size)) - word_list)
            negative_samples = np.random.choice(negative_sample_space,size=len(word_list),replace = False)
            for word_id, negative_wordID in zip(word_list, negative_samples):
                yield [document_id, word_id, negative_wordID]
                
    def __getitem__(self, idx):
        doc_id, word_id, negative_wordID = next(self.generator)
        doc_id = torch.FloatTensor(self.document_vectors[doc_id])
        word_id = torch.FloatTensor(self.vocab.word_vectors[word_id])
        negative_word = torch.FloatTensor(self.vocab.word_vectors[negative_wordID])

        return self.document_vectors[idx]

    def __len__(self):
        return self.dataset_size

In [9]:
# load and build torch dataset
if config["dataset"] == 'IMDB':
    data_file_path = '/content/drive/MyDrive/ML/autoencoder/IMDB.txt'
elif config["dataset"] == 'CNN':
    data_file_path = '/content/drive/MyDrive/ML/autoencoder/CNN.txt'
elif config["dataset"] == 'PubMed':
    data_file_path = '/content/drive/MyDrive/ML/autoencoder/PubMed.txt'

print("Building dataset....")
dataset = DocDataset(
                    raw_data_file_path=data_file_path,
                    word2embedding=word2embedding,
                    skip_header=False,
                    n_document = config["n_document"],
                    min_word_freq_threshold = config["min_word_freq_threshold"],
                    topk_word_freq_threshold = config["topk_word_freq_threshold"],
                    document_vector_agg_weight = config["document_vector_agg_weight"],
                    select_topk_TFIDF = config["select_topk_TFIDF"]
                    )

Building dataset....


Loading documents: 0it [00:00, ?it/s]

Preprocessing documents:   0%|          | 0/6852 [00:00<?, ?it/s]

doc num 6852
eliminate freq words
decapod
subject
line
organ
write
univers
one
would
use
like
get
know
dont
think
time
say
make
also
want
work
could
good
new
way
go
need
well
even
look
thing
see
anyon
tri
much
thank
may
year
world
right
system
two
problem
question
take
seem
mani
first
pleas
1
post
come
2
call
usa
help
state
point
sinc
mean
find
still
read
email
back
give
ive
might
differ
reason
let
run
sure
day
case
said
doesnt
cant
last
got
interest
tell
person
follow
better
ask
part
never
start
without
put
fact
made
gener
actual
3
number
lot
group
includ
that


  0%|          | 0/6852 [00:00<?, ?it/s]

calculate document vectors:   0%|          | 0/6852 [00:00<?, ?it/s]

In [10]:
print("Finish building dataset!")
print(f"Number of documents:{len(dataset.documents)}")
print(f"Number of words:{dataset.vocab_size}")

l = list(map(len, dataset.document_answers))
print("Average length of document:", np.mean(l))

Finish building dataset!
Number of documents:6852
Number of words:4086
Average length of document: 78.83274956217163


In [11]:
print(len(dataset.document_vectors[0]))
print(dataset.document_vectors[0])
print(len(dataset))
#print(dataset.document_vectors[0])
# print(word2embedding[dataset.vocab.itos[1]])
# print(norm_vector[1].dtype)
# sorted_s = sorted(range(4086), key = lambda k : dataset.document_vectors[0][k], reverse=True) 
# print(sorted_s)

4086
tensor([0.0000, 0.7070, 0.1650,  ..., 0.0000, 0.0000, 0.0000])
6852


In [12]:
model_path='/content/drive/MyDrive/ML/autoencoder'
train_dataloader = DataLoader(dataset, batch_size=64, shuffle=True)

In [13]:
# Model structure
class Encoder(nn.Module):
    def __init__(self, seq_len, no_features, embedding_size):
        super().__init__()
        
        self.seq_len = seq_len
        self.no_features = no_features    # The number of expected features(= dimension size) in the input x
        self.embedding_size = embedding_size   # the number of features in the embedded points of the inputs' number of features
        self.hidden_size = (2 * embedding_size)  # The number of features in the hidden state h
        self.LSTM1 = nn.LSTM(
            input_size = no_features,
            hidden_size = embedding_size,
            num_layers = 1,
            batch_first=True
        )
        
    def forward(self, x):
        # Inputs: input, (h_0, c_0). -> If (h_0, c_0) is not provided, both h_0 and c_0 default to zero.
        x, (hidden_state, cell_state) = self.LSTM1(x, None)  
        last_lstm_layer_hidden_state = hidden_state[-1,:,:]
        # out = self.sig(last_lstm_layer_hidden_state)
        return last_lstm_layer_hidden_state
    
    
# (2) Decoder
class Decoder(nn.Module):
    def __init__(self, seq_len, no_features, output_size):
        super().__init__()

        self.seq_len = seq_len
        self.no_features = no_features
        self.hidden_size = (2 * no_features)
        self.output_size = output_size
        self.LSTM1 = nn.LSTM(
            input_size = no_features,
            hidden_size = self.hidden_size,
            num_layers = 1,
            batch_first = True
        )

        self.fc = nn.Linear(self.hidden_size, output_size)
        self.relu = nn.ReLU()
        
    def forward(self, x):
        x = x.unsqueeze(1).repeat(1, self.seq_len, 1)
        x, (hidden_state, cell_state) = self.LSTM1(x, None)
        x = x.reshape((-1, self.seq_len, self.hidden_size))
        x = self.fc(x)
        out = self.relu(x)
        return out

In [34]:
epochs = 50
batch_size = 64
lr = 0.001
device = "cuda" if  torch.cuda.is_available() else "cpu"

model_encoder = Encoder(1, dataset.vocab_size, 12).to(device)
model_decoder = Decoder(1, 12, dataset.vocab_size).to(device)
optimizer_En = torch.optim.Adam(model_encoder.parameters(), lr=lr)
optimizer_De = torch.optim.Adam(model_decoder.parameters(), lr=lr)
loss_function = nn.MSELoss()

In [35]:
# Train
log_loss=[]

for epoch in range(epochs):
    total_loss = 0
    model_encoder.train()
    model_decoder.train()
    for batch, (data) in enumerate(train_dataloader):
        data = data.reshape(-1, 1, dataset.vocab_size)
        data = data.to(device)
        
        optimizer_En.zero_grad()
        optimizer_De.zero_grad()
        # Forward
        codes = model_encoder(data)
        decoded = model_decoder(codes)
        decoded = torch.nn.functional.normalize(decoded, dim=2)
        
        loss = loss_function(decoded, data)
        loss.backward()
        
        optimizer_En.step()
        optimizer_De.step()
        
        total_loss+=loss
        
    total_loss /= len(train_dataloader.dataset)
    log_loss.append(loss)
    if epoch % 1 ==0:
        print('[{}/{}] Loss:'.format(epoch+1, epochs), total_loss.item())
print('[{}/{}] Loss:'.format(epoch+1, epochs), total_loss.item())
plt.plot(log_loss)
torch.save(model_encoder.state_dict(), model_path+'encoder.pth')
torch.save(model_decoder.state_dict(), model_path+'decoder.pth')

[1/50] Loss: 7.191074928414309e-06
[2/50] Loss: 7.055882633721922e-06
[3/50] Loss: 7.040441687422572e-06
[4/50] Loss: 7.039486263238359e-06
[5/50] Loss: 7.035956059553428e-06
[6/50] Loss: 7.032579105725745e-06
[7/50] Loss: 7.0291716838255525e-06
[8/50] Loss: 7.023385478532873e-06
[9/50] Loss: 6.998516710154945e-06
[10/50] Loss: 6.931923962838482e-06
[11/50] Loss: 6.8497574829962105e-06
[12/50] Loss: 6.771485914214281e-06
[13/50] Loss: 6.708078217343427e-06
[14/50] Loss: 6.627753009524895e-06
[15/50] Loss: 6.537624813063303e-06
[16/50] Loss: 6.444904101954307e-06
[17/50] Loss: 6.343394034047378e-06
[18/50] Loss: 6.213174401636934e-06
[19/50] Loss: 6.049510375305545e-06
[20/50] Loss: 5.859304110344965e-06
[21/50] Loss: 5.642348696710542e-06
[22/50] Loss: 5.4257825468084775e-06
[23/50] Loss: 5.214562406763434e-06
[24/50] Loss: 5.0534763431642205e-06
[25/50] Loss: 4.9265677262155805e-06
[26/50] Loss: 4.793902007804718e-06
[27/50] Loss: 4.707913831225596e-06
[28/50] Loss: 4.643589363695355e

In [37]:
model_encoder = Encoder(1, dataset.vocab_size, 12).to(device)
model_decoder = Decoder(1, 12, dataset.vocab_size).to(device)
model_encoder.load_state_dict(torch.load(model_path+'encoder.pth'))
model_decoder.load_state_dict(torch.load(model_path+'decoder.pth'))
model_encoder.eval()
model_decoder.eval()
data = torch.from_numpy(np.array(dataset.document_vectors[0]).reshape(-1, 1, dataset.vocab_size))
data = data.to(device)
codes = model_encoder(data)
decoded = model_decoder(codes)
decoded = torch.nn.functional.normalize(decoded, dim=2)



print(data)
print(decoded)
loss = loss_function(decoded, data)
print(loss)

tensor([[[0.0000, 0.7070, 0.1650,  ..., 0.0000, 0.0000, 0.0000]]])
tensor([[[0.0000, 0.9889, 0.0000,  ..., 0.0000, 0.0000, 0.0000]]],
       grad_fn=<DivBackward0>)
tensor(0.0001, grad_fn=<MseLossBackward>)
