In [181]:
import csv
import pandas as pd
import numpy as np
import string
import re

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import IterableDataset, DataLoader, Dataset

In [4]:
hotel_reviews = pd.read_csv(r"C:\Users\Damja\OneDrive\Damjan\HS22\NLP UZH\Exercise2\tripadvisor_hotel_reviews.csv")
scifi_pth = r"C:\Users\Damja\OneDrive\Damjan\HS22\NLP UZH\Exercise2\scifi.txt"

In [None]:
with open(path, 'r', encoding='utf-8') as foo:
    f = foo.readline()  # it is only '1' line

scifi = f.split()

In [5]:
CONTEXT_SIZE = 2  # 2 words to the left, 2 to the right

def make_context_vector(context, word_to_ix):
    idxs = [word_to_ix[w] for w in context]
    return torch.tensor(idxs, dtype=torch.long)

### Preprocessing the hotel reviews data
For that, we create a class which contains all the methods we need for preprocessing the data.

In [58]:
class Preprocessing():

    def __init__(self, full_dataset):
        self.full_dataset = full_dataset
        self.word_to_ix = {}
        self.ix_to_word = {}
        self.context_dataset = []
        self.vocab_size = None
    
    def create_context_vocab(self, input_txt):
        """
        Takes preprocessed lists of words and creates a context dataset
        At the same time it creates the word_to_ix and ix_to_word dictionaries for lookup
        """

        for i in range(2, len(input_txt) - 2):
            context = [input_txt[i - 2], input_txt[i - 1],
                       input_txt[i + 1], input_txt[i + 2]]
            target = input_txt[i]
            self.context_dataset.append((context, target))

        # adding words to our vocabulary list    
        input_txt = set(input_txt)
        # get the current n
        for el in input_txt:
            if el not in self.word_to_ix:
                self.word_to_ix[el] = len(self.word_to_ix)
    
    def apply_create_context_vocab(self):
        """
        Applys the create context fct to every row
        """
        for row in range(self.full_dataset.shape[0]):
            self.create_context_vocab(self.full_dataset['Review'][row].split())
        self.ix_to_word = dict((v, k) for k, v in self.word_to_ix.items())
        
        
    def convert_lowercase(self, x):
        x = x.lower()
        return x
        
    def remove_emoji(self, x):
        emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
        return emoji_pattern.sub(r'', x)
        
    exclude = string.punctuation
    
    def remove_punc(self, x):
        exclude = string.punctuation
        return x.translate(str.maketrans('', '', exclude))
    
    def remove_special_chars(self, x):
        x = re.sub('[^A-Za-z0-9]+', ' ', x)
        return x

    def remove_one_letter_words(self, x):
        x = re.sub(r'(?:^| )\w(?:$| )', ' ', x).strip()
        return x
    
    # default is to apply all these preprocessing steps
    def apply_preprocessing(self,
                            lowercase=True,
                            remove_emoji=True,
                            remove_punc=True,
                            remove_special_chars=True,
                            remove_one_letter_words=True):
        if lowercase:
            self.full_dataset['Review'] = self.full_dataset['Review'].apply(self.convert_lowercase)
        if remove_emoji:
            self.full_dataset['Review'] = self.full_dataset['Review'].apply(self.remove_emoji)
        if remove_punc:
            self.full_dataset['Review'] = self.full_dataset['Review'].apply(self.remove_punc)
        if remove_special_chars:
            self.full_dataset['Review'] = self.full_dataset['Review'].apply(self.remove_special_chars)
        if remove_one_letter_words:
            self.full_dataset['Review'] = self.full_dataset['Review'].apply(self.remove_one_letter_words)
    

In [59]:
my_data = Preprocessing(hotel_reviews)

In [60]:
#my_data.apply_preprocessing()

In [61]:
my_data.full_dataset

Unnamed: 0,Review,Rating
0,nice hotel expensive parking got good deal sta...,4
1,ok nothing special charge diamond member hilto...,2
2,nice rooms not experience hotel monaco seattle...,3
3,unique great stay wonderful time hotel monaco ...,5
4,great stay great stay went seahawk game awesom...,5
...,...,...
20486,best kept secret 3rd time staying charm not 5s...,5
20487,great location price view hotel great quick pl...,4
20488,ok just looks nice modern outside desk staff n...,2
20489,hotel theft ruined vacation hotel opened sept ...,1


In [62]:
my_data.apply_create_context_vocab()

In [63]:
len(my_data.word_to_ix)

80380

In [67]:
(my_data.context_dataset)

[(['nice', 'hotel', 'parking', 'got'], 'expensive'),
 (['hotel', 'expensive', 'got', 'good'], 'parking'),
 (['expensive', 'parking', 'good', 'deal'], 'got'),
 (['parking', 'got', 'deal', 'stay'], 'good'),
 (['got', 'good', 'stay', 'hotel'], 'deal'),
 (['good', 'deal', 'hotel', 'anniversary'], 'stay'),
 (['deal', 'stay', 'anniversary', 'arrived'], 'hotel'),
 (['stay', 'hotel', 'arrived', 'late'], 'anniversary'),
 (['hotel', 'anniversary', 'late', 'evening'], 'arrived'),
 (['anniversary', 'arrived', 'evening', 'took'], 'late'),
 (['arrived', 'late', 'took', 'advice'], 'evening'),
 (['late', 'evening', 'advice', 'previous'], 'took'),
 (['evening', 'took', 'previous', 'reviews'], 'advice'),
 (['took', 'advice', 'reviews', 'did'], 'previous'),
 (['advice', 'previous', 'did', 'valet'], 'reviews'),
 (['previous', 'reviews', 'valet', 'parking'], 'did'),
 (['reviews', 'did', 'parking', 'check'], 'valet'),
 (['did', 'valet', 'check', 'quick'], 'parking'),
 (['valet', 'parking', 'quick', 'easy'],

In [68]:
word_to_ix = my_data.word_to_ix
ix_to_word = my_data.ix_to_word

In [350]:
len(my_data.context_dataset)

2032672

In [70]:
# create Dataset class for transforming out dataset
# takes the loaded data and transforms it into a Dataset object to pass to the dataloader
# we could also include the import here but we will not

class CBOW_Dataset(Dataset):
    
    def __init__(self, full_data):
        self.data = full_data
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        x = make_context_vector(self.data[idx][0], word_to_ix)
        y = torch.tensor(word_to_ix[self.data[idx][1]], dtype=torch.long)
        return x, y

In [351]:
class CBOW(nn.Module):

    def __init__(self, vocab_size, embedding_size, batch_size):
        
        super().__init__()
        self.batch_size = batch_size
        self.embedding_size = embedding_size
        self.embedding = nn.Embedding(num_embeddings=vocab_size,
                                     embedding_dim=embedding_size)
        self.fc1 = nn.Linear(in_features=embedding_size,
                            out_features=vocab_size)

    def forward(self, inputs):
        x_embedded = sum(self.embedding(inputs).view(4, self.batch_size, self.embedding_size))
        y_out = self.fc1(x_embedded)
        m = nn.LogSoftmax()
        y_out = m(y_out)
        return y_out

### Let's try and train this model

In [352]:
# set the batch size here
BATCH_SIZE = 2048
EMBEDDING_SIZE = 50
data_loader = CBOW_Dataset(my_data.context_dataset)
data_loader = DataLoader(data_loader, batch_size = BATCH_SIZE, drop_last=True)

In [353]:
import torch.optim as optim
vocab_size = len(my_data.word_to_ix)
model = CBOW(vocab_size, embedding_size=EMBEDDING_SIZE, batch_size=BATCH_SIZE)
loss_func = torch.nn.NLLLoss()
lr = 0.001
optimizer = optim.Adam(model.parameters(), lr=lr)

num_epochs = 10

In [354]:
# train loop
num_batches = len(data_loader)

for epoch in range(1, num_epochs+1):
    for i, d in enumerate(data_loader):
        optimizer.zero_grad()
        x, y = d
        y_pred = model(x)        
        loss = loss_func(y_pred, y)
        loss_batch = loss.item()
        loss.backward()
        optimizer.step()
        print(f'Epoch [{epoch}/{num_epochs}], batch: [{i}/{num_batches}, loss: {loss_batch:.4f}]')

  y_out = m(y_out)


Epoch [1/2], batch: [0/993, loss: 11.9325]
Epoch [1/2], batch: [1/993, loss: 11.9569]
Epoch [1/2], batch: [2/993, loss: 11.9584]
Epoch [1/2], batch: [3/993, loss: 11.9193]
Epoch [1/2], batch: [4/993, loss: 11.9034]
Epoch [1/2], batch: [5/993, loss: 11.8822]
Epoch [1/2], batch: [6/993, loss: 11.8552]
Epoch [1/2], batch: [7/993, loss: 11.8495]
Epoch [1/2], batch: [8/993, loss: 11.8650]
Epoch [1/2], batch: [9/993, loss: 11.7946]
Epoch [1/2], batch: [10/993, loss: 11.8412]
Epoch [1/2], batch: [11/993, loss: 11.8232]
Epoch [1/2], batch: [12/993, loss: 11.7485]


KeyboardInterrupt: 

In [None]:
from collections import Counter
target_words = [elem[1] for elem in my_data.context_dataset]
counts = Counter(target_words)

sorted_counts = counts.most_common()
lower_30 = sorted_counts[-76030: -76000 ]
upper_30 = sorted_counts[:30]

In [None]:
upper_30

In [None]:
lower_30

In [None]:
model.eval()

word_list = ["manage", "occupied", "passport", "improve", "hotel", "good", "stayed", "staff", "friendly"]
w_as_ix = [torch.tensor(word_to_ix[word_list[i]], dtype=torch.long).to(device) for i in range(len(word_list))]
embedded_words = [model.embedding(elem) for elem in w_as_ix]

cos = nn.CosineSimilarity()


In [None]:
result_dist = [0 for elem in word_list]
result_words = [None for elem in word_list]

with torch.no_grad():
  for w, ix in word_to_ix.items():  # loop through whole vocab
    cur_embed = model.embedding((torch.tensor(ix, dtype=torch.long)).to(device))  # embedding for the current word in our vocabulary
    for idx, embed in enumerate(embedded_words):  # enumerate to always know which word we're looking at
      if w != word_list[idx]:  # check that we don't compare the same word with itself, otherwise the cosine dist will be always 1
        cur_dist = cos(cur_embed.view(1, 50), embed.view(1,50))
        if cur_dist > result_dist[idx]:
          result_dist[idx] = cur_dist
          result_words[idx] = w


In [None]:
res_comparison = [ (f'Actual word: "{word_list[idx]}"', f'Most Similar Word: "{result_words[idx]}"') for idx in range(len(word_list))]

res_comparison

In [None]:
import torch.nn as nn

def get_closest_word(word, topn=5):
  word_distance = []
  emb = model.embedding
  pdist = nn.PairwiseDistance()
  i = word_to_ix[word]
  lookup_tensor_i = torch.tensor([i], dtype=torch.long).to(device)
  v_i = emb(lookup_tensor_i).to(device)
  for j in range(len(word_to_ix)):
    if j != i:
      lookup_tensor_j = torch.tensor([j], dtype=torch.long).to(device)
      v_j = emb(lookup_tensor_j)
      word_distance.append((ix_to_word[j], float(pdist(v_i, v_j))))
  word_distance.sort(key=lambda x: x[1])
  return word_distance[:topn]