In [1]:
import csv
import pandas as pd
import numpy as np
import string
import re

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import IterableDataset, DataLoader, Dataset
import torch.optim as optim


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
hotel_path = "/content/drive/MyDrive/NLP_ASSIGNMENT2/tripadvisor_hotel_reviews.csv"
scifi_path = "/content/drive/MyDrive/NLP_ASSIGNMENT2/scifi.txt"

Let's import the hotel reviews and the scifi dataset

In [5]:
hotel_reviews = pd.read_csv(hotel_path)

with open(scifi_path, 'r', encoding='utf-8') as foo:
    f = foo.readline()  # it is only '1' line

scifi_dataset = re.split('(?<=[.!?]) +',f)

In [6]:
scifi_dataset = pd.DataFrame(scifi_dataset)
scifi_dataset.columns = ['Review']

In [7]:
CONTEXT_SIZE = 2  # 2 words to the left, 2 to the right

def make_context_vector(context, word_to_ix):
    idxs = [word_to_ix[w] for w in context]
    return torch.tensor(idxs, dtype=torch.long)

### Preprocessing the data
For that, we create a class which contains all the methods we need for preprocessing the data.

In [8]:
class Preprocessing():

    def __init__(self, full_dataset):
        self.full_dataset = full_dataset
        self.word_to_ix = {}
        self.ix_to_word = {}
        self.context_dataset = []
        self.vocab_size = None
    
    def create_context_vocab(self, input_txt):
        """
        Takes preprocessed lists of words and creates a context dataset
        At the same time it creates the word_to_ix and ix_to_word dictionaries for lookup
        """

        for i in range(2, len(input_txt) - 2):
            context = [input_txt[i - 2], input_txt[i - 1],
                       input_txt[i + 1], input_txt[i + 2]]
            target = input_txt[i]
            self.context_dataset.append((context, target))

        # adding words to our vocabulary list    
        input_txt = set(input_txt)
        # get the current n
        for el in input_txt:
            if el not in self.word_to_ix:
                self.word_to_ix[el] = len(self.word_to_ix)
    
    def apply_create_context_vocab(self):
        """
        Applys the create context fct to every row
        """
        for row in range(self.full_dataset.shape[0]):
            self.create_context_vocab(self.full_dataset['Review'][row].split())
        self.ix_to_word = dict((v, k) for k, v in self.word_to_ix.items())
        
        
    def convert_lowercase(self, x):
        x = x.lower()
        return x
        
    def remove_emoji(self, x):
        emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
        return emoji_pattern.sub(r'', x)
        
    exclude = string.punctuation
    
    def remove_punc(self, x):
        exclude = string.punctuation
        return x.translate(str.maketrans('', '', exclude))
    
    def remove_special_chars(self, x):
        x = re.sub('[^A-Za-z0-9]+', ' ', x)
        return x

    def remove_one_letter_words(self, x):
        x = re.sub(r'(?:^| )\w(?:$| )', ' ', x).strip()
        return x
    
    # default is to apply all these preprocessing steps
    def apply_preprocessing(self,
                            lowercase=True,
                            remove_emoji=True,
                            remove_punc=True,
                            remove_special_chars=True,
                            remove_one_letter_words=True):
        if lowercase:
            self.full_dataset['Review'] = self.full_dataset['Review'].apply(self.convert_lowercase)
        if remove_emoji:
            self.full_dataset['Review'] = self.full_dataset['Review'].apply(self.remove_emoji)
        if remove_punc:
            self.full_dataset['Review'] = self.full_dataset['Review'].apply(self.remove_punc)
        if remove_special_chars:
            self.full_dataset['Review'] = self.full_dataset['Review'].apply(self.remove_special_chars)
        if remove_one_letter_words:
            self.full_dataset['Review'] = self.full_dataset['Review'].apply(self.remove_one_letter_words)
    

In [9]:
hotel_data = Preprocessing(hotel_reviews)
scifi_data = Preprocessing(scifi_dataset)

In [10]:
hotel_data.apply_preprocessing()
scifi_data.apply_preprocessing()

In [11]:
hotel_data.apply_create_context_vocab()
scifi_data.apply_create_context_vocab()

In [12]:
print("Unique words in the scifi dataset:", len(scifi_data.word_to_ix))
print("Unique words in the hotel dataset:", len(hotel_data.word_to_ix))

Unique words in the scifi dataset: 200807
Unique words in the hotel dataset: 80380


In [14]:
print("Length of the context data set for the scifi dataset:", len(scifi_data.context_dataset))
print("Length of the context data set for the hotel dataset:", len(hotel_data.context_dataset))

Length of the context data set for the scifi dataset: 10468281
Length of the context data set for the hotel dataset: 2032672


In [15]:
# create Dataset class for transforming out dataset
# takes the loaded data and transforms it into a Dataset object to pass to the dataloader
# we could also include the import here but we will not

class CBOW_Dataset(Dataset):
    
    def __init__(self, full_data, word_to_ix):
        self.data = full_data
        self.word_to_ix = word_to_ix
            
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        x = make_context_vector(self.data[idx][0], self.word_to_ix)
        y = torch.tensor(self.word_to_ix[self.data[idx][1]], dtype=torch.long)
        return x, y

In [16]:
class CBOW(nn.Module):

    def __init__(self, vocab_size, embedding_size, batch_size):
        
        super().__init__()
        self.batch_size = batch_size
        self.embedding_size = embedding_size
        self.embedding = nn.Embedding(num_embeddings=vocab_size,
                                     embedding_dim=embedding_size)
        self.fc1 = nn.Linear(in_features=embedding_size,
                            out_features=vocab_size)

    def forward(self, inputs):
        x_embedded = sum(self.embedding(inputs).view(4, self.batch_size, self.embedding_size))
        y_out = self.fc1(x_embedded)
        m = nn.LogSoftmax()
        y_out = m(y_out)
        return y_out

### Let's try and train this Model

In [17]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [18]:
def training_procedure(dataset, num_epochs, batch_size, embed_size):
    """
    Function that takes in the corresponding dataset (hotel or scifi) and trains it.
    returns the model for later use
    """
    BATCH_SIZE = batch_size
    EMBEDDING_SIZE = embed_size
    
    data_loader = CBOW_Dataset(dataset.context_dataset, dataset.word_to_ix)
    data_loader = DataLoader(data_loader, batch_size = BATCH_SIZE, drop_last=True)

    vocab_size = len(dataset.word_to_ix)
    model = CBOW(vocab_size, embedding_size=EMBEDDING_SIZE, batch_size=BATCH_SIZE).to(device)
    loss_func = torch.nn.NLLLoss()
    lr = 0.001
    optimizer = optim.Adam(model.parameters(), lr=lr)

    num_epochs = num_epochs

    # train loop
    num_batches = len(data_loader)

    for epoch in range(1, num_epochs+1):
        for i, d in enumerate(data_loader):
            optimizer.zero_grad()
            x, y = d
            x, y = x.to(device), y.to(device)
            y_pred = model(x)        
            loss = loss_func(y_pred, y)
            loss_batch = loss.item()
            loss.backward()
            optimizer.step()
            print(f'Epoch [{epoch}/{num_epochs}], batch: [{i}/{num_batches}, loss: {loss_batch:.4f}]')
    return model



In [19]:
hotel_model = training_procedure(hotel_data, 12, 2048, 50)
scifi_model = training_procedure(scifi_data, 2, 2048, 50)



[1;30;43mDie letzten 5000 Zeilen der Streamingausgabe wurden abgeschnitten.[0m
Epoch [2/2], batch: [111/5111, loss: 6.7889]
Epoch [2/2], batch: [112/5111, loss: 6.9631]
Epoch [2/2], batch: [113/5111, loss: 6.8010]
Epoch [2/2], batch: [114/5111, loss: 6.9956]
Epoch [2/2], batch: [115/5111, loss: 7.1966]
Epoch [2/2], batch: [116/5111, loss: 6.9746]
Epoch [2/2], batch: [117/5111, loss: 6.6180]
Epoch [2/2], batch: [118/5111, loss: 6.9051]
Epoch [2/2], batch: [119/5111, loss: 7.7143]
Epoch [2/2], batch: [120/5111, loss: 7.3862]
Epoch [2/2], batch: [121/5111, loss: 7.2865]
Epoch [2/2], batch: [122/5111, loss: 7.2147]
Epoch [2/2], batch: [123/5111, loss: 6.7171]
Epoch [2/2], batch: [124/5111, loss: 6.9872]
Epoch [2/2], batch: [125/5111, loss: 6.8374]
Epoch [2/2], batch: [126/5111, loss: 6.9414]
Epoch [2/2], batch: [127/5111, loss: 7.3128]
Epoch [2/2], batch: [128/5111, loss: 7.0211]
Epoch [2/2], batch: [129/5111, loss: 6.9576]
Epoch [2/2], batch: [130/5111, loss: 7.0845]
Epoch [2/2], batch:

In the following, we will answer questions of Part 2 of the sheet.

In [20]:
from collections import Counter
target_words = [elem[1] for elem in hotel_data.context_dataset]
counts = Counter(target_words)

sorted_counts = counts.most_common()
lower_30 = sorted_counts[-76030: -76000 ]
upper_30 = sorted_counts[:30]

Let's first look at the 30 most common words and pick some of the 9 words from here and then the other words from the least common words.

From the most common, we pick: "hotel", "good", "stayed", "staff", "friendly"

In [21]:
upper_30

[('hotel', 43954),
 ('room', 33917),
 ('not', 30200),
 ('nt', 18306),
 ('great', 17156),
 ('staff', 15853),
 ('good', 15393),
 ('did', 13790),
 ('just', 12099),
 ('stay', 12079),
 ('rooms', 11841),
 ('no', 11370),
 ('nice', 11293),
 ('stayed', 9984),
 ('service', 9486),
 ('location', 9425),
 ('beach', 9416),
 ('night', 9229),
 ('day', 9220),
 ('clean', 9138),
 ('breakfast', 9138),
 ('time', 9125),
 ('food', 8894),
 ('like', 7942),
 ('really', 7571),
 ('resort', 7445),
 ('pool', 7120),
 ('people', 6644),
 ('place', 6579),
 ('friendly', 6414)]

From the least common words, we pick do not pick actually the 'least' common as many of these words have only 1 occurence so it will be very difficult to find similar words. We will search for words that have between 50 and 150. We will pick: "manage", "occupied", "passport", "improve"

In [22]:
lower_30

[('sets', 86),
 ('oct', 86),
 ('occupied', 86),
 ('criticism', 86),
 ('regularly', 86),
 ('frankly', 86),
 ('unlimited', 86),
 ('concerns', 86),
 ('sales', 86),
 ('account', 86),
 ('meters', 86),
 ('passport', 86),
 ('steaks', 86),
 ('stopover', 86),
 ('diet', 86),
 ('washing', 86),
 ('sandals', 86),
 ('brick', 85),
 ('charging', 85),
 ('mildew', 85),
 ('queue', 85),
 ('23rd', 85),
 ('chains', 85),
 ('improve', 85),
 ('training', 85),
 ('airlines', 85),
 ('settled', 85),
 ('according', 85),
 ('manage', 85),
 ('dined', 85)]

Now let's see what are the 5 closest words for each of the words we picked now. The first 4 words in our word list are less frequent, while the last 5 are among the most frequent words. Of course, we have to use the embeddings to measure similarities. That's what is done in the below chunk.

In [23]:
hotel_model.eval()

word_list = ["manage", "occupied", "passport", "improve", "hotel", "good", "stayed", "staff", "friendly"]
w_as_ix = [torch.tensor(hotel_data.word_to_ix[word_list[i]], dtype=torch.long).to(device) for i in range(len(word_list))]
embedded_words = [hotel_model.embedding(elem) for elem in w_as_ix]

Then, for measuring the distance between two embeddings, we use the cosine distance.

We will loop through the whole vocabulary and pick the words which are most similar. They will be always stored in a list and changed if there is a word that is more similar.

In [24]:
result_dist = [0 for elem in word_list]
result_words = [None for elem in word_list]
cos = nn.CosineSimilarity()

with torch.no_grad():
  for w, ix in hotel_data.word_to_ix.items():  # loop through whole vocab
    cur_embed = hotel_model.embedding((torch.tensor(ix, dtype=torch.long)).to(device))  # embedding for the current word in our vocabulary
    for idx, embed in enumerate(embedded_words):  # enumerate to always know which word we're looking at
      if w != word_list[idx]:  # check that we don't compare the same word with itself, otherwise the cosine dist will be always 1
        cur_dist = cos(cur_embed.view(1, 50), embed.view(1,50))
        if cur_dist > result_dist[idx]:
          result_dist[idx] = cur_dist
          result_words[idx] = w

In [25]:
res_comparison = [ (f'Actual word: "{word_list[idx]}"', f'Most Similar Word: "{result_words[idx]}"') for idx in range(len(word_list))]

res_comparison

[('Actual word: "manage"', 'Most Similar Word: "amhsa"'),
 ('Actual word: "occupied"', 'Most Similar Word: "infirst"'),
 ('Actual word: "passport"', 'Most Similar Word: "switchingnot"'),
 ('Actual word: "improve"', 'Most Similar Word: "freebuffet"'),
 ('Actual word: "hotel"', 'Most Similar Word: "beautifulalso"'),
 ('Actual word: "good"', 'Most Similar Word: "soana"'),
 ('Actual word: "stayed"', 'Most Similar Word: "westerner"'),
 ('Actual word: "staff"', 'Most Similar Word: "exept"'),
 ('Actual word: "friendly"', 'Most Similar Word: "benihana"')]

In [27]:
import torch.nn as nn

def get_closest_word(word, model, word_to_ix, ix_to_word, topn=5):
  word_distance = []
  emb = model.embedding
  pdist = nn.PairwiseDistance()
  i = word_to_ix[word]
  lookup_tensor_i = torch.tensor([i], dtype=torch.long).to(device)
  v_i = emb(lookup_tensor_i).to(device)
  for j in range(len(word_to_ix)):
    if j != i:
      lookup_tensor_j = torch.tensor([j], dtype=torch.long).to(device)
      v_j = emb(lookup_tensor_j)
      word_distance.append((ix_to_word[j], float(pdist(v_i, v_j))))
  word_distance.sort(key=lambda x: x[1])
  return word_distance[:topn]

For the hotel dataset, we'll look at the words "manage" and "hotel"

In [29]:
print(get_closest_word(word_list[0], hotel_model, hotel_data.word_to_ix, hotel_data.ix_to_word))

print(get_closest_word(word_list[4], hotel_model, hotel_data.word_to_ix, hotel_data.ix_to_word))


[('amhsa', 6.380679607391357), ('barf', 6.417194843292236), ('carelessness', 6.5769124031066895), ('birkenstocks', 6.644679546356201), ('girlfriendthe', 6.692757606506348)]
[('thoughtsstaff', 6.489668846130371), ('liquour', 6.7385711669921875), ('nicethe', 6.855052471160889), ('centrale', 6.912321090698242), ('zebraprint', 6.951244831085205)]


Let's do the same steps, but for the scifi dataset

In [57]:
target_words = [elem[1] for elem in scifi_data.context_dataset]
counts = Counter(target_words)

sorted_counts = counts.most_common()
lower_30 = sorted_counts[-158030: -158000 ]
upper_30 = sorted_counts[100:130]

For the scifi dataset, the 30 most likely words are almost exlusively 'short' typical words that appear in english, so we take some less frequent words that are adjectives and nouns. 
For the common words, we take: "old", "think", "new", "going" ,"people"

These are 2 verbs, 2 adjectives and 1 noun.

In [58]:
upper_30

[('something', 11976),
 ('make', 11894),
 ('came', 11893),
 ('much', 11827),
 ('still', 11792),
 ('right', 11688),
 ('long', 11658),
 ('going', 11557),
 ('got', 11351),
 ('think', 10901),
 ('looked', 10744),
 ('away', 10613),
 ('new', 10589),
 ('very', 10458),
 ('might', 10435),
 ('come', 10434),
 ('never', 10330),
 ('good', 10052),
 ('thought', 10048),
 ('take', 10039),
 ('himself', 10000),
 ('after', 9887),
 ('people', 9840),
 ('eyes', 9799),
 ('because', 9726),
 ('didnt', 9696),
 ('enough', 9633),
 ('went', 9547),
 ('again', 9381),
 ('old', 9379)]

For the less common words we take: "vegetables", "romance", "sterile", "cursing". These are 2 nouns, 1 adjective and 1 verb.

In [54]:
lower_30

[('urgency', 122),
 ('clustered', 122),
 ('plug', 122),
 ('shivered', 122),
 ('vegetables', 122),
 ('greeting', 122),
 ('linda', 122),
 ('warp', 122),
 ('frost', 122),
 ('player', 122),
 ('marge', 122),
 ('fowler', 122),
 ('norman', 122),
 ('fazzool', 122),
 ('tortured', 121),
 ('reared', 121),
 ('shove', 121),
 ('romance', 121),
 ('audible', 121),
 ('flashlight', 121),
 ('disappointment', 121),
 ('pigs', 121),
 ('leak', 121),
 ('sterile', 121),
 ('defeated', 121),
 ('discarded', 121),
 ('cursing', 121),
 ('surviving', 121),
 ('soap', 121),
 ('childish', 121)]

Now, let's look at what are the closest words in our CBOW model for the scifi dataset.

In [59]:
scifi_model.eval()

word_list_scifi = ["vegetables", "romance", "sterile", "cursing", "old", "think", "new", "going" ,"people"]
w_as_ix = [torch.tensor(scifi_data.word_to_ix[word_list_scifi[i]], dtype=torch.long).to(device) for i in range(len(word_list_scifi))]
embedded_words = [scifi_model.embedding(elem) for elem in w_as_ix]

In [61]:
result_dist_scifi = [0 for elem in word_list_scifi]
result_words_scifi = [None for elem in word_list_scifi]
cos = nn.CosineSimilarity()

with torch.no_grad():
  for w, ix in scifi_data.word_to_ix.items():  # loop through whole vocab
    cur_embed = scifi_model.embedding((torch.tensor(ix, dtype=torch.long)).to(device))  # embedding for the current word in our vocabulary
    for idx, embed in enumerate(embedded_words):  # enumerate to always know which word we're looking at
      if w != word_list_scifi[idx]:  # check that we don't compare the same word with itself, otherwise the cosine dist will be always 1
        cur_dist = cos(cur_embed.view(1, 50), embed.view(1,50))
        if cur_dist > result_dist_scifi[idx]:
          result_dist_scifi[idx] = cur_dist
          result_words_scifi[idx] = w

Let's look at the results:

In [64]:
res_comparison_scifi = [ (f'Actual word: "{word_list_scifi[idx]}"', f'Most Similar Word: "{result_words_scifi[idx]}"') for idx in range(len(word_list_scifi))]

res_comparison_scifi

[('Actual word: "vegetables"', 'Most Similar Word: "desiperafte"'),
 ('Actual word: "romance"', 'Most Similar Word: "manufacturers"'),
 ('Actual word: "sterile"', 'Most Similar Word: "shinier"'),
 ('Actual word: "cursing"', 'Most Similar Word: "mchughs"'),
 ('Actual word: "old"', 'Most Similar Word: "ragsnarled"'),
 ('Actual word: "think"', 'Most Similar Word: "sili"'),
 ('Actual word: "new"', 'Most Similar Word: "outworks"'),
 ('Actual word: "going"', 'Most Similar Word: "semienslaved"'),
 ('Actual word: "people"', 'Most Similar Word: "coujd"')]

Finally, let's look at the nearest neighbors for two words. We'll look at "people" and "old"

In [63]:
print(get_closest_word("people", scifi_model, scifi_data.word_to_ix, scifi_data.ix_to_word))

print(get_closest_word("old", scifi_model, scifi_data.word_to_ix, scifi_data.ix_to_word))


[('ringside', 5.976100921630859), ('stimulated', 6.036586761474609), ('invaders', 6.1958327293396), ('competely', 6.229156017303467), ('lisnng', 6.307568073272705)]
[('ragsnarled', 5.654760837554932), ('doctorpsychologist', 5.683231830596924), ('castrate', 5.784371376037598), ('eggheady', 5.868690013885498), ('soned', 5.99940299987793)]
