In [2]:
import os
import re
import nltk
import pytreebank
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F


import plotly.express as px
import matplotlib.pyplot as plt

from nltk.tree import Tree
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer

from collections import Counter

from torch.utils.data import TensorDataset, DataLoader

from sklearn import metrics

# download all the nltk corpora only once
#nltk.download('all')

In [3]:
# set available device
is_cuda = torch.cuda.is_available()

if is_cuda:
    device = torch.device("cuda")
    print("GPU is available")
else:
    device = torch.device("cpu")
    print("GPU not available, CPU used")

GPU not available, CPU used


## Prepare Data for Analysis

In [4]:
# load the SST corpus in the parenthesis format
dataset = pytreebank.load_sst()

# add Javascript  and CSS to the Ipython Notebook
pytreebank.LabeledTree.inject_visualization_javascript()

# visualize an example
example = dataset["train"][1]
example.display()

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [5]:
def extract_text_and_label(tree):
    """  Extract text and label text from a given tree"""
    text = tree.to_lines()[0]
    label = tree.label
    return text, label

def extract_dataframe(dataset, category):
    """ Extract text and label from all trees into a dataframe. """
    data = []
    for tree in dataset[category]:
        text, label = extract_text_and_label(tree)
        data.append([text, label])
    
    df = pd.DataFrame(data, columns=["text", "label"])
    return df

In [6]:
# extract text and label dataframes for train,test and validate dataset
df_train = extract_dataframe(dataset, "train")
df_test = extract_dataframe(dataset, "test")
df_validate = extract_dataframe(dataset, "dev")

In [7]:
df_train.head()

Unnamed: 0,text,label
0,The Rock is destined to be the 21st Century 's...,3
1,The gorgeously elaborate continuation of `` Th...,4
2,Singer/composer Bryan Adams contributes a slew...,3
3,You 'd think by now America would have had eno...,2
4,Yet the act is still charming here .,3


In [8]:
x_train = df_train["text"]; y_train = df_train["label"]
x_test = df_test["text"]; y_test = df_test["label"]
x_validate = df_validate["text"]; y_validate = df_validate["label"]

print(f'shape of train  data is {x_train.shape}')
print(f'shape of test data is {x_test.shape}')
print(f'shape of validate data is {x_validate.shape}')

shape of train  data is (8544,)
shape of test data is (2210,)
shape of validate data is (1101,)


##  Data statistics

In [9]:
# concatenate train, test and validate data frames  alonng  axis=0 andwith  additional  column  "data_type"
df_train["data_type"] = "train"
df_test["data_type"] = "test"
df_validate["data_type"] = "validate"

df = pd.concat([df_train, df_test, df_validate], axis=0)
df["sentiment"] = df["label"].replace([0, 1, 2, 3, 4], ["very negative", "negative", "neutral", "positive", "very positive"])
df

Unnamed: 0,text,label,data_type,sentiment
0,The Rock is destined to be the 21st Century 's...,3,train,positive
1,The gorgeously elaborate continuation of `` Th...,4,train,very positive
2,Singer/composer Bryan Adams contributes a slew...,3,train,positive
3,You 'd think by now America would have had eno...,2,train,neutral
4,Yet the act is still charming here .,3,train,positive
...,...,...,...,...
1096,it seems to me the film is about the art of ri...,1,validate,negative
1097,It 's just disappointingly superficial -- a mo...,1,validate,negative
1098,The title not only describes its main characte...,1,validate,negative
1099,Sometimes it feels as if it might have been ma...,2,validate,neutral


In [10]:
# use plotly.express to plot df label count distribution as bar plot
fig = px.histogram(df, x="sentiment", color="data_type", barmode="group", title="Label count distribution")
# order the x-axis by sentiment
fig.update_xaxes(categoryorder="array", categoryarray=["very negative", "negative", "neutral", "positive", "very positive"])
fig.show()    

## Pre-processing text

Tokenization, Removing Stop Words and Lemmatization:
Word tokanization splits a sentence into tokens (words and punctuation)
Lemmatization gets the base form of the word;  NLTK WordNetLemmatizer treats everythins as a noun

In [101]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/elliemcintosh/nltk_data...


True

In [11]:
def preprocess_text(text):
    """ Preprocess string by removing all non-alphanumeric characters, 
    replacing all runs of whitespace with a no space, and digits with no space."""

    # remove all non-alphanumeric characters
    text = re.sub(r"[^\w\s]", "", text)

    # replace all runs of whitespace with a single no space
    text = re.sub(r"\s+", "", text)

    # replace all digits with no space
    text = re.sub(r"\d", "", text)

    return text

def tockenize_text(x_train, x_test):
    """ Create a one-hot dictionaty of tokens for train and test data, tokenize train and test data,
     and return tokenized train and test data,  and vocabulary. """
    word_list = []
    stop_words = set(stopwords.words("english"))
    
    for sentance in x_train:
        for word in sentance.lower().split():
            word = preprocess_text(word)
            if word not in stop_words and word != "":
                word_list.append(word)

    corpus = Counter(word_list)
    # sort the corpus by frequency
    corpus = sorted(corpus, key=corpus.get, reverse=True)
    # create a one-hot dictionary, which maps each word to a unique integer
    one_hot_dict = {word: i for i, word in enumerate(corpus, 1)}

    # tokenize train and test data
    x_train_tokens = [[one_hot_dict[preprocess_text(word)] for word in sentance.lower().split() if preprocess_text(word) in one_hot_dict.keys()] for sentance in x_train]
    x_test_tokens = [[one_hot_dict[preprocess_text(word)] for word in sentance.lower().split() if preprocess_text(word) in one_hot_dict.keys()] for sentance in x_test]

    return x_train_tokens, x_test_tokens, one_hot_dict


In [12]:
x_train_tock, x_test_tock, vocab = tockenize_text(x_train, x_test)

In [13]:
print(f"Length of vocabulary is {len(vocab)}")
# print out the first (sorted of basis of most common) 10 key value paris of vocab, 
# where each word is mapped to a unique integer index
[(k, v) for i,(k,v) in enumerate(vocab.items()) if i < 10]

Length of vocabulary is 16125


[('film', 1),
 ('movie', 2),
 ('nt', 3),
 ('one', 4),
 ('like', 5),
 ('story', 6),
 ('much', 7),
 ('good', 8),
 ('even', 9),
 ('comedy', 10)]

In [14]:
# each sentence of the training/test text is tokenized into a list of integers
x_train_tock[0]

[473,
 3271,
 2296,
 1128,
 31,
 5379,
 156,
 17,
 4099,
 9,
 3272,
 1735,
 2674,
 8001,
 1547,
 5380,
 710,
 8002]

### Analysis of sentence length

Pytorch TensorDatasett and DataLoader classes used for batching and loading data  expect all tensors in a batch to have the same shape.

In [15]:
sentence_lengths = [len(sentence) for sentence in x_train]
# plot the distribution of sentence lengths in train data with plotly.express
fig = px.histogram(x=sentence_lengths, title="Distribution of sentence lengths in train data")
fig.show()

pd.Series(sentence_lengths).describe()

count    8544.000000
mean      102.310393
std        51.430023
min         4.000000
25%        62.000000
50%        98.000000
75%       137.000000
max       267.000000
dtype: float64

## Padding

In [16]:
sen_len = max(len(sentence) for sentence in x_train)
print(f'maximum sentence length is {sen_len}')

maximum sentence length is 267


In [17]:
def padding(sentences, sen_len, padding="pre"):
    """ Pad with zeros  or truncate each sentence of a sentences to a chosen sentence length. """
    features = np.zeros((len(sentences), sen_len), dtype=int)
    for i, sentence in enumerate(sentences):
        if len(sentence) != 0:
            if len(sentence) >= sen_len:
                features[i, :] = np.array(sentence)[:sen_len]
            else:
                if padding == "pre":
                    features[i, -len(sentence):] = np.array(sentence) # pre-padding
                else:
                    features[i, 0:len(sentence)] = np.array(sentence)  # post-padding
    return features

In [18]:
# pad the input sentences in train and test data and turn them into numpy arrays
x_train_tock_pad = padding(x_train_tock, sen_len)
x_test_tock_pad = padding(x_test_tock, sen_len)

# turn labels into numpy arrays
y_train = np.array(y_train)
y_test = np.array(y_test)

## Batching and Loading

In [19]:
# create Tensor datasets
train_data = TensorDataset(torch.from_numpy(x_train_tock_pad), torch.from_numpy(y_train))
test_data = TensorDataset(torch.from_numpy(x_test_tock_pad), torch.from_numpy(y_test))

# define dataloaders, shuffling the data at each epoch
batch_size = 32

# load batched and shuffled data in parallel with DataLoader object
train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size, num_workers=2)  
test_loader = DataLoader(test_data, shuffle=True, batch_size=batch_size, num_workers=2)

In [20]:
x_train_tock_pad.shape

(8544, 267)

In [21]:
# obtain one batch of training data
data_iter = iter(train_loader)
one_batch = next(data_iter)

x_train_batch, y_train_batch = one_batch

print(f"Batch of input sentences: \n{x_train_batch}")
print(f"Batch of sentiment for input sentences: \n {y_train_batch}")

Batch of input sentences: 
tensor([[    0,     0,     0,  ..., 13339, 13340,    15],
        [    0,     0,     0,  ...,     0,   110,  6991],
        [    0,     0,     0,  ...,   997,    81,   651],
        ...,
        [    0,     0,     0,  ...,  1149,  9574,   133],
        [    0,     0,     0,  ...,  1388,    90,     2],
        [    0,     0,     0,  ...,     0,     0,   532]])
Batch of sentiment for input sentences: 
 tensor([1, 3, 3, 3, 4, 3, 3, 1, 1, 3, 4, 2, 0, 4, 3, 0, 0, 1, 3, 1, 2, 2, 3, 2,
        2, 3, 2, 3, 3, 3, 2, 3])


##  Word Embedding Background

Word2Vec is a continuous representaion of a word in N-dim vector of "context words" scores, which indicate how context words are related to the given word. In order to learn the word embbeddings need to train the Skip-Gram or Continuous Bag Of Words models on large text data: the Ski-Gram model learns to predict context words frrom target. The CBOW model predicts the target word according to its context represented as a bag of the words contained in a fixed size window around the target word. The learing is called **self-supervised learning** since the traning labels are automatically created from the given dataset.  Words embedding is a by-product of these fake problems

Can train word embeddings using own text data with **fastText**, install as in
https://fasttext.cc/docs/en/support.html or train the model with nltk train data ither on cml % ./fasttext skipgram -input /Users/elliemcintosh/nltk_data/corpora/treebank/train.txt -output model or with python, which create a binary representation of the trained model in model.bin file.
fastText breaks words down to n-grams, learns and adds the character n-grams embeddings assosiaced with the target word: 
do, dog, ogg, ggy, gy for "doggy" with n=3. 
With character n-grams seen in the training data, word vectors can be assigned to words that did not appear in the train set,OutOfVocabulary or OOV words.

Or download pre-trained published model GloVe - https://nlp.stanford.edu/projects/glove which provides multiple word-embedding files trained using different datasets and vector size; the dimensionality of the emedding is suffix of the file name.

In [None]:
import fasttext

help(fasttext.FastText)

Help on module fasttext.FastText in fasttext:

NAME
    fasttext.FastText

DESCRIPTION
    # Copyright (c) 2017-present, Facebook, Inc.
    # All rights reserved.
    #
    # This source code is licensed under the MIT license found in the
    # LICENSE file in the root directory of this source tree.

FUNCTIONS
    cbow(*kargs, **kwargs)
    
    load_model(path)
        Load a model given a filepath and return a model object.
    
    read_args(arg_list, arg_dict, arg_names, default_values)
    
    skipgram(*kargs, **kwargs)
    
    supervised(*kargs, **kwargs)
    
    tokenize(text)
        Given a string of text, tokenize it and return a list of tokens
    
    train_supervised(*kargs, **kwargs)
        Train a supervised model and return a model object.
        
        input must be a filepath. The input text does not need to be tokenized
        as per the tokenize function, but it must be preprocessed and encoded
        as UTF-8. You might want to consult standard preprocessi

In [206]:
# train fastText model on the nltk  treebank tain data; fastText provides two models for computing word representations: skipgram and cbow
model = fasttext.train_unsupervised('/Users/elliemcintosh/Documents/Python/ManningBooks_LiveProjects/NLP/data/nltk_treebank/train.txt', 
                                    model='skipgram')
# the train  model returns all words in  the vocabulary  sorted  by decreasing  frequency
model.words

Read 0M words
Number of words:  3379
Number of labels: 0
Progress: 100.0% words/sec/thread:  189305 lr:  0.000000 avg.loss:  2.558970 ETA:   0h 0m 0s


['(2',
 '(3',
 '(1',
 '(4',
 '</s>',
 '(0',
 'the)',
 'a)',
 'of)',
 'and))',
 ',))',
 '.)))',
 ',)',
 'to)',
 'is)',
 '.))',
 'that)',
 'in)',
 'it)',
 "'s)",
 'The)',
 'as)',
 '.)))))',
 'with)',
 'for)',
 'its)',
 'but))',
 'A)',
 '.))))',
 'an)',
 'this)',
 "n't))",
 'It)',
 'you)',
 'be)',
 'on)',
 "'s)))",
 "'s))",
 'by)',
 'are)',
 'has)',
 'about)',
 'from)',
 'film))',
 'at)',
 'have)',
 'than)',
 'his)',
 'like)',
 'I)',
 'more)',
 'movie))',
 '-LRB-)',
 'all)',
 'and)',
 'or))',
 'so)',
 'not))',
 'who)',
 'one)',
 'into)',
 '...)',
 '--))',
 'does)',
 '...))',
 'too)',
 'will)',
 'This)',
 'can)',
 'most)',
 'if)',
 'their)',
 'no)',
 'some)',
 '``)',
 'up))',
 'it))',
 'what)',
 'out))',
 'film)))',
 'film)',
 'An)',
 '`)',
 '.))))))',
 'he)',
 'not)',
 'your)',
 'make)',
 'would)',
 'good)',
 'been)',
 'very)',
 'movie)))',
 'may)',
 'makes)',
 'but)',
 'they)',
 'any)',
 'do)',
 'even)',
 'was)',
 'there)',
 'just)',
 'could)',
 'If)',
 'little)',
 '-RRB-)))',
 'much)',


In [209]:
# a quiry word 100  dim  vector embbeding
model.get_word_vector('dog')

array([ 0.16253328,  0.0819099 ,  0.01472814,  0.03313632,  0.10125743,
       -0.00929637,  0.05601443,  0.03101848,  0.05921847,  0.01711445,
       -0.04059543,  0.04245573, -0.03731231,  0.02569779,  0.0145978 ,
        0.10325693,  0.02197906, -0.02770103,  0.08043672, -0.0395107 ,
       -0.10118322,  0.09178766,  0.23023367, -0.02440606, -0.05144547,
       -0.15267608, -0.01294647, -0.10742956,  0.03991385,  0.21038455,
       -0.01549202,  0.08480956, -0.0686678 ,  0.0701733 , -0.07046919,
        0.04888634,  0.04524221,  0.02878515, -0.04175093,  0.00584948,
       -0.05867366,  0.04517206,  0.00658897,  0.11184171, -0.00776079,
       -0.08310948, -0.0567942 ,  0.1035456 ,  0.03385323,  0.05159619,
       -0.044962  ,  0.03537765, -0.01541902, -0.04078512, -0.00461746,
       -0.06302381,  0.01774449,  0.00406195,  0.00242029,  0.02014366,
        0.01594901,  0.11802742, -0.04625532,  0.06356614,  0.0351117 ,
       -0.06681772, -0.0548759 ,  0.07126195,  0.02140171,  0.05

In [223]:
# use fastText train model to find the 5  closest words to query word
print(model.get_nearest_neighbors('dog', k=5))
print(model.get_nearest_neighbors('doggy', k=5))

[(0.9188043475151062, 'do)'), (0.9081246256828308, 'much))'), (0.9012392163276672, 'much)'), (0.8770462274551392, 'much)))'), (0.8681175112724304, 'happens)')]
[(0.9171121120452881, 'do)'), (0.9080454111099243, 'much))'), (0.8988326191902161, 'much)'), (0.876613438129425, 'much)))'), (0.8642932772636414, 'happens)')]


### GloVe 50 dim word embedings

In [122]:
# download  pre-trained CloVe  word  embeddings in /data/glove
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip

--2024-01-11 14:13:34--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2024-01-11 14:13:34--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2024-01-11 14:13:34--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


202

In [124]:
# display first 5 lines of the 50 dimensional words embedding in glove.6B.50d.txt
!head -n 5 glove/glove.6B.50d.txt

the 0.418 0.24968 -0.41242 0.1217 0.34527 -0.044457 -0.49688 -0.17862 -0.00066023 -0.6566 0.27843 -0.14767 -0.55677 0.14658 -0.0095095 0.011658 0.10204 -0.12792 -0.8443 -0.12181 -0.016801 -0.33279 -0.1552 -0.23131 -0.19181 -1.8823 -0.76746 0.099051 -0.42125 -0.19526 4.0071 -0.18594 -0.52287 -0.31681 0.00059213 0.0074449 0.17778 -0.15897 0.012041 -0.054223 -0.29871 -0.15749 -0.34758 -0.045637 -0.44251 0.18785 0.0027849 -0.18411 -0.11514 -0.78581
, 0.013441 0.23682 -0.16899 0.40951 0.63812 0.47709 -0.42852 -0.55641 -0.364 -0.23938 0.13001 -0.063734 -0.39575 -0.48162 0.23291 0.090201 -0.13324 0.078639 -0.41634 -0.15428 0.10068 0.48891 0.31226 -0.1252 -0.037512 -1.5179 0.12612 -0.02442 -0.042961 -0.28351 3.5416 -0.11956 -0.014533 -0.1499 0.21864 -0.33412 -0.13872 0.31806 0.70358 0.44858 -0.080262 0.63003 0.32111 -0.46765 0.22786 0.36034 -0.37818 -0.56657 0.044691 0.30392
. 0.15164 0.30177 -0.16763 0.17684 0.31719 0.33973 -0.43478 -0.31086 -0.44999 -0.29486 0.16608 0.11963 -0.41328 -0.42353

In [None]:
def find_neighbors(query_word, vocabulary, embeddings, n_neighbors=5):
    """ For a query word, find the n_neighbors closest vocabulary words based on the embeddings."""
    # find the index of the query word in the vocabulary
    query_index = vocabulary.index(query_word)
    
    # get the embedding of the query word
    query_embedding = embeddings[query_index]
    
    # calculate the distance between the query embedding and all other embeddings
    distances = []
    for embedding in embeddings:
        distance = np.linalg.norm(query_embedding - embedding)
        distances.append(distance)
    
    # sort the distances and return the indices of the n_neighbors closest words
    sorted_distances = np.argsort(distances)
    neighbors = sorted_distances[1:n_neighbors+1]
    
    # return the neighbors
    return neighbors


In [4]:
# load the 50 dimensional GloVe embeddings
import pandas as pd
import csv

words = pd.read_table("../data/glove/glove.6B.50d.txt", 
                      sep=" ", 
                      index_col=0, 
                      header=None, 
                      quoting=csv.QUOTE_NONE)

In [5]:
words.head()

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,41,42,43,44,45,46,47,48,49,50
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
the,0.418,0.24968,-0.41242,0.1217,0.34527,-0.044457,-0.49688,-0.17862,-0.00066,-0.6566,...,-0.29871,-0.15749,-0.34758,-0.045637,-0.44251,0.18785,0.002785,-0.18411,-0.11514,-0.78581
",",0.013441,0.23682,-0.16899,0.40951,0.63812,0.47709,-0.42852,-0.55641,-0.364,-0.23938,...,-0.080262,0.63003,0.32111,-0.46765,0.22786,0.36034,-0.37818,-0.56657,0.044691,0.30392
.,0.15164,0.30177,-0.16763,0.17684,0.31719,0.33973,-0.43478,-0.31086,-0.44999,-0.29486,...,-6.4e-05,0.068987,0.087939,-0.10285,-0.13931,0.22314,-0.080803,-0.35652,0.016413,0.10216
of,0.70853,0.57088,-0.4716,0.18048,0.54449,0.72603,0.18157,-0.52393,0.10381,-0.17566,...,-0.34727,0.28483,0.075693,-0.062178,-0.38988,0.22902,-0.21617,-0.22562,-0.093918,-0.80375
to,0.68047,-0.039263,0.30186,-0.17792,0.42962,0.032246,-0.41376,0.13228,-0.29847,-0.085253,...,-0.094375,0.018324,0.21048,-0.03088,-0.19722,0.082279,-0.09434,-0.073297,-0.064699,-0.26044


In [192]:
def vec(w):
    """Get the word embedding vector for word w as a numpy array."""
    return words.loc[w].values.reshape(1, -1)

def find_n_closest_words(v, words, n):
    """Find the n closest words to query word vector v,
    calculating the  Euclidean distance between their word 
    vectr embeddings. """
  
    # calculate the element-wise difference between each word vector and the query word vector
    diff = words - v
    # square the difference  and sum up the along the rows
    # no sqrt needed for computing the max distances since monotonic
    delta = np.sum(diff * diff, axis=1)
    # sort the deltas in  ascending  order, and get the index  values of the first n elements
    closest_words_ids  = np.argsort(delta)[ 1 : n + 1 ]
    # turn the nearest n vectors indeces into words
    closest_words = words.iloc[closest_words_ids].index.values
    
    return closest_words


In [197]:
print(f"Five of Glove 50 dim words embedings\n: {words.iloc[:5, :5]}\n")
print(f"Embedding of the word 'the'\n: {vec('the')}")

Five of Glove 50 dim words embedings
:             1         2        3        4        5
0                                                 
the  0.418000  0.249680 -0.41242  0.12170  0.34527
,    0.013441  0.236820 -0.16899  0.40951  0.63812
.    0.151640  0.301770 -0.16763  0.17684  0.31719
of   0.708530  0.570880 -0.47160  0.18048  0.54449
to   0.680470 -0.039263  0.30186 -0.17792  0.42962

Embedding of the word 'the'
: [[ 4.1800e-01  2.4968e-01 -4.1242e-01  1.2170e-01  3.4527e-01 -4.4457e-02
  -4.9688e-01 -1.7862e-01 -6.6023e-04 -6.5660e-01  2.7843e-01 -1.4767e-01
  -5.5677e-01  1.4658e-01 -9.5095e-03  1.1658e-02  1.0204e-01 -1.2792e-01
  -8.4430e-01 -1.2181e-01 -1.6801e-02 -3.3279e-01 -1.5520e-01 -2.3131e-01
  -1.9181e-01 -1.8823e+00 -7.6746e-01  9.9051e-02 -4.2125e-01 -1.9526e-01
   4.0071e+00 -1.8594e-01 -5.2287e-01 -3.1681e-01  5.9213e-04  7.4449e-03
   1.7778e-01 -1.5897e-01  1.2041e-02 -5.4223e-02 -2.9871e-01 -1.5749e-01
  -3.4758e-01 -4.5637e-02 -4.4251e-01  1.8785e-01  2.78

In [224]:
v1 = vec("dog")
find_n_closest_words(v1, words, 5)

0
,       5450
.       3876
of     22454
to     12425
and     6658
dtype: int64


array(['cat', 'dogs', 'puppy', 'rabbit', 'pet'], dtype=object)

In [225]:
v2 = vec("doggy")
find_n_closest_words(v2, words, 5)

0
,       25789
.       27894
of     129247
to      29937
and     42641
dtype: int64


array(['snoop', 'dogg', 'kurupt', 'dre', 'scarecrow'], dtype=object)

### Visualize GloVe embeddings

In [14]:
from sklearn.manifold import TSNE

# Visualize the first 1000 GloVe words embbbedings in tsne 2d space
tsne = TSNE(n_components=2, init='pca', random_state=0)
words_embeddings_2d = tsne.fit_transform(words.head(1000))

In [22]:
w2d_df = pd.DataFrame(words_embeddings_2d, index=words.index.values[:1000])
w2d_df.columns = ["x", "y"]
w2d_df["word"] = w2d_df.index.values

# use plotly to plot the 2d embeddings dataframe
fig = px.scatter(w2d_df, x="x", y="y", 
                 color = "word", 
                 title = "TSNE 2d Embeddings of the first 1000 GloVe words")
fig.update_traces(mode="markers", text="word", marker={'size':4})
fig.show()

## Building LSTM Recurrent Neural Network

**RRN**s uses sequential data and apply to ordinal problems. Their outpit depends on the prior elements of the sequence, as the output of a given step is provided alongside the input of the next step. RNNs do become less effective learning with increase of input information, the so called **vanishing  gradient** during back propagation. Long Short Memory network, **LSTM** provide solution to this long term dependency problem, by addition of internal state, called  **cell** which allowes a relevent prior information to be kept. The "memory" cell consists of *forget gate*, *input gate* and *output gate*. It output numbers bw 0 = gate closed, i.e. "forget everything" and 1 = gate wide open, i.e. "output all memory to the input  state". 

LSTMs, sequence prediction with long term deependency data that goes alongside it are used in in machine translation, Q&A Chatbots

Gate Recurrent Units **GRU**s take the internal/ memory/ hidden state and the input and *relevence gate* and *update gate* whose value are between 0 and 1, determining how much of the previous information will be overwrritten

## Sentiment analysis with LSTM

In [21]:
class SentimentRNN(nn.Module):
    """ A Long Short Term Memory RNN model for sentiment analysis with embedding layer and hidden layer."""

    def __init__(self, input_size, embedding_dim, hidden_dim, num_layers, num_classes, drop_prob=0.5):
        """Initialize the model by setting up the layers."""
        
        #  call constructor of the parent class
        super(SentimentRNN, self).__init__()

        self.input_size = input_size

        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.num_classes = num_classes
        self.num_layers = num_layers

        # create embedding layer 
        self.embedding = nn.Embedding(num_embeddings = input_size,
                                      embedding_dim = embedding_dim)    
        # create  LSTM layer
        self.lstm = nn.LSTM(embedding_dim,  
                           hidden_dim, 
                           num_layers, 
                           dropout = drop_prob,
                           batch_first=True)
        
        # initialize the weights of the LSTM layer
        for name, param in self.lstm.named_parameters():
            if 'weight_ih' in name:
                torch.nn.init.xavier_uniform_(param.data)
            elif 'weight_hh' in name:
                torch.nn.init.orthogonal_(param.data)
            elif 'bias' in name:
                param.data.fill_(0)

        # drpout layer
        self.dropout = nn.Dropout(0.3)
        # linear layer that maps the lstm ouput size of (doubling if bidrectional), to output_dim
        self.fc = nn.Linear(self.hidden_dim, self.num_classes)

        # softmax layer turns the output alog the dim specified into a probability distribution 
        self.softmax  = nn.Softmax(dim=1)

    def forward(self, x):
        """ Function that takes a batch of input sentences, passes it through the 
        embedding, lstm and linear layers to output logits."""
        
        batch_size = x.size(0)
        
        # initialize the hidden and cell states 2 timese the number of layers if bidirectional
        hidden_state = torch.zeros(self.num_layers, batch_size, self.hidden_dim).to(device)
        cell_state = torch.zeros(self.num_layers, batch_size, self.hidden_dim).to(device)

        hidden = (hidden_state, cell_state)
        # pass the input tensor through the embedding layer, 
        embbedings = self.embedding(x)

        # pass the embeddings through the LSTM layer
        lstm_out, hidden = self.lstm(embbedings, hidden)
        # the lstm output is a tensor with shape (batch_size, sequence_length, hidden_dim),  
        # slice the  last time step of each batch sequence as a tensor of shape (batch_size, hidden_dim)
        lstm_out = lstm_out[:, -1, :]
        out = self.fc(lstm_out)
        out = self.softmax(out)
        # return each  class final probabilities
        return out
    


In [22]:
# instantiate the model with the hyperparameters
input_size = len(vocab) + 1
embedding_dim = 128
hidden_dim = 128
num_layers = 2
num_classes = 5

model =  SentimentRNN(input_size, 
                      embedding_dim, 
                      hidden_dim, 
                      num_layers, 
                      num_classes, 
                      drop_prob = 0.5).to(device)

print(model)

SentimentRNN(
  (embedding): Embedding(16126, 128)
  (lstm): LSTM(128, 128, num_layers=2, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.3, inplace=False)
  (fc): Linear(in_features=128, out_features=5, bias=True)
  (softmax): Softmax(dim=1)
)


## Training

In [23]:
# set up learning rate, loss and optimization functions
lr = 0.001

# define the loss function as cross entropy loss
criterion = nn.CrossEntropyLoss()
# define the optimization function as Adam, less  sensitive to the learning rate
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

# set gradient clipping to avoid exploding gradients; the max allowed norm of the gradients is a hyper parameter
clip = 5
epochs = 10

In [24]:
def train(data_loader, model, optimizer, device, criterion, clip):
    """ Train the model on the training set. """

    # set model to training mode
    model.train()
    # go through the batches of data in data_loader:
    for data in data_loader:
        
        sentences = data[0]
        sentiment = data[1]
        # move the data to the device
        sentences = sentences.to(device) 
        sentiment = sentiment.to(device)  
        # clear the gradients
        optimizer.zero_grad()
        # Forward pass
        predictions = model(sentences)
        # get the the class with the highest probability
        #predicted = torch.argmax(predictions, dim=1)
        loss = criterion(predictions, sentiment)
        loss.backward()

        # gradient clipping to prevent exploding gradients
        nn.utils.clip_grad_norm_(model.parameters(),  clip)
        
        optimizer.step()

def evaluate(data_loader, model, device):

    final_predictions = []
    final_targets = []
    
    model.eval()

    for data in data_loader:
        sentences = data[0]
        sentiment = data[1]

        sentences = sentences.to(device)
        sentiment = sentiment.to(device)

        outputs = model(sentences)
        # get a 1D tesor of the indices of the classes with maximum probabilities
        predictions =torch.argmax(outputs, dim=1)
        
        # use detatch() to separatet the tensor from the computational graph
        predictions = predictions.detach().cpu().numpy().tolist()
        sentiment = sentiment.detach().cpu().numpy().tolist()

        final_predictions.extend(predictions)
        final_targets.extend(sentiment)     

        return  final_predictions, final_targets   

In [25]:
best_accuracy = 0
for epoch in range(1, epochs + 1):
        train(train_loader, model, optimizer, device, criterion, clip)
        outputs, targets = evaluate(test_loader, model, device)
        accuracy = metrics.accuracy_score(targets, outputs)
        
        print(f"Epoch:{epoch}, Accuracy Score: {accuracy}")

        if accuracy > best_accuracy:
            best_accuracy = accuracy
            
print(f"Best accuracy is {best_accuracy}")

Epoch:1, Accuracy Score: 0.34375
Epoch:2, Accuracy Score: 0.3125
Epoch:3, Accuracy Score: 0.25
Epoch:4, Accuracy Score: 0.3125
Epoch:5, Accuracy Score: 0.375
Epoch:6, Accuracy Score: 0.21875
Epoch:7, Accuracy Score: 0.28125
Epoch:8, Accuracy Score: 0.40625
Epoch:9, Accuracy Score: 0.34375
Epoch:10, Accuracy Score: 0.3125
Best accuracy is 0.40625
