# Special Topics: Natural Language Processing
## Assignment #2: Word2Vec Implementation
## Clayton Haley

In [1]:
import pandas as pd
import random
import nltk
from nltk.stem import WordNetLemmatizer
from math import log2
from sklearn.metrics.pairwise import cosine_similarity

import tensorflow as tf
from keras import optimizers
from tensorflow.keras import backend as K
from tensorflow.python.ops import math_ops
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Embedding

import numpy as np
from scipy import spatial
import string
from nltk.corpus import stopwords 

import spacy

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

In [2]:
reviews_df = pd.DataFrame(pd.read_csv("Review_word2vec.csv"))
reviews_df.head()

Unnamed: 0.1,Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1395,1396,B00068PCTU,A1WLOJBYGZ49HL,"Brian F., Gloucester, MA",0,0,5,1333843200,The best I've ever had!,I am so happy I discovered this product. I had...
1,343222,343223,B000WFEN74,A14PLPD8AWSF3R,"A. isa ""sevendaymagic""",3,5,2,1340928000,Most of the Cans Were Dented,I've been ordering a lot of these. Many of the...
2,163165,163166,B00015UC8O,AR6KPQ78PW5WG,jturney,4,4,5,1285027200,baking aide,"I'm learning to make sour dough bread, and the..."
3,98713,98714,B001EO5Q1E,A2FPN31EGMHSIB,LinLou,0,0,5,1293753600,pleased,"I cannot find this product in any store,so I w..."
4,550051,550052,B000CQIDJM,A12E78L9515IF6,Husam J. Alnajjar,0,0,4,1209686400,only a hint of flavour,"Been a fan of Stash teas, but I never ventured..."


In [3]:
reviews_text = list(reviews_df.Text)
reviews_text[3]

'I cannot find this product in any store,so I was very pleased to be able to get it at Amazon.'

## Implementation of Word2Vec

In [4]:
class Word2Vec:
    
    """
    Initializes global constants:
        corpus
        embedding_size
        training epochs
        window size
        vocab size
    """
    def __init__(self, corpus, window_size):
        self.corpus = corpus 
        self.embedding_size = 300
        self.epochs = 10
        self.window_size = window_size
        self.vocab_size = None
    
    """
    Removes stop words and puncuation from all reviews
    
    return: clean_docs
    """
    def _remove_stop(self):
        stop_words = set(stopwords.words('english'))
        punctuation = set(string.punctuation)
        
        clean_docs = []
        
        for doc in self.corpus:
            # Remove stops
            no_stops = ' '.join([word for word in doc.lower().split() if word not in stop_words])
            
            # Remove punctuation
            punc_free = ''.join(symbol for symbol in no_stops if symbol not in punctuation)
            clean_docs.append(punc_free)
                    
        return clean_docs
    
    """
    The preprocessing function removes stop words, lemmatizes, and reduces
    the corpus size based on if a review contains 'coffee', 'pasta', 'tuna',
    or 'cookies'. A subset of these reviews are used for the final corpus.
    This function also performs the sliding window function and creates one hot vectors.
    
    return: input_words
            target_words
            one_hot_encodings_x
            one_hot_encodings_y
            vocab_size
    """
    def _preprocess(self):
        
        print('Removing Stop Words...', '\n')
        
        no_stops = self._remove_stop()

        print('Lemmatizing...', '\n')
        
        nlp = spacy.load("en_core_web_sm")
        parts_of_speech = ['ADJ','NOUN','VERB']
        
        #temp_corpus = []
        #for review in no_stops:
            #doc = nlp(review)
            #sentences = list(doc.sents)
        
        lemmatizer = WordNetLemmatizer()
        temp_corpus = []
        
        for review in no_stops:
            doc = nlp(review)
            lemmatized = " ".join(lemmatizer.lemmatize(word) for word in doc.split() 
                                  if word.pos_ in parts_of_speech)
            lemmatized = [value for value in lemmatized.split() if not value.isdigit()]
            temp_corpus.append(lemmatized)
        
        final_corpus = []
        
        for review in temp_corpus:
            if 'coffee' in review:
                final_corpus.append(review)
            elif 'pasta' in review:
                final_corpus.append(review)
            elif 'tuna' in review:
                final_corpus.append(review)
            elif 'cookies' in review:
                final_corpus.append(review)
            else:
                continue

        final_corpus = random.sample(final_corpus, int((len(final_corpus)/5)))
        set_of_words = set([word for sentence in final_corpus for word in sentence])
    
        vocab_size = len(set_of_words)
            
        print('Utilizing sliding window to find word groupings...', '\n')
        neighbors = self._window_slider(final_corpus)
        
        input_words = [i[0] for i in neighbors]
        target_words = [i[1] for i in neighbors]
        
        print('Creating one hot encodings...', '\n')

        one_hot_encodings_x = self._one_hot(input_words, vocab_size)
        one_hot_encodings_y = self._one_hot(target_words, vocab_size)
        
        print('Preprocessing Finished!', '\n')
        
        return input_words, target_words, one_hot_encodings_x, one_hot_encodings_y, vocab_size
    
    """
    Performs a sliding window mechanism that which selects words within a specified length.
    The window moves over one index each iteration.
    
    return: neighbors - pairs of words
    """
    def _window_slider(self, corpus):
        neighbors = []
        for sentence in corpus:
            # Specify window
            window = [sentence[x:x+self.window_size] for x in range(len(sentence) - self.window_size + 1 )]
            
            # Get context and center words
            for triplet in window:
                center_index = int((len(triplet) - 1) / 2)
                context = sum([triplet[0:center_index],triplet[center_index+1:len(triplet)]], [])
                
                # Create pairs
                for context_word in context:
                    pair = [triplet[center_index], context_word]
                    neighbors.append(pair)

        return neighbors
    
    """
    Creates one hot encodings for input and target words. Using Tensorflow's 
    one hot encoding function so that the model can understand the inputs.
    
    The one hot encoding array returns indices for each word. Each index represents
    the location of "1"
    
    return: one_hot_encodings
    """
    def _one_hot(self, words, vocab_size):
        one_hot_encodings = [one_hot(word, vocab_size) for word in words]
        return one_hot_encodings
    
    """
    Run the tensorflow sequential model with custom backend loss function.
    
    return: embedding layer weights
    """
    def _run_model(self, x, y, vocab_size):
        
        print('Run Model...', '\n')
        
        model = Sequential()
        
        # Create embedding layer: vocab_size x self.embedding_size
        model.add(Embedding(vocab_size, self.embedding_size, 
                            input_length=1, name="embedding"))
        model.add(Flatten())
        
        # Output layer with softmax activation
        model.add(Dense(1, activation='softmax'))
        
        # Subgradient Descent optimizer
        optimizer = tf.keras.optimizers.SGD()
        
        # Use custom loss function instead of built-in function
        model.compile(optimizer=optimizer, loss=self._cross_entropy_loss, metrics=['accuracy'])
        
        # Fit the model
        model.fit(x, y, epochs=10, verbose=0)
        
        print('Training Finished!')
        
        weights = model.get_layer('embedding').get_weights()[0]
        
        
        return weights
    
    """
    Computes Cross Entropy Loss for Skip-gram network
    
    return: loss
    """
    def _cross_entropy_loss(self, actual, pred):
        actual = tf.cast(actual, tf.float32)
        pred = tf.cast(pred, tf.float32)
        error = actual*pred
        loss = -K.sum(error)
        return loss
    
    """
    Returns the cosine similarity between two vectors
    """
    def _cosine_similarity(self, word_vec_1, word_vec_2):
        return (1 - spatial.distance.cosine(word_vec_1, word_vec_2))
    
    """
    Solves Analogies given 3 input words, and the word_vectors
    Example: Spain is to Spanish as Germany is to German
    return: missing_word - word with highest cosine similarity
    """
    def _solve_analogy(self, word_1, word_2, word_3, word_vectors):
        # Initialize as -1 at beginning of iteration 
        # because values will be higher than this one
        best_similarity = -1

        missing_word = None
        
        # Word vectors for input words
        vec_1,vec_2,vec_3 = np.array(word_vectors[f'{word_1}']).astype(float), \
                               np.array(word_vectors[f'{word_2}']).astype(float), \
                               np.array(word_vectors[f'{word_3}']).astype(float)
        
        # Only compute cosine similarity for all word vecs
        # excluding the input vectos
        for i in word_vectors.columns:
            if i in [word_1,word_2,word_3]:
                continue
            
            # Test vector
            try_vector = np.array(word_vectors[f'{i}']).astype(float)
            
            # Compute cosine similarity
            # Logic is as follows: Spanish - Spain (Country) = try_vector - German
            similarity = cosine_similarity([vec_2-vec_1], [try_vector-vec_3])
            
            if similarity > best_similarity:
                best_similarity = similarity
                missing_word = i     

        return missing_word

In [5]:
model = Word2Vec(reviews_text, window_size = 5)
input_words, target_words, x, y, vocab_size = model._preprocess()

Removing Stop Words... 

Lemmatizing... 



AttributeError: 'spacy.tokens.doc.Doc' object has no attribute 'split'

In [None]:
embeddings = model._run_model(x, y, vocab_size)

## 2. Find Similar Words

In [None]:
words = ['coffee', 'pasta', 'tuna', 'cookie']

for word in words:
    sims = []
    indices = []
    for index, embedding in enumerate(embeddings):

        if x[input_words.index(word)][0] == index:
            continue
        
        # Compute cosine similarity
        similarity = model._cosine_similarity(embeddings[x[input_words.index(word)][0]].reshape(-1, 1), 
                                              embeddings[index].reshape(-1, 1))
        sims.append(similarity)
        indices.append(index)

    sims = np.array(sims)
    indices = np.array(indices)
    
    # Get top 10 words
    top_words = [target_words[i] for i in list(sims.argsort()[-10:])]
    print(f'The top ten words for {word} are: {top_words}')

## 3. Word Analogies 

In [None]:
glove_list = []
with open('new_glove.txt') as file:
    for line in file:
        inner_list = [word.rstrip() for word in line.split(' ')]
        glove_list.append(inner_list)

In [None]:
glove_dict = {}

for embedding in glove_list:
    glove_dict[f'{embedding[0]}'] = embedding[1:len(embedding)]
    
glove_df = pd.DataFrame(glove_dict)
glove_df.head()

In [None]:
analogy_1 = model._solve_analogy('spain', 'spanish', 'germany', glove_df)
analogy_2 = model._solve_analogy('japan', 'tokyo', 'france', glove_df)
analogy_3 = model._solve_analogy('woman', 'man', 'queen', glove_df)
analogy_4 = model._solve_analogy('australia', 'hotdog', 'italy', glove_df)

In [None]:
print(f'Spain is to Spanish as Germany is to {str(analogy_1)}')
print(f'Japan is to Tokyo as France is to {str(analogy_2)}')
print(f'Woman is to Man as Queen is to {str(analogy_3)}')
print(f'Australia is to Hotdog as Italy is to {str(analogy_4)}')