# GloVe Word Embeddings

In [1]:
# Import necessary libraries
import scipy.sparse
import os
import pandas as pd
import numpy as np
from numpy import asarray
import pickle

import gensim.downloader as api
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from joblib import dump

from afinn import Afinn
from nltk.corpus import sentiwordnet as swn
from nltk.corpus import wordnet as wn

In [2]:
# Set paths
script_dir = os.path.dirname(os.path.abspath('processor.ipynb'))
data_path = os.path.join(script_dir, 'Thesis_Jupyter_Final/src/')
os.getcwd()
print(data_path)

input_folder_path = os.path.join(data_path, 'input')
processed_folder_path = os.path.join(data_path, 'input/processed/normal')

/home2/s3985113/Thesis_Jupyter_Final/src/


In [3]:
# Set constants and other variables
NUM_of_CLASSES = 3
EMBEDDING_DIM = 100

glove_model = "glove-twitter-100"

In [4]:
def load_data(file_path):
    """
    Load data from a csv file.

    Args:
        file_path (str): The path to the csv file.

    Returns:
        x (DataFrame): The reviews.
        y (DataFrame): The labels.
    """

    df = pd.read_csv(file_path)

    x = df['x']
    y = df['y']

    return x, y


def load_vocab(file_path):
    """
    Load vocabulary from a pickle file.

    Args:
        file_path (str): The path to the pickle file.

    Returns:
        vocab (list): The loaded vocabulary list.
        vocab_size (int): The size of the loaded vocabulary.
    """

    with open(file_path, 'rb') as f:
        vocab = pickle.load(f)
        vocab_size = len(vocab)

    vocab_size = len(vocab)

    return vocab, vocab_size
    

x_train, y_train = load_data(os.path.join(processed_folder_path, "train.csv"))
x_val, y_val = load_data(os.path.join(processed_folder_path, "val.csv"))
x_test, y_test = load_data(os.path.join(processed_folder_path, "test.csv"))
print(x_train[:5])
print()

vocab_data_filename = "vocab.pkl"
vocab, vocab_size = load_vocab(os.path.join(processed_folder_path, vocab_data_filename))
print("Vocab size: ", vocab_size)

0    bad superficial speaks fast continually stop k...
1                      let grade purchase disappointed
2    horrible test sense element described generall...
3    least favorite ere far style plot setting deta...
4    guess level look easier broader last crowdsour...
Name: x, dtype: object

Vocab size:  11905


The code provided below draws inspiration from the article on Machine Learning Mastery titled "Develop a Word Embedding Model for Predicting Movie Review Sentiment" available at the following link: [Machine Learning Mastery - Word Embeddings](https://machinelearningmastery.com/develop-word-embedding-model-predicting-movie-review-sentiment/)

# Encode Data

In [5]:
def find_max_seq_len(data):
    """
    Find the maximum sequence length in the list of reviews.

    Args:
        data (list): The list of reviews.

    Returns:
        max_seq_length (int): The maximum sequence length found.
    """

    max_seq_length = max([len(line.split()) for line in data])
    print(f'Maximum review length: {max_seq_length}')

    return max_seq_length

max_seq_length = find_max_seq_len(x_train)

Maximum review length: 449


In [6]:
def fit_tokenizer(data):
    """
    Fit a tokenizer on the review data.

    Args:
        data (array): Input review data for fitting the tokenizer.

    Returns:
        tokenizer (Tokenizer): Fitted tokenizer.
    """

    tokenizer = Tokenizer()
    # Remove default filters, including punctuation
    tokenizer.filters = ""  
    # Disable lowercase conversion
    tokenizer.lower = False  
    tokenizer.fit_on_texts(data) 

    return tokenizer


def encode_text(lines, tokenizer, max_length, filename):
    """
    Encode text data, pad the sequences and save.

    Args:
        lines (array): Input text review data to be encoded.
        tokenizer (Tokenizer): Fitted tokenizer object.
        max_length (int): Maximum sequence length to be used for padding.
        filename (str): Name of the file to save the encoded sequences.

    Returns:
        padded (numpy.ndarray): Padded and encoded sequences.
    """

    encoded_seq = tokenizer.texts_to_sequences(lines)
    padded = pad_sequences(encoded_seq, maxlen=max_length, padding='post')

    # Save 
    with open(os.path.join(processed_folder_path, filename + '.pkl'), 'wb') as file:
        pickle.dump(padded, file)

    return padded
    
    
tokenizer = fit_tokenizer(x_train)

# Encode Data
x_train_encoded = encode_text(x_train, tokenizer, max_seq_length, "x_train_encoded")
x_val_encoded = encode_text(x_val, tokenizer, max_seq_length, "x_val_encoded")
x_test_encoded = encode_text(x_test, tokenizer, max_seq_length, "x_test_encoded")

print("\nEncoded Data Shape (doc, max_len):\n* train: {}\n* validation: {}\n* test: {}\n".format(x_train_encoded.shape, x_val_encoded.shape, x_test_encoded.shape))
print("x_train_tfidf:\n{}".format(x_train_encoded))


Encoded Data Shape (doc, max_len):
* train: (41000, 449)
* validation: (11529, 449)
* test: (11899, 449)

x_train_tfidf:
[[  96  549  929 ...    0    0    0]
 [ 453  240 1125 ...    0    0    0]
 [1260   67  312 ...    0    0    0]
 ...
 [   6  380   88 ...    0    0    0]
 [ 125 2234  195 ...    0    0    0]
 [ 761  583  442 ...    0    0    0]]


### Encode y

In [7]:
def one_hot_encode(y):
    """
    Convert integer y-labels to one-hot encoding.

    Args:
        y (array): Labels (int).

    Returns:
        y_encoded (array): One-hot encoded labels.
    """

    y_encoded = np.zeros((len(y), NUM_of_CLASSES))
    for i, label in enumerate(y):
        y_encoded[i, label - 1] = 1

    return y_encoded


# Convert sentiment labels to one-hot encoding
y_train_encoded = one_hot_encode(y_train)
y_val_encoded = one_hot_encode(y_val)
y_test_encoded = one_hot_encode(y_test)

# Save y-encoded sets
with open(os.path.join(processed_folder_path, "y_train_encoded.pkl"), "wb") as f:
    pickle.dump(y_train_encoded, f)
with open(os.path.join(processed_folder_path, "y_val_encoded.pkl"), "wb") as f:
    pickle.dump(y_val_encoded, f)
with open(os.path.join(processed_folder_path, "y_test_encoded.pkl"), "wb") as f:
    pickle.dump(y_test_encoded, f)

   
print("\ny-Encoded Data Shape:\n* train: {}\n* validation: {}\n* test: {}\n".format(y_train_encoded.shape, y_val_encoded.shape, y_test_encoded.shape))


y-Encoded Data Shape:
* train: (41000, 3)
* validation: (11529, 3)
* test: (11899, 3)



# Word2Vec

In [16]:
# Total vocabulary size plus 0 for unknown words
embedding_vocab_size = len(tokenizer.word_index) + 1
print("embedding_vocab_size: ", embedding_vocab_size)

# Check if there are any words identified via the tokenizer that are not in vocab
tokenizer_vocab = set(tokenizer.word_index.keys())
vocab_set = set(vocab)
tokenizer_only_words = vocab_set.difference(tokenizer_vocab)
print("Words in tokenizer but not in vocab: ", len(tokenizer_only_words))
print("Words: ", tokenizer_only_words)

embedding_vocab_size:  11395
Words in tokenizer but not in vocab:  511
Words:  {'cleanly', 'stride', 'knowledg', 'mullins', 'unforgettable', 'worship', 'fundmentals', 'feiler', 'nearby', 'quicky', 'grear', 'fem', 'finely', 'speculative', 'opics', 'asignments', 'esay', 'complementing', 'schopenhauer', 'negotiated', 'methylation', 'domestication', 'garcia', 'differente', 'moocer', 'comlex', 'gr8', 'expend', 'webdevelopment', 'nicest', 'contend', 'imformative', 'geometrically', 'fi', 'denis', 'jerry', 'glove', 'devotes', 'matured', 'sang', 'workaround', 'lung', 'velichkovski', 'psychoteraphy', 'wealthy', 'persevered', 'jorney', 'everythis', 'einstellung', 'aizuri', 'ag', 'swiss', 'devising', 'recruiter', 'developping', 'florida', 'designated', 'rbm', 'marketeers', 'burdensome', 'potent', 'soviet', 'chucky', 'servance', 'improvising', 'swm', 'leart', 'excelentes', 'adversely', 'fearing', 'pediatric', 'frederickson', 'keying', 'dicussion', 'lookup', 'veru', 'epitome', 'whirlwind', 'sor', 's

Under-sampling could have potentially removed instances of certain words from the training set, reducing the overall vocabulary that the tokenizer picks up.

In [9]:
def load_embedding():
    """
    Load the pre-trained GloVe word embeddings and convert them to a dictionary.

    Returns:
        pretrained_embeddings (dict): A dictionary mapping words to their corresponding embedding vectors.
    """

    glove_model_filename = str(glove_model) + ".txt"
    glove_file_path = os.path.join(processed_folder_path, glove_model_filename)
    if not os.path.exists(glove_file_path): 
        # Check if the pre-trained Word2Vec model is already downloaded. If not, download it.
        print("GloVe model doesn't exist...")
        model = api.load(glove_model)
        model.save_word2vec_format(glove_file_path, binary=False)
        # 5186/12465 (41.60%) are not defined with twitter-glove
        
    # Load embedding into memory, skip first line
    print("Loading w2v model...")
    file = open(glove_file_path, 'r', encoding='utf8')
    lines = file.readlines()[1:]
    file.close()

    # Create a map of words to vectors
    pretrained_embeddings = dict()
    for line in lines:
        parts = line.split()
        # Set key as string word, value as numpy array for vector
        pretrained_embeddings[parts[0]] = asarray(parts[1:], dtype='float32')

    return pretrained_embeddings


def get_embedding_matrix(loaded_embedding, tokenizer, embedding_dim):
    """
    Create an embedding matrix from the loaded/pretrained word embeddings.

    Args:
        loaded_embedding (dict): A dictionary containing the pre-trained word embeddings.
        tokenizer (Tokenizer): The Tokenizer object used to tokenize the text data.
        embedding_dim (int): The dimension of the word embeddings.

    Returns:
        embedding_matrix (ndarray): The embedding matrix for the Embedding layer.
    """

    # Define weight matrix dimensions (vocab_size + 1 for unknown words) with all 0 
    embedding_matrix = np.zeros((len(tokenizer.word_index) + 1, embedding_dim))

    count_all_words = 0
    count_na_words = 0
    zero_vector_words = []
    for word, i in tokenizer.word_index.items():
        # Map loaded vectors to terms in vocab
        if word in loaded_embedding.keys():
            embedding_matrix[i] = loaded_embedding.get(word)
        else:
            # Some terms such as emojis or neg-tagged words are not found in the loaded w2v model, hence they will have vectors with all 0
            zero_vector_words.append(word)
            count_na_words += 1
        count_all_words += 1
    print(f'{count_na_words}/{count_all_words} ({((count_na_words/count_all_words)*100):.2f}%) are not defined in the pretrained W2V model and will receive vectors with all 0.')
    print(f"W2V Embedding Matrix shape: {embedding_matrix.shape}")
    print(f"Embedding Matrix:\n{embedding_matrix[:5]}")

    # Save unrecognized words that are not present in the GloVe model
    file_path = os.path.join(processed_folder_path, "out_of_glove_words.txt")
    with open(file_path, 'w') as file:
        file.write('\n'.join(zero_vector_words))

    # Save embeddings
    file_path = os.path.join(processed_folder_path, "embedding_matrix.txt")     # for visualization
    with open(file_path, 'w') as file:
        file.write('\n'.join(' '.join(str(x) for x in row) for row in embedding_matrix))
    np.savetxt(os.path.join(processed_folder_path, "embedding_matrix.txt"), embedding_matrix, fmt='%f')

    with open(os.path.join(processed_folder_path, 'embedding_matrix.pkl'), 'wb') as file:   # for saving memory space
        pickle.dump(embedding_matrix, file)

    return embedding_matrix

pretrained_embeddings = load_embedding()
embedding_vectors = get_embedding_matrix(pretrained_embeddings, tokenizer, EMBEDDING_DIM)


Loading w2v model...


831/11394 (7.29%) are not defined in the pretrained W2V model and will receive vectors with all 0.
W2V Embedding Matrix shape: (11395, 100)
Embedding Matrix:
[[ 0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00 