# Vectorizers

In [45]:
import scipy.sparse
import os
import pandas as pd
import numpy as np
from numpy import asarray
import pickle

import gensim.downloader as api
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from joblib import dump

from afinn import Afinn
from nltk.corpus import sentiwordnet as swn
from nltk.corpus import wordnet as wn

In [46]:
script_dir = os.path.dirname(os.path.abspath('processor.ipynb'))
data_path = os.path.join(script_dir, 'Thesis_Jupyter_Final/src/')
os.getcwd()
print(data_path)

input_folder_path = os.path.join(data_path, 'input')
processed_folder_path = os.path.join(data_path, 'input/processed/normal')

glove_model = "glove-twitter-100"
#glove_model = "glove-wiki-gigaword-100"

/home2/s3985113/Thesis_Jupyter_Final/src/


In [47]:
NUM_of_CLASSES = 3
EMBEDDING_DIM = 100

In [48]:
def load_data(file_path):
    # Load data
    df = pd.read_csv(file_path)

    x = df['x']
    y = df['y']

    return x, y

def load_vocab(file_path):
    with open(file_path, 'rb') as f:
        vocab = pickle.load(f)
        vocab_size = len(vocab)

    vocab_size = len(vocab)

    return vocab, vocab_size
    

x_train, y_train = load_data(os.path.join(processed_folder_path, "train.csv"))
x_val, y_val = load_data(os.path.join(processed_folder_path, "val.csv"))
x_test, y_test = load_data(os.path.join(processed_folder_path, "test.csv"))
print(x_train[:5])
print()


vocab_data_filename = "vocab.pkl"
vocab, vocab_size = load_vocab(os.path.join(processed_folder_path, vocab_data_filename))
print("Vocab size: ", vocab_size)

0    bad superficial speaks fast continually stop k...
1                      let grade purchase disappointed
2    horrible test sense element described generall...
3    least favorite ere far style plot setting deta...
4    guess level look easier broader last crowdsour...
Name: x, dtype: object

Vocab size:  11905


# Encode Data

In [49]:
def find_max_seq_len(data):
    # Find maximum sequence length
    max_seq_length = max([len(line.split()) for line in data])
    print(f'Maximum review length: {max_seq_length}')

    return max_seq_length

max_seq_length = find_max_seq_len(x_train)

Maximum review length: 430


In [50]:
def fit_tokenizer(data):
    # Fit tokenizer (on training data)
    tokenizer = Tokenizer()
    # Remove default filters, including punctuation
    tokenizer.filters = ""  
    # Disable lowercase conversion
    tokenizer.lower = False  
    tokenizer.fit_on_texts(data) 

    return tokenizer

def encode_text(lines, tokenizer, max_length, filename):
    # Integer encode
    encoded_seq = tokenizer.texts_to_sequences(lines)
    # Pad the encoded sequences
    padded = pad_sequences(encoded_seq, maxlen=max_length, padding='post')

    # Save 
    with open(os.path.join(processed_folder_path, filename + '.pkl'), 'wb') as file:
        pickle.dump(padded, file)

    return padded
    
    
tokenizer = fit_tokenizer(x_train)

# Encode Data
x_train_encoded = encode_text(x_train, tokenizer, max_seq_length, "x_train_encoded")
x_val_encoded = encode_text(x_val, tokenizer, max_seq_length, "x_val_encoded")
x_test_encoded = encode_text(x_test, tokenizer, max_seq_length, "x_test_encoded")

print("\nEncoded Data Shape (doc, vocab_size):\n* train: {}\n* validation: {}\n* test: {}\n".format(x_train_encoded.shape, x_val_encoded.shape, x_test_encoded.shape))
print("x_train_tfidf:\n{}".format(x_train_encoded))


Encoded Data Shape (doc, vocab_size):
* train: (41000, 430)
* validation: (11529, 430)
* test: (11899, 430)

x_train_tfidf:
[[   93   566   866 ...     0     0     0]
 [  464   263  1110 ...     0     0     0]
 [ 1356    74   288 ...     0     0     0]
 ...
 [  899    42  8753 ...     0     0     0]
 [   50    66    65 ...     0     0     0]
 [ 3674  1738 11382 ...     0     0     0]]


### Encode y

In [51]:
# TODO: can't remember if this is used somewhere else, if not save data inside function
def one_hot_encode(y):
    y_encoded = np.zeros((len(y), NUM_of_CLASSES))
    for i, label in enumerate(y):
        y_encoded[i, label - 1] = 1

    return y_encoded

# Convert sentiment labels to one-hot encoding
y_train_encoded = one_hot_encode(y_train)
y_val_encoded = one_hot_encode(y_val)
y_test_encoded = one_hot_encode(y_test)

# Save y-encoded sets
np.save(os.path.join(processed_folder_path, "y_train_encoded.npy"), np.array(y_train_encoded))
np.save(os.path.join(processed_folder_path, "y_val_encoded.npy"), np.array(y_val_encoded))
np.save(os.path.join(processed_folder_path, "y_test_encoded.npy"), np.array(y_test_encoded))
   
print("\ny-Encoded Data Shape:\n* train: {}\n* validation: {}\n* test: {}\n".format(y_train_encoded.shape, y_val_encoded.shape, y_test_encoded.shape))


y-Encoded Data Shape:
* train: (41000, 3)
* validation: (11529, 3)
* test: (11899, 3)



# Word2Vec

In [52]:
# Total vocabulary size plus 0 for unknown words
embedding_vocab_size = len(tokenizer.word_index) + 1
print("embedding_vocab_size: ", embedding_vocab_size)

# Check if there are any words identified via the tokenizer that are not in vocab
tokenizer_vocab = set(tokenizer.word_index.keys())
vocab_set = set(vocab)
tokenizer_only_words = tokenizer_vocab.difference(vocab_set)
print("Words in tokenizer but not in vocab: ", len(tokenizer_only_words))

embedding_vocab_size:  11383
Words in tokenizer but not in vocab:  0


In [53]:
def load_embedding():
    glove_model_filename = str(glove_model) + ".txt"
    glove_file_path = os.path.join(processed_folder_path, glove_model_filename)
    if not os.path.exists(glove_file_path): #OTHERWISE CONVERT HERE TO TXT AND ALSO WHEN YOU SAVE
        # Check if the pre-trained Word2Vec model is already downloaded. If not, download it.
        print("GloVe model doesn't exist...")
        model = api.load(glove_model)
        model.save_word2vec_format(glove_file_path, binary=False)
        # 5186/12465 (41.60%) are not defined with twitter-glove
        # 5177/12465 (41.53%) are not defined with wiki

    # Load embedding into memory, skip first line
    print("Loading w2v model...")
    file = open(glove_file_path, 'r', encoding='utf8')
    lines = file.readlines()[1:]
    file.close()

    # Create a map of words to vectors
    pretrained_embeddings = dict()
    for line in lines:
        parts = line.split()
        # Set key as string word, value as numpy array for vector
        pretrained_embeddings[parts[0]] = asarray(parts[1:], dtype='float32')

    return pretrained_embeddings

def get_embedding_matrix(loaded_embedding, tokenizer, embedding_dim):
    # Create a weight matrix for the Embedding layer from a loaded/pretrained embedding

    # Define weight matrix dimensions (vocab_size + 1 for unknown words) with all 0 
    embedding_matrix = np.zeros((len(tokenizer.word_index) + 1, embedding_dim))

    count_all_words = 0
    count_na_words = 0
    zero_vector_words = []
    for word, i in tokenizer.word_index.items():
        # Map loaded vectors to terms in vocab
        if word in loaded_embedding.keys():
            embedding_matrix[i] = loaded_embedding.get(word)
        else:
            # Some terms such as emojis or neg-tagged words are not found in the loaded w2v model, hence they will have vectors with all 0
            zero_vector_words.append(word)
            count_na_words += 1
        count_all_words += 1
    print(f'{count_na_words}/{count_all_words} ({((count_na_words/count_all_words)*100):.2f}%) are not defined in the pretrained W2V model and will receive vectors with all 0.')
    print(f"W2V Embedding Matrix shape: {embedding_matrix.shape}")
    print(f"Embedding Matrix:\n{embedding_matrix[:5]}")

    # Save unrecognized words that are not present in the GloVe model
    file_path = os.path.join(processed_folder_path, "out_of_glove_words.txt")
    with open(file_path, 'w') as file:
        file.write('\n'.join(zero_vector_words))

    # Save embeddings
    # TODO: delete
    file_path = os.path.join(processed_folder_path, "embedding_matrix.txt")
    with open(file_path, 'w') as file:
        file.write('\n'.join(' '.join(str(x) for x in row) for row in embedding_matrix))
    np.savetxt(os.path.join(processed_folder_path, "embedding_matrix.txt"), embedding_matrix, fmt='%f')

    # TODO: keep
    with open(os.path.join(processed_folder_path, 'embedding_matrix.pkl'), 'wb') as file:
        pickle.dump(embedding_matrix, file)

    return embedding_matrix

pretrained_embeddings = load_embedding()
embedding_vectors = get_embedding_matrix(pretrained_embeddings, tokenizer, EMBEDDING_DIM)


Loading w2v model...


825/11382 (7.25%) are not defined in the pretrained W2V model and will receive vectors with all 0.
W2V Embedding Matrix shape: (11383, 100)
Embedding Matrix:
[[ 0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00 

# AFINN - worse - VECTORS NOT SAVED

In [54]:
afinn = Afinn(language='en')

def compute_scores(text):
    words = text.split() 
    scores = [afinn.score(word) for word in words]  # compute the AFINN score for each word
    return scores

# Compute AFINN scores
x_train_scores = [compute_scores(text) for text in x_train]
x_val_scores = [compute_scores(text) for text in x_val]
x_test_scores = [compute_scores(text) for text in x_test]
print(x_train_scores[:5])

# Pad the sequences with zeros
x_train_scores_padded = pad_sequences(x_train_scores, maxlen=max_seq_length, padding='post')
x_val_scores_padded = pad_sequences(x_val_scores, maxlen=max_seq_length, padding='post')
x_test_scores_padded = pad_sequences(x_test_scores, maxlen=max_seq_length, padding='post')

print(x_train_scores_padded[:5])
print(x_train_scores_padded.shape)

[[-3.0, 0.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, 0.0, 0.0, -2.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, -2.0], [-3.0, 0.0, 0.0, 0.0, 0.0, 0.0, -2.0], [0.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -3.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -2.0, 0.0, 0.0, 0.0, 0.0, 0.0, -2.0, 0.0, 0.0, 0.0, 0.0, -3.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0]]
[[-3  0  0 ...  0  0  0]
 [ 0  0  0 ...  0  0  0]
 [-3  0  0 ...  0  0  0]
 [ 0  2  0 ...  0  0  0]
 [ 0  0  0 ...  0  0  0]]
(41000, 430)


In [55]:
def calculate_zero_percentage(score_list):
    total_scores = sum([len(scores) for scores in score_list])
    total_zeros = sum([scores.count(0) for scores in score_list])
    return total_zeros / total_scores * 100

# Calculate the percentage of exact zeros
train_zero_percentage = calculate_zero_percentage(x_train_scores)
val_zero_percentage = calculate_zero_percentage(x_val_scores)
test_zero_percentage = calculate_zero_percentage(x_test_scores)

print(f'Training data zero percentage: {train_zero_percentage:.2f}%')
print(f'Validation data zero percentage: {val_zero_percentage:.2f}%')
print(f'Test data zero percentage: {test_zero_percentage:.2f}%')

Training data zero percentage: 87.03%
Validation data zero percentage: 86.77%
Test data zero percentage: 86.68%


# SentiWordNet

In [56]:
def get_sentiment(word):
    synsets = wn.synsets(word) # get set of synonyms
    if not synsets:
        return 0  # return 0 if the word is not in WordNet
    synset = synsets[0] # The first synset is the most common sense (which are orderd by freq)
    swn_synset = swn.senti_synset(synset.name())
    return swn_synset.pos_score() - swn_synset.neg_score() # Return the overall sentiment polarity

def compute_scores(text):
    words = text.split()
    scores = [get_sentiment(word) for word in words]
    return scores

def pad_scores(score_list, max_len):
    # Pad the sequences to max sequence length
    return [scores + [0] * (max_len - len(scores)) for scores in score_list]

# Compute sentiment scores
x_train_scores = [compute_scores(text) for text in x_train]
x_val_scores = [compute_scores(text) for text in x_val]
x_test_scores = [compute_scores(text) for text in x_test]

# Pad the scores
x_train_scores_padded = np.array(pad_scores(x_train_scores, max_seq_length))
x_val_scores_padded = np.array(pad_scores(x_val_scores, max_seq_length))
x_test_scores_padded = np.array(pad_scores(x_test_scores, max_seq_length))

print(x_train_scores_padded[:5])
print(x_train_scores_padded.shape)

[[-0.875 -0.5    0.    ...  0.     0.     0.   ]
 [ 0.     0.     0.    ...  0.     0.     0.   ]
 [-0.625  0.125  0.375 ...  0.     0.     0.   ]
 [ 0.     0.25   0.    ...  0.     0.     0.   ]
 [-0.125 -0.25   0.    ...  0.     0.     0.   ]]
(41000, 430)


In [57]:
def zero_percentage(score_list):
    total_scores = sum([len(scores) for scores in score_list])
    total_zeros = sum([scores.count(0) for scores in score_list])
    return total_zeros / total_scores * 100

# Calculate the percentage of exact zeros
train_zero_percentage = zero_percentage(x_train_scores)
val_zero_percentage = zero_percentage(x_val_scores)
test_zero_percentage = zero_percentage(x_test_scores)

print(f'Training data zero percentage: {train_zero_percentage:.2f}%')
print(f'Validation data zero percentage: {val_zero_percentage:.2f}%')
print(f'Test data zero percentage: {test_zero_percentage:.2f}%')

Training data zero percentage: 72.06%
Validation data zero percentage: 72.24%
Test data zero percentage: 72.17%


In [58]:
# Save the padded scores
with open(os.path.join(processed_folder_path, 'x_train_scores_padded.pkl'), 'wb') as file:
    pickle.dump(x_train_scores_padded, file)
with open(os.path.join(processed_folder_path, 'x_val_scores_padded.pkl'), 'wb') as file:
    pickle.dump(x_val_scores_padded, file)
with open(os.path.join(processed_folder_path, 'x_test_scores_padded.pkl'), 'wb') as file:
    pickle.dump(x_test_scores_padded, file)

In [59]:
# Save padded scores to text files
np.savetxt(os.path.join(processed_folder_path, 'x_train_scores_padded.txt'), x_train_scores_padded, fmt='%f')