# Vectorizers

In [1]:
import scipy.sparse
import os
import pandas as pd
import numpy as np
from numpy import asarray
import matplotlib.pyplot as plt
import multiprocessing
import pickle

import gensim.downloader as api
from keras.preprocessing.text import Tokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from keras.utils import pad_sequences
from joblib import dump

from afinn import Afinn
from nltk.corpus import sentiwordnet as swn
from nltk.corpus import wordnet as wn

NUM_of_CLASSES = 3
w2v_pretrained_model = "glove-twitter-100"
#w2v_pretrained_model = "glove-wiki-gigaword-100"
w2v_pretrained_model_filename = str(w2v_pretrained_model) + "-word2vec.txt" 

In [2]:
script_dir = os.path.dirname(os.path.abspath('vectorizer.ipynb'))
data_path = os.path.join(script_dir, 'Thesis_Jupyter_Final/src/')
os.getcwd()
print(data_path)

input_folder_path = os.path.join(data_path, 'input')
processed_folder_path = os.path.join(data_path, 'input/processed')
results_folder_path = "results"

# Create the folder if it doesn't exist
if not os.path.exists(results_folder_path):
    os.makedirs(results_folder_path)

/home2/s3985113/Thesis_Jupyter_Final/src/


In [3]:
def load_data(filename, process=True):
    # Load data
    data_file_path = os.path.join(input_folder_path, filename)
    df = pd.read_csv(data_file_path)

    x = df['x']
    y = df['y']

    return x, y

def load_vocab():
    vocab_data_filename = "vocab.pkl"
    file_path = os.path.join(processed_folder_path, vocab_data_filename)
    with open(file_path, 'rb') as f:
        vocab = pickle.load(f)
        vocab_size = len(vocab)

    vocab_size = len(vocab)

    return vocab, vocab_size
    

x_train, y_train = load_data("train.csv", process=False)
x_val, y_val = load_data("val.csv", process=False)
x_test, y_test = load_data("test.csv", process=False)

vocab, vocab_size = load_vocab()

In [4]:
def save_to_npz(matrix, file_path):
    scipy.sparse.save_npz(file_path, matrix)

def save_to_npy(arr, file_path):
    np.save(file_path, np.array(arr))

## TF-IDF

In [45]:
def get_tfidf_vectorizer(vocab, max_features, min_df, max_df):
    # Convert vocab to a dict in order to use it in TF-IDF vectorizer
    vocab_dict = {word: i for i, word in enumerate(vocab)}

    tfidf_vectorizer = TfidfVectorizer(
        max_features=max_features, # maximum number of features to keep, check unique vocabs and determine based on that, high causes saprse metrics and low value causes loss in important words/vocab
        vocabulary=vocab_dict,
        lowercase=False,
        ngram_range=(1, 3),  # range of n-grams,
        max_df=max_df,  # ignore terms that have a document frequency strictly higher than the threshold
        min_df=min_df,  # ignore terms that have a document frequency strictly lower than the threshold.
        use_idf=True,  # enable IDF weighting
        smooth_idf=True,  # smooth IDF weights --> provides stability, reduces run time errors
        sublinear_tf=True  # apply sublinear scaling to term frequencies
    )

    # Save tfidf vectorizer
    file_path = os.path.join(processed_folder_path, 'tfidf_vectorizer.joblib')
    dump(tfidf_vectorizer, file_path)

    return tfidf_vectorizer

def transform_to_tfidf(x_train, x_val, x_test):
    # Fit and transform the training set
    x_train_tfidf = tfidf_vectorizer.fit_transform(x_train)

    # Transform the validation and testing set
    x_val_tfidf = tfidf_vectorizer.transform(x_val)
    x_test_tfidf = tfidf_vectorizer.transform(x_test)

    # Save data
    save_to_npz(x_train_tfidf, os.path.join(processed_folder_path, "train_tfidf.npz"))
    save_to_npz(x_val_tfidf, os.path.join(processed_folder_path, "val_tfidf.npz"))
    save_to_npz(x_test_tfidf, os.path.join(processed_folder_path, "test_tfidf.npz"))

    return x_train_tfidf, x_val_tfidf, x_test_tfidf


max_features = 10000
max_df = 0.95
min_df = 5

tfidf_vectorizer = get_tfidf_vectorizer(vocab, max_features, min_df, max_df)
x_train_tfidf, x_val_tfidf, x_test_tfidf = transform_to_tfidf(x_train, x_val, x_test)

print("\nData Shape (doc, vocab_size):\n* train: {}\n* validation: {}\n* test: {}\n".format(x_train_tfidf.shape, x_val_tfidf.shape, x_test_tfidf.shape))
print("x_train_tfidf:\n{}".format(x_train_tfidf))


Data Shape (doc, vocab_size):
* train: (41000, 11905)
* validation: (11529, 11905)
* test: (11899, 11905)

x_train_tfidf:
  (0, 7674)	0.41401673971112746
  (0, 3005)	0.34153855159235935
  (0, 1288)	0.2701998993071273
  (0, 1188)	0.28300945321786775
  (0, 1101)	0.2506496977415895
  (0, 1049)	0.27107612519882957
  (0, 991)	0.2660827031141531
  (0, 415)	0.22753195491195433
  (0, 290)	0.1974816461099669
  (0, 277)	0.21180699919929327
  (0, 251)	0.19520146633712748
  (0, 228)	0.21589497141966651
  (0, 216)	0.22556793582011625
  (0, 139)	0.21066367388175217
  (0, 93)	0.1900667120183632
  (1, 1588)	0.5935472288937115
  (1, 586)	0.44068854266770097
  (1, 475)	0.49109821654100483
  (1, 450)	0.4607795971440773
  (2, 2300)	0.438158529659526
  (2, 992)	0.4115649106107287
  (2, 880)	0.39367520419393415
  (2, 681)	0.40500932727012673
  (2, 471)	0.33518988777153336
  (2, 307)	0.3415349809089223
  :	:
  (40997, 28)	0.32303288191390067
  (40998, 1271)	0.3193570093028152
  (40998, 745)	0.56901338091961

In [40]:
#TODO: delete
def save_tfidf_data(data, filename, feature_names):
    # Save the matrix with feature names as a DataFrame
    data = pd.DataFrame(data.toarray(), columns=feature_names)
    file_path = os.path.join(processed_folder_path, filename)
    data.to_csv(file_path, sep=',', index=False) # TODO: if this isn't working, note that you added sep=','


# Get feature names
feature_names = tfidf_vectorizer.get_feature_names_out()

# Save vectorized data
#save_tfidf_data(x_train_tfidf, "train_tfidf.csv", feature_names)
#save_tfidf_data(x_val_tfidf, "val_tfidf.csv", feature_names)

# Encode Data

In [5]:
def find_max_seq_len(data):
    # Find maximum sequence length
    max_seq_length = max([len(line.split()) for line in data])
    print(f'Maximum review length: {max_seq_length}')

    return max_seq_length

max_seq_length = find_max_seq_len(x_train)

Maximum review length: 430


In [6]:
def fit_tokenizer(data):
    # Fit tokenizer (on training data)
    tokenizer = Tokenizer()
    # Remove default filters, including punctuation
    tokenizer.filters = ""  
    # Disable lowercase conversion
    tokenizer.lower = False  
    tokenizer.fit_on_texts(data) 

    return tokenizer

def encode_text(lines, tokenizer, max_length, filename):
    # Integer encode
    encoded_seq = tokenizer.texts_to_sequences(lines)
    # Pad the encoded sequences
    padded = pad_sequences(encoded_seq, maxlen=max_length, padding='post')

    # Save to np.array
    save_to_npy(padded, os.path.join(processed_folder_path, filename))

    return padded
    
    
tokenizer = fit_tokenizer(x_train)

# Encode Data
x_train_encoded = encode_text(x_train, tokenizer, max_seq_length, "x_train_encoded.npy")
x_val_encoded = encode_text(x_val, tokenizer, max_seq_length, "x_val_encoded.npy")
x_test_encoded = encode_text(x_test, tokenizer, max_seq_length, "x_test_encoded.npy")

print("\nEncoded Data Shape (doc, vocab_size):\n* train: {}\n* validation: {}\n* test: {}\n".format(x_train_encoded.shape, x_val_encoded.shape, x_test_encoded.shape))
print("x_train_tfidf:\n{}".format(x_train_encoded))


Encoded Data Shape (doc, vocab_size):
* train: (41000, 430)
* validation: (11529, 430)
* test: (11899, 430)

x_train_tfidf:
[[  96  561  851 ...    0    0    0]
 [ 423  240 1166 ...    0    0    0]
 [1205   62  299 ...    0    0    0]
 ...
 [1733  162 1602 ...    0    0    0]
 [2164  108   97 ...    0    0    0]
 [1358  118  663 ...    0    0    0]]


### Encode y

In [7]:
# TODO: can't remember if this is used somewhere else, if not save data inside function
def one_hot_encode(y):
    y_encoded = np.zeros((len(y), NUM_of_CLASSES))
    for i, label in enumerate(y):
        y_encoded[i, label - 1] = 1

    return y_encoded

# Convert sentiment labels to one-hot encoding
y_train_encoded = one_hot_encode(y_train)
y_val_encoded = one_hot_encode(y_val)
y_test_encoded = one_hot_encode(y_test)

# Save y-encoded sets
save_to_npy(y_train_encoded, os.path.join(processed_folder_path, "y_train_encoded.npy"))
save_to_npy(y_val_encoded, os.path.join(processed_folder_path, "y_val_encoded.npy"))
save_to_npy(y_test_encoded, os.path.join(processed_folder_path, "y_test_encoded.npy"))
   
print("\ny-Encoded Data Shape:\n* train: {}\n* validation: {}\n* test: {}\n".format(y_train_encoded.shape, y_val_encoded.shape, y_test_encoded.shape))


y-Encoded Data Shape:
* train: (41000, 3)
* validation: (11529, 3)
* test: (11899, 3)



# Word2Vec

In [8]:
# Total vocabulary size plus 0 for unknown words
embedding_vocab_size = len(tokenizer.word_index) + 1
print("embedding_vocab_size: ", embedding_vocab_size)

# Check if there are any words identified via the tokenizer that are not in vocab
tokenizer_vocab = set(tokenizer.word_index.keys())
vocab_set = set(vocab)
tokenizer_only_words = tokenizer_vocab.difference(vocab_set)
print("Words in tokenizer but not in vocab: ", len(tokenizer_only_words))

embedding_vocab_size:  11354
Words in tokenizer but not in vocab:  0


In [9]:
def load_embedding():
    w2v_pretrained_file_path = os.path.join(processed_folder_path, w2v_pretrained_model_filename)
    if not os.path.exists(w2v_pretrained_file_path):
        # Check if the pre-trained Word2Vec model is already downloaded. If not, download it.
        print("\nW2v model doesn't exist")
        model = api.load(w2v_pretrained_model)
        model.save_word2vec_format(w2v_pretrained_file_path, binary=False)
        # 5186/12465 (41.60%) are not defined with twitter-glove
        # 5177/12465 (41.53%) are not defined with wiki

    # Load embedding into memory, skip first line
    print("Loading w2v model...")
    file = open(w2v_pretrained_file_path, 'r', encoding='utf8')
    lines = file.readlines()[1:]
    file.close()

    # Create a map of words to vectors
    embedding = dict()
    for line in lines:
        parts = line.split()
        # Set key as string word, value as numpy array for vector
        embedding[parts[0]] = asarray(parts[1:], dtype='float32')

    return embedding

def get_emb_matrix(loaded_embedding, tokenizer, embedding_dim):
    # Create a weight matrix for the Embedding layer from a loaded/pretrained embedding

    # Define weight matrix dimensions (vocab_size + 1 for unknown words) with all 0 
    emb_matrix = np.zeros((len(tokenizer.word_index) + 1, embedding_dim))

    count_all_words = 0
    count_na_words = 0
    zero_vector_words = []
    for word, i in tokenizer.word_index.items():
        # Map loaded vectors to terms in vocab
        if word in loaded_embedding.keys():
            emb_matrix[i] = loaded_embedding.get(word)
        else:
            # Some terms such as emojis or neg-tagged words are not found in the loaded w2v model, hence they will have vectors with all 0
            zero_vector_words.append(word)
            count_na_words += 1
        count_all_words += 1
    print(f'{count_na_words}/{count_all_words} ({((count_na_words/count_all_words)*100):.2f}%) are not defined in the pretrained W2V model and will receive vectors with all 0.')
    print(f"W2V Embedding Matrix shape: {emb_matrix.shape}")
    print(f"Embedding Matrix:\n{emb_matrix[:10, :6]}")

    # Save unrecognized words that are not present in the GloVe model
    file_path = os.path.join(processed_folder_path, "out_of_glove_words.txt")
    with open(file_path, 'w') as file:
        file.write('\n'.join(zero_vector_words))

    # Save embeddings
    # TODO: delete
    file_path = os.path.join(processed_folder_path, "embedding_matrix.txt")
    with open(file_path, 'w') as file:
        file.write('\n'.join(' '.join(str(x) for x in row) for row in emb_matrix))

    # TODO: keep
    save_to_npy(emb_matrix, (os.path.join(processed_folder_path, "embedding_matrix.npy")))
    
    return emb_matrix

pretrained_embedding = load_embedding()
embedding_dim = 100
w2v_embedding_vectors = get_emb_matrix(pretrained_embedding, tokenizer, embedding_dim)

Loading w2v model...
821/11353 (7.23%) are not defined in the pretrained W2V model and will receive vectors with all 0.
W2V Embedding Matrix shape: (11354, 100)
Embedding Matrix:
[[ 0.          0.          0.          0.          0.          0.        ]
 [ 1.19679999 -0.028458   -0.29611     0.49471     0.15605     0.53438997]
 [ 0.66938001 -0.1402      0.080513    0.10082    -0.56133002  0.67628998]
 [-0.18062     0.28406999 -0.16242    -0.0034944  -0.41459     0.80851001]
 [-0.059789    0.076035   -0.0072208  -0.044774   -0.60459    -0.41768   ]
 [ 0.11942    -0.18155999 -0.041091    0.047532   -0.14318     0.64025003]
 [-0.25685    -0.23058    -0.017128    0.46162999 -0.35681999  0.47191   ]
 [-0.57674998 -0.42304999  0.27188    -0.31986001  0.18842     0.71320999]
 [-0.22439    -0.25909001 -0.29517999 -0.56308001 -0.10016    -0.32510999]
 [ 0.23627    -0.12958001  0.087473   -0.018755    0.33734     0.66074997]]


# AFINN

In [65]:
afinn = Afinn(language='en')

def compute_scores(text):
    words = text.split() 
    scores = [afinn.score(word) for word in words]  # compute the AFINN score for each word
    return scores

# Compute AFINN scores
x_train_scores = [compute_scores(text) for text in x_train]
x_val_scores = [compute_scores(text) for text in x_val]
x_test_scores = [compute_scores(text) for text in x_test]
print(x_train_scores[:5])

# Pad the sequences with zeros
x_train_scores_padded = pad_sequences(x_train_scores, maxlen=max_seq_length, padding='post')
x_val_scores_padded = pad_sequences(x_val_scores, maxlen=max_seq_length, padding='post')
x_test_scores_padded = pad_sequences(x_test_scores, maxlen=max_seq_length, padding='post')

print(x_train_scores_padded[:5])
print(x_train_scores_padded.shape)

[[-3.0, 0.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, 0.0, 0.0, -2.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, -2.0], [-3.0, 0.0, 0.0, 0.0, 0.0, 0.0, -2.0], [0.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -3.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -2.0, 0.0, 0.0, 0.0, 0.0, 0.0, -2.0, 0.0, 0.0, 0.0, 0.0, -3.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0]]
[[-3  0  0 ...  0  0  0]
 [ 0  0  0 ...  0  0  0]
 [-3  0  0 ...  0  0  0]
 [ 0  2  0 ...  0  0  0]
 [ 0  0  0 ...  0  0  0]]
(41000, 430)


In [66]:
def calculate_zero_percentage(score_list):
    total_scores = sum([len(scores) for scores in score_list])
    total_zeros = sum([scores.count(0) for scores in score_list])
    return total_zeros / total_scores * 100

# Calculate the percentage of exact zeros
train_zero_percentage = calculate_zero_percentage(x_train_scores)
val_zero_percentage = calculate_zero_percentage(x_val_scores)
test_zero_percentage = calculate_zero_percentage(x_test_scores)

print(f'Training data zero percentage: {train_zero_percentage:.2f}%')
print(f'Validation data zero percentage: {val_zero_percentage:.2f}%')
print(f'Test data zero percentage: {test_zero_percentage:.2f}%')

Training data zero percentage: 87.03%
Validation data zero percentage: 86.83%
Test data zero percentage: 86.53%


# SentiWordNet

In [72]:
def get_sentiment(word):
    synsets = wn.synsets(word) # get set of synonyms
    if not synsets:
        return 0  # return 0 if the word is not in WordNet
    synset = synsets[0] # The first synset is the most common sense (which are orderd by freq)
    swn_synset = swn.senti_synset(synset.name())
    return swn_synset.pos_score() - swn_synset.neg_score() # Return the overall sentiment polarity

def compute_scores(text):
    words = text.split()
    scores = [get_sentiment(word) for word in words]
    return scores

def pad_scores(score_list, max_len):
    # Pad the sequences to max sequence length
    return [scores + [0] * (max_len - len(scores)) for scores in score_list]

# Compute sentiment scores
x_train_scores = [compute_scores(text) for text in x_train]
x_val_scores = [compute_scores(text) for text in x_val]
x_test_scores = [compute_scores(text) for text in x_test]

# Pad the scores
x_train_scores_padded = np.array(pad_scores(x_train_scores, max_seq_length))
x_val_scores_padded = np.array(pad_scores(x_val_scores, max_seq_length))
x_test_scores_padded = np.array(pad_scores(x_test_scores, max_seq_length))

print(x_train_scores_padded[:5])
print(x_train_scores_padded.shape)

[[-0.875 -0.5    0.    ...  0.     0.     0.   ]
 [ 0.     0.     0.    ...  0.     0.     0.   ]
 [-0.625  0.125  0.375 ...  0.     0.     0.   ]
 [ 0.     0.25   0.    ...  0.     0.     0.   ]
 [-0.125 -0.25   0.    ...  0.     0.     0.   ]]
(41000, 430)


In [73]:
def zero_percentage(score_list):
    total_scores = sum([len(scores) for scores in score_list])
    total_zeros = sum([scores.count(0) for scores in score_list])
    return total_zeros / total_scores * 100

# Calculate the percentage of exact zeros
train_zero_percentage = zero_percentage(x_train_scores)
val_zero_percentage = zero_percentage(x_val_scores)
test_zero_percentage = zero_percentage(x_test_scores)

print(f'Training data zero percentage: {train_zero_percentage:.2f}%')
print(f'Validation data zero percentage: {val_zero_percentage:.2f}%')
print(f'Test data zero percentage: {test_zero_percentage:.2f}%')

Training data zero percentage: 72.14%
Validation data zero percentage: 72.26%
Test data zero percentage: 72.25%


In [74]:
# Save the padded scores
save_to_npy(x_train_scores_padded, os.path.join(processed_folder_path, 'x_train_scores_padded.npy'))
save_to_npy(x_val_scores_padded, os.path.join(processed_folder_path, 'x_val_scores_padded.npy'))
save_to_npy(x_test_scores_padded, os.path.join(processed_folder_path, 'x_test_scores_padded.npy'))

In [75]:
def save_to_txt(scores, file_path):
    np.savetxt(file_path, scores, fmt='%f')

# Save padded scores to text files
save_to_txt(x_train_scores_padded, os.path.join(processed_folder_path, 'x_train_scores_padded.txt'))