# Vectorizers

In [926]:
import os
import pandas as pd
import numpy as np
from numpy import asarray
import matplotlib.pyplot as plt
import multiprocessing
import pickle

import gensim.downloader as api
from keras.preprocessing.text import Tokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from keras.utils import pad_sequences

DATASET_COLUMNS = ['Id', 'Review', 'Sentiment']
senti_labels = {1: 'Negative', 2: 'Neutral', 3: 'Positive'}
senti_categories = list(senti_labels.values())
NUM_of_CLASSES = 3

input_folder_path = "./pls/Thesis_Jupyter_Final/input/"
processed_folder_path = "./pls/Thesis_Jupyter_Final/processed"
data_filename = "reviews_data.csv"
cleaned_data_filename = "cleaned_data.csv"
vocab_filename = 'vocab.csv'
w2v_pretrained_model = "glove-twitter-100"
#w2v_pretrained_model = "glove-wiki-gigaword-100"
w2v_pretrained_model_filename = str(w2v_pretrained_model) + "-word2vec.txt" 
w2v_filename =  'embedding_w2v_matrix.npy'

In [927]:
train = pd.read_csv(os.path.join(input_folder_path, "train.csv"))
val = pd.read_csv(os.path.join(input_folder_path, "val.csv"))
test = pd.read_csv(os.path.join(input_folder_path, "test.csv"))

x_train = train['x']
y_train = train['y']
x_val = val['x']
y_val = val['y']
x_test = test['x']
y_test = test['y']

# TF-IDF
vocab = [line.strip() for line in open(os.path.join(processed_folder_path, vocab_filename))]
vocab_size = len(vocab)

MAX_FEATURES = 10000
MAX_DF = 0.95
MIN_DF = 5

# Word2Vec
EMBEDDING_DIM = 100
%store EMBEDDING_DIM

Stored 'EMBEDDING_DIM' (int)


## TF-IDF

In [928]:
# Convert vocab to a dict in order to use it in TF-IDF vectorizer
vocab_dict = {} 
for i, word in enumerate(vocab):
    vocab_dict[word] = i

tfidf_vectorizer = TfidfVectorizer(
    max_features=MAX_FEATURES, # maximum number of features to keep, check unique vocabs and determine based on that, high causes saprse metrics and low value causes loss in important words/vocab
    vocabulary=vocab_dict,
    lowercase=False,
    ngram_range=(1, 1),  # range of n-grams, only unigrams now
    max_df=MAX_DF,  # ignore terms that have a document frequency strictly higher than the threshold
    min_df=MIN_DF,  # ignore terms that have a document frequency strictly lower than the threshold.
    use_idf=True,  # enable IDF weighting
    smooth_idf=True,  # smooth IDF weights --> provides stability, reduces run time errors
    sublinear_tf=True  # apply sublinear scaling to term frequencies
)

# Fit and transform the training set
x_train_tfidf = tfidf_vectorizer.fit_transform(x_train)

# Transform the validation and testing set
x_val_tfidf = tfidf_vectorizer.transform(x_val)
x_test_tfidf = tfidf_vectorizer.transform(x_test)

In [929]:
print("Given vocabulary-size : {},".format(vocab_size))
print("\nData Shape:\n* train: {}\n* validation: {}\n* test: {}\n".format(x_train_tfidf.shape, x_val_tfidf.shape, x_test_tfidf.shape))
print("x_train_tfidf:\n{}".format(x_train_tfidf))

Given vocabulary-size : 724,

Data Shape:
* train: (60000, 724)
* validation: (16415, 724)
* test: (20455, 724)

x_train_tfidf:
  (0, 77)	0.19263659253476834
  (0, 76)	0.28696050322135436
  (0, 75)	0.2229334021359909
  (0, 74)	0.2889169330599206
  (0, 73)	0.30399912719052186
  (0, 72)	0.2227351534650395
  (0, 71)	0.4403350495738444
  (0, 70)	0.22454778967963013
  (0, 69)	0.2531499063439343
  (0, 68)	0.24034547539000561
  (0, 67)	0.3214112004445366
  (0, 66)	0.22406315671956353
  (0, 65)	0.21851260223109373
  (0, 55)	0.19473416554224124
  (1, 89)	0.8225521850236588
  (1, 29)	0.5686896367200696
  (2, 184)	0.2651315590110469
  (2, 183)	0.2603707923918581
  (2, 182)	0.24232894199347244
  (2, 181)	0.2654114840276176
  (2, 180)	0.3137009892852252
  (2, 179)	0.3042143374430658
  (2, 178)	0.2680480867642309
  (2, 177)	0.3010270291978346
  (2, 176)	0.3165430134269181
  :	:
  (59997, 445)	0.271214241978095
  (59997, 420)	0.2331923807353413
  (59997, 344)	0.2626530459353287
  (59997, 223)	0.27940

In [930]:
print(f'\nData Types:\nx_train_tfidf - type: {type(x_train_tfidf)}\nx_val_tfidf - type: {type(x_val_tfidf)}\ny-train - type: {type(y_train)}')


Data Types:
x_train_tfidf - type: <class 'scipy.sparse._csr.csr_matrix'>
x_val_tfidf - type: <class 'scipy.sparse._csr.csr_matrix'>
y-train - type: <class 'pandas.core.series.Series'>


In [931]:
def save_tfidf_data(data, filename, feature_names):
    # Save the matrix with feature names as a DataFrame
    data = pd.DataFrame(data.toarray(), columns=feature_names)
    file_path = os.path.join(processed_folder_path, filename)
    data.to_csv(file_path, sep=',', index=False) # TODO: if this isn't working, note that you added sep=','


# Get feature names
feature_names = tfidf_vectorizer.get_feature_names_out()

# Save vectorized data
save_tfidf_data(x_train_tfidf, "train_tfidf.csv", feature_names)
save_tfidf_data(x_val_tfidf, "val_tfidf.csv", feature_names)
save_tfidf_data(x_test_tfidf, "test_tfidf.csv", feature_names)

# Encode Data

In [932]:
# Find maximum sequence length
max_seq_length = max([len(review.split()) for review in x_train])
%store max_seq_length
print(f'Maximum review length: {max_seq_length}')

Stored 'max_seq_length' (int)
Maximum review length: 294


In [933]:
# Fit tokenizer (on training data)
tokenizer = Tokenizer()
# Remove default filters, including punctuation
tokenizer.filters = ""  
# Disable lowercase conversion
tokenizer.lower = False  
tokenizer.fit_on_texts(x_train) 

In [934]:
def encode_text(lines, tokenizer, max_length):
    # Integer encode
    encoded_seq = tokenizer.texts_to_sequences(lines)
    # Pad the encoded sequences
    padded = pad_sequences(encoded_seq, maxlen=max_length, padding='post')

    return padded

In [935]:
# Encode Data
x_train_encoded = encode_text(x_train, tokenizer, max_seq_length)
x_val_encoded = encode_text(x_val, tokenizer, max_seq_length)
x_test_encoded = encode_text(x_test, tokenizer, max_seq_length)

print("Encoded-data shapes:\n* train: {}\n* validation: {}\n* test: {}\n".format(x_train_encoded.shape, x_val_encoded.shape, x_test_encoded.shape))
print(f"x_train_encoded[:3]:\n{x_val_encoded[:3]}")

Encoded-data shapes:
* train: (60000, 294)
* validation: (16415, 294)
* test: (20455, 294)

x_train_encoded[:3]:
[[  6 162 648 587   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0

In [936]:
# Restructure labels
y_train = y_train.values
y_val = y_val.values
y_test = y_test.values
print("target-data shapes:\n* train: {}\n* validation: {}\n* test: {}\n".format(y_train.shape, y_val.shape, y_test.shape))

target-data shapes:
* train: (60000,)
* validation: (16415,)
* test: (20455,)



In [937]:
def save_encoded_data(data, filename):
    # Save the encoded arrays
    file_path = os.path.join(processed_folder_path, filename)
    np.save(file_path, np.array(data))

save_encoded_data(x_train_encoded, "train_encoded_x")
save_encoded_data(x_val_encoded, "val_encoded_x")
save_encoded_data(x_test_encoded, "test_encoded_x")

# Word2Vec

In [938]:
# Total vocabulary size plus 0 for unknown words
embedding_vocab_size = len(tokenizer.word_index) + 1
%store embedding_vocab_size
print("embedding_vocab_size: ", embedding_vocab_size)

Stored 'embedding_vocab_size' (int)
embedding_vocab_size:  724


In [939]:
# Check if there are any words identified via the tokenizer that are not in vocab
tokenizer_vocab = set(tokenizer.word_index.keys())
vocab_set = set(vocab)
tokenizer_only_words = tokenizer_vocab.difference(vocab_set)
print("Words in tokenizer but not in vocab: ", len(tokenizer_only_words))

Words in tokenizer but not in vocab:  0


In [940]:
# TODO: gigaword or twitter?
def load_embedding():
    # Check if the pre-trained Word2Vec model is already downloaded
    w2v_pretrained_file_path = os.path.join(processed_folder_path, w2v_pretrained_model_filename)
    if not os.path.exists(w2v_pretrained_file_path):
        print("\nw2v model doesn't exist")
        # If the model does not exist, download it
        model = api.load(w2v_pretrained_model)
        # Save the word2vec embeddings in the appropriate format
        model.save_word2vec_format(w2v_pretrained_file_path, binary=False)

    # load embedding into memory, skip first line
    print("Loading w2v model...")
    file = open(w2v_pretrained_file_path, 'r', encoding='utf8')
    lines = file.readlines()[1:]
    file.close()
    # create a map of words to vectors
    embedding = dict()
    for line in lines:
        parts = line.split()
        # key is string word, value is numpy array for vector
        embedding[parts[0]] = asarray(parts[1:], dtype='float32')
    return embedding

In [941]:
raw_embedding = load_embedding()

Loading w2v model...


In [942]:
def get_weight_matrix(embedding, tokenizer):
    # create a weight matrix for the Embedding layer from a loaded embedding

    # define weight matrix dimensions with all 0
    weight_matrix = np.zeros((embedding_vocab_size, EMBEDDING_DIM))
    # step vocab, store vectors using the Tokenizer's integer mapping
    count_all = 0
    count_na = 0
    for word, i in tokenizer.word_index.items():
        # TODO: important note, pretrained word2vec model removes all neg_ and emojis (also other words) that are
        #  not defined in the model it These values should prob? also be removed from the vocab (and update vocab size) to avoid mismatch in the embedding layer
        if word in embedding.keys():
            # print(embedding.get(word)[:3])
            weight_matrix[i] = embedding.get(word)
        else:
            #print(word)
            count_na += 1
        count_all += 1
    print(f'count_na/count_all: {str(count_na)}/{count_all}')
    print(f"embedding matrix shape: {weight_matrix.shape}")

    # save model in ASCII (word2vec) format
    file_path = os.path.join(processed_folder_path, w2v_filename)
    '''
    with open(file_path, 'w') as file:
        file.write('\n'.join(' '.join(str(x) for x in row) for row in weight_matrix))
    '''

    np.save(file_path, weight_matrix)
    
    return weight_matrix

In [943]:
w2v_embedding_vectors = get_weight_matrix(raw_embedding, tokenizer)

count_na/count_all: 192/723
embedding matrix shape: (724, 100)


### Encode y

In [944]:
def one_hot_encode(y):
    y_encoded = np.zeros((len(y), NUM_of_CLASSES))
    for i, label in enumerate(y):
        y_encoded[i, label - 1] = 1

    return y_encoded

In [945]:
# Convert sentiment labels to one-hot encoding
y_train_encoded = one_hot_encode(y_train)
y_val_encoded = one_hot_encode(y_val)
y_test_encoded = one_hot_encode(y_test)
   
print("\ny-encoded Data Shape:\n* train: {}\n* validation: {}\n* test: {}\n".format(y_train_encoded.shape, y_val_encoded.shape, y_test_encoded.shape))


y-encoded Data Shape:
* train: (60000, 3)
* validation: (16415, 3)
* test: (20455, 3)



In [946]:
save_encoded_data(y_train_encoded, "train_encoded_y")
save_encoded_data(y_val_encoded, "val_encoded_y")
save_encoded_data(y_test_encoded, "test_encoded_y")