In [None]:
import os
import pandas as pd
import numpy as np
from numpy import asarray
import matplotlib.pyplot as plt
import multiprocessing
import pickle

import gensim.downloader as api
from keras.preprocessing.text import Tokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from keras.utils import pad_sequences

DATASET_COLUMNS = ['Id', 'Review', 'Sentiment']
senti_labels = {1: 'Negative', 2: 'Neutral', 3: 'Positive'}
senti_categories = list(senti_labels.values())
NUM_of_CLASSES = 3

input_folder_path = "./pls/Thesis_Jupyter_Final/input/"
processed_folder_path = "./pls/Thesis_Jupyter_Final/processed"
data_filename = "reviews_data.csv"
cleaned_data_filename = "cleaned_data.csv"
vocab_filename = 'vocab.txt'
w2v_filename =  'weight_matrix_word2vec.txt'

In [None]:
train = pd.read_csv(os.path.join(input_folder_path, "train.csv"))
val = pd.read_csv(os.path.join(input_folder_path, "val.csv"))
test = pd.read_csv(os.path.join(input_folder_path, "test.csv"))

x_train = train['Review']
y_train = train['Sentiment']
x_val = val['Review']
y_val = val['Sentiment']
x_test = test['Review']
y_test = test['Sentiment']

vocab = pd.read_csv(os.path.join(processed_folder_path, vocab_filename))
vocab_size = len(vocab)

In [None]:
# TF-IDF
MAX_FEATURES = 10000
MAX_DF = 0.95
MIN_DF = 5

# Word2Vec
EMBEDDING_DIM = 100

## TF-IDF

In [None]:
# Initialize TfidfVectorizer with the filtered vocabulary
tfidf_vectorizer = TfidfVectorizer(
    max_features=MAX_FEATURES, # maximum number of features to keep, check unique vocabs and determine based on that, high causes saprse metrics and low value causes loss in important words/vocab
    vocabulary=vocab,
    lowercase=False,
    ngram_range=(1, 1),  # range of n-grams, only unigrams now
    max_df=MAX_DF,  # ignore terms that have a document frequency strictly higher than the threshold
    min_df=MIN_DF,  # ignore terms that have a document frequency strictly lower than the threshold.
    use_idf=True,  # enable IDF weighting
    smooth_idf=True,  # smooth IDF weights --> provides stability, reduces run time errors
    sublinear_tf=True  # apply sublinear scaling to term frequencies
)

# Fit and transform the training set
x_train_tfidf = tfidf_vectorizer.fit_transform(x_train)

# Transform the validation and testing set
x_val_tfidf = tfidf_vectorizer.transform(x_val)
x_test_tfidf = tfidf_vectorizer.transform(x_test)

In [None]:
print("Given vocabulary-size : {},".format(vocab_size))
print("\nData Shape:\n* train: {}\n* validation: {}\n* test: {}\n".format(x_train_tfidf.shape, x_val_tfidf.shape, x_test_tfidf.shape))
print("x_train_tfidf:\n{}".format(x_train_tfidf))

Given vocabulary-size : 1541,

Data Shape:
* train: (6400, 1541)
* validation: (1600, 1541)
* test: (2000, 1541)

x_train_tfidf:
  (0, 15)	0.21488294804730057
  (0, 14)	0.3258705571787054
  (0, 13)	0.2754694528444137
  (0, 12)	0.2274723012657554
  (0, 11)	0.18810094400150487
  (0, 10)	0.28388400052650353
  (0, 9)	0.36035657104380786
  (0, 8)	0.2748752517308079
  (0, 7)	0.2875212896711316
  (0, 6)	0.1775564539318653
  (0, 5)	0.199508163741704
  (0, 3)	0.3097449723672214
  (0, 2)	0.26822893504871026
  (0, 1)	0.22079750360155437
  (0, 0)	0.16728041506367827
  (1, 18)	0.545190792811988
  (1, 17)	0.4024854525010473
  (1, 16)	0.6169315424330162
  (1, 6)	0.40020985983516544
  (2, 933)	0.2663099426984769
  (2, 34)	0.18229784732070112
  (2, 33)	0.26345337034491717
  (2, 32)	0.3049939672889147
  (2, 31)	0.28948020585724704
  (2, 30)	0.15548796677139815
  :	:
  (6399, 942)	0.2297989266474506
  (6399, 888)	0.18581248936403078
  (6399, 835)	0.22246469755266957
  (6399, 644)	0.2114056082225976
  (63

In [None]:
print(f'\nData Types:\nx_train_tfidf - type: {type(x_train_tfidf)}\nx_val_tfidf - type: {type(x_val_tfidf)}\ny-train - type: {type(y_train)}')


Data Types:
x_train_tfidf - type: <class 'scipy.sparse._csr.csr_matrix'>
x_val_tfidf - type: <class 'scipy.sparse._csr.csr_matrix'>
y-train - type: <class 'pandas.core.series.Series'>


In [None]:
def save_tfidf_data(data, filename, feature_names):
    # Save the matrix with feature names as a DataFrame
    data = pd.DataFrame(data.toarray(), columns=feature_names)
    file_path = os.path.join(processed_folder_path, filename)
    data.to_csv(file_path, sep=',', index=False) # TODO: if this isn't working, note that you added sep=','


# Get feature names
feature_names = tfidf_vectorizer.get_feature_names_out()

# Save vectorized data
save_tfidf_data(x_train_tfidf, "train_tfidf.csv", feature_names)
save_tfidf_data(x_val_tfidf, "val_tfidf.csv", feature_names)
save_tfidf_data(x_test_tfidf, "test_tfidf.csv", feature_names)

# Encode Data

In [None]:
# Find maximum sequence length
max_seq_length = max([len(review.split()) for review in x_train])
print(f'Maximum review length: {max_seq_length}')

Maximum review length: 238


In [None]:
# Fit tokenizer (on training data)
tokenizer = Tokenizer()
# Remove default filters, including punctuation
tokenizer.filters = ""  
# Disable lowercase conversion
tokenizer.lower = False  
tokenizer.fit_on_texts(x_train) 

In [None]:
def encode_text(lines, tokenizer, max_length):
    # Integer encode
    encoded_seq = tokenizer.texts_to_sequences(lines)
    # Pad the encoded sequences
    padded = pad_sequences(encoded_seq, maxlen=max_length, padding='post')

    return padded

In [None]:
# Encode Data
x_train_encoded = encode_text(x_train, tokenizer, max_seq_length)
x_val_encoded = encode_text(x_val, tokenizer, max_seq_length)
x_test_encoded = encode_text(x_test, tokenizer, max_seq_length)

print("Encoded-data shapes:\n* train: {}\n* validation: {}\n* test: {}\n".format(x_train_encoded.shape, x_val_encoded.shape, x_test_encoded.shape))
print(f"x_train_encoded[:3]:\n{x_val_encoded[:3]}")

Encoded-data shapes:
* train: (6400, 238)
* validation: (1600, 238)
* test: (2000, 238)

x_train_encoded[:3]:
[[ 130  167    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0

In [None]:
# Restructure labels
y_train = y_train.values
y_val = y_val.values
y_test = y_test.values
print("target-data shapes:\n* train: {}\n* validation: {}\n* test: {}\n".format(y_train.shape, y_val.shape, y_test.shape))

target-data shapes:
* train: (6400,)
* validation: (1600,)
* test: (2000,)



In [None]:
def save_encoded_data(data, filename):
    # Save the encoded arrays
    file_path = os.path.join(processed_folder_path, filename)
    np.save(file_path, np.array(data))

save_encoded_data(x_train_encoded, "train_encoded_x.csv")
save_encoded_data(x_val_encoded, "val_encoded_x.csv")
save_encoded_data(x_test_encoded, "test_encoded_x.csv")

# Word2Vec

In [None]:
# Total vocabulary size plus 0 for unknown words
embedding_vocab_size = len(tokenizer.word_index) + 1
print("embedding_vocab_size: ", embedding_vocab_size)

embedding_vocab_size:  1543


Checking why embedding vocab_size is 2 greater than original vocab size due to 'empty'
    #TODO: remove rows?

In [None]:
# Convert the tokenizer word index into a set
tokenizer_words = set(tokenizer.word_index.keys())

# Convert the manual vocabulary into a set
vocab_set = set(vocab)

# Find the words in tokenizer but not in vocab
tokenizer_only_words = tokenizer_words.difference(vocab_set)

print("Words in tokenizer but not in vocab:")
print(tokenizer_only_words)

Words in tokenizer but not in vocab:
{'<empty>'}


In [None]:
# TODO: gigaword or twitter?
def load_embedding():
    # Check if the pre-trained Word2Vec model is already downloaded
    #w2v_pretrained_model = "glove-twitter-100"
    w2v_pretrained_model = "glove-wiki-gigaword-100"
    w2v_pretrained_model_filename = str(w2v_pretrained_model) + "-word2vec.txt"
    if not os.path.exists(w2v_pretrained_model_filename):
        print("\nw2v model doesn't exist")
        # If the model does not exist, download it
        model = api.load("glove-twitter-100")
        # Save the word2vec embeddings in the appropriate format
        model.save_word2vec_format(w2v_pretrained_model_filename, binary=False)

    # load embedding into memory, skip first line
    print("Loading w2v model...")
    file = open(w2v_pretrained_model_filename, 'r', encoding='utf8')
    lines = file.readlines()[1:]
    file.close()
    # create a map of words to vectors
    embedding = dict()
    for line in lines:
        parts = line.split()
        # key is string word, value is numpy array for vector
        embedding[parts[0]] = asarray(parts[1:], dtype='float32')
    return embedding

In [None]:
raw_embedding = load_embedding()

Loading w2v model...


In [None]:
def get_weight_matrix(embedding, tokenizer):
    # create a weight matrix for the Embedding layer from a loaded embedding

    # define weight matrix dimensions with all 0
    weight_matrix = np.zeros((embedding_vocab_size, EMBEDDING_DIM))
    # step vocab, store vectors using the Tokenizer's integer mapping
    count_all = 0
    count_na = 0
    for word, i in tokenizer.word_index.items():
        # TODO: important note, pretrained word2vec model removes all neg_ and emojis (also other words) that are
        #  not defined in the model it These values should prob? also be removed from the vocab (and update vocab size) to avoid mismatch in the embedding layer
        if word in embedding.keys():
            # print(embedding.get(word)[:3])
            weight_matrix[i] = embedding.get(word)
        else:
            #print(word)
            count_na += 1
        count_all += 1
    print(f'count_na/count_all: {str(count_na)}/{count_all}')
    print(f"embedding matrix shape: {weight_matrix.shape}")

    # save model in ASCII (word2vec) format
    file_path = os.path.join(processed_folder_path, w2v_filename)
    with open(file_path, 'w') as file:
        file.write('\n'.join(' '.join(str(x) for x in row) for row in weight_matrix))
    
    return weight_matrix

In [None]:
w2v_embedding_vectors = get_weight_matrix(raw_embedding, tokenizer)

count_na/count_all: 450/1542
embedding matrix shape: (1543, 100)


### Encode y

In [None]:
def one_hot_encode(y):
    y_encoded = np.zeros((len(y), NUM_of_CLASSES))
    for i, label in enumerate(y):
        y_encoded[i, label - 1] = 1

    return y_encoded

In [None]:
# Convert sentiment labels to one-hot encoding

y_train_encoded = one_hot_encode(y_train)
y_val_encoded = one_hot_encode(y_val)
y_test_encoded = one_hot_encode(y_test)

print("Check one-hot encoding:\n", y_train_encoded[:3])    
print("\ny-encoded Data Shape:\n* train: {}\n* validation: {}\n* test: {}\n".format(y_train_encoded.shape, y_val_encoded.shape, y_test_encoded.shape))
print("\nx_train_encoded - type:", type(x_train_encoded))
print("y_train_encoded - type:", type(y_train_encoded))

In [None]:
save_encoded_data(y_train_encoded, "train_encoded_y.csv")
save_encoded_data(y_val_encoded, "val_encoded_y.csv")
save_encoded_data(y_test_encoded, "test_encoded_y.csv")