# Vectorizers

In [59]:
import scipy.sparse
import os
import pandas as pd
import numpy as np
from numpy import asarray
import matplotlib.pyplot as plt
import multiprocessing
import pickle

import gensim.downloader as api
from keras.preprocessing.text import Tokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from keras.utils import pad_sequences

input_folder_path = "./pls/Thesis_Jupyter_Final/src/input/"
processed_folder_path = "./pls/Thesis_Jupyter_Final/src/input/processed"
w2v_pretrained_model = "glove-twitter-100"
#w2v_pretrained_model = "glove-wiki-gigaword-100"
w2v_pretrained_model_filename = str(w2v_pretrained_model) + "-word2vec.txt" 

In [60]:
# TODO: load_data obtained from processor.ipynb, delete or import after
def load_data(filename, process=True):
    # Load data
    data_file_path = os.path.join(input_folder_path, filename)
    df = pd.read_csv(data_file_path)

    if process:
        # Set ID as index
        df.set_index('Id', inplace=True, drop=True)
        df = drop_missing(df)

        print(df.info())
        print()
        print(f'Dataset shape: {df.shape}\n')

    return df

df_train = load_data("train.csv", process=False)
df_val = load_data("val.csv", process=False)
df_test = load_data("test.csv", process=False)

x_train = df_train['x']
y_train = df_train['y']
x_val = df_val['x']
y_val = df_val['y']
x_test = df_test['x']
y_test = df_test['y']

with open('vocab.pkl', 'rb') as f:
    vocab = pickle.load(f)
    vocab_size = len(vocab)

# TODO: delete this part
#vocab = dict(sorted(vocab.items(), key=lambda x: x[1], reverse=True))
first_20 = list(vocab.items())[:10]
print("20 Most Common Words:")
for word, freq in first_20:
    print(f"\t{word}: {freq}")
print(type(vocab))
print(vocab_size)

20 Most Common Words:
	awesome: 2011
	took: 568
	longer: 147
	hoped: 27
	complete: 845
	satisfying: 31
	finish: 239
	dr: 1651
	chuck: 857
	appreciate: 367
<class 'dict'>
4094


In [61]:
def save_to_npz(matrix, file_path):
    scipy.sparse.save_npz(file_path, matrix)

def save_to_npy(arr, file_path):
    np.save(file_path, np.array(arr))

## TF-IDF

In [62]:
 def get_tfidf_vectorizer(vocab, max_features, min_df, max_df):
    # Convert vocab to a dict in order to use it in TF-IDF vectorizer
    vocab_dict = {word: i for i, word in enumerate(vocab)}

    tfidf_vectorizer = TfidfVectorizer(
        max_features=max_features, # maximum number of features to keep, check unique vocabs and determine based on that, high causes saprse metrics and low value causes loss in important words/vocab
        vocabulary=vocab_dict,
        lowercase=False,
        ngram_range=(1, 1),  # range of n-grams, only unigrams now
        max_df=max_df,  # ignore terms that have a document frequency strictly higher than the threshold
        min_df=min_df,  # ignore terms that have a document frequency strictly lower than the threshold.
        use_idf=True,  # enable IDF weighting
        smooth_idf=True,  # smooth IDF weights --> provides stability, reduces run time errors
        sublinear_tf=True  # apply sublinear scaling to term frequencies
    )

    return tfidf_vectorizer


def transform_to_tfidf(x_train, x_val, x_test):
    # Fit and transform the training set
    x_train_tfidf = tfidf_vectorizer.fit_transform(x_train)

    # Transform the validation and testing set
    x_val_tfidf = tfidf_vectorizer.transform(x_val)
    x_test_tfidf = tfidf_vectorizer.transform(x_test)

    # Save data
    save_to_npz(x_train_tfidf, os.path.join(processed_folder_path, "train_tfidf.npz"))
    save_to_npz(x_train_tfidf, os.path.join(processed_folder_path, "val_tfidf.npz"))
    save_to_npz(x_train_tfidf, os.path.join(processed_folder_path, "test_tfidf.npz"))

    return x_train_tfidf, x_val_tfidf, x_test_tfidf


max_features = 10000
max_df = 0.95
min_df = 5

tfidf_vectorizer = get_tfidf_vectorizer(vocab_dict, max_features, min_df, max_df)
x_train_tfidf, x_val_tfidf, x_test_tfidf = transform_to_tfidf(x_train, x_val, x_test)
%store tfidf_vectorizer

print("\nData Shape (doc, vocab_size):\n* train: {}\n* validation: {}\n* test: {}\n".format(x_train_tfidf.shape, x_val_tfidf.shape, x_test_tfidf.shape))
print("x_train_tfidf:\n{}".format(x_train_tfidf))

Stored 'tfidf_vectorizer' (TfidfVectorizer)

Data Shape (doc, vocab_size):
* train: (35000, 4094)
* validation: (15683, 4094)
* test: (19615, 4094)

x_train_tfidf:
  (0, 253)	0.20319997741435275
  (0, 252)	0.2484868510317415
  (0, 251)	0.23054254932250434
  (0, 250)	0.2808887522041542
  (0, 249)	0.2911297304736712
  (0, 248)	0.2282591189234916
  (0, 247)	0.22570959882362757
  (0, 246)	0.2970633359921816
  (0, 245)	0.21806125543746474
  (0, 244)	0.2872029173143731
  (0, 243)	0.27895172088188636
  (0, 242)	0.25238923368819194
  (0, 241)	0.21557586195904763
  (0, 111)	0.21032416490798655
  (0, 47)	0.3140440550441993
  (0, 37)	0.1672782476507953
  (1, 289)	0.3760537882123812
  (1, 288)	0.32711892958014527
  (1, 287)	0.49476400180820523
  (1, 286)	0.27335421072881083
  (1, 285)	0.413545407051769
  (1, 284)	0.43067563181712404
  (1, 71)	0.2748845371029889
  (2, 361)	0.3264121823619089
  (2, 360)	0.3895994185253099
  :	:
  (34998, 536)	0.5573385910101979
  (34998, 295)	0.5189532784738773
  (3

In [63]:
#TODO: delete
def save_tfidf_data(data, filename, feature_names):
    # Save the matrix with feature names as a DataFrame
    data = pd.DataFrame(data.toarray(), columns=feature_names)
    file_path = os.path.join(processed_folder_path, filename)
    data.to_csv(file_path, sep=',', index=False) # TODO: if this isn't working, note that you added sep=','


# Get feature names
feature_names = tfidf_vectorizer.get_feature_names_out()

# Save vectorized data
save_tfidf_data(x_train_tfidf, "train_tfidf.csv", feature_names)
save_tfidf_data(x_val_tfidf, "val_tfidf.csv", feature_names)

# Encode Data

In [64]:
def find_max_seq_len(data):
    # Find maximum sequence length
    max_seq_length = max([len(line.split()) for line in data])
    print(f'Maximum review length: {max_seq_length}')

    return max_seq_length

max_seq_length = find_max_seq_len(x_train)
%store max_seq_length

Maximum review length: 472
Stored 'max_seq_length' (int)


In [65]:
def fit_tokenizer(data):
    # Fit tokenizer (on training data)
    tokenizer = Tokenizer()
    # Remove default filters, including punctuation
    tokenizer.filters = ""  
    # Disable lowercase conversion
    tokenizer.lower = False  
    tokenizer.fit_on_texts(data) 

    return tokenizer

tokenizer = fit_tokenizer(x_train)

In [66]:
def encode_text(lines, tokenizer, max_length, filename):
    # Integer encode
    encoded_seq = tokenizer.texts_to_sequences(lines)
    # Pad the encoded sequences
    padded = pad_sequences(encoded_seq, maxlen=max_length, padding='post')

    # Save to np.array
    save_to_npy(padded, os.path.join(processed_folder_path, filename))

    return padded
    

# Encode Data
x_train_encoded = encode_text(x_train, tokenizer, max_seq_length, "train_encoded_x.npy")
x_val_encoded = encode_text(x_val, tokenizer, max_seq_length, "val_encoded_x.npy")
x_test_encoded = encode_text(x_test, tokenizer, max_seq_length, "test_encoded_x.npy")

print("\nData Shape (doc, vocab_size):\n* train: {}\n* validation: {}\n* test: {}\n".format(x_train_encoded.shape, x_val_encoded.shape, x_test_encoded.shape))
print("x_train_tfidf:\n{}".format(x_train_encoded))


Data Shape (doc, vocab_size):
* train: (35000, 472)
* validation: (15683, 472)
* test: (19615, 472)

x_train_tfidf:
[[ 309  892   86 ...    0    0    0]
 [  26 1000  761 ...    0    0    0]
 [1791  231   37 ...    0    0    0]
 ...
 [ 306  516  790 ...    0    0    0]
 [ 211  299 1072 ...    0    0    0]
 [1936    5 2408 ...    0    0    0]]


### Encode y

In [67]:
# TODO: necessary?
def get_y_values(y):
    return y.values


y_train = y_train.get_y_values(y_train)
y_val = y_val.get_y_values(y_val)
y_test = y_test.get_y_values(y_test)

print("target-data shapes:\n* train: {}\n* validation: {}\n* test: {}\n".format(y_train.shape, y_val.shape, y_test.shape))

AttributeError: 'Series' object has no attribute 'get_y_values'

In [68]:
# TODO: can't remember if this is used somewhere else, if not save data inside function
def one_hot_encode(y):
    y_encoded = np.zeros((len(y), NUM_of_CLASSES))
    for i, label in enumerate(y):
        y_encoded[i, label - 1] = 1

    return y_encoded

# Convert sentiment labels to one-hot encoding
y_train_encoded = one_hot_encode(y_train)
y_val_encoded = one_hot_encode(y_val)
y_test_encoded = one_hot_encode(y_test)


save_to_npy(y_train_encoded, os.path.join(processed_folder_path, "train_encoded_y.npy"))
save_to_npy(y_val_encoded, os.path.join(processed_folder_path, "val_encoded_y.npy"))
save_to_npy(y_test_encoded, os.path.join(processed_folder_path, "test_encoded_y.npy"))
   
print("\ny-encoded Data Shape:\n* train: {}\n* validation: {}\n* test: {}\n".format(y_train_encoded.shape, y_val_encoded.shape, y_test_encoded.shape))


y-encoded Data Shape:
* train: (35000, 3)
* validation: (15683, 3)
* test: (19615, 3)



# Word2Vec

In [72]:
# TODO: may delete?
# Total vocabulary size plus 0 for unknown words
embedding_vocab_size = len(tokenizer.word_index) + 1
%store embedding_vocab_size
print("embedding_vocab_size: ", embedding_vocab_size)

Stored 'embedding_vocab_size' (int)
embedding_vocab_size:  4095


In [73]:
# TODO: may delete?
# Check if there are any words identified via the tokenizer that are not in vocab
tokenizer_vocab = set(tokenizer.word_index.keys())
vocab_set = set(vocab)
tokenizer_only_words = tokenizer_vocab.difference(vocab_set)
print("Words in tokenizer but not in vocab: ", len(tokenizer_only_words))

Words in tokenizer but not in vocab:  0


In [79]:
def load_embedding():
    w2v_pretrained_file_path = os.path.join(processed_folder_path, w2v_pretrained_model_filename)
    if not os.path.exists(w2v_pretrained_file_path):
        # Check if the pre-trained Word2Vec model is already downloaded. If not, download it.
        print("\nW2v model doesn't exist")
        model = api.load(w2v_pretrained_model)
        model.save_word2vec_format(w2v_pretrained_file_path, binary=False)

    # Load embedding into memory, skip first line
    print("Loading w2v model...")
    file = open(w2v_pretrained_file_path, 'r', encoding='utf8')
    lines = file.readlines()[1:]
    file.close()

    # Create a map of words to vectors
    embedding = dict()
    for line in lines:
        parts = line.split()
        # Set key as string word, value as numpy array for vector
        embedding[parts[0]] = asarray(parts[1:], dtype='float32')

    return embedding

def get_emb_matrix(loaded_embedding, tokenizer, embedding_dim):
    # Create a weight matrix for the Embedding layer from a loaded/pretrained embedding

    # Define weight matrix dimensions (vocab_size + 1 for unknown words) with all 0 
    emb_matrix = np.zeros((len(tokenizer.word_index) + 1, EMBEDDING_DIM))

    count_all_words = 0
    count_na_words = 0
    for word, i in tokenizer.word_index.items():
        # Map loaded vectors to terms in vocab
        if word in loaded_embedding.keys():
            emb_matrix[i] = loaded_embedding.get(word)
        else:
            # Some terms such as emojis or neg-tagged words are not found in the loaded w2v model, hence they will have vectors with all 0
            count_na_words += 1
        count_all_words += 1
    print(f'{count_na_words}/{count_all_words} ({((count_na_words/count_all_words)*100):.2f}%) are not defined in the pretrained W2V model and will receive vectors with all 0.')
    print(f"W2V Embedding Matrix shape: {emb_matrix.shape}")
    print(f"Embedding Matrix:\n{emb_matrix[:10, :6]}")

    # Save W2V model
    # TODO: delete
    file_path = os.path.join(processed_folder_path, "embedding_matrix.txt")
    with open(file_path, 'w') as file:
        file.write('\n'.join(' '.join(str(x) for x in row) for row in emb_matrix))

    # TODO: keep
    save_to_npy(emb_matrix, (os.path.join(processed_folder_path, "embedding_matrix.npy")))
    
    return emb_matrix

pretrained_embedding = load_embedding()
embedding_dim = 100
w2v_embedding_vectors = get_emb_matrix(pretrained_embedding, tokenizer, embedding_dim)
%store w2v_embedding_vectors

Loading w2v model...
1442/4094 (35.22%) are not defined in the pretrained W2V model and will receive vectors with all 0.
W2V Embedding Matrix shape: (4095, 100)
Embedding Matrix: [[ 0.          0.          0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.          0.          0.        ]
 [ 0.13095    -0.17110001  0.21895    -0.53894001 -0.13356     0.21934   ]
 [ 0.          0.          0.          0.          0.          0.        ]
 [ 0.33802     0.78784001  0.23646    -0.059737   -0.14753    -0.15067001]
 [ 0.          0.          0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.          0.          0.        ]]
Stored 'w2v_embedding_vectors' (ndarray)
