# TF-IDF Vectorize

In [23]:
import scipy.sparse
import os
import pandas as pd
import pickle

from sklearn.feature_extraction.text import TfidfVectorizer
from joblib import dump

In [24]:
script_dir = os.path.dirname(os.path.abspath('processor.ipynb'))
data_path = os.path.join(script_dir, 'Thesis_Jupyter_Final/src/')
os.getcwd()
print(data_path)

input_folder_path = os.path.join(data_path, 'input')
processed_folder_path = os.path.join(data_path, 'input/processed/neg_tagged')

/home2/s3985113/Thesis_Jupyter_Final/src/


In [25]:
def load_data(file_path):
    # Load data
    df = pd.read_csv(file_path)

    x = df['x']
    y = df['y']

    return x, y

def load_vocab(file_path):
    with open(file_path, 'rb') as f:
        vocab = pickle.load(f)
        vocab_size = len(vocab)

    vocab_size = len(vocab)

    return vocab, vocab_size
    

x_train, y_train = load_data(os.path.join(processed_folder_path, "train.csv"))
x_val, y_val = load_data(os.path.join(processed_folder_path, "val.csv"))
x_test, y_test = load_data(os.path.join(processed_folder_path, "test.csv"))
print(x_train[:5])
print()


vocab_data_filename = "vocab.pkl"
vocab, vocab_size = load_vocab(os.path.join(processed_folder_path, vocab_data_filename))
print("Vocab size: ", vocab_size)

0    bad superficial explanation speaks fast need s...
1    first let grade quiz purchase highly disappointed
2    horrible test little sense use element describ...
3    least favorite informative_NEG far_NEG style_N...
4    guess thing explanation level_NEG assignment_N...
Name: x, dtype: object

Vocab size:  10573


## TF-IDF

In [26]:
max_features = 7000
max_df = 0.95
min_df = 5

def get_tfidf_vectorizer(vocab, max_features, min_df, max_df):
    # Convert vocab to a dict in order to use it in TF-IDF vectorizer
    vocab_dict = {word: i for i, word in enumerate(vocab)}

    tfidf_vectorizer = TfidfVectorizer(
        max_features=max_features, # maximum number of features to keep, check unique vocabs and determine based on that, high causes saprse metrics and low value causes loss in important words/vocab
        vocabulary=vocab_dict,
        lowercase=False,
        ngram_range=(1, 2),  # range of n-grams,
        max_df=max_df,  # ignore terms that have a document frequency strictly higher than the threshold
        min_df=min_df,  # ignore terms that have a document frequency strictly lower than the threshold.
        use_idf=True,  # enable IDF weighting
        smooth_idf=True,  # smooth IDF weights --> provides stability, reduces run time errors
        sublinear_tf=True  # apply sublinear scaling to term frequencies
    )

    # Save tfidf vectorizer
    file_path = os.path.join(processed_folder_path, 'tfidf_vectorizer.joblib')
    dump(tfidf_vectorizer, file_path)

    return tfidf_vectorizer

def transform_to_tfidf(x_train, x_val, x_test):
    # Fit and transform the training set
    x_train_tfidf = tfidf_vectorizer.fit_transform(x_train)

    # Transform the validation and testing set
    x_val_tfidf = tfidf_vectorizer.transform(x_val)
    x_test_tfidf = tfidf_vectorizer.transform(x_test)

    return x_train_tfidf, x_val_tfidf, x_test_tfidf

tfidf_vectorizer = get_tfidf_vectorizer(vocab, max_features, min_df, max_df)
x_train_tfidf, x_val_tfidf, x_test_tfidf = transform_to_tfidf(x_train, x_val, x_test)

# Save data
scipy.sparse.save_npz(os.path.join(processed_folder_path, "train_tfidf.npz"), x_train_tfidf)
scipy.sparse.save_npz(os.path.join(processed_folder_path, "val_tfidf.npz"), x_val_tfidf)
scipy.sparse.save_npz(os.path.join(processed_folder_path, "test_tfidf.npz"), x_test_tfidf)

print("\nData Shape (doc, vocab_size):\n* train: {}\n* validation: {}\n* test: {}\n".format(x_train_tfidf.shape, x_val_tfidf.shape, x_test_tfidf.shape))
print("x_train_tfidf:\n{}".format(x_train_tfidf))


Data Shape (doc, vocab_size):
* train: (41000, 10573)
* validation: (11540, 10573)
* test: (11921, 10573)

x_train_tfidf:
  (0, 4944)	0.333611549281506
  (0, 2133)	0.2961339731660829
  (0, 1962)	0.2790813018609294
  (0, 1829)	0.2845491982794516
  (0, 1738)	0.2471451738558906
  (0, 1627)	0.27254171122483684
  (0, 784)	0.24895427348562024
  (0, 581)	0.2096952161674913
  (0, 438)	0.21087359433125996
  (0, 400)	0.21511850812086866
  (0, 337)	0.22498142205493177
  (0, 307)	0.22522954040809726
  (0, 223)	0.21748277299714208
  (0, 207)	0.2005277841281215
  (0, 41)	0.16507539015093728
  (0, 37)	0.3065855262224691
  (1, 3132)	0.48745999184528344
  (1, 1117)	0.3794568593358208
  (1, 1065)	0.41963408220421816
  (1, 811)	0.43121465736774295
  (1, 38)	0.2770028355754138
  (1, 22)	0.27822837453512
  (1, 20)	0.3197105022731662
  (2, 3942)	0.3961231024356585
  (2, 2937)	0.3742032094232427
  :	:
  (40997, 8925)	0.4335846690233216
  (40997, 4246)	0.3889950728579958
  (40997, 2499)	0.36388494758151557
 

In [27]:
#TODO: delete
def save_tfidf_data(data, filename, feature_names):
    # Save the matrix with feature names as a DataFrame
    data = pd.DataFrame(data.toarray(), columns=feature_names)
    file_path = os.path.join(processed_folder_path, filename)
    data.to_csv(file_path, sep=',', index=False) # TODO: if this isn't working, note that you added sep=','


# Get feature names
feature_names = tfidf_vectorizer.get_feature_names_out()

# Save vectorized data
#save_tfidf_data(x_train_tfidf, "train_tfidf.csv", feature_names)
#save_tfidf_data(x_val_tfidf, "val_tfidf.csv", feature_names)