## To save time, we can run TF-IDF vectorization on train, validation, test batches at once.
#### Note: To run inference on single samples, you would need the trained vectorizer that we save here. 

In [None]:
import spacy
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import save_npz, load_npz
import dill
import json
import time

seed = 1

In [None]:
# load data
load_path = "../data/clean/"
df_train = pd.read_excel(load_path + "training_data_cleaned.xlsx")
df_val = pd.read_excel(load_path + "validation_data_cleaned.xlsx")
df_test = pd.read_excel(load_path + "test_data_cleaned.xlsx")

In [None]:
# doing it en masse

codes = [
    "350k_1percent_422words",
    "authorization_1percent_401words",
    "order related and payments_1percent_439words",
    "product queries_1percent_351words",
    "queries regarding website_1percent_274words",
    "warranty_1percent_433words",
    "full_5352words",
]

for code in codes:
    vocab_path = "../data/interim/vocabs/vocab_" + code + ".json"
    vectorizer_path = "../data/interim/vectorizers/vec_" + code
    tfidf_train_path = "../data/interim/tfidfs/tfidf_" + code + "_train.npz"
    tfidf_val_path = "../data/interim/tfidfs/tfidf_" + code + "_val.npz"
    tfidf_test_path = "../data/interim/tfidfs/tfidf_" + code + "_test.npz"

    with open(vocab_path) as f:
        vocab = json.load(f)
    vectorizer = TfidfVectorizer(ngram_range=(1, 3), vocabulary=vocab)
    tfidf_train = vectorizer.fit_transform(df_train["text"])
    tfidf_val = vectorizer.transform(df_val["text"])
    tfidf_test = vectorizer.transform(df_test["text"])

    with open(vectorizer_path, "wb") as f:
        dill.dump(vectorizer, f, protocol=3)
    save_npz(tfidf_train_path, tfidf_train)
    save_npz(tfidf_val_path, tfidf_val)
    save_npz(tfidf_test_path, tfidf_test)