In [1]:
import json
import pickle

import numpy as np
import pandas as pd

from spacy.lang.en import English
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

from utils.preprocessing import spacy_tokenize, dummy_fn

In [2]:
with open('data/dataset.json', 'r') as f:
    data = json.load(f)
    
train, test = train_test_split(data, test_size=0.2, random_state=42)

In [3]:
len(train), len(test)

(8452, 2113)

In [4]:
tokenizer = English().tokenizer
texts = [ut for sample in train for ut in sample['previous_text']]
texts = [spacy_tokenize(ut, tokenizer) for ut in texts]

tfidf = TfidfVectorizer(
    lowercase=False,
    analyzer='word',
    tokenizer=dummy_fn,
    preprocessor=dummy_fn,
    token_pattern=None,
    min_df=3,
    max_df=0.8,
    max_features = 300,
)

In [5]:
tfidf.fit(texts)

TfidfVectorizer(lowercase=False, max_df=0.8, max_features=300, min_df=3,
                preprocessor=<function dummy_fn at 0x0000020DF76C6700>,
                token_pattern=None,
                tokenizer=<function dummy_fn at 0x0000020DF76C6700>)

In [6]:
tfidf.transform(texts[34:35]).todense().shape

(1, 300)

In [7]:
pickle.dump(tfidf, open("models/tfidf_3_08_300.pkl", "wb"))

In [8]:
tf_loaded = pickle.load(open("models/tfidf_3_08_300.pkl", 'rb'))
tf_loaded

TfidfVectorizer(lowercase=False, max_df=0.8, max_features=300, min_df=3,
                preprocessor=<function dummy_fn at 0x0000020DF76C6700>,
                token_pattern=None,
                tokenizer=<function dummy_fn at 0x0000020DF76C6700>)

In [9]:
np.all(tfidf.transform(texts[34:35]).todense() == tf_loaded.transform(texts[34:35]).todense())

True