In [1]:
import json
import pickle

import numpy as np
import pandas as pd

from spacy.lang.en import English
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

from utils.preprocessing import spacy_tokenize, dummy_fn

## Daily Dialogue

In [2]:
with open('data/daily_dataset_v3.json', 'r', encoding="utf8") as f:
    daily = json.load(f)
    
daily_train, daily_test = train_test_split(daily, test_size=0.2, random_state=42)

In [4]:
len(daily_train), len(daily_test)

(48869, 12218)

In [3]:
tokenizer = English().tokenizer
texts = [" ".join(ut) for sample in daily_train for ut in sample['previous_text']]
texts = [spacy_tokenize(ut, tokenizer) for ut in texts]

# tfidf = TfidfVectorizer(
#     lowercase=False,
#     analyzer='word',
#     tokenizer=dummy_fn,
#     preprocessor=dummy_fn,
#     token_pattern=None,
#     min_df=3,
#     max_df=0.7,
#     max_features = 300,
# )

In [21]:
tfidf.fit(texts)

TfidfVectorizer(lowercase=False, max_df=0.7, max_features=300, min_df=3,
                preprocessor=<function dummy_fn at 0x0000022238E4E5E0>,
                token_pattern=None,
                tokenizer=<function dummy_fn at 0x0000022238E4E5E0>)

In [22]:
tfidf.transform(texts[34:35]).todense().shape

(1, 300)

In [23]:
pickle.dump(tfidf, open("models/daily_tfidf_3_08_300.pkl", "wb"))

In [4]:
tf_loaded = pickle.load(open("models/daily_tfidf_3_08_300.pkl", 'rb'))
tf_loaded

TfidfVectorizer(lowercase=False, max_df=0.7, max_features=300, min_df=3,
                preprocessor=<function dummy_fn at 0x000001BDA9C47EE0>,
                token_pattern=None,
                tokenizer=<function dummy_fn at 0x000001BDA9C47EE0>)

In [25]:
np.all(tfidf.transform(texts[34:35]).todense() == tf_loaded.transform(texts[34:35]).todense())

True

In [7]:
tf_loaded.transform(texts[34:35]).todense()

<1x300 sparse matrix of type '<class 'numpy.float64'>'
	with 15 stored elements in Compressed Sparse Row format>

In [26]:
len(tfidf.vocabulary_)

300

## Topical Dataset

In [27]:
with open('data/topical_dataset_v3.json', 'r', encoding="utf8") as f:
    topical = json.load(f)
    
topical_train, topical_test = train_test_split(topical, test_size=0.2, random_state=42)

In [28]:
len(topical_train), len(topical_test)

(129995, 32499)

In [29]:
tokenizer = English().tokenizer
texts = [" ".join(ut) for sample in topical_train for ut in sample['previous_text']]
texts = [spacy_tokenize(ut, tokenizer) for ut in texts]

tfidf = TfidfVectorizer(
    lowercase=False,
    analyzer='word',
    tokenizer=dummy_fn,
    preprocessor=dummy_fn,
    token_pattern=None,
    min_df=3,
    max_df=0.7,
    max_features = 300,
)

In [30]:
tfidf.fit(texts)

TfidfVectorizer(lowercase=False, max_df=0.7, max_features=300, min_df=3,
                preprocessor=<function dummy_fn at 0x0000022238E4E5E0>,
                token_pattern=None,
                tokenizer=<function dummy_fn at 0x0000022238E4E5E0>)

In [31]:
tfidf.transform(texts[34:35]).todense().shape

(1, 300)

In [32]:
pickle.dump(tfidf, open("models/topical_tfidf_3_08_300.pkl", "wb"))

In [33]:
tf_loaded = pickle.load(open("models/topical_tfidf_3_08_300.pkl", 'rb'))
tf_loaded

TfidfVectorizer(lowercase=False, max_df=0.7, max_features=300, min_df=3,
                preprocessor=<function dummy_fn at 0x0000022238E4E5E0>,
                token_pattern=None,
                tokenizer=<function dummy_fn at 0x0000022238E4E5E0>)

In [34]:
np.all(tfidf.transform(texts[34:35]).todense() == tf_loaded.transform(texts[34:35]).todense())

True

In [38]:
len(tfidf.vocabulary_)

300