In [1]:
import json
import pickle

import numpy as np
import pandas as pd

from spacy.lang.en import English
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

from utils.preprocessing import spacy_tokenize, dummy_fn

## Daily Dialogue

In [2]:
with open('data/daily_dataset.json', 'r', encoding="utf8") as f:
    daily = json.load(f)
    
daily_train, daily_test = train_test_split(daily, test_size=0.2, random_state=42)

In [3]:
len(daily_train), len(daily_test)

(1881, 471)

In [4]:
tokenizer = English().tokenizer
texts = [" ".join(ut) for sample in daily_train for ut in sample['previous_text']]
texts = [spacy_tokenize(ut, tokenizer) for ut in texts]

tfidf = TfidfVectorizer(
    lowercase=False,
    analyzer='word',
    tokenizer=dummy_fn,
    preprocessor=dummy_fn,
    token_pattern=None,
    min_df=3,
    max_df=0.7,
    max_features = 300,
)

In [5]:
tfidf.fit(texts)

TfidfVectorizer(lowercase=False, max_df=0.7, max_features=300, min_df=3,
                preprocessor=<function dummy_fn at 0x000002EA4F9473A0>,
                token_pattern=None,
                tokenizer=<function dummy_fn at 0x000002EA4F9473A0>)

In [6]:
tfidf.transform(texts[34:35]).todense().shape

(1, 300)

In [7]:
pickle.dump(tfidf, open("models/daily_tfidf_3_08_300.pkl", "wb"))

In [8]:
tf_loaded = pickle.load(open("models/daily_tfidf_3_08_300.pkl", 'rb'))
tf_loaded

TfidfVectorizer(lowercase=False, max_df=0.7, max_features=300, min_df=3,
                preprocessor=<function dummy_fn at 0x000002EA4F9473A0>,
                token_pattern=None,
                tokenizer=<function dummy_fn at 0x000002EA4F9473A0>)

In [9]:
np.all(tfidf.transform(texts[34:35]).todense() == tf_loaded.transform(texts[34:35]).todense())

True

In [10]:
tfidf.vocabulary_

{'i': 121,
 'think': 249,
 'to': 255,
 'send': 214,
 'it': 127,
 'in': 124,
 '!': 0,
 'you': 296,
 'need': 167,
 'be': 38,
 'very': 268,
 'that': 239,
 'every': 79,
 'two': 262,
 'if': 123,
 'want': 270,
 'get': 93,
 '.': 11,
 'can': 54,
 'now': 175,
 '?': 13,
 'hello': 107,
 ',': 9,
 'this': 250,
 'is': 126,
 'from': 91,
 'do': 71,
 "n't": 165,
 'know': 131,
 "'ll": 4,
 'me': 149,
 'but': 49,
 'we': 273,
 'last': 132,
 'week': 274,
 'at': 35,
 'the': 240,
 'of': 177,
 'oh': 181,
 'yes': 294,
 'hi': 111,
 'how': 120,
 'are': 31,
 'great': 100,
 'have': 104,
 'a': 14,
 'few': 83,
 'minutes': 153,
 'right': 206,
 'for': 89,
 'tell': 234,
 'little': 137,
 'more': 157,
 'about': 15,
 'our': 191,
 'company': 61,
 'has': 103,
 'wo': 287,
 'take': 232,
 'long': 140,
 'sure': 230,
 'look': 141,
 'here': 110,
 'see': 212,
 'where': 280,
 "'s": 7,
 'like': 136,
 'so': 221,
 'far': 82,
 'am': 25,
 'really': 205,
 'excuse': 81,
 'your': 297,
 'and': 27,
 'please': 198,
 'will': 285,
 'hotel': 116,

## Topical Dataset

In [2]:
with open('data/topical_dataset.json', 'r', encoding="utf8") as f:
    topical = json.load(f)
    
topical_train, topical_test = train_test_split(topical, test_size=0.2, random_state=42)

In [3]:
len(topical_train), len(topical_test)

(8093, 2024)

In [4]:
tokenizer = English().tokenizer
texts = [" ".join(ut) for sample in topical_train for ut in sample['previous_text']]
texts = [spacy_tokenize(ut, tokenizer) for ut in texts]

tfidf = TfidfVectorizer(
    lowercase=False,
    analyzer='word',
    tokenizer=dummy_fn,
    preprocessor=dummy_fn,
    token_pattern=None,
    min_df=3,
    max_df=0.7,
    max_features = 300,
)

In [5]:
tfidf.fit(texts)

TfidfVectorizer(lowercase=False, max_df=0.7, max_features=300, min_df=3,
                preprocessor=<function dummy_fn at 0x000001ED96A18310>,
                token_pattern=None,
                tokenizer=<function dummy_fn at 0x000001ED96A18310>)

In [6]:
tfidf.transform(texts[34:35]).todense().shape

(1, 300)

In [7]:
pickle.dump(tfidf, open("models/topical_tfidf_3_08_300.pkl", "wb"))

In [8]:
tf_loaded = pickle.load(open("models/topical_tfidf_3_08_300.pkl", 'rb'))
tf_loaded

TfidfVectorizer(lowercase=False, max_df=0.7, max_features=300, min_df=3,
                preprocessor=<function dummy_fn at 0x000001ED96A18310>,
                token_pattern=None,
                tokenizer=<function dummy_fn at 0x000001ED96A18310>)

In [9]:
np.all(tfidf.transform(texts[34:35]).todense() == tf_loaded.transform(texts[34:35]).todense())

True

In [10]:
tfidf.vocabulary_

{'yeah': 293,
 'i': 119,
 'think': 247,
 'they': 244,
 'have': 107,
 'their': 239,
 'own': 190,
 ' ': 0,
 'basketball': 39,
 'players': 197,
 'are': 31,
 'some': 226,
 'the': 238,
 'paid': 191,
 'people': 193,
 'in': 123,
 'country': 63,
 'even': 78,
 'wow': 291,
 'did': 68,
 'not': 175,
 'know': 134,
 'that': 237,
 'thought': 251,
 'football': 87,
 'were': 275,
 'as': 33,
 'well': 274,
 'of': 178,
 'was': 268,
 'actually': 17,
 'a': 14,
 'for': 88,
 'you': 297,
 'game': 92,
 'we': 273,
 'play': 194,
 '3': 12,
 'video': 266,
 'games': 93,
 '!': 1,
 '?': 13,
 'believe': 45,
 'it': 128,
 'because': 41,
 'my': 163,
 'can': 57,
 'need': 167,
 'to': 253,
 'so': 224,
 "n't": 164,
 'since': 223,
 'new': 170,
 'house': 117,
 'do': 70,
 'never': 169,
 'used': 264,
 'lol': 141,
 'hear': 109,
 'about': 15,
 'guy': 102,
 'who': 282,
 'played': 195,
 'then': 241,
 'does': 71,
 'me': 152,
 'there': 242,
 'always': 23,
 'fun': 90,
 'out': 188,
 'fan': 82,
 'nt': 177,
 'watched': 270,
 'any': 29,
 'bu