In [14]:
import pickle
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_distances
import joblib
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [2]:
with open('./dataframes/full_data_clean_df_pickle4.pkl', 'rb') as f:
    df = pickle.load(f)

In [6]:
df_subset = df.sample(frac=0.203, replace=False)

In [22]:
df_subset.to_pickle('./dataframes/new_df_pickle_4.pkl', protocol=4)

In [7]:
docs_cleaned = df_subset['cleaned_bow']

In [8]:
docs_train, docs_test = train_test_split(docs_cleaned, stratify=df_subset['site'])

In [9]:
vec = CountVectorizer(max_df=0.85, 
                      min_df=10,
                      ngram_range=(1,3),
                      max_features=500)

tf = vec.fit_transform(docs_train)

In [10]:
joblib.dump(vec, './models/new_vec.joblib')

['./models/new_vec.joblib']

In [11]:
joblib.dump(vec, './models/new_vec_pickle_4.joblib', protocol=4)

['./models/new_vec_pickle_4.joblib']

In [16]:
num_topics = 80

lda = LatentDirichletAllocation(n_components=num_topics,
                                learning_method='online',
                                learning_offset=50,
                                doc_topic_prior=0.9,
                                topic_word_prior=0.9,
                                batch_size=32,
                                n_jobs=-1)
lda.fit(tf)

LatentDirichletAllocation(batch_size=32, doc_topic_prior=0.9,
                          learning_method='online', learning_offset=50,
                          n_components=80, n_jobs=-1, topic_word_prior=0.9)

In [17]:
tf_test = vec.fit_transform(docs_test)

In [18]:
lda.score(tf_test)

-33041988.79651101

In [19]:
lda.perplexity(tf_test)

374.9879409180741

In [20]:
joblib.dump(lda, './models/new_lda_pickle_4.joblib',protocol=4)

['./models/new_lda_pickle_4.joblib']