In [23]:
import pickle
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_distances
import joblib
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [24]:
with open('./dataframes/full_data_clean_df_pickle4.pkl', 'rb') as f:
    df = pickle.load(f)

In [25]:
docs_train, docs_test = train_test_split(df, stratify=df['site'], train_size=250000)

In [26]:
docs_train.head()

Unnamed: 0.1,Unnamed: 0,title,NER,site,urls,bag_of_words,cleaned_bow
566922,566922,Condensed Tomato Soup,"[tomatoes, chicken broth, heavy cream]",www.cookbooks.com,http://www.cookbooks.com/Recipe-Details.aspx?i...,Condensed Tomato Soup 1 large can of crushed t...,condens tomato soup 1 larg crush tomato 4 chic...
240073,240073,Vegetable Clam Chowder(Low-Fat),"[clams, bay leaves, oregano, onion, low-fat ch...",www.cookbooks.com,http://www.cookbooks.com/Recipe-Details.aspx?i...,Vegetable Clam Chowder(Low-Fat) 2 cans clams...,veget clam chowder low-fat 2 can clam 2 bay le...
599686,599686,Party Potatoes,"[white potatoes, salt, sour cream, Cheddar che...",www.cookbooks.com,http://www.cookbooks.com/Recipe-Details.aspx?i...,"Party Potatoes 4 to 5 large white potatoes, bo...",parti potato 4 5 larg white potato boil jacket...
122349,122349,Dirty Rice,"[ground chuck, rice, onion soup, cream of chic...",www.cookbooks.com,http://www.cookbooks.com/Recipe-Details.aspx?i...,Dirty Rice 1 lb. ground chuck 1 c. uncooked ri...,dirti rice 1 ground chuck 1 uncook rice 1 onio...
782954,782954,"""Herman"" Starter","[flour, sugar, yeast, salt, warm water]",www.cookbooks.com,http://www.cookbooks.com/Recipe-Details.aspx?i...,"""Herman"" Starter 2 c. flour 3 Tbsp. sugar 1 en...",`` herman '' starter 2 flour 3 sugar 1 envelop...


In [None]:
from model_comparisions import get_counts, counts_horizontal_bar

In [None]:
website_counts = get_counts(df, 'site')
website_counts

In [None]:
fig, ax = plt.subplots(figsize=(18,10))
ax.set_xlabel('Recipe Count', fontsize = 24)
ax.set_title('Counts of Recipes by Website', fontsize = 24)

counts_horizontal_bar(website_counts, 'counts', 'site', 0, 900000, 60000, ax)

In [27]:
docs_train.to_pickle('./dataframes/250k_new_df_pickle_4.pkl', protocol=4)

In [28]:
docs_cleaned = docs_train['cleaned_bow']

In [29]:
vec = CountVectorizer(max_df=0.85, 
                      min_df=10,
                      ngram_range=(1,3),
                      max_features=1000)

tf = vec.fit_transform(docs_cleaned)

In [30]:
joblib.dump(vec, './models/250k_new_vec_pickle_4.joblib', protocol=4)

['./models/250k_new_vec_pickle_4.joblib']

In [31]:
num_topics = 100

lda = LatentDirichletAllocation(n_components=num_topics,
                                learning_method='online',
                                learning_offset=50,
                                doc_topic_prior=0.9,
                                topic_word_prior=0.9,
                                batch_size=32,
                                n_jobs=-1)
lda.fit(tf)

LatentDirichletAllocation(batch_size=32, doc_topic_prior=0.9,
                          learning_method='online', learning_offset=50,
                          n_components=100, n_jobs=-1, topic_word_prior=0.9)

In [32]:
joblib.dump(lda, './models/250k_new_lda_pickle_4.joblib',protocol=4)

['./models/250k_new_lda_pickle_4.joblib']

In [33]:
lda.score(tf)

-119074268.38571844

In [34]:
lda.perplexity(tf)

402.44675421065614