In [4]:
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt

from ast import literal_eval

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.metrics.pairwise import cosine_distances
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.model_selection import train_test_split, RandomizedSearchCV

import string
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer as PS

import joblib

from pprint import pprint


[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [5]:
data = pd.read_csv('../full_dataset.csv')

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2231142 entries, 0 to 2231141
Data columns (total 7 columns):
 #   Column       Dtype 
---  ------       ----- 
 0   Unnamed: 0   int64 
 1   title        object
 2   ingredients  object
 3   directions   object
 4   link         object
 5   source       object
 6   NER          object
dtypes: int64(1), object(6)
memory usage: 119.2+ MB


In [7]:
data = data.loc[data['source'] == 'Gathered']

In [8]:
data.drop(columns='source', inplace=True)

In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1643098 entries, 0 to 1643097
Data columns (total 6 columns):
 #   Column       Non-Null Count    Dtype 
---  ------       --------------    ----- 
 0   Unnamed: 0   1643098 non-null  int64 
 1   title        1643098 non-null  object
 2   ingredients  1643098 non-null  object
 3   directions   1643098 non-null  object
 4   link         1643098 non-null  object
 5   NER          1643098 non-null  object
dtypes: int64(1), object(5)
memory usage: 87.8+ MB


In [10]:
def literal_return(val):
    try:
        return literal_eval(val)
    except (ValueError, SyntaxError) as e:
        return val

def df_str_to_literal(df, column_name):
    df[column_name] = df[column_name].apply(literal_return)
    return df[column_name]

In [11]:
def clean_df(df, columns_list):
    for col in columns_list:
        df[col] = df_str_to_literal(df, col)
    return None

## CLEANING DF

In [12]:
clean_df(data, ['ingredients', 'directions', 'NER'])

In [13]:
data['bag_of_words'] = ''
columns_list = ['title', 'ingredients', 'directions']
for col in columns_list:
    if col == 'title':
        data['bag_of_words'] += data[col] + ' '
    if col == 'ingredients':
        data['bag_of_words'] += data[col].apply(' '.join) + ' '
    if col == 'directions':
        data['bag_of_words'] += data[col].apply(' '.join)
data.head()

Unnamed: 0.1,Unnamed: 0,title,ingredients,directions,link,NER,bag_of_words
0,0,No-Bake Nut Cookies,"[1 c. firmly packed brown sugar, 1/2 c. evapor...","[In a heavy 2-quart saucepan, mix brown sugar,...",www.cookbooks.com/Recipe-Details.aspx?id=44874,"[brown sugar, milk, vanilla, nuts, butter, bit...",No-Bake Nut Cookies 1 c. firmly packed brown s...
1,1,Jewell Ball'S Chicken,"[1 small jar chipped beef, cut up, 4 boned chi...","[Place chipped beef on bottom of baking dish.,...",www.cookbooks.com/Recipe-Details.aspx?id=699419,"[beef, chicken breasts, cream of mushroom soup...",Jewell Ball'S Chicken 1 small jar chipped beef...
2,2,Creamy Corn,"[2 (16 oz.) pkg. frozen corn, 1 (8 oz.) pkg. c...","[In a slow cooker, combine all ingredients. Co...",www.cookbooks.com/Recipe-Details.aspx?id=10570,"[frozen corn, cream cheese, butter, garlic pow...",Creamy Corn 2 (16 oz.) pkg. frozen corn 1 (8 o...
3,3,Chicken Funny,"[1 large whole chicken, 2 (10 1/2 oz.) cans ch...","[Boil and debone chicken., Put bite size piece...",www.cookbooks.com/Recipe-Details.aspx?id=897570,"[chicken, chicken gravy, cream of mushroom sou...",Chicken Funny 1 large whole chicken 2 (10 1/2 ...
4,4,Reeses Cups(Candy),"[1 c. peanut butter, 3/4 c. graham cracker cru...",[Combine first four ingredients and press in 1...,www.cookbooks.com/Recipe-Details.aspx?id=659239,"[peanut butter, graham cracker crumbs, butter,...",Reeses Cups(Candy) 1 c. peanut butter 3/4 c....


In [14]:
bow_data = data[['title','bag_of_words']]
bow_data.head()

Unnamed: 0,title,bag_of_words
0,No-Bake Nut Cookies,No-Bake Nut Cookies 1 c. firmly packed brown s...
1,Jewell Ball'S Chicken,Jewell Ball'S Chicken 1 small jar chipped beef...
2,Creamy Corn,Creamy Corn 2 (16 oz.) pkg. frozen corn 1 (8 o...
3,Chicken Funny,Chicken Funny 1 large whole chicken 2 (10 1/2 ...
4,Reeses Cups(Candy),Reeses Cups(Candy) 1 c. peanut butter 3/4 c....


In [15]:
documents = bow_data.bag_of_words
documents

0          No-Bake Nut Cookies 1 c. firmly packed brown s...
1          Jewell Ball'S Chicken 1 small jar chipped beef...
2          Creamy Corn 2 (16 oz.) pkg. frozen corn 1 (8 o...
3          Chicken Funny 1 large whole chicken 2 (10 1/2 ...
4          Reeses Cups(Candy)   1 c. peanut butter 3/4 c....
                                 ...                        
1643093    Tuna 'N Egg Salad In Pitas 6 ounces tuna drain...
1643094    Croque Monsieur Panini 2 tablespoons unsalted ...
1643095    Croque Monsieur With Cucumber Salad 1/4 cup wh...
1643096    Baked Pork Chops 1 egg whites 1 cup evaporated...
1643097    Date Filled Oatmeal Cookies 8 ounces dates cut...
Name: bag_of_words, Length: 1643098, dtype: object

In [16]:
def clean_document(document):
    '''
    Takes in a string.
    Returns cleaned string.
    '''
    # lowercase the strings
    doc_lower = document.lower() 

    #tokenize
    tokens = word_tokenize(doc_lower) 
    
    # remove punctuation
    punc = set(string.punctuation)
    tokens_no_punc = [word for word in tokens if word not in punc]
   
    # remove stopwords
    s_words = set(stopwords.words('english'))
    s_words_list = ['tablespoon', 'tbsp', 'teaspoon', 'tsp', 'cup', 'oz', 'lb', 'c.']
    for word in s_words_list:
        s_words.add(word)
    tokens_no_sw = [word for word in tokens_no_punc if word not in s_words]
    
    # stem the words to get rid of multiple forms of the same word
    porter = PS()
    tokens_stemmed = [porter.stem(word) for word in tokens_no_sw]
    
    # join all words into one string
    cleaned_doc = ' '.join(tokens_stemmed)
    
    return cleaned_doc

## CLEANING DATA

In [17]:
bow_data['cleaned_bow'] = bow_data['bag_of_words'].apply(clean_document)
bow_data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bow_data['cleaned_bow'] = bow_data['bag_of_words'].apply(clean_document)


Unnamed: 0,title,bag_of_words,cleaned_bow
0,No-Bake Nut Cookies,No-Bake Nut Cookies 1 c. firmly packed brown s...,no-bak nut cooki 1 firmli pack brown sugar 1/2...
1,Jewell Ball'S Chicken,Jewell Ball'S Chicken 1 small jar chipped beef...,jewel ball 's chicken 1 small jar chip beef cu...
2,Creamy Corn,Creamy Corn 2 (16 oz.) pkg. frozen corn 1 (8 o...,creami corn 2 16 pkg frozen corn 1 8 pkg cream...
3,Chicken Funny,Chicken Funny 1 large whole chicken 2 (10 1/2 ...,chicken funni 1 larg whole chicken 2 10 1/2 ca...
4,Reeses Cups(Candy),Reeses Cups(Candy) 1 c. peanut butter 3/4 c....,rees cup candi 1 peanut butter 3/4 graham crac...
...,...,...,...
1643093,Tuna 'N Egg Salad In Pitas,Tuna 'N Egg Salad In Pitas 6 ounces tuna drain...,tuna n egg salad pita 6 ounc tuna drain 1 cook...
1643094,Croque Monsieur Panini,Croque Monsieur Panini 2 tablespoons unsalted ...,croqu monsieur panini 2 tablespoon unsalt butt...
1643095,Croque Monsieur With Cucumber Salad,Croque Monsieur With Cucumber Salad 1/4 cup wh...,croqu monsieur cucumb salad 1/4 white wine vin...
1643096,Baked Pork Chops,Baked Pork Chops 1 egg whites 1 cup evaporated...,bake pork chop 1 egg white 1 evapor skim milk ...


In [19]:
data.to_pickle('./models/full_data_df_pickle_4.pkl', protocol=4)

In [20]:
data.to_pickle('./models/full_data_df_pickle_5.pkl', protocol=5)

In [21]:
bow_data.to_pickle('./models/training_df_pickle_4.pkl', protocol=4)

In [22]:
bow_data.to_pickle('./models/training_df_pickle_5.pkl', protocol=5)

In [23]:
docs_cleaned = bow_data['cleaned_bow']

## TRAINING

In [24]:
num_features = 1000
ngram_range=(1,3)

vec = CountVectorizer(max_df=0.85, 
                      min_df=10,
                      ngram_range=ngram_range,
                      max_features=num_features)

tf = vec.fit_transform(docs_cleaned)

In [25]:
# tf_feature_names = vec.get_feature_names()
# tf_feature_names[:30]

In [26]:
num_topics = [100]
learning_method=['online']
learning_offset = [10, 50, 90]
doc_topic_prior = [None, 0.1, 0.9]
topic_word_prior = [None, 0.1, 0.9]
learning_decay = [0.5, 0.7, 0.9]
batch_size = [64, 128]
n_jobs= [-1]

In [27]:
random_grid = {'n_components': num_topics,
               'learning_method':learning_method,
               'learning_offset': learning_offset,
               'doc_topic_prior': doc_topic_prior,
               'topic_word_prior': topic_word_prior,
               'learning_decay': learning_decay,
               'batch_size': batch_size,
               'n_jobs': n_jobs}

pprint(random_grid)

{'batch_size': [64, 128],
 'doc_topic_prior': [None, 0.1, 0.9],
 'learning_decay': [0.5, 0.7, 0.9],
 'learning_method': ['online'],
 'learning_offset': [10, 50, 90],
 'n_components': [100],
 'n_jobs': [-1],
 'topic_word_prior': [None, 0.1, 0.9]}


In [28]:
tf_train, tf_test = train_test_split(tf, test_size=0.25)
lda = LatentDirichletAllocation()
lda_grid = RandomizedSearchCV(estimator=lda, 
                              param_distributions=random_grid,
                              cv=5,
                              n_iter=10,
                              n_jobs=-1, 
                              verbose=1)

In [None]:
lda_grid.fit(tf_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


## Disk space ran out... all 1000 GB...

In [None]:
print(pd.DataFrame.from_dict(lda_grid.cv_results_))
print('Test Score:', lda_grid.score(tf_test))
print('Perplexity:', lda_grid.perplexity(tf_test))

In [None]:
best_lda_model = lda_grid.best_estimator_

In [None]:
joblib.dump(best_lda_model, './models/lda_model_full_tid_pickle4.joblib', protocol=4)

In [None]:
joblib.dump(best_lda_model, './models/lda_model_full_tid_pickle5.joblib', protocol=5)

In [None]:
joblib.dump(vec, './models/vec_full_tid_pickle4.joblib', protocol=4)

In [None]:
joblib.dump(vec, './models/vec_full_tid_pickle5.joblib', protocol=5)

In [None]:
print("Best Model's Params: ", lda_grid.best_params_)

In [None]:
print("Best Model's Log Likelihood Score: ", lda_grid.best_score_)

In [None]:
print("Best Model's Perplexity: ", best_lda_model.perplexity(tf))

In [None]:
def display_topics(model, feature_names, num_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx))
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-num_top_words - 1:-1]]))

num_top_words = 10
display_topics(lda, tf_feature_names, num_top_words)

In [None]:
joblib.dump(best_lda_model, './models/lda_model_full_tid.joblib')
joblib.dump(vec, './models/vec_full_tid.joblib')
lda = joblib.load('./models/lda_model_full_tid.joblib')
tf_vectorizer = joblib.load('./models/vec_full_tid.joblib')

In [None]:
probs = lda.transform(tf)

In [None]:
def closest_recipes(keyword, recipes, probs, n_recipes=10):
    idx_arr = np.array(recipes.index)
    keyword_recipes = recipes[recipes.str.contains(keyword, case=False, regex=False)]
    keyword_samples = np.random.choice(keyword_recipes.index, size=50, replace=True)
    keyword_idxs = []
    for sample_idx in keyword_samples:
        keyword_idx = int(np.where(idx_arr == sample_idx)[0])
        keyword_idxs.append(keyword_idx)

    d={}
    for idx in keyword_idxs:
        sims = cosine_distances(probs[idx].reshape(1, -1), probs).argsort()[0]
        for sim in sims[1:n_recipes+1]:
            if sim not in d:
                d[sim] = 1
            else:
                d[sim] += 1
                
    d_sorted = [k for k, v in sorted(d.items(), key=lambda item: item[1])][:-n_recipes:-1]
    print(f'Top {n_recipes} recipes most closely related to {keyword}')
    return np.array(recipes)[d_sorted]

In [None]:
recipes = bow_toy_data.title

In [None]:
closest_recipes('bean', recipes, probs)

In [None]:
closest_recipes('vegan', recipes, probs)

In [None]:
closest_recipes('pepperoni pizza', recipes, probs)

In [None]:
closest_recipes('spaghetti', recipes, probs)

In [None]:
closest_recipes('chicken soup', recipes, probs)

Need to experiment with different parameters with the model, as well as the bag of words contents. First, let's see if we can identify the "best" number of topics using KMeans clustering.

In [None]:
closest_recipes('lentil', recipes, probs)

In [None]:
closest_recipes('cheese pizza', recipes, probs)

## KMEANS

In [None]:
vectorizer = TfidfVectorizer(max_features=1000)
docs_vec = vectorizer.fit_transform(docs_cleaned)
features = vectorizer.get_feature_names()

In [None]:
k=100
kmeans = KMeans(n_clusters=k)
kmeans.fit(docs_vec)
score = silhouette_score(docs_vec, kmeans.labels_)
print(f'k = {k}, silhouette score = {score}')

In [None]:
joblib.dump(kmeans, './models/kmeans_full.joblib')
joblib.dump(vectorizer, './models/tf_vec_full.joblib')
kmeans = joblib.load('./models/kmeans_full.joblib')
tf_vec = joblib.load('./models/tf_vec_full.joblib')

In [None]:
# Find the top 10 features for each cluster.
n_features = 10
top_centroids = kmeans.cluster_centers_.argsort()[:,-1:-(n_features+1):-1]
print("top features (words) for each cluster:")
for num, centroid in enumerate(top_centroids):
    print(f"{num}, {', '.join(features[i] for i in centroid)}")

In [None]:
print("Random sample of texts in each cluster \n")
assigned_cluster = kmeans.transform(docs_vec).argmin(axis=1)
for i in range(kmeans.n_clusters):
    cluster = np.arange(0, docs_vec.shape[0])[assigned_cluster==i]
    sample_recipes = np.random.choice(cluster, 10, replace=False)
    
    print(f'\n cluster {i}:')
    for idx in sample_recipes:
        print(f'{recipes.iloc[idx]}')

In [None]:
recipe_count = []
for i in range(kmeans.n_clusters):
    cluster = np.arange(0, docs_vec.shape[0])[assigned_cluster==i]
    recipe_count.append(len(cluster))
    print(f"Cluster {i}: {len(cluster)} recipes")

In [144]:
top_words = []
n_features = 1
top_centroids = kmeans.cluster_centers_.argsort()[:,-1:-(n_features+1):-1]
for num, centroid in enumerate(top_centroids):
    top_words.append(', '.join(features[i] for i in centroid))

In [None]:
fig, ax = plt.subplots(figsize=(24, 8))
ax.set_title('Recipe Count By Cluster', fontsize = 24)
ax.set_ylabel('Count', fontsize = 24)
plt.yticks(fontsize = 20)
ax.set_xticklabels(top_words, fontsize = 18, rotation = 90)
ax.bar(top_words, recipe_count);