In [12]:
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt

from ast import literal_eval

from urllib.parse import urlparse

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.metrics.pairwise import cosine_similarity, cosine_distances
from sklearn.metrics import jaccard_score
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.model_selection import train_test_split, RandomizedSearchCV

import string
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer as PS

import joblib

import pprint

[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [13]:
# data = pd.read_csv('dataset/full_dataset.csv')
data = pd.read_csv('../work/dataset/full_dataset.csv')

In [16]:
toy_data = data.sample(frac=0.1, replace=False)
toy_data.head(10)

Unnamed: 0.1,Unnamed: 0,title,ingredients,directions,link,source,NER
929910,929910,Hilda'S Icebox Cookies,"[""2 cups margarine"", ""1 cup packed brown sugar...","[""Cream the margarine and the sugars together....",www.allrecipes.com/recipe/11222/hildas-icebox-...,Gathered,"[""margarine"", ""brown sugar"", ""white sugar"", ""e..."
570299,570299,Potato Skins,"[""potatoes (how many depends on number serving...","[""Boil or microwave washed whole potatoes unti...",www.cookbooks.com/Recipe-Details.aspx?id=408480,Gathered,"[""potatoes"", ""bacon"", ""green onions"", ""cheese""..."
1490809,1490809,Sweet ‘N’ Sour Appetizer Meatballs,"[""1 egg"", ""1/2 cup quick-cooking oats"", ""1 env...","[""In a large bowl, combine the egg, oats and s...",www.tasteofhome.com/recipes/sweet-n-sour-appet...,Gathered,"[""egg"", ""oats"", ""onion soup"", ""ground beef"", ""..."
2070153,2070153,Quick Wine and Cheese Puff,"[""2 eggs"", ""2/3 cup all-purpose flour"", ""1/2 c...","[""Preheat oven to 425 degrees F (220 degrees C...",allrecipes.com/recipe/quick-wine-and-cheese-puff/,Recipes1M,"[""eggs"", ""all-purpose"", ""white wine"", ""milk"", ..."
1933048,1933048,Chicken Salad with Almonds,"[""1 tablespoon soy sauce"", ""8 cups torn lettuc...","[""Whisk together dressing and soy sauce in a s...",www.food.com/recipe/chicken-salad-with-almonds...,Recipes1M,"[""soy sauce"", ""torn lettuce"", ""chicken"", ""gree..."
2121461,2121461,Sauces - Apricot Lemon Glaze/ Lime Daiquiri/ P...,"[""12 cup apricot preserves, good quality"", ""2 ...","[""Apricot Lemon Glaze Heat ingredients togethe...",www.food.com/recipe/sauces-apricot-lemon-glaze...,Recipes1M,"[""apricot preserves"", ""lemons"", ""daiquiri mix""..."
972753,972753,Jubilee Ham,"[""1 7/8 kg ham (gammon joint)"", ""1 onion"", ""1/...","[""Put the gammon joint in a large pan."", ""Cove...",www.food.com/recipe/jubilee-ham-479506,Gathered,"[""ham"", ""onion"", ""peppercorn"", ""anise"", ""bay l..."
1492244,1492244,Butter Pecan Layer Cake,"[""2-2/3 cups chopped pecans"", ""1-1/4 cups butt...","[""Place pecans and 1/4 cup butter in a baking ...",www.tasteofhome.com/recipes/butter-pecan-layer...,Gathered,"[""pecans"", ""butter"", ""eggs"", ""vanilla"", ""all-p..."
654069,654069,Fried Macaroni,"[""1 box macaroni"", ""2 (8 oz.) cans tomato sauc...","[""Melt Crisco in skillet; pour macaroni and on...",www.cookbooks.com/Recipe-Details.aspx?id=1029647,Gathered,"[""macaroni"", ""tomato sauce"", ""onions"", ""Crisco..."
685236,685236,Oatmeal Brownies,"[""1/2 c. butter, melted"", ""1 c. sugar"", ""2 egg...","[""Combine all ingredients in the order given.""...",www.cookbooks.com/Recipe-Details.aspx?id=821550,Gathered,"[""butter"", ""sugar"", ""eggs"", ""vanilla"", ""flour""..."


In [17]:
toy_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 223114 entries, 929910 to 1664393
Data columns (total 7 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   Unnamed: 0   223114 non-null  int64 
 1   title        223114 non-null  object
 2   ingredients  223114 non-null  object
 3   directions   223114 non-null  object
 4   link         223114 non-null  object
 5   source       223114 non-null  object
 6   NER          223114 non-null  object
dtypes: int64(1), object(6)
memory usage: 13.6+ MB


In [18]:
def literal_return(val):
    try:
        return literal_eval(val)
    except (ValueError, SyntaxError) as e:
        return val

def df_str_to_literal(df, column_name):
    df[column_name] = df[column_name].apply(literal_return)
    return df[column_name]

In [19]:
def clean_df(df, columns_list):
    for col in columns_list:
        df[col] = df_str_to_literal(df, col)
    return None

In [20]:
clean_df(toy_data, ['ingredients', 'directions', 'NER'])

In [21]:
toy_data.head()

Unnamed: 0.1,Unnamed: 0,title,ingredients,directions,link,source,NER
929910,929910,Hilda'S Icebox Cookies,"[2 cups margarine, 1 cup packed brown sugar, 1...",[Cream the margarine and the sugars together. ...,www.allrecipes.com/recipe/11222/hildas-icebox-...,Gathered,"[margarine, brown sugar, white sugar, eggs, fl..."
570299,570299,Potato Skins,[potatoes (how many depends on number serving)...,[Boil or microwave washed whole potatoes until...,www.cookbooks.com/Recipe-Details.aspx?id=408480,Gathered,"[potatoes, bacon, green onions, cheese, sour c..."
1490809,1490809,Sweet ‘N’ Sour Appetizer Meatballs,"[1 egg, 1/2 cup quick-cooking oats, 1 envelope...","[In a large bowl, combine the egg, oats and so...",www.tasteofhome.com/recipes/sweet-n-sour-appet...,Gathered,"[egg, oats, onion soup, ground beef, brown sug..."
2070153,2070153,Quick Wine and Cheese Puff,"[2 eggs, 2/3 cup all-purpose flour, 1/2 cup wh...",[Preheat oven to 425 degrees F (220 degrees C)...,allrecipes.com/recipe/quick-wine-and-cheese-puff/,Recipes1M,"[eggs, all-purpose, white wine, milk, salt, gr..."
1933048,1933048,Chicken Salad with Almonds,"[1 tablespoon soy sauce, 8 cups torn lettuce, ...",[Whisk together dressing and soy sauce in a sm...,www.food.com/recipe/chicken-salad-with-almonds...,Recipes1M,"[soy sauce, torn lettuce, chicken, green onion..."


In [22]:
toy_data['bag_of_words'] = ''
columns_list = ['title', 'ingredients', 'directions']
for col in columns_list:
    if col == 'title':
        toy_data['bag_of_words'] += toy_data[col] + ' '
    if col == 'ingredients':
        toy_data['bag_of_words'] += toy_data[col].apply(' '.join) + ' '
    if col == 'directions':
        toy_data['bag_of_words'] += toy_data[col].apply(' '.join)
toy_data.head()

Unnamed: 0.1,Unnamed: 0,title,ingredients,directions,link,source,NER,bag_of_words
929910,929910,Hilda'S Icebox Cookies,"[2 cups margarine, 1 cup packed brown sugar, 1...",[Cream the margarine and the sugars together. ...,www.allrecipes.com/recipe/11222/hildas-icebox-...,Gathered,"[margarine, brown sugar, white sugar, eggs, fl...",Hilda'S Icebox Cookies 2 cups margarine 1 cup ...
570299,570299,Potato Skins,[potatoes (how many depends on number serving)...,[Boil or microwave washed whole potatoes until...,www.cookbooks.com/Recipe-Details.aspx?id=408480,Gathered,"[potatoes, bacon, green onions, cheese, sour c...",Potato Skins potatoes (how many depends on num...
1490809,1490809,Sweet ‘N’ Sour Appetizer Meatballs,"[1 egg, 1/2 cup quick-cooking oats, 1 envelope...","[In a large bowl, combine the egg, oats and so...",www.tasteofhome.com/recipes/sweet-n-sour-appet...,Gathered,"[egg, oats, onion soup, ground beef, brown sug...",Sweet ‘N’ Sour Appetizer Meatballs 1 egg 1/2 c...
2070153,2070153,Quick Wine and Cheese Puff,"[2 eggs, 2/3 cup all-purpose flour, 1/2 cup wh...",[Preheat oven to 425 degrees F (220 degrees C)...,allrecipes.com/recipe/quick-wine-and-cheese-puff/,Recipes1M,"[eggs, all-purpose, white wine, milk, salt, gr...",Quick Wine and Cheese Puff 2 eggs 2/3 cup all-...
1933048,1933048,Chicken Salad with Almonds,"[1 tablespoon soy sauce, 8 cups torn lettuce, ...",[Whisk together dressing and soy sauce in a sm...,www.food.com/recipe/chicken-salad-with-almonds...,Recipes1M,"[soy sauce, torn lettuce, chicken, green onion...",Chicken Salad with Almonds 1 tablespoon soy sa...


In [23]:
bow_toy_data = toy_data[['title','bag_of_words']]
bow_toy_data.head()

Unnamed: 0,title,bag_of_words
929910,Hilda'S Icebox Cookies,Hilda'S Icebox Cookies 2 cups margarine 1 cup ...
570299,Potato Skins,Potato Skins potatoes (how many depends on num...
1490809,Sweet ‘N’ Sour Appetizer Meatballs,Sweet ‘N’ Sour Appetizer Meatballs 1 egg 1/2 c...
2070153,Quick Wine and Cheese Puff,Quick Wine and Cheese Puff 2 eggs 2/3 cup all-...
1933048,Chicken Salad with Almonds,Chicken Salad with Almonds 1 tablespoon soy sa...


In [24]:
documents = bow_toy_data.bag_of_words

In [25]:
def clean_document(document):
    '''
    Takes in a string.
    Returns cleaned string.
    '''
    # lowercase the strings
    doc_lower = document.lower() 

    #tokenize
    tokens = word_tokenize(doc_lower) 
    
    # remove punctuation
    punc = set(string.punctuation)
    tokens_no_punc = [word for word in tokens if word not in punc]
   
    # remove stopwords
    s_words = set(stopwords.words('english'))
    s_words_list = ['tablespoon', 'tbsp', 'teaspoon', 'tsp', 'cup', 'oz', 'lb', 'c.']
    for word in s_words_list:
        s_words.add(word)
    tokens_no_sw = [word for word in tokens_no_punc if word not in s_words]
    
    # stem the words to get rid of multiple forms of the same word
    porter = PS()
    tokens_stemmed = [porter.stem(word) for word in tokens_no_sw]
    
    # join all words into one string
    cleaned_doc = ' '.join(tokens_stemmed)
    
    return cleaned_doc

In [26]:
bow_toy_data['cleaned_bow'] = toy_data['bag_of_words'].apply(clean_document)
bow_toy_data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bow_toy_data['cleaned_bow'] = toy_data['bag_of_words'].apply(clean_document)


Unnamed: 0,title,bag_of_words,cleaned_bow
929910,Hilda'S Icebox Cookies,Hilda'S Icebox Cookies 2 cups margarine 1 cup ...,hilda 's icebox cooki 2 cup margarin 1 pack br...
570299,Potato Skins,Potato Skins potatoes (how many depends on num...,potato skin potato mani depend number serv bac...
1490809,Sweet ‘N’ Sour Appetizer Meatballs,Sweet ‘N’ Sour Appetizer Meatballs 1 egg 1/2 c...,sweet ‘ n ’ sour appet meatbal 1 egg 1/2 quick...
2070153,Quick Wine and Cheese Puff,Quick Wine and Cheese Puff 2 eggs 2/3 cup all-...,quick wine chees puff 2 egg 2/3 all-purpos flo...
1933048,Chicken Salad with Almonds,Chicken Salad with Almonds 1 tablespoon soy sa...,chicken salad almond 1 soy sauc 8 cup torn let...
...,...,...,...
696044,Orange Julius (Dallas Fair),Orange Julius (Dallas Fair) 1 (6 oz.) can froz...,orang juliu dalla fair 1 6 frozen orang juic 1...
1970706,Holiday Cookie Pops,Holiday Cookie Pops 1 (20 ounce) package refri...,holiday cooki pop 1 20 ounc packag refriger su...
425889,Grape Jelly,Grape Jelly 5 c. grape juice (in large pan) 1 ...,grape jelli 5 grape juic larg pan 1 box sure-j...
1869068,7 Layer Sombrero Dip,7 Layer Sombrero Dip 1 (1 ounce) packethidden ...,7 layer sombrero dip 1 1 ounc packethidden val...


In [27]:
docs_cleaned = bow_toy_data['cleaned_bow']

In [9]:
lda = joblib.load('lda_model_6_tid.joblib')
tf_vectorizer = joblib.load('vec_6_tid.joblib')

Can do .predict/.score for model and .transform for vectorizer with new data on the previous models/count vectorizer

In [28]:
tf = tf_vectorizer.fit_transform(docs_cleaned)

In [29]:
lda.perplexity(tf)

536.1663685571037

In [30]:
probs = lda.transform(tf)

In [77]:
def get_keyword_idxs(keyword, recipes):
    idx_arr = np.array(recipes.index)
    keyword_recipes = recipes[recipes.str.contains(keyword, case=False, regex=False)]
    keyword_samples = np.random.choice(keyword_recipes.index, size=min(len(keyword_recipes), 50), replace=False)
    keyword_idxs = []
    for sample_idx in keyword_samples:
        keyword_idx = int(np.where(idx_arr == sample_idx)[0])
        keyword_idxs.append(keyword_idx)
    return keyword_idxs

In [78]:
def get_sample_recipes(keyword, recipes):
    keyword_idxs = get_keyword_idxs(keyword, recipes)
    print(f'Recipe samples for {keyword}')
    return np.array(recipes)[keyword_idxs]

In [79]:
def closest_recipes(keyword, recipes, probs, n_recipes=10):
    keyword_idxs = get_keyword_idxs(keyword, recipes)
    
    d={}
    for idx in keyword_idxs:
        sims = cosine_distances(probs[idx].reshape(1, -1), probs).argsort()[0]
        for sim in sims[1:n_recipes+1]:
            if sim not in d:
                d[sim] = 1
            else:
                d[sim] += 1
                
    d_sorted = [k for k, v in sorted(d.items(), key=lambda item: item[1])][:-n_recipes:-1]
    
    return np.array(recipes)[d_sorted], np.array(recipes)[keyword_idxs]

In [80]:
recipes = bow_toy_data.title

In [81]:
keyword = 'pizza'
n_recipes = 10
recipe_recs, reference_recipes = closest_recipes(keyword, recipes, probs, n_recipes=n_recipes)
print(f'Top {n_recipes} recipes for "{keyword}":')
print(recipe_recs)
print('')
print(f'Based on recipe samples:')
print(reference_recipes)

Top 10 recipes for "pizza":
['Miracle Lasagna' 'Easy Pizza' 'Pizza Dough' 'Bubble Pizza'
 "Marlean'S Easy Vegetable Squares" 'Pizza Casserole'
 'Honey Pizza Dough  (Awesome)' 'Quick Pizza Snack' 'Basic Pizza Dough']

Based on recipe samples:
['Pizza Casserole' 'Pizza Salad' 'Fruit Pizza' 'Veggie Pizza'
 'Pizza Casserole' 'Roasted Potato Pizza' 'Easy Pizza' 'Upside Down Pizza'
 'BLUE PIZZA' "Leah's Favorite Pizza"
 'Wonderful Chicken Pizza With Fresh Basil'
 'Pizza di Ricotta Ricotta Pie with Potato Crust' 'Breakfast Pizza'
 'Pizza Meatballs' 'Smoked Salmon, Tomato And Feta Pizza'
 'Pizza Dough With Yeast' 'Low Carb Pizza Soup'
 'Delicious Chicken Garlic Pizza' 'Chicago Style Deep Dish Pizza Crust'
 'Pizza Dough' 'Chocolate Pizza' 'Roasted Vegetable 12 Grain Bread Pizza'
 'Pizza Casserole' "California Pizza Kitchen's Tiramisu Recipe"
 'Micro Snack Pizzas' 'Pizza Dough ' 'Pizza Breadsticks' 'Pizza Dough'
 'Bisquick Pizza Pull Apart Bread' 'Black Bean Spinach Southwestern Pizza'
 'Pizza P

In [82]:
bow_toy_data['link'] = toy_data['link']
bow_toy_data.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bow_toy_data['link'] = toy_data['link']


Unnamed: 0,title,bag_of_words,cleaned_bow,link
929910,Hilda'S Icebox Cookies,Hilda'S Icebox Cookies 2 cups margarine 1 cup ...,hilda 's icebox cooki 2 cup margarin 1 pack br...,www.allrecipes.com/recipe/11222/hildas-icebox-...
570299,Potato Skins,Potato Skins potatoes (how many depends on num...,potato skin potato mani depend number serv bac...,www.cookbooks.com/Recipe-Details.aspx?id=408480
1490809,Sweet ‘N’ Sour Appetizer Meatballs,Sweet ‘N’ Sour Appetizer Meatballs 1 egg 1/2 c...,sweet ‘ n ’ sour appet meatbal 1 egg 1/2 quick...,www.tasteofhome.com/recipes/sweet-n-sour-appet...
2070153,Quick Wine and Cheese Puff,Quick Wine and Cheese Puff 2 eggs 2/3 cup all-...,quick wine chees puff 2 egg 2/3 all-purpos flo...,allrecipes.com/recipe/quick-wine-and-cheese-puff/
1933048,Chicken Salad with Almonds,Chicken Salad with Almonds 1 tablespoon soy sa...,chicken salad almond 1 soy sauc 8 cup torn let...,www.food.com/recipe/chicken-salad-with-almonds...


In [83]:
pd.to_pickle(bow_toy_data, 'df.pkl')