In [5]:
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt

from ast import literal_eval

from urllib.parse import urlparse

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.metrics.pairwise import cosine_similarity, cosine_distances
from sklearn.metrics import jaccard_score
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.model_selection import train_test_split, RandomizedSearchCV

import string
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer as PS

import joblib

import pprint

import pickle

[nltk_data] Downloading package stopwords to /Users/coxem/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/coxem/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [6]:
# data = pd.read_csv('dataset/full_dataset.csv')
# data = pd.read_csv('../dataset/full_dataset.csv')

In [7]:
# toy_data = data.sample(frac=0.1, replace=False)
# toy_data.head(10)

In [8]:
# toy_data.info()

In [9]:
# def literal_return(val):
#     try:
#         return literal_eval(val)
#     except (ValueError, SyntaxError) as e:
#         return val

# def df_str_to_literal(df, column_name):
#     df[column_name] = df[column_name].apply(literal_return)
#     return df[column_name]

In [10]:
# def clean_df(df, columns_list):
#     for col in columns_list:
#         df[col] = df_str_to_literal(df, col)
#     return None

In [11]:
# clean_df(toy_data, ['ingredients', 'directions', 'NER'])

In [12]:
# toy_data.head()

In [13]:
# toy_data['bag_of_words'] = ''
# columns_list = ['title', 'ingredients', 'directions']
# for col in columns_list:
#     if col == 'title':
#         toy_data['bag_of_words'] += toy_data[col] + ' '
#     if col == 'ingredients':
#         toy_data['bag_of_words'] += toy_data[col].apply(' '.join) + ' '
#     if col == 'directions':
#         toy_data['bag_of_words'] += toy_data[col].apply(' '.join)
# toy_data.head()

In [14]:
# bow_toy_data = toy_data[['title','bag_of_words']]
# bow_toy_data.head()

In [15]:
# documents = bow_toy_data.bag_of_words

In [16]:
# def clean_document(document):
#     '''
#     Takes in a string.
#     Returns cleaned string.
#     '''
#     # lowercase the strings
#     doc_lower = document.lower() 

#     #tokenize
#     tokens = word_tokenize(doc_lower) 
    
#     # remove punctuation
#     punc = set(string.punctuation)
#     tokens_no_punc = [word for word in tokens if word not in punc]
   
#     # remove stopwords
#     s_words = set(stopwords.words('english'))
#     s_words_list = ['tablespoon', 'tbsp', 'teaspoon', 'tsp', 'cup', 'oz', 'lb', 'c.']
#     for word in s_words_list:
#         s_words.add(word)
#     tokens_no_sw = [word for word in tokens_no_punc if word not in s_words]
    
#     # stem the words to get rid of multiple forms of the same word
#     porter = PS()
#     tokens_stemmed = [porter.stem(word) for word in tokens_no_sw]
    
#     # join all words into one string
#     cleaned_doc = ' '.join(tokens_stemmed)
    
#     return cleaned_doc

In [17]:
# bow_toy_data['cleaned_bow'] = toy_data['bag_of_words'].apply(clean_document)
# bow_toy_data

In [18]:
# docs_cleaned = bow_toy_data['cleaned_bow']

In [19]:
lda = joblib.load('./models/lda_model_6_tid.joblib')
tf_vectorizer = joblib.load('./models/vec_6_tid.joblib')

In [20]:
with open('./flask_app/df.pkl', 'rb') as f:
    df = pickle.load(f)

In [54]:
joblib.dump(lda, './models/lda_model_6_tid_pickle4.joblib', protocol=4)
joblib.dump(tf_vectorizer, './models/vec_6_tid_pickle4.joblib', protocol=4)

['./models/vec_6_tid_pickle4.joblib']

In [21]:
docs = df['cleaned_bow']

In [22]:
tf = tf_vectorizer.fit_transform(docs)

In [23]:
# lda.perplexity(tf)

In [24]:
probs = lda.transform(tf)

In [25]:
def get_keyword_idxs(keyword, recipes):
    idx_arr = np.array(recipes.index)
    keyword_recipes = recipes[recipes.str.contains(keyword, case=False, regex=False)]
    keyword_samples = np.random.choice(keyword_recipes.index, size=min(len(keyword_recipes), 50), replace=False)
    keyword_idxs = []
    for sample_idx in keyword_samples:
        keyword_idx = int(np.where(idx_arr == sample_idx)[0])
        keyword_idxs.append(keyword_idx)
    return keyword_idxs

In [26]:
def get_sample_recipes(keyword, recipes):
    keyword_idxs = get_keyword_idxs(keyword, recipes)
    print(f'Recipe samples for {keyword}')
    return np.array(recipes)[keyword_idxs]

In [44]:
def closest_recipes(keyword, recipes, probs, n_recipes=10):
    keyword_idxs = get_keyword_idxs(keyword, recipes)
    
    d={}
    for idx in keyword_idxs:
        sims = cosine_distances(probs[idx].reshape(1, -1), probs).argsort()[0]
        for sim in sims[1:n_recipes+1]:
            if sim not in d:
                d[sim] = 1
            else:
                d[sim] += 1
                
    rec_idxs = [k for k, v in sorted(d.items(), key=lambda item: item[1])][:-n_recipes:-1]
    
    recipe_recs = np.array(recipes)[rec_idxs]
    
    reference_recipes = np.array(recipes)[keyword_idxs]
    
    return recipe_recs, rec_idxs, reference_recipes, keyword_idxs

In [40]:
recipes = df.title

In [45]:
keyword = 'pizza'
n_recipes = 10
recipe_recs, rec_idxs, reference_recipes, keyword_idxs = closest_recipes(keyword, recipes, probs, n_recipes=n_recipes)
print(f'Top {n_recipes} recipes for "{keyword}":')
print(recipe_recs)
print('')
print(f'Based on recipe samples:')
print(reference_recipes)

Top 10 recipes for "pizza":
['Barbecue Oysters' 'Beef And Kidney Pie' 'Pizzicate Pizzas' 'Easy Pizza'
 "Lee'S Spaghetti Sauce(Makes 4 To 6 Servings)  " 'Pizza Crackers'
 'Strudel Dough' 'Fast pizza' 'Puffed Pastry Pizza']

Based on recipe samples:
['Elbow Mac Pepperoni Pizza' 'Pizza puff' 'Taco Pizza Dip'
 'Pizza Casserole' "Prissy'S Pizza Dust"
 'Tuna, Roast Pepper And Pesto Pizzas' 'Pizza Dough' 'Pizza Hot Dish'
 'Pizza Pinwheels' 'Pepperoni Pizza Nachos' 'Vegetable Pizza'
 'Fruit Pizza' 'Ny Style Pizza With Arugula, Sausage And Cheese #Ragu'
 'Vegetable And Goat Cheese Pizza' 'Pizza Casserole' 'Mini Pizzas'
 'Blt Pizza' 'Deep Dish Taco Pizza' 'Fruit Pizza' 'Caramel Brownie Pizza'
 'Mexican Pizzas' 'Grilled Bbq Cheddar Chicken Pizza'
 'Zucchini And Roquefort Pizza' 'Veal Chops Pizzaiolo'
 'Double Crust Pizza Casserole' 'Spinach And Mushroom Pizza' 'Pizza Bites'
 'Lava Pizza Dip' 'Breakfast Pizza'
 'Gorgonzola and Arugula Pizza with Caramelized Onions'
 "Pyro's Bar & Grill french brea

In [32]:
# bow_toy_data['link'] = toy_data['link']
# bow_toy_data.head()

In [33]:
# pd.to_pickle(bow_toy_data, 'df.pkl')

In [46]:
rec_idxs

[70123, 150710, 157480, 95384, 116032, 54604, 56900, 151716, 126860]

In [35]:
links = df.link
links

929910     www.allrecipes.com/recipe/11222/hildas-icebox-...
570299       www.cookbooks.com/Recipe-Details.aspx?id=408480
1490809    www.tasteofhome.com/recipes/sweet-n-sour-appet...
2070153    allrecipes.com/recipe/quick-wine-and-cheese-puff/
1933048    www.food.com/recipe/chicken-salad-with-almonds...
                                 ...                        
696044       www.cookbooks.com/Recipe-Details.aspx?id=343227
1970706       www.food.com/recipe/holiday-cookie-pops-132710
425889       www.cookbooks.com/Recipe-Details.aspx?id=921975
1869068      www.food.com/recipe/7-layer-sombrero-dip-248426
1664393        www.food.com/recipe/red-beans-and-rice-166700
Name: link, Length: 223114, dtype: object

In [48]:
rec_links = np.array(links)[rec_idxs]
rec_links

array(['www.cookbooks.com/Recipe-Details.aspx?id=718200',
       'www.food.com/recipe/beef-and-kidney-pie-3073',
       'www.cookbooks.com/Recipe-Details.aspx?id=629025',
       'www.cookbooks.com/Recipe-Details.aspx?id=989946',
       'www.cookbooks.com/Recipe-Details.aspx?id=97149',
       'www.cookbooks.com/Recipe-Details.aspx?id=295519',
       'www.epicurious.com/recipes/food/views/strudel-dough-390234',
       'cookpad.com/us/recipes/345337-fast-pizza',
       'www.yummly.com/recipe/Puffed-Pastry-Pizza-631970'], dtype=object)

In [53]:
results = {'Recipe': recipe_recs, 'Link': rec_links}
results_df = pd.DataFrame(results)
results_df

Unnamed: 0,Recipe,Link
0,Barbecue Oysters,www.cookbooks.com/Recipe-Details.aspx?id=718200
1,Beef And Kidney Pie,www.food.com/recipe/beef-and-kidney-pie-3073
2,Pizzicate Pizzas,www.cookbooks.com/Recipe-Details.aspx?id=629025
3,Easy Pizza,www.cookbooks.com/Recipe-Details.aspx?id=989946
4,Lee'S Spaghetti Sauce(Makes 4 To 6 Servings),www.cookbooks.com/Recipe-Details.aspx?id=97149
5,Pizza Crackers,www.cookbooks.com/Recipe-Details.aspx?id=295519
6,Strudel Dough,www.epicurious.com/recipes/food/views/strudel-...
7,Fast pizza,cookpad.com/us/recipes/345337-fast-pizza
8,Puffed Pastry Pizza,www.yummly.com/recipe/Puffed-Pastry-Pizza-631970
