In [200]:
import pandas as pd

import pickle
from datetime import datetime
import re
import string
from IPython.display import Image, display

from pymongo import MongoClient
from pprint import pprint

from nltk.corpus import stopwords

# tokenize
from nltk.util import ngrams
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize import TreebankWordTokenizer

# stem & lemmatize
from nltk.stem import LancasterStemmer
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
import spacy

### Pull from Mongo and into a dataframe

In [2]:
client = MongoClient()

In [3]:
rec_db = client.recipes
rec_db.list_collection_names()

['descriptions']

In [4]:
descr_col = rec_db.descriptions

In [5]:
descr_col.find_one({},{ '_id': 0})

{'recipe_url': 'https://cooking.nytimes.com/recipes/1020205-ash-reshteh-persian-greens-bean-and-noodle-soup',
 'image_url': 'https://static01.nyt.com/images/2019/05/15/dining/14Iranianrex2/merlin_154113918_721ff786-e3ef-453f-b3a0-deec1d8f8e02-threeByTwoMediumAt2X.jpg',
 'recipe_title': 'Ash Reshteh (Persian Greens, Bean and Noodle Soup)',
 'recipe_description': 'Ash reshteh’s flavor is defined by two uniquely Persian ingredients: reshteh and kashk. The soup, served during the festivities leading up to Nowruz, the Persian New Year, wouldn’t be the same without the soup noodles called reshteh, which are saltier and starchier than Italian noodles — though you could substitute linguine in a pinch. Kashk, a form of drained yogurt or whey, is saltier and more sour than Greek yogurt or sour cream. More like feta than yogurt, liquid kashk gives ash its distinct, satisfying flavor. If you can’t find liquid kashk, buy it powdered and hydrate it with warm water to the consistency of sour cream. L

In [96]:
# select if has a recipe description and a publication date
descr_df = pd.DataFrame(list(descr_col.find({'recipe_description': {'$ne':None}, 'pub_date': {'$ne':None}},
                                            {'_id':0,'pub_date':1, 'recipe_author':1, 'recipe_description':1, 'recipe_title':1, 'image_url':1, 'recipe_url':1})))

In [139]:
# remove any duplicate rows
descr_df=descr_df.drop_duplicates('recipe_title')

In [98]:
descr_df.head()

Unnamed: 0,image_url,pub_date,recipe_author,recipe_description,recipe_title,recipe_url
0,https://static01.nyt.com/images/2019/05/15/din...,2019-05-14,Samin Nosrat,Ash reshteh’s flavor is defined by two uniquel...,"Ash Reshteh (Persian Greens, Bean and Noodle S...",https://cooking.nytimes.com/recipes/1020205-as...
1,https://static01.nyt.com/images/2019/05/14/din...,2019-05-14,Samin Nosrat,"Named for the city in southwestern Iran, salad...","Salad-e Shirazi (Persian Cucumber, Tomato and ...",https://cooking.nytimes.com/recipes/1020212-sa...
2,https://static01.nyt.com/images/2019/05/15/din...,2019-05-14,Samin Nosrat,Fesenjoon hails from the verdant northern Iran...,Khoresh-e Fesenjoon (Persian Chicken Stew With...,https://cooking.nytimes.com/recipes/1020224-kh...
3,https://static01.nyt.com/images/2019/05/15/din...,2019-05-14,Samin Nosrat,"Yogurt, both plain and with cucumbers, is ever...",Mast-o Khiar (Persian Cucumber and Herb Yogurt),https://cooking.nytimes.com/recipes/1020213-ma...
4,https://static01.nyt.com/images/2019/05/15/din...,2019-05-09,Rebekah Peppler,This simple recipe takes the 3-2-1 spritz form...,Amaro Spritz,https://cooking.nytimes.com/recipes/1020201-am...


### Functions to do all the cleaning

In [8]:
def clean_text_stem(word_stemmer, data_series):
    
    # removes numbers, punctuation, lowercases, and 2 special cases
    alphanumeric = lambda x: re.sub('\w*\d\w*', ' ', x)
    punc_lower = lambda x: re.sub('[%s]' % re.escape(string.punctuation), ' ', x.lower())
    extra_punc = lambda x: x.replace('’', '').replace('—', ' ')

    initial_clean = data_series.map(alphanumeric).map(punc_lower).map(extra_punc)
    
    # word tokenize
    tokenized = initial_clean.apply(word_tokenize)
    
    # stem using the passed in stemmer
    stemmer = word_stemmer()
    stemmed = tokenized.apply(lambda x: [stemmer.stem(word) for word in x])
    
    # return the cleaned text back as a series of strings
    return stemmed.apply(lambda x: ' '.join(x))

def clean_text_lemmatize(word_lemmatizer, data_series):
    
    # removes numbers, punctuation, lowercases, and 2 special cases
    alphanumeric = lambda x: re.sub('\w*\d\w*', ' ', x)
    punc_lower = lambda x: re.sub('[%s]' % re.escape(string.punctuation), ' ', x.lower())
    extra_punc = lambda x: x.replace('’', '').replace('—', ' ')

    initial_clean = data_series.map(alphanumeric).map(punc_lower).map(extra_punc)
    
    # single word tokenize
    tokenized = initial_clean.apply(word_tokenize)
    
    # lemmatize using the passed in lemmatizer
    lemmatizer = word_lemmatizer()
    lemmatized = tokenized.apply(lambda x: [lemmatizer.lemmatize(word) for word in x])
    
    # return the cleaned text back as a series of strings
    return lemmatized.apply(lambda x: ' '.join(x))

def clean_text_spacy_lemmatizer(data_series):
    #only works with spacy lemmatizer
    
    # removes numbers, punctuation, lowercases, and 2 special cases
    alphanumeric = lambda x: re.sub('\w*\d\w*', ' ', x)
    punc_lower = lambda x: re.sub('[%s]' % re.escape(string.punctuation), ' ', x.lower())
    extra_punc = lambda x: x.replace('’', '').replace('—', ' ')

    initial_clean = data_series.map(alphanumeric).map(punc_lower).map(extra_punc)

    # lemmatize
    lemmatizer = spacy.load('en')
    lemmatized = lambda x: ' '.join([token.lemma_.strip() for token in lemmatizer(x) if token.lemma_ != '-PRON-'])

    # return the cleaned text back as a series of strings
    return initial_clean.apply(lemmatized)

### Topic Model

In [9]:
# from sklearn.pipeline import Pipeline
# from sklearn.pipeline import FeatureUnion
# from sklearn.preprocessing import StandardScaler

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.decomposition import TruncatedSVD # LSA
from sklearn.decomposition import NMF # NMF
from sklearn.metrics.pairwise import cosine_similarity # for LSA and NMF
from gensim import corpora, models, similarities, matutils # LDA
# logging for gensim (set to INFO)
# import logging
# logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)



In [10]:
def display_topics(model, feature_names, no_top_words, topic_names=None):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i]for i in topic.argsort()[:-no_top_words - 1:-1]]))

### the formulas to do all the topic modeling

In [70]:
def lsa_topic_model(vectorizer, text, num_topics, len_topics=5):
    # vectorize
    docs_vect = vectorizer.fit_transform(text)
    print(f"document array shape is: {docs_vect.shape}", '\n')
    
    # model
    model = TruncatedSVD(num_topics)
    doc_topic = model.fit_transform(docs_vect)
    
    # print stats
    print(f"explained variance ratio: {model.explained_variance_ratio_}")

    ind=['component_'+str(num) for num in range(1,num_topics+1)]
    # relating features to latent topics
    topic_word = pd.DataFrame(model.components_.round(3),
                 index = ind,
                 columns = vectorizer.get_feature_names())
#     print(topic_word)
    display_topics(model, vectorizer.get_feature_names(), len_topics)
    
    # the Vt matrix shows us the documents we started with, and how each document is made up of the resulting topics
#     Vt_mat = pd.DataFrame(doc_topic.round(3),
#                  index = descr_df['recipe_title'],
#                  columns = ind)
#     print(Vt_mat.head())
    
    # prints the bag of words matrix as a dataframe
    # print(pd.DataFrame(docs_vect.toarray(), index=descr_df['recipe_title'], columns=vectorizer.get_feature_names()).head(10))

In [67]:
def nmf_topic_model(vectorizer, text, num_topics, len_topics=5):
    # vectorize
    docs_vect = vectorizer.fit_transform(text)
    print(f"document array shape is: {docs_vect.shape}", '\n')
    
    # model
    model = NMF(num_topics)
    doc_topic = model.fit_transform(docs_vect)

    ind=['component_'+str(num) for num in range(1,num_topics+1)]
    # relating features to latent topics
    topic_word = pd.DataFrame(model.components_.round(3),
                 index = ind,
                 columns = vectorizer.get_feature_names())
#     print(topic_word)
    display_topics(model, vectorizer.get_feature_names(), len_topics)
    print(model.components_[:5])
    
    # the Vt matrix shows us the documents we started with, and how each document is made up of the resulting topics (how it maps into that space)
#     Vt_mat = pd.DataFrame(doc_topic.round(3),
#                  index = descr_df['recipe_title'],
#                  columns = ind)
#     print(Vt_mat.head())
    
    # prints the bag of words matrix as a dataframe
    # print(pd.DataFrame(docs_vect.toarray(), index=descr_df['recipe_title'], columns=vectorizer.get_feature_names()).head(10))

In [14]:
# using gensim LSA model (called LSI)- try SKLearn version
# from sklearn.decomposition import LatentDirichletAllocation

def lda_topic_model(vectorizer, text, num_topics, len_topics=5):
    # vectorize
    docs_vectorizer = vectorizer.fit(text)
    
    # Create the term-document matrix. Transpose it so the terms are the rows
    doc_word = docs_vectorizer.transform(text).transpose()
    
    pd.DataFrame(doc_word.toarray(), docs_vectorizer.get_feature_names()).head()
    print(f"document array shape is: {doc_word.shape}", '\n')
    
    # Convert sparse scipy matrix of counts to a gensim corpus
    corpus = matutils.Sparse2Corpus(doc_word)
    
    # map matrix rows to words. We need to save a mapping (dict) of row id to word (token) for later use by gensim
    id2word = dict((v, k) for k, v in docs_vectorizer.vocabulary_.items())
    
    # Create lda model (equivalent to "fit" in sklearn)
    # requires our corpus of word counts, mapping of row ids to words, and the number of topics
    lda = models.LdaModel(corpus=corpus, num_topics=num_topics, id2word=id2word, passes=5)
    
    # the 5 most important words for each of the topics we found
    topics = lda.print_topics(len_topics)
    for ix, topic in enumerate(topics):
        print(f"topic {ix}: {topic}", "\n")
    
    # to map our documents to the topic space we need to actually use the LdaModel transformer that we created above
    # Transform the docs from the word space to the topic space (like "transform" in sklearn)
    lda_corpus = lda[corpus]
    # Store the documents' topic vectors in a list so we can take a peak
    lda_docs = [doc for doc in lda_corpus]
    
    # take a look at the document vectors in the topic space, which are measures of the component of each document along each topic
    # Check out the document vectors in the topic space for the first 5 documents
    for ix, doc in enumerate(lda_docs[0:5]):
        print(f'document {ix}: {doc}', '\n')

In [15]:
# set stop words, adding new ones I come across, and stem if I used a stemmer
def get_stop_words(word_stemmer=None):
    stopwds = stopwords.words('english')
    newStopWords = ['bake', 'kitchen', 'ingredient', 'dish', 'recipe', 'time', 'new', 'york',
                   'make', 'use', 'like', 'one', 'add', 'made', 'list', 'step', 'flavor', 'also', 'stir',
                   'without', 'invite', 'good', 'inch', 'serve']
    stopwds.extend(newStopWords)

    if word_stemmer:
        # stem using the passed in stemmer
        stemmer = word_stemmer()
        return [stemmer.stem(word) for word in stopwds]
    else:
        return stopwds

## Stemming

In [46]:
# if you stem the text also stem the stop words
cleaned_text = clean_text_stem(PorterStemmer, descr_df['recipe_description'])

#### LSA

In [None]:
# count vectorizer
lsa_topic_model(CountVectorizer(stop_words=get_stop_words(PorterStemmer), ngram_range=(1,2), min_df=3, max_df=.9), 
                cleaned_text, 
                10, 10)

In [None]:
# TfidfVectorizer
lsa_topic_model(TfidfVectorizer(stop_words=get_stop_words(PorterStemmer), ngram_range=(1,2), min_df=3, max_df=.9), 
           cleaned_text,
           10, 10)

#### NMF

In [None]:
# count vectorizer
nmf_topic_model(CountVectorizer(stop_words=get_stop_words(PorterStemmer), ngram_range=(1,2), min_df=3, max_df=.9), 
                cleaned_text, 
                10, 10)

In [None]:
# Really good
# TfidfVectorizer
nmf_topic_model(TfidfVectorizer(stop_words=get_stop_words(PorterStemmer), ngram_range=(1,2), min_df=3, max_df=.9), 
                cleaned_text, 
                10, 10)

#### LDA

In [None]:
# count vectorizer
lda_topic_model(CountVectorizer(stop_words=get_stop_words(PorterStemmer), ngram_range=(1,2), min_df=3, max_df=.9), 
                cleaned_text, 
                10, 10)

In [None]:
# TfidfVectorizer
lda_topic_model(TfidfVectorizer(stop_words=get_stop_words(PorterStemmer), ngram_range=(1,2), min_df=3, max_df=.9), 
                cleaned_text, 
                10, 10)

## Spacy lemmatizing

In [53]:
cleaned_text = clean_text_spacy_lemmatizer(descr_df['recipe_description'])

#### LSA

In [None]:
# count vectorizer
lsa_topic_model(CountVectorizer(stop_words=get_stop_words(), ngram_range=(1,2), min_df=3, max_df=.9), 
                cleaned_text, 
                10, 10)

In [None]:
# TfidfVectorizer
lsa_topic_model(TfidfVectorizer(stop_words=get_stop_words(), ngram_range=(1,2), min_df=3, max_df=.9), 
           cleaned_text,
           10, 10)

#### NMF

In [None]:
# count vectorizer
nmf_topic_model(CountVectorizer(stop_words=get_stop_words(), ngram_range=(1,2), min_df=3, max_df=.9), 
                cleaned_text, 
                10, 10)

In [None]:
# TfidfVectorizer
nmf_topic_model(TfidfVectorizer(stop_words=get_stop_words(), ngram_range=(1,2), min_df=3, max_df=.9), 
                cleaned_text, 
                10, 10)

#### LDA

In [None]:
# count vectorizer
lda_topic_model(CountVectorizer(stop_words=get_stop_words(), ngram_range=(1,2), min_df=3, max_df=.9), 
                cleaned_text, 
                10, 10)

In [None]:
# TfidfVectorizer
lda_topic_model(TfidfVectorizer(stop_words=get_stop_words(), ngram_range=(1,2), min_df=3, max_df=.9), 
                cleaned_text, 
                10, 10)

### NLTK lemmatizing

In [60]:
cleaned_text = clean_text_lemmatize(WordNetLemmatizer, descr_df['recipe_description'])

#### LSA

In [None]:
# kept only because it has the printout I want
# count vectorizer
lsa_topic_model(CountVectorizer(stop_words=get_stop_words(), ngram_range=(1,2), min_df=3, max_df=.9), 
                cleaned_text, 
                10, 10)

In [None]:
# TfidfVectorizer
lsa_topic_model(TfidfVectorizer(stop_words=get_stop_words(), ngram_range=(1,2), min_df=3, max_df=.9), 
           cleaned_text,
           10, 10)

#### NMF

In [None]:
# count vectorizer
nmf_topic_model(CountVectorizer(stop_words=get_stop_words(), ngram_range=(1,2), min_df=3, max_df=.9), 
                cleaned_text, 
                10, 10)

In [None]:
# TfidfVectorizer
nmf_topic_model(TfidfVectorizer(stop_words=get_stop_words(), ngram_range=(1,2), min_df=3, max_df=.9), 
                cleaned_text, 
                10, 10)

#### LDA

In [None]:
# count vectorizer
lda_topic_model(CountVectorizer(stop_words=get_stop_words(), ngram_range=(1,2), min_df=3, max_df=.9), 
                cleaned_text, 
                10, 10)

In [None]:
# TfidfVectorizer
lda_topic_model(TfidfVectorizer(stop_words=get_stop_words(), ngram_range=(1,2), min_df=3, max_df=.9), 
                cleaned_text, 
                10, 10)

## T-sne

In [None]:
from sklearn.manifold import TSNE

model = TSNE(n_components=2, random_state=0,verbose=0)
low_data = model.fit_transform(docs_vect.toarray())

In [None]:
target_names = topics

colors = cycle(['r','g','b','c','m','y','orange','k','aqua','yellow'])
target_ids = range(len(target_names))
plt.figure(dpi=150)
for i, c, label in zip(target_ids, colors, target_names):
    plt.scatter(low_data[target == i, 0], low_data[target == i, 1], c=c, label=label, s=15, alpha=1)
plt.legend(fontsize=10, loc='upper left', frameon=True, facecolor='#FFFFFF', edgecolor='#333333')
plt.xlim(-100,100);
plt.title("Digit Clusters with TSNE", fontsize=12)
plt.ylabel("Junk TSNE Axis 2", fontsize=12)
plt.xlabel("Junk TSNE Axis 1", fontsize=12);
plt.xticks(fontsize=10)
plt.yticks(fontsize=10);

## Recommender

In [141]:
# champion model is nmf with nltk lemmatizer

#clean text
cleaned_text = clean_text_lemmatize(WordNetLemmatizer, descr_df['recipe_description'])

# vectorize
vectorizer = TfidfVectorizer(stop_words=get_stop_words(), ngram_range=(1,2), min_df=3, max_df=.9)
docs_vect = vectorizer.fit_transform(cleaned_text)
print(f"document array shape is: {docs_vect.shape}", '\n')

document array shape is: (6937, 17749) 



In [221]:
# model
model = NMF(10) #number of topics
doc_topic = model.fit_transform(docs_vect)

# print topics
display_topics(model, vectorizer.get_feature_names(), 10)

# print weighting of each feature in the topics
# print('printing model components', model.components_[:5])

# the Vt matrix shows us the documents we started with, and how each document is made up of the resulting topics (how it maps into that space)
ind=['Topic '+str(num) for num in range(0,10)]
Vt_mat = pd.DataFrame(doc_topic.round(3),
             index = descr_df['recipe_title'],
             columns = ind)
print(Vt_mat.loc['Spicy Chorizo and Red Lentil Soup with Kale'])

# prints the bag of words matrix as a dataframe
# print(pd.DataFrame(docs_vect.toarray(), index=descr_df['recipe_title'], columns=vectorizer.get_feature_names()).head(10))


Topic  0
salad, oil, olive, olive oil, green, dressing, garlic, lemon, pepper, herb

Topic  1
improvise, double, double boiler, boiler, diameter, chip, quince, fruit, leftover, smoker

Topic  2
pie, dough, butter, wa, sweet, fruit, sugar, flour, crust, drink

Topic  3
chicken, meat, sauce, fish, breast, pork, pan, cooking, roast, cook

Topic  4
bean, green, soup, green bean, white, black, white bean, black bean, pea, red

Topic  5
tomato, sauce, pasta, tomato sauce, summer, fresh, cheese, bread, eggplant, fresh tomato

Topic  6
potato, mashed, sweet potato, mashed potato, sweet, cheese, soup, gratin, sour cream, butter

Topic  7
cake, chocolate, frosting, layer, batter, pan, dessert, chocolate cake, moist, day

Topic  8
cream, ice, ice cream, chocolate, milk, coconut, dessert, vanilla, whipped, custard

Topic  9
rice, grain, vegetable, noodle, soup, cooked, brown rice, risotto, egg, meal
Topic 0    0.015
Topic 1    0.000
Topic 2    0.021
Topic 3    0.002
Topic 4    0.019
Topic 5    0.

In [222]:
def get_similar_recipes(selected_recipe, num_recipes=3):
    cos_sims=[]
    labels=[]

    for similar_recipe in descr_df[descr_df['recipe_title']!=selected_recipe]['recipe_title']:
        cos_sims.append(cosine_similarity((Vt_mat.loc[selected_recipe], Vt_mat.loc[similar_recipe]))[0,1])
        labels.append(similar_recipe)
    df = pd.DataFrame(cos_sims, index=labels)

    df = df.sort_values(by=0, ascending=False).head(num_recipes)
    
    for recipe in df.index:
        details = descr_df[descr_df['recipe_title']==recipe]
        print(recipe)
        print(str(details['recipe_url']))
        print(str(details['recipe_description']))
        try: 
            display(Image(url=str(details['image_url']), height=500,width=500))
        except:
            print("This recipe does not have an image!")

In [223]:
get_similar_recipes('Spicy Chorizo and Red Lentil Soup with Kale')

Risotto With Spring Carrots and Leeks
5350    https://cooking.nytimes.com/recipes/1012504-ri...
Name: recipe_url, dtype: object
5350    You can get carrots and leeks year ‘round in t...
Name: recipe_description, dtype: object


Rye and Cornmeal Muffins With Caraway
4611    https://cooking.nytimes.com/recipes/1013495-ry...
Name: recipe_url, dtype: object
4611    I like to serve these savory muffins, whose fl...
Name: recipe_description, dtype: object


Lettuce Soup With Cucumber Croutons
5951    https://cooking.nytimes.com/recipes/11000-lett...
Name: recipe_url, dtype: object
5951    Soup is the most versatile of dishes. When it ...
Name: recipe_description, dtype: object
