In [4]:
import os
import datetime
import pandas as pd
from pandas import json_normalize
import numpy as np
import re
import json
import requests
import time
from bs4 import BeautifulSoup
from collections import defaultdict, Counter
import random
import pprint
from string import punctuation
from wordcloud import WordCloud
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import NMF, TruncatedSVD, LatentDirichletAllocation


from nltk.corpus import stopwords
sw = stopwords.words("english")
sw.append('recipes')
sw.append('recipe')

In [9]:
# define functions

punctuation = set(punctuation) # speeds up comparison
tw_punct = punctuation


def descriptive_stats(tokens, verbose=True) :
    """
        Given a list of tokens, print number of tokens, number of unique tokens,
        number of characters, lexical diversity, and num_tokens most common
        tokens. Return a list of
    """

    
    num_tokens=len(tokens)
    num_unique_tokens = len(set(tokens))
    lexical_diversity = num_unique_tokens/num_tokens
    num_characters = sum(len(token) for token in tokens)

    if verbose :
        print(f"There are {num_tokens} tokens in the data.")
        print(f"There are {num_unique_tokens} unique tokens in the data.")
        print(f"There are {num_characters} characters in the data.")
        print(f"The lexical diversity is {lexical_diversity:.3f} in the data.")

        # print the five most common tokens
        counter = Counter(tokens)
        top_5_tokens = counter.most_common(5)
        print("Top 5 most common tokens:")
        for token, count in top_5_tokens:
            print(f"{token}: {count} occurrences")

    return([num_tokens, num_unique_tokens,
            lexical_diversity,
            num_characters])


def remove_stopwords(tokens) :
    return [token for token in tokens if token not in sw]
    return(tokens)

def remove_punctuation(text, punct_set=tw_punct) :
    """
        Function takes two arguments: (1) text, which is the input string, and (2) the punctuation set, which is set to the tw_punct value set.
        Returns all characters not found in the punctuation set and concatenates them back into a string using the .join() method with an empty
        string "" as the separator.
    """
    return("".join([ch for ch in text if ch not in punct_set]))

def tokenize(text) :
    """
        Splitting on whitespace rather than the book's tokenize function. That
        function will drop tokens like '#hashtag' or '2A', which we need for Twitter.
    """
    tokens = text.split()
    return(tokens)

def prepare(text, pipeline) :
    tokens = str(text)

    for transform in pipeline :
        tokens = transform(tokens)

    return(tokens)
def display_topics(model, features, no_top_words=5):
    for topic, words in enumerate(model.components_):
        total = words.sum()
        largest = words.argsort()[::-1] # invert sort order
        print("\nTopic %02d" % topic)
        for i in range(0, no_top_words):
            print("  %s (%2.2f)" % (features[largest[i]], abs(words[largest[i]]*100.0/total)))

In [10]:
from matplotlib import pyplot as plt

def wordcloud(word_freq, title, max_words=200, stopwords=sw):

    wc = WordCloud(width=800, height=400,
                   background_color= "black", colormap="Paired",
                   max_font_size=150, max_words=max_words)

    # convert data frame into dict
    if type(word_freq) == pd.Series:
        counter = Counter(word_freq.fillna(0).to_dict())
    else:
        counter = word_freq

    # filter stop words in frequency counter
    if stopwords is not None:
        counter = {token:freq for (token, freq) in counter.items()
                              if token not in stopwords}
    wc.generate_from_frequencies(counter)

    plt.title(title)

    plt.imshow(wc, interpolation='bilinear')
    plt.axis("off")


def count_words(df, column='titles_tokens', preprocess=None, min_freq=2):

    # process tokens and update counter
    def update(doc):
        tokens = doc if preprocess is None else preprocess(doc)
        counter.update(tokens)

    # create counter and run through all data
    counter = Counter()
    df[column].map(update)

    # transform counter into data frame
    freq_df = pd.DataFrame.from_dict(counter, orient='index', columns=['freq'])
    freq_df = freq_df.query('freq >= @min_freq')
    freq_df.index.name = 'token'

    return freq_df.sort_values('freq', ascending=False)

In [12]:
# data import
recipes = pd.read_csv("recipes.csv")
recipes = pd.DataFrame(recipes)
#recipes['title']=recipes['title'].astype(str)
#recipes['ingredients']=recipes['ingredients'].astype(str)
#recipes['step']=recipes['step'].astype(str)
#recipes.dtypes

recipes.head(5)

Unnamed: 0,title,ingredients,step
0,Cannellini Bean and Asparagus Salad with Mushr...,"cannellini beans, water, water, curry leaves, ...",Rinse the cannellini beans and soak for 8 hour...
1,Berry Banana Breakfast Smoothie,"yogurt, graham cracker crumbs, soymilk, berrie...",Take some yogurt in your favorite flavor and a...
2,Red Lentil Soup with Chicken and Turnips,"olive oil, soup, carrot, celery, onion, salt a...","To a large dutch oven or soup pot, heat the ol..."
3,Asparagus and Pea Soup: Real Convenience Food,"garlic, onion, garlic, onion, extra virgin oli...",Chop the garlic and onions. Saute the onions i...
4,Garlicky Kale,"olive oil, kale, vinegar, garlic",Heat the olive oil in a large pot over medium ...


In [16]:
# store tokens in new dataframe 'df'
df=pd.DataFrame()

# fold to lowercase
df['Recipe_tokens']=recipes['title'].str.lower()
df['Ingredients_tokens']=recipes['ingredients'].str.lower()
#df['URL_tokens']=recipes['URL'].str.lower()

# remove punctuation
df['Ingredients_tokens']=df['Ingredients_tokens'].apply(remove_punctuation)
#df['URL_tokens']=df['URL_tokens'].apply(remove_punctuation)
df['Recipe_tokens']=df['Recipe_tokens'].apply(remove_punctuation)

# tokenize
df['Recipe_tokens']=tokenize(df['Recipe_tokens'].str)
df['Ingredients_tokens']=tokenize(df['Ingredients_tokens'].str)
#df['URL_tokens']=tokenize(df['URL_tokens'].str)

# remove stopwords
df['Ingredients_tokens']=df['Ingredients_tokens'].apply(remove_stopwords)
#df['URL_tokens']=df['URL_tokens'].apply(remove_stopwords)
df['Recipe_tokens']=df['Recipe_tokens'].apply(remove_stopwords)

df

Unnamed: 0,Recipe_tokens,Ingredients_tokens
0,"[cannellini, bean, asparagus, salad, mushrooms]","[cannellini, beans, water, water, curry, leave..."
1,"[berry, banana, breakfast, smoothie]","[yogurt, graham, cracker, crumbs, soymilk, ber..."
2,"[red, lentil, soup, chicken, turnips]","[olive, oil, soup, carrot, celery, onion, salt..."
3,"[asparagus, pea, soup, real, convenience, food]","[garlic, onion, garlic, onion, extra, virgin, ..."
4,"[garlicky, kale]","[olive, oil, kale, vinegar, garlic]"
5,"[slow, cooker, beef, stew]","[cream, mushroom, soup, beef, broth, seasoning..."
6,"[red, kidney, bean, jambalaya]","[kidney, beans, brown, rice, water, kidney, be..."
7,"[chicken, fajita, stuffed, bell, pepper]","[chili, powder, cilantro, pepper, quinoa, cumi..."
8,"[cauliflower, brown, rice, vegetable, fried, r...","[cauliflower, florets, cauliflower, rice, caul..."
9,"[hummus, zaatar]","[chickpeas, water, water, canned, chickpeas, p..."


In [18]:
Recipe_combined_tokens = [token for sublist in df['Recipe_tokens'] for token in sublist]
#descriptive_stats(sza_combined_tokens)
#tokens_without_stopwords = remove_stop(Recipe_combined_tokens)

#tokens_with_punctuation = remove_punctuation(tokens_without_stopwords)
tokens_without_stopwords = Recipe_combined_tokens
descriptive_stats(tokens_without_stopwords)

There are 43 tokens in the data.
There are 37 unique tokens in the data.
There are 259 characters in the data.
The lexical diversity is 0.860 in the data.
Top 5 most common tokens:
bean: 2 occurrences
asparagus: 2 occurrences
red: 2 occurrences
soup: 2 occurrences
chicken: 2 occurrences


[43, 37, 0.8604651162790697, 259]

In [20]:
tokens_without_stopwords

['cannellini',
 'bean',
 'asparagus',
 'salad',
 'mushrooms',
 'berry',
 'banana',
 'breakfast',
 'smoothie',
 'red',
 'lentil',
 'soup',
 'chicken',
 'turnips',
 'asparagus',
 'pea',
 'soup',
 'real',
 'convenience',
 'food',
 'garlicky',
 'kale',
 'slow',
 'cooker',
 'beef',
 'stew',
 'red',
 'kidney',
 'bean',
 'jambalaya',
 'chicken',
 'fajita',
 'stuffed',
 'bell',
 'pepper',
 'cauliflower',
 'brown',
 'rice',
 'vegetable',
 'fried',
 'rice',
 'hummus',
 'zaatar']

In [23]:
# Count Vectorizer
count_ingredient_vectorizer = CountVectorizer(stop_words='english', min_df=10, max_df=0.5)
count_ingredient_vectors = count_ingredient_vectorizer.fit_transform(tokens_without_stopwords)
count_ingredient_vectors.shape


ValueError: After pruning, no terms remain. Try a lower min_df or a higher max_df.

In [24]:
# TF-IDF Vectorizer
tfidf_ingredient_vectorizer = TfidfVectorizer(stop_words='english', min_df=5, max_df=0.7)
tfidf_ingredient_vectors = tfidf_ingredient_vectorizer.fit_transform(tokens_without_stopwords)
tfidf_ingredient_vectors.shape

ValueError: After pruning, no terms remain. Try a lower min_df or a higher max_df.

In [36]:
# NFM Model
nmf_ingredient_model = NMF(n_components=5, random_state=314)
W_ingredient_matrix = nmf_ingredient_model.fit_transform(tfidf_ingredient_vectors)
H_ingredient_matrix = nmf_ingredient_model.components_

# Display NMF Model
display_topics(nmf_ingredient_model, tfidf_ingredient_vectorizer.get_feature_names_out())



Topic 00
  chicken (100.00)
  red (0.00)
  rice (0.00)
  grilled (0.00)
  cauliflower (0.00)

Topic 01
  roasted (100.00)
  soup (0.00)
  salad (0.00)
  garlic (0.00)
  lemon (0.00)

Topic 02
  beans (100.00)
  rice (0.00)
  cauliflower (0.00)
  grilled (0.00)
  cream (0.00)

Topic 03
  oil (99.98)
  soup (0.01)
  red (0.00)
  garlic (0.00)
  cauliflower (0.00)

Topic 04
  pepper (99.93)
  red (0.03)
  soup (0.01)
  cauliflower (0.01)
  rice (0.01)


In [37]:
# Fitting LDA Model
lda_ingredient_model = LatentDirichletAllocation(n_components=5, random_state=314)
W_lda_ingredient_matrix = lda_ingredient_model.fit_transform(count_ingredient_vectors)
H_lda_ingredient_matrix = lda_ingredient_model.components_

# Display LDA Model
display_topics(lda_ingredient_model, count_ingredient_vectorizer.get_feature_names_out())


Topic 00
  beans (10.90)
  roasted (10.90)
  rice (8.13)
  cream (6.43)
  kidney (4.30)

Topic 01
  oil (10.66)
  salt (5.89)
  sauce (5.68)
  chickpeas (5.24)
  seeds (5.03)

Topic 02
  chicken (14.02)
  olive (7.93)
  beef (7.42)
  green (7.42)
  butter (6.40)

Topic 03
  pepper (9.57)
  red (7.71)
  grilled (7.29)
  lemon (6.46)
  yogurt (5.43)

Topic 04
  soup (7.07)
  cauliflower (6.53)
  salad (5.81)
  garlic (5.81)
  vegetable (4.36)


In [38]:
# Fitting LSA Model
svd_ingredient_model = TruncatedSVD(n_components=5, random_state=314)
W_svd_ingredient_matrix = svd_ingredient_model.fit_transform(tfidf_ingredient_vectors)
H_svd_ingredient_matrix = svd_ingredient_model.components_
# Display LSA Model
display_topics(svd_ingredient_model, tfidf_ingredient_vectorizer.get_feature_names_out())


Topic 00
  chicken (101.68)
  olive (1.08)
  oil (0.95)
  red (0.78)
  green (0.60)

Topic 01
  roasted (91.75)
  beans (7.97)
  oil (1.42)
  olive (0.54)
  beef (0.44)

Topic 02
  beans (111.25)
  cream (1.94)
  lemon (0.84)
  garlic (0.76)
  sauce (0.53)

Topic 03
  oil (98.72)
  cream (5.60)
  garlic (3.62)
  pepper (1.75)
  beans (1.55)

Topic 04
  pepper (101.26)
  red (3.35)
  green (3.22)
  olive (2.24)
  cauliflower (1.85)
