In [1]:
# Load csv into dataframe
import pandas as pd
import numpy as np
import nltk
import string
import ast
import re
import unidecode
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from collections import Counter

df = pd.read_csv('./recipes.csv', index_col='RecipeId')

# Drop columns of irrelevant data
df.drop(columns=['CookTime','PrepTime','TotalTime','RecipeCategory','Keywords','RecipeIngredientQuantities','RecipeServings'], inplace=True)

# Drop recipes who have a rating of 4 or less OR 3 or less total reviews
df = df[df['AggregatedRating'] > 4.0]
df = df[df['ReviewCount'] > 3]

# Renumbered recipe ID's to match index
new_index_values = range(0, len(df))
df.index = new_index_values
df.index.name = 'RecipeID'

# Cleaned up ingredients and instructions column
df['RecipeIngredientParts'] = df['RecipeIngredientParts'].apply(lambda x: x.replace('c(','').replace(')',''))
df['RecipeInstructions'] = df['RecipeInstructions'].apply(lambda x: x.replace('c(','').replace(')',''))




ModuleNotFoundError: No module named 'pandas'

In [None]:
# Visualize the top 200 most common ingredients to remove common household ingredients (e.g. salt, pepper, oil, etc)
vocab = nltk.FreqDist()

for ingredients in df['RecipeIngredientParts']:
    ingredients = ingredients.split()
    vocab.update(ingredients)
    
for word, frequency in vocab.most_common(200):
    print(f'{word};{frequency}')


In [2]:
df.head()

Unnamed: 0_level_0,Name,RecipeIngredientParts,AggregatedRating,ReviewCount,RecipeInstructions
RecipeID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,Low-Fat Berry Blue Frozen Dessert,"""blueberries"", ""granulated sugar"", ""vanilla yo...",4.5,4.0,"""Toss 2 cups berries with sugar."", ""Let stand ..."
1,Best Lemonade,"""sugar"", ""lemons, rind of"", ""lemon, zest of"", ...",4.5,10.0,"""Into a 1 quart Jar with tight fitting lid, pu..."
2,Cabbage Soup,"""plain tomato juice"", ""cabbage"", ""onion"", ""car...",4.5,11.0,"""Mix everything together and bring to a boil.""..."
3,Warm Chicken A La King,"""chicken"", ""butter"", ""flour"", ""milk"", ""celery""...",5.0,23.0,"""Melt 1 1/2 ozs butter, add the flour and cook..."
4,Chicken Breasts Lombardi,"""fresh mushrooms"", ""butter"", ""boneless skinles...",5.0,21.0,"""Cook mushrooms in 2 tbsp butter in a large s..."


In [4]:
print(df['RecipeIngredientParts'].iloc[0])

"blueberries", "granulated sugar", "vanilla yogurt", "lemon juice"


In [3]:
def ingredient_cleanup(ingredientList):
    
    terms_to_remove = ['salt', 'pepper', 'oil','of', 'zest','rind','fresh', 'butter', 'sugar', 'water', 'cloves', 'all','purpose', 'black', 'pepper', 'garlic', 'powder', 'cinnamon', 'juice', 'paprika','clove', 'chili','ground','extra','virgin','granulated','cumin','dried','oregano', 'kosher', 'boneless','skinless','canola', 'basil', 'white','thyme','cilantro','vinegar','powder','confectioners','seasoning','lean','curry','light','crushed','dry','boiling','sea']
    
    # turn ingredients from objects into a list
    if isinstance(ingredientList, list):
        ingredients = ingredientList
    else:
        ingredients = ast.literal_eval(ingredientList)
    
    # initialize lemmatizer and ingredient list to be returned
    lemmatizer = WordNetLemmatizer()
    cleaned_ingredients = []
    
    for i in ingredients:
        # Split words according to spaces and hyphens
        items = re.split(' |-', i)
        
        # Remove any non-alphabetic terms, set it lower-case and remove any accents
        items = [word for word in items if word.isalpha()]
        items = [word.lower() for word in items]
        items = [unidecode.unidecode(word) for word in items]
    
        # Lemmatize each word for analysis and comparison
        items =[lemmatizer.lemmatize(word) for word in items]
        # remove common terms ingredients listed in terms_to_remove
        items = [word for word in items if word not in terms_to_remove]
        
        if items:
            cleaned_ingredients.append(' '.join(items))
    cleaned_ingredients_str = ' '.join(cleaned_ingredients)
    return cleaned_ingredients_str

In [7]:
ingredient_cleanup(df['RecipeIngredientParts'].iloc[0])

'blueberry vanilla yogurt lemon'

In [9]:
df['RecipeIngredientParts'] = df['RecipeIngredientParts'].apply(lambda x: ingredient_cleanup(x))
df.head()

SyntaxError: '(' was never closed (<unknown>, line 1)