### Imports

In [1]:
import pandas as pd
import numpy as np
import nltk
import re
import string
from collections import defaultdict
from nltk.corpus import wordnet, stopwords
from nltk.tokenize import TreebankWordTokenizer
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer

from common_word_counts import find_common_words
from tokenization_lemmatization import tokenize_lemmatize

pd.options.mode.chained_assignment = None

### Loading and Cleaning Review Data

In [2]:
pd.to_pickle(pd.read_csv('RAW_interactions.csv'), 'full_reviews_df.pkl')

In [2]:
reviews = pd.read_pickle('full_reviews_df.pkl')

In [3]:
reviews.drop(['user_id', 'date'], axis=1, errors='ignore', inplace=True)
reviews

Unnamed: 0,recipe_id,rating,review
0,40893,4,Great with a salad. Cooked on top of stove for...
1,40893,5,"So simple, so delicious! Great for chilly fall..."
2,44394,4,This worked very well and is EASY. I used not...
3,85009,5,I made the Mexican topping and took it to bunk...
4,85009,5,"Made the cheddar bacon topping, adding a sprin..."
...,...,...,...
1132362,72730,0,Another approach is to start making sauce with...
1132363,386618,5,These were so delicious! My husband and I tru...
1132364,78003,5,WOW! Sometimes I don't take the time to rate ...
1132365,78003,4,Very good! I used regular port as well. The ...


#### Remove null reviews

In [5]:
counter = defaultdict(int)

for review in reviews['review']:
    
    counter[type(review)] += 1
        
counter

In [4]:
for index, row in reviews.iterrows():
    
    if type(row['review']) == float:
        reviews.drop(index, axis=0, inplace=True)
        
reviews.shape

(1132198, 3)

#### Determine how many reviews a recipe must have to be considered

In [7]:
len(reviews['recipe_id'].value_counts()) - sum(reviews['recipe_id'].value_counts() < 50)

In [5]:
ids_to_count_map = {}

for recipe_id, count in reviews['recipe_id'].value_counts().items():
    
    ids_to_count_map[recipe_id] = count

In [6]:
id_to_count_merging_map = pd.DataFrame([ids_to_count_map.keys(), ids_to_count_map.values()]).transpose()

In [7]:
id_to_count_merging_map.rename(columns={0: 'recipe_id', 1: 'num_reviews'}, inplace=True)

In [8]:
reviews = pd.merge(reviews, id_to_count_merging_map, how='inner', on='recipe_id')

In [9]:
popular_reviews = reviews[reviews['num_reviews'] >= 50].reset_index(drop=True)

In [13]:
popular_reviews.shape

In [14]:
popular_reviews

#### Build set for recommender system with similar number of reviews

In [11]:
test_reviews = reviews[(reviews['num_reviews'] >= 45) & (reviews['num_reviews'] < 50)].reset_index(drop=True)

In [14]:
alpha_only = lambda x: re.sub('\d*', '', x)
punc_lower = lambda x: re.sub('[%s]' % re.escape(string.punctuation), ' ', x.lower())
alpha_only_r2 = lambda x: re.sub('[^a-zA-z\ ]*', '', x)
excited_word_cleaner = lambda x: re.sub(r'\w*((\w)\2{2,})\w*', ' ', x)
double_aa_cleaner = lambda x:re.sub(r'\b\w*[a][a]\w+\b', ' ', x)
strip_whitespace = lambda x: re.sub(r'[ ]{2,}', ' ', x)
add_space = lambda x: ' ' + x

In [11]:
popular_reviews['review'] = popular_reviews['review'].map(alpha_only)\
    .map(punc_lower).map(alpha_only_r2).map(excited_word_cleaner)\
    .map(double_aa_cleaner).map(strip_whitespace).map(add_space)

In [15]:
test_reviews['review'] = test_reviews['review'].map(alpha_only)\
    .map(punc_lower).map(alpha_only_r2).map(excited_word_cleaner)\
    .map(double_aa_cleaner).map(strip_whitespace).map(add_space)

#### Convenient way to combine all reviews for each recipe

In [12]:
groupby_id_reviews = popular_reviews.groupby('recipe_id')['review'].sum()

In [13]:
grouped_reviews = pd.DataFrame([groupby_id_reviews.index, groupby_id_reviews.values]).T.rename(columns={0: 'id', 1: 'reviews'})

In [19]:
grouped_reviews.head()

In [16]:
groupby_id_test_reviews = test_reviews.groupby('recipe_id')['review'].sum()

In [17]:
grouped_test_reviews = pd.DataFrame([groupby_id_test_reviews.index, groupby_id_test_reviews.values]).T.rename(columns={0: 'id', 1: 'reviews'})

In [18]:
grouped_test_reviews.head()

Unnamed: 0,id,reviews
0,153,took about minutes to bake i use applesauce ...
1,198,loved it very good recipe i enjoyed the spic...
2,432,this is the easiest pie to bake it really sou...
3,519,this time i cooked the hash browns in a toast...
4,647,i finally made this recipe been sitting on it...


### Loading and cleaning Recipe Data

In [20]:
pd.to_pickle(pd.read_csv('RAW_recipes.csv'), 'full_recipes_df.pkl')

In [19]:
recipes = pd.read_pickle('full_recipes_df.pkl')
recipes.head()

Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients
0,arriba baked winter squash mexican style,137739,55,47892,2005-09-16,"['60-minutes-or-less', 'time-to-make', 'course...","[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",11,"['make a choice and proceed with recipe', 'dep...",autumn is my favorite time of year to cook! th...,"['winter squash', 'mexican seasoning', 'mixed ...",7
1,a bit different breakfast pizza,31490,30,26278,2002-06-17,"['30-minutes-or-less', 'time-to-make', 'course...","[173.4, 18.0, 0.0, 17.0, 22.0, 35.0, 1.0]",9,"['preheat oven to 425 degrees f', 'press dough...",this recipe calls for the crust to be prebaked...,"['prepared pizza crust', 'sausage patty', 'egg...",6
2,all in the kitchen chili,112140,130,196586,2005-02-25,"['time-to-make', 'course', 'preparation', 'mai...","[269.8, 22.0, 32.0, 48.0, 39.0, 27.0, 5.0]",6,"['brown ground beef in large pot', 'add choppe...",this modified version of 'mom's' chili was a h...,"['ground beef', 'yellow onions', 'diced tomato...",13
3,alouette potatoes,59389,45,68585,2003-04-14,"['60-minutes-or-less', 'time-to-make', 'course...","[368.1, 17.0, 10.0, 2.0, 14.0, 8.0, 20.0]",11,['place potatoes in a large pot of lightly sal...,"this is a super easy, great tasting, make ahea...","['spreadable cheese with garlic and herbs', 'n...",11
4,amish tomato ketchup for canning,44061,190,41706,2002-10-25,"['weeknight', 'time-to-make', 'course', 'main-...","[352.9, 1.0, 337.0, 23.0, 3.0, 0.0, 28.0]",5,['mix all ingredients& boil for 2 1 / 2 hours ...,my dh's amish mother raised him on this recipe...,"['tomato juice', 'apple cider vinegar', 'sugar...",8


In [20]:
recipes = pd.merge(recipes, id_to_count_merging_map, how='inner', left_on='id', right_on='recipe_id')

In [16]:
popular_recipes = recipes[recipes['num_reviews'] >= 50].reset_index(drop=True)

In [23]:
test_recipes = recipes[(recipes['num_reviews'] >= 45) & (recipes['num_reviews'] < 50)].reset_index(drop=True)

In [24]:
and_remover = lambda x: re.sub(r'and', '', x)
separate_calories = lambda x: float(x.replace('[', '').split(',')[0])

In [18]:
popular_recipes['steps'] = popular_recipes['steps'].map(alpha_only).map(punc_lower)\
                                                    .map(alpha_only_r2).map(strip_whitespace)\
                                                    .map(add_space)

popular_recipes['tags'] = popular_recipes['tags'].map(alpha_only).map(punc_lower)\
                                                    .map(alpha_only_r2).map(strip_whitespace)

popular_recipes['ingredients'] = popular_recipes['ingredients'].map(and_remover).map(alpha_only).map(punc_lower)\
                                                    .map(alpha_only_r2).map(strip_whitespace).map(add_space)

popular_recipes['calories'] = popular_recipes['nutrition'].map(separate_calories)

popular_recipes['name'] = popular_recipes['name'].map(alpha_only).map(punc_lower)\
                                                    .map(alpha_only_r2).map(strip_whitespace)

In [25]:
test_recipes['steps'] = test_recipes['steps'].map(alpha_only).map(punc_lower)\
                                                    .map(alpha_only_r2).map(strip_whitespace)\
                                                    .map(add_space)

test_recipes['tags'] = test_recipes['tags'].map(alpha_only).map(punc_lower)\
                                                    .map(alpha_only_r2).map(strip_whitespace)

test_recipes['ingredients'] = test_recipes['ingredients'].map(and_remover).map(alpha_only).map(punc_lower)\
                                                    .map(alpha_only_r2).map(strip_whitespace).map(add_space)

test_recipes['calories'] = test_recipes['nutrition'].map(separate_calories)

test_recipes['name'] = test_recipes['name'].map(alpha_only).map(punc_lower)\
                                                    .map(alpha_only_r2).map(strip_whitespace)

In [23]:
popular_recipes.drop(['contributor_id', 'submitted', 'nutrition', 'description', 'recipe_id', 'num_reviews'], axis=1, errors='ignore', inplace=True)
popular_recipes.to_pickle('popular_recipes.pkl')

In [26]:
test_recipes.drop(['contributor_id', 'submitted', 'nutrition', 'description', 'recipe_id', 'num_reviews'], axis=1, errors='ignore', inplace=True)
test_recipes.to_pickle('test_recipes.pkl')

### Tokenization and Lemmatization

#### Main Recipes

In [32]:
grouped_reviews['reviews'] = grouped_reviews['reviews'].map(tokenize_lemmatize)
grouped_reviews.to_pickle('cleaned_reviews.pkl')

In [27]:
grouped_test_reviews['reviews'] = grouped_test_reviews['reviews'].map(tokenize_lemmatize)
grouped_test_reviews.to_pickle('cleaned_test_reviews.pkl')

In [None]:
grouped_reviews = pd.read_pickle('cleaned_reviews.pkl')

In [24]:
popular_recipes = pd.read_pickle('popular_recipes.pkl')

In [26]:
popular_rr = pd.merge(popular_recipes, grouped_reviews, how='inner', on='id')

In [27]:
popular_rr['tags'] = popular_rr['tags'].map(tokenize_lemmatize)
popular_rr['steps'] = popular_rr['steps'].map(tokenize_lemmatize)

In [29]:
popular_rr['rev_steps_ingr'] = popular_rr['reviews'] + popular_rr['steps'] + popular_rr['ingredients']

In [31]:
popular_rr.drop(['steps', 'ingredients', 'num_reviews', 'reviews'], axis=1, errors='ignore', inplace=True)

In [32]:
popular_rr.to_pickle('popular_rr.pkl')

#### Test Recipes

In [29]:
test_recipes = pd.read_pickle('test_recipes.pkl')

In [30]:
grouped_test_reviews = pd.read_pickle('cleaned_test_reviews.pkl')

In [31]:
test_rr = pd.merge(test_recipes, grouped_test_reviews, how='inner', on='id')

In [33]:
test_rr['tags'] = test_rr['tags'].map(tokenize_lemmatize)
test_rr['steps'] = test_rr['steps'].map(tokenize_lemmatize)

In [34]:
test_rr['rev_steps_ingr'] = test_rr['reviews'] + test_rr['steps'] + test_rr['ingredients']

In [35]:
test_rr.drop(['steps', 'ingredients', 'num_reviews', 'reviews'], axis=1, errors='ignore', inplace=True)

In [36]:
test_rr.to_pickle('test_rr.pkl')