# Re-Fridgerate: The Recipe Recommender

### [Webscraping](Webscraping)

### [EDA, Cleaning & Wrangling](EDA)

### [Sentiment Analysis](Sentiment)

### [Minimum Viable Product (MVP)](Minimum)

### 1. Web Scraping

In [None]:
# Importing libraries

import requests
from bs4 import BeautifulSoup
from time import sleep
import pandas as pd
import random

In [None]:
# Testing one url first

url = "https://www.allrecipes.com/recipe/22831/alfredo-sauce/"

# Downloading html with a request and getting response code

response = requests.get(url)
response.status_code

In [None]:
# Creating the 'soup' with an hmtl parser

soup = BeautifulSoup(response.content, 'html.parser')

In [None]:
# Checking the html code looks as expected

soup.prettify

In [None]:
# Scrape the recipe title using inspect function in the website

rawTitle = soup.find(class_="recipe-main-header")

In [None]:
# From rawTitle we can get the actual title under "headline"

parsedTitle = rawTitle.find(class_="headline").get_text()
parsedTitle

In [None]:
# Scrape categories: 

# Inspecting the website we see under "breadcrumbs__title" we see we have all categories and subcategories

rawCategs = soup.find_all(class_="breadcrumbs__title")
rawCategs

In [None]:
# We create an empty list and store all using a loop,
# although we only care about items in the 3rd and 4th position from the list, which will be our Category and Subcategory

categs = []
for c in rawCategs:
    categs.append(c.get_text())

categs

In [None]:
# Now we get our Category and Subcategories

category = categs[2]
subcategory = categs[3]

In [None]:
# Scraping ingredients:

# Inspecting the website we see all ingredients are nested individually under "ingredients-item"
# we will find all of them first

rawIngredients = soup.find_all(class_="ingredients-item")

In [None]:
# First, we will start by creating an empty list

parsedIngredients = []

# Then, we will loop through rawIngredients to find each "ingredients-item-name" and store it in a new variable "ing".
# Finally, we will append "ing" in our empty list

for i in rawIngredients:
    ing = i.find(class_="ingredients-item-name")
    parsedIngredients.append(ing.get_text())
    
parsedIngredients

In [None]:
# Scraping reviews:

# Inspecting the website we see all reviews are stored under 'recipe-review-body--truncated', so we will use find_all
    
rawReviews = soup.find_all(class_='recipe-review-body--truncated')

# Now we create and empty list, and loop through our previous results to append inside the list

parsedReviews = []
for r in rawReviews:
    parsedReviews.append(r.get_text())
    
parsedReviews

In [None]:
# Creating the main dataframe

df = pd.DataFrame(columns=['Title', 'Category', 'Subcategory', 'Ingredients', 'Reviews', 'URL'])

In [None]:
# Defining a function that performs all the different scraping tasks we defined previously and stores it in our df.
# We will later create a loop that will allow to apply this function across different urls

def fetch_data(url, df):
    
    # Print message with url for trackability in case there is an error
    print("Fetching " + url)

    # Getting the query
    response = requests.get(url)
    
    # If response message different to 200, print error message and keep scraping.
    # We were getting this error because not all pages where found when incrementing 'cur'
    if response.status_code != 200:
        print("Error fetching page (Bummer!)") 
        return df

    # Create the soup
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find title
    # Had to do some error handling since some pages had a different layout than the one I had originally scraped
    try:
        rawTitle = soup.find(class_="recipe-main-header")
        parsedTitle = rawTitle.find(class_="headline").get_text()
    
    # Printing url with AttributeError to identify which urls had a different layout
    except AttributeError:
        print("AttributeError " + url)
        return df

    # Find categories
    rawCategs = soup.find_all(class_="breadcrumbs__title")
    categs = []
    for c in rawCategs:
        categs.append(c.get_text())
    
    # Created a loop that appends categs as blanks when these had a smaller length than 2 since some of the urls had no categories
    categs = categs[2:4]
    while len(categs) < 2:
        categs.append("")

    # Find ingredients
    rawIngredients = soup.find_all(class_="ingredients-item")

    # Parse ingredients
    parsedIngredients = []
    for i in rawIngredients:
        ing = i.find(class_="ingredients-item-name")
        parsedIngredients.append(ing.get_text())

    # Find reviews
    rawReviews = soup.find_all(class_='recipe-review-body--truncated')

    # Parse reviews
    parsedReviews = []
    for r in rawReviews:
        parsedReviews.append(r.get_text())

    # Creating a dictionary to append all scraped urls into the dataframe
    row = {
        'Title': parsedTitle, 
        'Category': categs[0],
        'Subcategory': categs[1],
        'Ingredients': parsedIngredients,
        'Reviews': parsedReviews,
        'URL': url,
    }
    df = df.append(row, ignore_index=True)

    return df

In [None]:
# Creating a while loop that scrapes through multiple pages after noticing incrementing by one a specific section 
# of the url would take us to a new page with a new recipe

# Scraping info: defining the baseUrl (which we will keep incrementing with the loop), the starting position and a total number of iterations 1500
# With more time and processing power we could run more iterations

baseUrl = "https://www.allrecipes.com/recipe/"
startAt = 22121
total = 1500

In [None]:
# While loop:
# Defininig current as the previously defined starting position
# While current is smaller than the starting position plus the total iterations, it will keep running

cur = startAt
while cur < startAt + total:
    
    # Construct url using our baseUrl and cur converted to string
    url = baseUrl + str(cur)    
    
    # Calling the previously defined function
    df = fetch_data(url, df)

    # using sleep function with random floats from 0.0 to 1.0 seconds
    wait_time = random.random()
    sleep(wait_time)

    # Increment count by one
    cur += 1

In [None]:
# Export final dataframe into a csv file

df.to_csv('/Users/lucas/downloads/final_project.csv')

### 2. EDA, Cleaning & Wrangling

In [None]:
# Import libraries

import textblob
from textblob import TextBlob
import nltk
nltk.download()

In [None]:
# Import csv file and store into a dataframe

df = pd.read_csv('final_project.csv')

In [None]:
# Check for null values, data types, total rows and columns

df.info()

# We have a few nulls in Category and Subcategory, but we will keep them for now
# also, all columns are objects - we will need to change Ingredients to a list

In [None]:
# Quick look at the df's structure, we will deal with the necessary columns from left to right

df.head()

In [None]:
# Make the whole df lower case

df = df.applymap(lambda s:s.lower() if type(s) == str else s)

In [None]:
# Remove 'Unnamed: 0' column

df.drop(['Unnamed: 0'], axis=1, inplace = True)

In [None]:
# Ingredients dtype is an object, so we convert it to a list

df['listIngredients'] = df['Ingredients'].apply(lambda x: eval(x))

In [None]:
from nltk.tokenize import word_tokenize

In [None]:
# Define a function that will allow us to tokenize 'listIngredients'

def tokenize (text):
    tokens = word_tokenize(str(text))
    return tokens

In [None]:
# Apply function and store results in 'tokIngredients'

df['tokIngredients'] = df['listIngredients'].apply(lambda x: tokenize(x))

In [None]:
# Keeping only alpha values and storing them in 'alphaIngredients'

df['alphaIngredients'] = df['tokIngredients'].apply(lambda x: [item for item in x if item.isalpha()])

In [None]:
# Creating a list of reserved words to remove from 'alphaIngredients'
# Removing food "state", unit measures basic ingredients and others

reserved_words = ['quarts', 
              'cup', 
              'cups', 
              'ounce', 
              'ounces',  
              'pound', 
              'pounds', 
              'teaspoon',
              'tablespoon',
              'tablespoons',
              'teaspoons',
              'package',
              'packages',
              'all',
              'purpose',
              'lean',
              'minced',
              'ground',
              'cubed',
              'raw',
              'fresh',
              'frozen',
              'extra' 
              'large',
              'whole',
              'small',
              'lightly' 
              'battered',
              'skinless',
              'boneless',
              'salt',
              'pepper', 
              'vinegar', 
              'olive',
              'vegetable',
              'butter', 
              'water', 
              'onion', 
              'onions',
              'garlic',
              'potatoes', 
              'potato',
              'flour',
              'white',
              'sugar',
              'baking'
              'soda',
              'powder',
              'peeled',
              'chopped',
              'sliced',
              'can',
              'dried',
              'ketchup',
              'sauce'
             ]

In [None]:
# Removing reserved words from 'alphaIngredients', storing result in 'cleanIngredients'

df['cleanIngredients'] = df['alphaIngredients'].apply(lambda x: [item for item in x if item not in reserved_words])

# Quick check

df['cleanIngredients']

In [None]:
# Checking an example of a review, we see one of them is duplicated

df['Reviews'][0]

In [None]:
# Removing duplicate values from 'Reviews'

df['Reviews'] = df['Reviews'].apply(lambda x: ','.join(pd.unique(x.split(','))))

# Quick check

df['Reviews'][0]

In [None]:
# Check number of records per Category. This could influence the scope of our MVP

df['Category'].value_counts()

In [None]:
# visual check

df['Category'].value_counts().plot(kind='bar');

In [None]:
# Check number of records per Subcategory

df['Subcategory'].value_counts()

### 3. Sentiment Analysis

In [None]:
# Sample check on the review column, index position 3 

text = df.iloc[3]['Reviews']
text

In [None]:
# Storing text variable into new variable 'testimonial'

testimonial = TextBlob(text)

# Running sentiment analysis on the selected record

testimonial.sentiment

In [None]:
# Using TextBlob to break down the review into sentences to facilitate reading.
# Perform a "sense check" of the polarity obtained above

testimonial.sentences

In [None]:
# Remove punctuation from 'Reviews' using RegEx and storing in a new column

df['cleanReviews'] = df['Reviews'].str.replace('[^\w\s]','')
df['cleanReviews'][3]

In [None]:
# Import Text and stopwords from NLTK libraries
# Printing stop_words to double check we remove them properly later on

from nltk.text import Text
from nltk.corpus import stopwords
stop_words=stopwords.words('english')
stop_words

In [None]:
# Tokenizing 'cleanReviews' to facilitate the removal of stopwords. Storing in column 'tokReviews'

tokenized = word_tokenize(text)
df['tokReviews'] = df['cleanReviews'].apply(lambda x: word_tokenize(x))

# Quick check

df['tokReviews']

In [None]:
# Removing stop words from 'tokReviews', storing in 'sentReviews'

df['sentReviews'] = df['tokReviews'].apply(lambda x: [item for item in x if item not in stop_words])

# Quick check

df['sentReviews']

In [None]:
# Create new column 'Sentiment' with 'sentReviews' sentiment scores

df['Sentiment'] = df['Reviews'].apply(lambda x: TextBlob(x).sentiment.polarity)

In [None]:
# looking at descriptive statistics for 'Sentiment'

df['Sentiment'].describe()

# looking at the mean and different quartiles we see reviews are almost always positive.

In [None]:
# Average rating per category stored in a new df 'avgCateg'

avgCateg = df.groupby(["Category"])["Sentiment"].mean().reset_index()
avgCateg.head(15)

In [None]:
# Visualising previous result. It seems like reviewers are more positive with drinks and fruits & veggies.
# We would need to compare this with the average number of reviews for each category in a future iteration

avgCateg = avgCateg.sort_values(by='Sentiment')
avgCateg.plot(kind="barh", y='Sentiment', x='Category');

### 4. Minimum Viable Product (MVP)

In [None]:
def recipe_recommender():
    
    # Print welcome message and instructions
    
    print('Welcome to Re-Fridgerate!')
    print("Let me know what's in your fridge and I will recommend a few of our best reviewed recipes!")
    print('I will now ask you to enter your food ingredients one by one')
    
    # Store user input in a list of length 3
    
    userInput = []
    for i in range(3):
        userInput.append(input('Ingredient: ').lower())
    
    # Create variable 'suggestion' that stores a subset of our dataframe where any cleanIngredient is in userInput
    
    suggestion = df[df['cleanIngredients'].apply(lambda x: any(item in userInput for item in x))] #.sample(5)
    
    # Select only Title, Category and URL and store in 'suggestion'
    
    suggestion = suggestion[['Title', 'Category', 'URL']]
    print('Below our suggestions... Bon Apetit!')
    return suggestion

In [None]:
recipe_recommender()