In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import json
import folium

### Project variable

In [2]:
months = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']

# Eat the season
get some food per season from [eat the season](http://www.eattheseasons.com/seasons.php) website

In [3]:
# Import libraries
import requests
from bs4 import BeautifulSoup

In [4]:
eat_season_data = pd.DataFrame(columns=['month', 'food'])

for month in months:
    r = requests.get('http://www.eattheseasons.com/{0}.php'.format(month))
    soup = BeautifulSoup(r.text, 'html.parser')
    food_on_month = soup.find_all('p')
    for p in food_on_month:
        for elem in p.text.split(", "):
            if (elem.lower() != month.lower()):
                eat_season_data = eat_season_data.append({
                    'month': month,
                    'food': elem.strip(),
                }, ignore_index=True)

eat_season_data.head()

Unnamed: 0,month,food
0,January,broccoli
1,January,broccolini
2,January,brussels sprouts
3,January,butternut squash
4,January,celery root


# seasonal food guide
Get some food per season from [seasonal food guide](https://www.seasonalfoodguide.org/) website.

In [5]:
import re

In [6]:
url = 'https://www.seasonalfoodguide.org'

bimonthly = []
for month in months:
    bimonthly.append('early-{0}'.format(month.lower()))
    bimonthly.append('late-{0}'.format(month.lower()))

#All the data on pages are generated by a script, let's obtain the adress
r = requests.get('https://www.seasonalfoodguide.org/maine/late-january')
soup = BeautifulSoup(r.text, 'html.parser')

url_end = soup.find_all('script')[1]['src']
req = requests.get(url + url_end)
data = req.text

season_guid_data = pd.DataFrame(columns=['month', 'food', 'state'])

m = re.findall(r'\{name:"(.*?)\}\}',data)
m = m[1:]

for elem in m:
    #get each month per states from the current eatable
    seasons = re.findall(r'[A-Z]{2}:{seasons:\[(.*?)\]', elem)
    #get each states that has some season on the current eatable
    states = re.findall(r'([A-Z]{2}):', elem)[1:]
    #get the name of the current eatable
    food = re.findall(r'([A-Za-z]+)"', elem)[0]
    for x in range(0, len(seasons)):
        for season in seasons[x].split(','):
            season_guid_data = season_guid_data.append({
                    'month' : bimonthly[int(season)-1],
                    'food' : food,
                    'state' : states[x]
            
            }, ignore_index=True)

            
#TODO: Do we need to save this dataset as a csv as it took some times to generate?
season_guid_data.head()

Unnamed: 0,month,food,state
0,late-july,Apples,AL
1,early-august,Apples,AL
2,late-august,Apples,AL
3,early-september,Apples,AL
4,late-september,Apples,AL


# Analyse our recipies dataset
Our dataset contains 2,5GB of html file (110'517 file regarding the number of line in the log file). There is plenty of different results. We first need to analyse what kind of data we have before analyse the data themself.

In [7]:
import glob

### The log file
We first have a log file containing valuable information. It give us the name of each file associated to the url it come from. We use it as index for the reste of the project. We will use the log file to navigate instead of looking blind in each file.
**We saw in the logfile that some file came with some error. We keep that in mind and will come back later on it.**

In [8]:
log_path = 'data/recipePages/msg.log'
recipies_path = 'data/recipePages'

f = open(log_path,'r')
log = f.read().split('\n')
#TODO: Take care of the error line in the log file

In [85]:
log_data = pd.DataFrame(columns=['domain', 'url', 'file'])
for line in log:
    domain = re.search(r'http://(.*?)/', line)
    url = re.search(r'http://.*?(?=\t)', line)
    file_name = re.search(r'.*?(?=\t)', line)
    if domain is not None:
        if url is not None:
            if file_name is not None:
                log_data = log_data.append({
                        'domain' : domain.group(0),
                        'url' : url.group(0),
                        'file' : file_name.group(0),
                    }, ignore_index=True)

log_data.head()

Unnamed: 0,domain,url,file
0,http://www.cooks.com/,"http://www.cooks.com/rec/search/0,1-0,ground_s...",6353d9ac2c6bf20dab72ea9043cc018f.html
1,http://www.cooks.com/,"http://www.cooks.com/rec/search/0,1-0,quick_ea...",3f207c5bffff6a090bf5a8ad9e206260.html
2,http://allrecipes.com/,http://allrecipes.com/recipe/classic-minestrone/,7e0ad7374f08c4a8de3500c065c17180.html
3,http://allrecipes.com/,http://allrecipes.com/Recipe/basil-butter-2/de...,4f9ea44a8519ba9d013264eb55711c9b.html
4,http://www.cdkitchen.com/,http://www.cdkitchen.com/recipes/recs/75/Beer_...,099aebf16685a804035fee84152c4f4f.html


### What is the domain name's propotion?
As we have many different domain name, we want to know how many of each domain name we have. To see then what to do with these data.

In [10]:
df = log_data['domain'].value_counts()
print('size: {0}'.format(df.size))
df.head()

size: 127


http://allrecipes.com/         28354
http://www.food.com/           14661
http://www.foodnetwork.com/    11996
http://www.yummly.com/          6590
http://www.cooks.com/           5546
Name: domain, dtype: int64

We see that:
- allrecipes.com
- www.food.com
- www.foodnetwork.com

These 3 website cover 49.77% of our dataset. We will then first make some methode to extract data from these and if needed or if we have time, we will then take care of the rest.

## Getting the ingredients of each recipes
We have a second dataset containing the recipe name, url, domain, ingredients and many other information. We will then extract this dataset and merge it to our log_data datafram to have the possibility to link these information to the corresponding html file. We do this because the review information we are only availible in the html file.

In [100]:
recip_info_path = 'data/recipeInfo/recipeInfo_WestWhiteHorvitz_WWW2013.tsv'

recip_info = pd.read_csv(recip_info_path, sep='\t', encoding='latin-1')

restricted_recipe_info = recip_info[['url', 'title', 'ingredients_list']]

merged_info = pd.merge(log_data, restricted_recipe_info, how='inner', on='url', indicator=False, suffixes=('_info', '_log'))
merged_info.head()

Unnamed: 0,domain,url,file,title,ingredients_list
0,http://allrecipes.com/,http://allrecipes.com/recipe/classic-minestrone/,7e0ad7374f08c4a8de3500c065c17180.html,Classic Minestrone Recipe,"3 tablespoons olive oil|1 leek, sliced|2 carro..."
1,http://allrecipes.com/,http://allrecipes.com/Recipe/basil-butter-2/de...,4f9ea44a8519ba9d013264eb55711c9b.html,Basil Butter Recipe,4 cloves garlic|15 leaves fresh basil|1/2 teas...
2,http://www.cdkitchen.com/,http://www.cdkitchen.com/recipes/recs/75/Beer_...,099aebf16685a804035fee84152c4f4f.html,Beer Cheese Recipe #11524,1 pound mild cheddar (shredded)|1 pound extra ...
3,http://www.foodnetwork.com/,http://www.foodnetwork.com/recipes/claire-robi...,10cf272724e823b8038b8190addf04d3.html,Roasted Sweet Potato Fries Recipe : Claire Rob...,"2 large sweet potatoes, peeled|1/4 cup freshly..."
4,http://allrecipes.com/,http://allrecipes.com/recipe/dirty-martini/,856e6ca1d45014b045c1266d406f3ccf.html,Dirty Martini Recipe,6 fluid ounces vodka|1 dash dry vermouth|1 flu...


### Reducing the dataset
Currently, our dataset is hudge. Going throug it completly took some time. To simplify our research acording to our previous observation, we will only keep the 3 main domain.

In [101]:
keeped_domain = pd.DataFrame({'domain':['http://allrecipes.com/', 'http://www.food.com/', 'http://www.foodnetwork.com/']})

new_merged = merged_info[merged_info.domain.isin(keeped_domain.domain)]
new_merged

Unnamed: 0,domain,url,file,title,ingredients_list
0,http://allrecipes.com/,http://allrecipes.com/recipe/classic-minestrone/,7e0ad7374f08c4a8de3500c065c17180.html,Classic Minestrone Recipe,"3 tablespoons olive oil|1 leek, sliced|2 carro..."
1,http://allrecipes.com/,http://allrecipes.com/Recipe/basil-butter-2/de...,4f9ea44a8519ba9d013264eb55711c9b.html,Basil Butter Recipe,4 cloves garlic|15 leaves fresh basil|1/2 teas...
3,http://www.foodnetwork.com/,http://www.foodnetwork.com/recipes/claire-robi...,10cf272724e823b8038b8190addf04d3.html,Roasted Sweet Potato Fries Recipe : Claire Rob...,"2 large sweet potatoes, peeled|1/4 cup freshly..."
4,http://allrecipes.com/,http://allrecipes.com/recipe/dirty-martini/,856e6ca1d45014b045c1266d406f3ccf.html,Dirty Martini Recipe,6 fluid ounces vodka|1 dash dry vermouth|1 flu...
5,http://allrecipes.com/,http://allrecipes.com/recipes/seafood/fish/tro...,c6a40a7de4b506a935093b67bccf4aac.html,Trout Recipes,
6,http://allrecipes.com/,http://allrecipes.com/recipe/candied-sweet-pot...,05bd905b46dcd56e9b97268b46f05e11.html,Candied Sweet Potatoes Recipe,"4 pounds sweet potatoes, quartered|1 1/4 cups ..."
7,http://www.foodnetwork.com/,http://www.foodnetwork.com/recipes/emeril-laga...,9f73f17c43827c38f10334e635b9bb45.html,Boudin Sausage Balls Recipe : Emeril Lagasse :...,"1 1/4 pounds pork butt, cut into 1-inch cubes|..."
8,http://allrecipes.com/,http://allrecipes.com/Recipe/Sushi-Roll/detail...,ac4ef97f41a1b718521378eed19fd12c.html,Sushi Roll Recipe,2/3 cup uncooked short-grain white rice|3 tabl...
10,http://www.foodnetwork.com/,http://www.foodnetwork.com/thanksgiving-stuffi...,1e225a1f2bcb521302e80fb044c0f65d.html,Thanksgiving Stuffing & Dressing,
11,http://www.foodnetwork.com/,http://www.foodnetwork.com/recipes/ina-garten/...,ef177a155ace2d8be530895aee520e2c.html,Parker's Split Pea Soup Recipe : Ina Garten : ...,"1 cup chopped yellow onions|2 cloves garlic, m..."


# Getting Review date
We do **the assumption** that the people which do a recipe will review the same day or maximum in the same week he cook the recipe. We need to extract the date of all reviews to know when they cooked the recipe

#### allrecipes.com
Starting for allrecipes.com. An inspection on the html elements lead us to see that all review date are referenced in:

``<div class="review">``

To find it, we used the inspector feature in firefox. It apears on testing that many html file we have are malformed. Sometime a page is just a search on a food name and it's not a recipe. Some othertime, there is no review. We had to modify the following methods manytime to take these error into account.
As we first parcour the entire dataset to extracte the useful information, we decide to put some nul value when the data are malformed. We will also have to take care of the quantity associated to the ingredients name. But we keep it for later.

We also had the surprise that BeautifulSoup search by matching element. It lead to the following problem, searching class review give us all class containging the word 'review' like 'previre' and many others. To deal with this problem and only get our class, we modify our usual way to search with BeautifulSoup and use an anonymus function.

In [48]:
def allRecipesReviewDate(path):
    f = open(path, 'r', encoding='latin-1')
    soup = BeautifulSoup(f.read(), 'html.parser')
    #The mentionned lamnda function for BeautifulSoup search
    review_html = soup.find_all(lambda tag: tag.name =='div' and tag.get('class') == ['review'])
    reviews = ''
    for rev in review_html:
        if rev is not None:
            if reviews != '':
                reviews += ' - '
            text = rev.text.strip().replace('\n', '').replace('\t', '')
            reviews += re.search(r'[A-Z][a-z]{2}\. [0-9]*, 200[0-9]', text).group(0)
    return reviews

#Example of result with a random file on this domain
allRecipesReviewDate('data/recipePages/7e0ad7374f08c4a8de3500c065c17180.html')

'Dec. 22, 2003 - Dec. 2, 2005 - Sep. 30, 2007 - Jan. 29, 2003 - Dec. 28, 2006 - Oct. 6, 2006 - Dec. 12, 2005 - Jan. 29, 2003 - Feb. 1, 2007 - Jan. 29, 2003'

#### www.food.com
Same principe, we use the inspector on firefox to indentify the review date. This time, there is no class easely findable directly for the date. We goes up to the first one acceptable and the  do a second find_all on it. As there is two ``<p>`` elements this time and we are interessting in the second one, we just take only the second element.

In [57]:
def foodReviewDate(path):
    f = open(path, 'r', encoding='latin-1')
    soup = BeautifulSoup(f.read(), 'html.parser')
    review_html = soup.find_all('div', class_="about-recipe-info")
    reviews = ''
    for rev in review_html:
        if rev is not None:
            if reviews != '':
                reviews += ' - '
            reviews += rev.find_all('p')[1].text    
    return reviews
foodReviewDate('data/recipePages/60e9148725c3f64336fc9d83b2c1b521.html')

'on September 09, 2004 - on April 28, 2011 - on February 11, 2010'

#### www.foodnetwork.com
Same procedure.

In [65]:
def foodnetworkReviewDate(path):
    f = open(path, 'r', encoding='latin-1')
    soup = BeautifulSoup(f.read(), 'html.parser')
    review_html = soup.find_all('div', class_="about-recipe-info")
    reviews = ''
    for rev in review_html:
        if rev is not None:
            if reviews != '':
                reviews += ' - '
            reviews += rev.find_all('p')[2].text    
    return reviews
foodnetworkReviewDate('data/recipePages/10cf272724e823b8038b8190addf04d3.html')

'on January 07, 2012 - on December 22, 2011 - on November 10, 2011'

## Getting the review date of each recipes
Now we can get a all the review date of a html file on our 3 favorite website, let's create a table with all of it!

In [102]:
#TODO: Using these 3 helper function, compute the review for all file.

# Compare the date of recipe's review with season of its ingredients
As our goal is to figure out if the foods that grow during precise natural seasons are actually eaten during that time. We have to compare the review date with the season information we got from other website.

In [None]:
#TODO compare the date with our seasonal information

# Map of the USA's States
Having a visual representation of our work is really helpfull for basic validation on our part. It is also better to explain what we did with example. Curently it's just the USA's States, we will implement it when we will have data to inject in it.

In [103]:
usa_geojson_path = os.path.join('GeoJson', 'gz_2010_us_states_500k.json')
usa_geojson = json.load(open(usa_geojson_path))

usa_map = folium.Map(location=[48, -102], zoom_start=3)

usa_states = []
for i in usa_geojson['features']:
    usa_states.append(i['properties']['NAME'])

folium.GeoJson(usa_geojson).add_to(usa_map)

#TODO: Inject usefull data in it.
usa_map

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.


# Final visualisation
We have many data, many relation. It's time to give a life to all these information! The last part will be to represente these relation as concret as possible.

--------------------------

--------------------------

--------------------------

--------------------------
# Archive
These cells are some elements we had but which took too many time to run or are code we simplify or don't use anymore.
We don't want to delet it as we took time to write them and as we can re-use a part of them. We will move these away for the delivery


#### allrecipes.com
Starting for allrecipes.com. An inspection on the html elements lead us to see that all ingredient are referenced in:

```<li class="plaincharacterwrap ingredient">text</li>```

To find it, we used the inspector feature in firefox. It apears on testing that many html file we have are malformed. Sometime a page is just a search on a food name and it's not a recipe. Some othertime, the recipe is not finish, and then the ingredient list contain some blanks. We had to modify the following methods manytime to take these error into account.
As we first parcour the entire dataset to extracte the useful information, we decide to put some nul value when the data are malformed. We will also have to take care of the quantity associated to the ingredients name. But we keep it for later.

In [11]:
'''def allRecipesIngredients(path):
    f = open(path, 'r', encoding='latin-1')
    soup = BeautifulSoup(f.read(), 'html.parser')
    ingredients_html = soup.find_all('li', class_="plaincharacterwrap ingredient")
    title_html = soup.find_all('h1', class_='plaincharacterwrap fn')
    #The data on allrecipes sometime are not recipes but just a search on a word
    title = ''
    if len(title_html) > 0:
        title = title_html[0].find('span', class_='itemreviewed').text
    ingredients = ''
    for ingr in ingredients_html:
        if ingr is not None:
            if ingredients != '':
                ingredients += ', '
            ingredients += ingr.text.strip()
    return ingredients, title
#Example of result with a random file on this domain
results, title= allRecipesIngredients('data/recipePages/000a3333ad24828769b6be5a5e1bdb4a.html')

#TODO: Format the data to only have the name of the food
print(title)
results'''

Chicken Breast Cutlets with Artichokes and Capers


'1 cup whole wheat or white flour, 1/2 teaspoon salt, 1/8 teaspoon white pepper, or to taste, 1/8 teaspoon black pepper, or to taste, 2 pounds chicken breast tenderloins or strips, 2 tablespoons canola oil, 2 tablespoons extra-virgin olive oil, 2 cups chicken broth, 2 tablespoons fresh lemon juice, 1 (12 ounce) jar quartered marinated artichoke hearts, with liquid, 1/4 cup capers, 2 tablespoons butter, 1/4 cup chopped flat-leaf parsley'

#### www.food.com
Same principe, we use the inspector on firefox to indentify the ingredient. But this time we had a 'span' with the name value. So we don't have the quantity to take care now.

In [12]:
'''def foodIngredients(path):
    f = open(path, 'r', encoding='latin-1')
    soup = BeautifulSoup(f.read(), 'html.parser')
    ingredients_html = soup.find_all('li', class_="ingredient")
    title_html = soup.find_all('h1', class_='fn')
    title = ''
    if len(title_html) > 0:
        title = title_html[0].text
    ingredients = ''
    for ingr in ingredients_html:
        if ingr.find('span', class_='name') is not None:
            if ingredients != '':
                ingredients += ', '        
            ingredients += ingr.find('span', class_='name').text.strip().replace('\n', '').replace('\t', '')
    return ingredients, title

#Example of result with a random file on this domain
results, title = foodIngredients('data/recipePages/60e9148725c3f64336fc9d83b2c1b521.html')
print(title)
results'''

Crab Quiche


'eggs, sour cream, milk, butter, melted, all-purpose flour, salt, fresh ground pepper, crabmeat, shredded swiss cheese, chopped green onion, 9 inch pie shell, unbaked'

#### www.foodnetwork.com
Same as allrecipes.

In [13]:
'''def foodNetworkIngredients(path):
    f = open(path, 'r', encoding='latin-1')
    soup = BeautifulSoup(f.read(), 'html.parser')
    ingredients_html = soup.find_all('li', class_="ingredient")
    title_html = soup.find_all('h1', class_= 'fn')
    title = ''
    if len(title_html) > 0:
        title = title_html[0].text
    ingredients = ''
    for ingr in ingredients_html:
        if ingr is not None:
            if ingredients != '':
                ingredients += ', '
            ingredients += ingr.text.strip()
    return ingredients, title

#Example of result with a random file on this domain
results , title = foodNetworkIngredients('data/recipePages/10cf272724e823b8038b8190addf04d3.html')



#TODO: Format the data to only have the name of the food
print(title)
results'''

Roasted Sweet Potato Fries


'2 large sweet potatoes, peeled, 1/4 cup freshly squeezed orange juice, 1 tablespoon vegetable oil, plus 2 teaspoons, Kosher salt and freshly ground black pepper, 1/2 teaspoon ground ginger, 1/4 teaspoon cayenne pepper, or to taste'

## Getting the ingredients of each recipes
Now we can get a all the ingredients of a html file on our 3 favorite website, let's create a table with all of it!

In [14]:
'''html_file_path = 'data/recipePages/'
recipes_ingredient = pd.DataFrame(columns=['recipe_name', 'domain', 'file', 'ingredients'])
index = 0
for line in log_data['domain']:  
    #print(index)
    if line == 'http://allrecipes.com/' or line == 'http://www.food.com/' or line == 'http://www.foodnetwork.com/':
        ingredients = []
        title = ''
        if line == 'http://allrecipes.com/':
            ingredients, title = allRecipesIngredients(html_file_path + log_data['file'][index])
        if line == 'http://www.food.com/':
            ingredients, title = foodIngredients(html_file_path + log_data['file'][index])
        if line == 'http://www.foodnetwork.com/':
            ingredients, title = foodNetworkIngredients(html_file_path + log_data['file'][index])
        recipes_ingredient = recipes_ingredient.append({
                'recipe_name' : title,
                'domain' : line,
                'file' : log_data['file'][index],
                'ingredients' : ingredients
            }, ignore_index=True)
    index += 1
    
#TODO: Save this as a csv as it take age to compute. So we don't have to compute it again all the time.
recipes_ingredient.head()'''

"html_file_path = 'data/recipePages/'\nrecipes_ingredient = pd.DataFrame(columns=['recipe_name', 'domain', 'file', 'ingredients'])\nindex = 0\nfor line in log_data['domain']:  \n    #print(index)\n    if line == 'http://allrecipes.com/' or line == 'http://www.food.com/' or line == 'http://www.foodnetwork.com/':\n        ingredients = []\n        title = ''\n        if line == 'http://allrecipes.com/':\n            ingredients, title = allRecipesIngredients(html_file_path + log_data['file'][index])\n        if line == 'http://www.food.com/':\n            ingredients, title = foodIngredients(html_file_path + log_data['file'][index])\n        if line == 'http://www.foodnetwork.com/':\n            ingredients, title = foodNetworkIngredients(html_file_path + log_data['file'][index])\n        recipes_ingredient = recipes_ingredient.append({\n                'recipe_name' : title,\n                'domain' : line,\n                'file' : log_data['file'][index],\n                'ingredient