In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import json
import folium
import requests
from bs4 import BeautifulSoup
import re
from pathlib import Path
import glob

### Project variable

In [5]:
months = ['january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 'october', 'november', 'december']

# Data from 'Eat the seasons' website
Get food seasons from the [eat the seasons](http://www.eattheseasons.com/seasons.php) website.

In [6]:
eat_season_data = pd.DataFrame(columns=['month', 'food'])

for month in months:
    r = requests.get('http://www.eattheseasons.com/{0}.php'.format(month))
    soup = BeautifulSoup(r.text, 'html.parser')
    food_on_month = soup.find_all('p')
    for p in food_on_month:
        for elem in p.text.split(", "):
            if (elem.lower() != month.lower()):
                eat_season_data = eat_season_data.append({
                    'month': month.lower(),
                    'food': elem.strip(),
                }, ignore_index=True)

eat_season_data.head()

Unnamed: 0,month,food
0,january,broccoli
1,january,broccolini
2,january,brussels sprouts
3,january,butternut squash
4,january,celery root


# Data from 'seasonal food guide' website
Get food seasons from the [seasonal food guide](https://www.seasonalfoodguide.org/) website.

In [7]:
def buildSeasonalFoodGuideCSV():
    
    url = 'https://www.seasonalfoodguide.org'

    #All the data on pages are generated by a script, let's obtain the adress
    r = requests.get('https://www.seasonalfoodguide.org/maine/late-january')
    soup = BeautifulSoup(r.text, 'html.parser')

    url_end = soup.find_all('script')[2]['src']
    req = requests.get(url + url_end)
    data = req.text

    season_guide_data = pd.DataFrame(columns=['month', 'food', 'state'])

    m = re.findall(r'\{name:"(.*?)\}\}',data)
    m = m[1:]

    for elem in m:
        #get each month per states from the current eatable
        seasons = re.findall(r'[A-Z]{2}:{seasons:\[(.*?)\]', elem)
        #get each states that has some season on the current eatable
        states = re.findall(r'([A-Z]{2}):', elem)[1:]
        #get the name of the current eatable
        food = re.findall(r'([A-Za-z]+)"', elem)[0]
        for x in range(0, len(seasons)):
            for season in seasons[x].split(','):
                season_guide_data = season_guide_data.append({
                        'month' : months[int((int(season)-1)/2)].lower(),
                        'food' : food.lower(),
                        'state' : states[x]
            
                }, ignore_index=True)
    return season_guide_data.drop_duplicates()

#### Storing the data in a CSV file
The data from this second website, *seasonal food guide*, is very large and computing regular expressions on it takes a long time. We thus decide to store it in a csv file, so that subsequent runs don't have to build it again.

Regarding the space of our project, we put all the data folder into the .gitignore. Thus the first time the user compute the project, it has to generate again all the csv. This idea will be kept in all other CSV generation.

In [8]:

SFG_file = Path("data/seasonalFoodGuide.csv")
if SFG_file.is_file():
    season_guide_data = pd.read_csv('data/seasonalFoodGuide.csv')
else:
    season_guide_data = buildSeasonalFoodGuideCSV()
    season_guide_data.to_csv('data/seasonalFoodGuide.csv', index=False)
            
season_guide_data.head()

Unnamed: 0,month,food,state
0,late-july,Apples,AL
1,early-august,Apples,AL
2,late-august,Apples,AL
3,early-september,Apples,AL
4,late-september,Apples,AL


# Analysis of the recipes dataset
Our dataset contains 2,5 GB of html files (110'517 files regarding the number of line in the log file). There are plenty of different results. We first need to analyse what kind of data we have before analyse the data themself.

### The log file
We first have a log file containing valuable information. It give us the name of each file associated to the url it comes from. We use it as index for the rest of the project. We will use the log file to navigate instead of looking blindly in each file.

**We observed in the log file that some files came with some error. We keep that in mind and will come back to it later on.**

In [12]:
log_path = 'data/recipePages/msg.log'
recipies_path = 'data/recipePages'

f = open(log_path,'r')
log = f.read().split('\n')
#TODO: Take care of the error line in the log file

In [13]:
def buildLogData():
    log_data = pd.DataFrame(columns=['domain', 'url', 'file'])
    for line in log:
        domain = re.search(r'http://(.*?)/', line)
        url = re.search(r'http://.*?(?=\t)', line)
        file_name = re.search(r'.*?(?=\t)', line)
        if domain is not None:
            if url is not None:
                if file_name is not None:
                    log_data = log_data.append({
                            'domain' : domain.group(0),
                            'url' : url.group(0),
                            'file' : file_name.group(0),
                        }, ignore_index=True)
    return log_data

#### Storing the data in a CSV file

In [None]:
my_file = Path("data/recipePages/log_data.csv")
if my_file.is_file():
    log_data = pd.read_csv('data/recipePages/log_data.csv')
else:
    log_data = buildLogData()
    log_data.to_csv('data/recipePages/log_data.csv', index=False)
            
log_data.head()

### Proportion of each domain name
Our dataset was scrapped from 127 different websites. Because we have to process each website differently to get the information we need, we will only treat the websites that make the biggest part of the dataset.

In [12]:
df = log_data['domain'].value_counts()
print('size: {0}'.format(df.size))
df.head()

size: 127


http://allrecipes.com/         28354
http://www.food.com/           14661
http://www.foodnetwork.com/    11996
http://www.yummly.com/          6590
http://www.cooks.com/           5546
Name: domain, dtype: int64

We observe that the following 3 websites cover 49.77% of the dataset:

- allrecipes.com
- www.food.com
- www.foodnetwork.com

We will therefore work with these 3 websites, knowing that we can improve our data by creating methods for other websites if needed.

## Getting the ingredients of each recipe
We have a second dataset containing the recipe name, url, domain, ingredients and many other information. We will then extract this dataset and merge it to our log_data dataframe to have the possibility to link these information to the corresponding html file. We do this because the review information is only availible in the html file.

In [13]:
recip_info_path = 'data/recipeInfo/recipeInfo_WestWhiteHorvitz_WWW2013.tsv'

recip_info = pd.read_csv(recip_info_path, sep='\t', encoding='latin-1')

restricted_recipe_info = recip_info[['url', 'title', 'ingredients_list']]

merged_info = pd.merge(log_data, restricted_recipe_info, how='inner', on='url', indicator=False, suffixes=('_info', '_log'))
merged_info.head()

Unnamed: 0,domain,url,file,title,ingredients_list
0,http://allrecipes.com/,http://allrecipes.com/recipe/classic-minestrone/,7e0ad7374f08c4a8de3500c065c17180.html,Classic Minestrone Recipe,"3 tablespoons olive oil|1 leek, sliced|2 carro..."
1,http://allrecipes.com/,http://allrecipes.com/Recipe/basil-butter-2/de...,4f9ea44a8519ba9d013264eb55711c9b.html,Basil Butter Recipe,4 cloves garlic|15 leaves fresh basil|1/2 teas...
2,http://www.cdkitchen.com/,http://www.cdkitchen.com/recipes/recs/75/Beer_...,099aebf16685a804035fee84152c4f4f.html,Beer Cheese Recipe #11524,1 pound mild cheddar (shredded)|1 pound extra ...
3,http://www.foodnetwork.com/,http://www.foodnetwork.com/recipes/claire-robi...,10cf272724e823b8038b8190addf04d3.html,Roasted Sweet Potato Fries Recipe : Claire Rob...,"2 large sweet potatoes, peeled|1/4 cup freshly..."
4,http://allrecipes.com/,http://allrecipes.com/recipe/dirty-martini/,856e6ca1d45014b045c1266d406f3ccf.html,Dirty Martini Recipe,6 fluid ounces vodka|1 dash dry vermouth|1 flu...


### Reducing the dataset
Currently, our dataset is huge. Going through it in its entirety takes several hours. To simplify our research acording to our previous observation, we will only keep the 3 top websites.

In [14]:
keeped_domain = pd.DataFrame({'domain':['http://allrecipes.com/', 'http://www.food.com/', 'http://www.foodnetwork.com/']})

new_merged = merged_info[merged_info.domain.isin(keeped_domain.domain)].reset_index().drop('index', 1)
new_merged.head()

Unnamed: 0,domain,url,file,title,ingredients_list
0,http://allrecipes.com/,http://allrecipes.com/recipe/classic-minestrone/,7e0ad7374f08c4a8de3500c065c17180.html,Classic Minestrone Recipe,"3 tablespoons olive oil|1 leek, sliced|2 carro..."
1,http://allrecipes.com/,http://allrecipes.com/Recipe/basil-butter-2/de...,4f9ea44a8519ba9d013264eb55711c9b.html,Basil Butter Recipe,4 cloves garlic|15 leaves fresh basil|1/2 teas...
2,http://www.foodnetwork.com/,http://www.foodnetwork.com/recipes/claire-robi...,10cf272724e823b8038b8190addf04d3.html,Roasted Sweet Potato Fries Recipe : Claire Rob...,"2 large sweet potatoes, peeled|1/4 cup freshly..."
3,http://allrecipes.com/,http://allrecipes.com/recipe/dirty-martini/,856e6ca1d45014b045c1266d406f3ccf.html,Dirty Martini Recipe,6 fluid ounces vodka|1 dash dry vermouth|1 flu...
4,http://allrecipes.com/,http://allrecipes.com/recipes/seafood/fish/tro...,c6a40a7de4b506a935093b67bccf4aac.html,Trout Recipes,


# Getting Review date
We do **the assumption** that the people which do a recipe will review the same day or maximum in the same week he cook the recipe. We need to extract the date of all reviews to know when they cooked the recipe.

#### allrecipes.com
Starting with *allrecipes.com*. An inspection on the html elements lead us to see that all review date are referenced in:

``<div class="review">``

To find it, we used the inspector feature in Firefox. It apears on testing that many html file we have are malformed. Sometime a page is just a search on a food name and it's not a recipe. Other times, there is no review. We had to modify the following methods multiple times to take these errors into account.
As we first browse the entire dataset to extract the useful information, we decide to put some nul values when the data is malformed. We will also have to take care of the quantity associated to the ingredients name, even though we keep it for later.

We also had the surprise that BeautifulSoup search by matching element. It lead to the following problem, searching class review give us all class containing the word 'review' like 'previre' and many others. To deal with this problem and only get our class, we modify our usual way to search with BeautifulSoup and use an anonymous function.

In [None]:
def allRecipesReviewDate(path):
    f = open(path, 'r', encoding='latin-1')
    soup = BeautifulSoup(f.read(), 'html.parser')
    #The mentionned lamnda function for BeautifulSoup search
    review_html = soup.find_all(lambda tag: tag.name =='div' and tag.get('class') == ['review'])
    reviews = ''
    for rev in review_html:
        if rev is not None:
            text = rev.text.strip().replace('\n', '').replace('\t', '')
            regex = re.search(r'[A-Z][a-z]{2}\. [0-9]*, 200[0-9]', text)
            if regex is not None:
                if reviews != '':
                    reviews += ' - '            
                reviews += regex.group(0)
    return reviews

print('Example of result with a random file from allrecipes.com:')
allRecipesReviewDate('data/recipePages/7e0ad7374f08c4a8de3500c065c17180.html')

#### www.food.com
Same principle, we use the inspector on Firefox to indentify the review date. This time, there is no class easily findable directly for the date. We go up to the first one acceptable and the  do a second find_all on it. As there is two ``<p>`` elements this time and we are interessting in the second one, we take only the second element.

In [16]:
def foodReviewDate(path):
    f = open(path, 'r', encoding='latin-1')
    soup = BeautifulSoup(f.read(), 'html.parser')
    review_html = soup.find_all('div', class_="about-recipe-info")
    reviews = ''
    for rev in review_html:
        if rev is not None:
            if reviews != '':
                reviews += ' - '
            reviews += rev.find_all('p')[1].text    
    return reviews

print('Example of result with a random file from food.com:')

foodReviewDate('data/recipePages/60e9148725c3f64336fc9d83b2c1b521.html')

'on September 09, 2004 - on April 28, 2011 - on February 11, 2010'

#### www.foodnetwork.com
Same procedure.

In [17]:
def foodnetworkReviewDate(path):
    f = open(path, 'r', encoding='latin-1')
    soup = BeautifulSoup(f.read(), 'html.parser')
    review_html = soup.find_all('div', class_="about-recipe-info")
    reviews = ''
    for rev in review_html:
        if rev is not None:
            rev_p = rev.find_all('p')
            if len(rev_p) >= 3:                
                if reviews != '':
                    reviews += ' - '
                reviews += rev_p[2].text    
    return reviews

print('Example of result with a random file from foodnetwork.com:')

foodnetworkReviewDate('data/recipePages/10cf272724e823b8038b8190addf04d3.html')

'on January 07, 2012 - on December 22, 2011 - on November 10, 2011'

## Getting the review date of each recipes
Now we can get all the review dates of a html file on our 3 favorite websites, let's create a table with all of it!

In [18]:
def build_review_data():
    html_file_path = 'data/recipePages/'
    reviews = pd.DataFrame(columns=['reviews_dates', 'domain', 'url', 'file', 'title', 'ingredients_list'])
    index = 0
    nbr_elem = new_merged.shape[0]
    for line in new_merged['domain']:    
        text = 'NaN'
        if line == 'http://allrecipes.com/':
            text = allRecipesReviewDate(html_file_path + new_merged['file'][index])
        if line == 'http://www.food.com/':
            text = foodReviewDate(html_file_path + new_merged['file'][index])
        if line == 'http://www.foodnetwork.com/':
            text = foodnetworkReviewDate(html_file_path + new_merged['file'][index])
        reviews = reviews.append({
                'reviews_dates' : text,
                'domain' : new_merged['domain'][index],
                'url' : new_merged['url'][index],
                'file' : new_merged['file'][index],
                'title' : new_merged['title'][index],
                'ingredients_list' : new_merged['ingredients_list'][index],
        }, ignore_index=True)
        if ((index % 2500) == 0 and index != 0):
            ratio = (index / nbr_elem) * 100
            print('We are curently at ' + str(ratio) + '%')
        index += 1
    return reviews

#### Storing in a CSV file
/!\ This methods takes hours to generate. /!\

In [19]:
my_file = Path('data/reviews.csv')
if my_file.is_file():
    review_data = pd.read_csv('data/reviews.csv', encoding='latin-1')
else:
    review_data = build_review_data()
    review_data[pd.notnull(review_data['ingredients_list'])].to_csv('data/reviews.csv', index=False)
            
#TODO: Do we need to save this dataset as a csv as it took some times to generate?
review_data.head()

Unnamed: 0,reviews_dates,domain,url,file,title,ingredients_list
0,"Dec. 22, 2003 - Dec. 2, 2005 - Sep. 30, 2007 -...",http://allrecipes.com/,http://allrecipes.com/recipe/classic-minestrone/,7e0ad7374f08c4a8de3500c065c17180.html,Classic Minestrone Recipe,"3 tablespoons olive oil|1 leek, sliced|2 carro..."
1,"Dec. 14, 2007 - Jun. 9, 2006 - Jul. 12, 2006 -...",http://allrecipes.com/,http://allrecipes.com/Recipe/basil-butter-2/de...,4f9ea44a8519ba9d013264eb55711c9b.html,Basil Butter Recipe,4 cloves garlic|15 leaves fresh basil|1/2 teas...
2,"on January 07, 2012 - on December 22, 2011 - o...",http://www.foodnetwork.com/,http://www.foodnetwork.com/recipes/claire-robi...,10cf272724e823b8038b8190addf04d3.html,Roasted Sweet Potato Fries Recipe : Claire Rob...,"2 large sweet potatoes, peeled|1/4 cup freshly..."
3,"Jun. 21, 2004 - Dec. 5, 2007 - Sep. 2, 2007 - ...",http://allrecipes.com/,http://allrecipes.com/recipe/dirty-martini/,856e6ca1d45014b045c1266d406f3ccf.html,Dirty Martini Recipe,6 fluid ounces vodka|1 dash dry vermouth|1 flu...
4,"Nov. 12, 2003 - Nov. 30, 2009 - Aug. 20, 2003 ...",http://allrecipes.com/,http://allrecipes.com/recipe/candied-sweet-pot...,05bd905b46dcd56e9b97268b46f05e11.html,Candied Sweet Potatoes Recipe,"4 pounds sweet potatoes, quartered|1 1/4 cups ..."


### Group the seasonal food dataset per month 
Our initial dataset consists of rows containing a month and a food. From that we want to get a mapping between ingredients and the resective months.

In [20]:
eat_season_per_month = eat_season_data.groupby('food')['month'].apply(list)
eat_season_per_month.head()

food
almonds                [september, october, november]
apples                 [september, october, november]
apricots                    [may, june, july, august]
artichoke    [april, may, august, september, october]
arugula                          [june, july, august]
Name: month, dtype: object

In [21]:
season_guid_no_state = season_guide_data[['month', 'food']].drop_duplicates()

season_guid_per_month = season_guid_no_state.groupby('food')['month'].apply(list)
season_guid_per_month.head()

food
apples        [july, august, september, october, november, d...
apricots      [july, august, may, june, september, january, ...
artichokes    [february, march, april, may, june, september,...
arugula       [march, april, may, june, july, august, septem...
asparagus             [march, april, may, june, february, july]
Name: month, dtype: object

# Compare the date of recipe's review with season of its ingredients
As our goal is to figure out if the foods that grow during precise natural seasons are actually eaten during that time. We have to compare the review dates with the season information we got from other websites.

### Rating the recipes regarding the ingredient list
We want to know what are the best season to eat a recipe respecting the local production. To do so, we came with the following idea. Given the ingredients on a recipe, we will take only those which exist on our seasonal dataset, and give 6 point to the month that match, and the reduce one point per distance of the good month. Then addition the score and divide it by the total number of ingredients.
First thing to do is to get for each recipe all the month for each ingredients we recognize in our seasonal dataset. And then do a rating algorithm on it.

In [22]:
def get_month_for_ingredient():
    months_eat = pd.DataFrame(columns=['months_eat_season'])
    months_season = pd.DataFrame(columns=['months_season_guid'])

    nbr_elem = review_data.shape[0]

    for i, row in review_data.iterrows():
        month_eat_season = []
        month_season_guid = []
        for line in review_data['ingredients_list'][i].split('|'):
            words = re.sub(r"[0-9]+|\(|\/|\)|,", '', line).lower().split(' ')
            prev_word = ''
            comp_word = ''
            for word in words:
                if word != '':
                    if prev_word != '':
                        comp_word = prev_word + ' ' + word
                    if word in eat_season_per_month.index:
                        month_eat_season.append(eat_season_per_month[word])
                    if comp_word in eat_season_per_month.index and prev_word != '':
                        month_eat_season.append(eat_season_per_month[comp_word])
                    if word in season_guid_per_month.index:
                        month_season_guid.append(season_guid_per_month[word])
                    if comp_word in season_guid_per_month.index and prev_word != '':
                        month_season_guid.append(season_guid_per_month[comp_word])
                    prev_word = word
        #TODO make this generate a ranking and not this table
        months_eat = months_eat.append({
                    'months_eat_season' : month_eat_season,
            }, ignore_index=True)
        months_season = months_season.append({
                    'months_season_guid' : month_season_guid,
            }, ignore_index=True)
        #if ((i % 2500) == 0):            
        #    ratio = (i / nbr_elem) * 100
        #    print('We are curently at ' + str(ratio) + '%')
    return months_eat, months_season

In [23]:
my_file = Path('data/months_eat.csv')
if my_file.is_file():
    months_eat = pd.read_csv('data/months_eat.csv', encoding='latin-1')
    months_season = pd.read_csv('data/months_season.csv', encoding='latin-1')
else:
    months_eat, months_season = get_month_for_ingredient()
    months_eat.to_csv('data/months_eat.csv', index=False)
    months_season.to_csv('data/months_season.csv', index=False)
    
months_eat.head()

Unnamed: 0,months_eat_season
0,"[['june', 'july', 'august', 'september', 'octo..."
1,"[['july', 'august', 'september', 'october', 'n..."
2,"[['january', 'august', 'september', 'october',..."
3,[]
4,"[['january', 'august', 'september', 'october',..."


"[['june', 'july', 'august', 'september', 'october'], ['may', 'june', 'july', 'august'], ['july', 'august', 'september', 'october']]"

#### Rating algorithm

In [37]:
def initiat_month_score():
    month_score = [
        ['january', 0],
        ['february', 0],
        ['march', 0],
        ['april', 0],
        ['may', 0],
        ['june', 0],
        ['july', 0],
        ['august', 0],
        ['september', 0],
        ['october', 0],
        ['november', 0],
        ['december', 0]
    ]
    return month_score;

#### Distance between month
To do our rating algorithm, we calculate the distance between the months that 

In [47]:
def month_distance(month_one, month_two):
    if (month_one == '') or (month_two == ''):
        return 6
    month_pos = {
        'january': 1,
        'february': 2,
        'march': 3,
        'april': 4,
        'may': 5,
        'june': 6,
        'july': 7,
        'august': 8,
        'september': 9,
        'october': 10,
        'november': 11,
        'december': 12
    }
    diff = month_pos[month_one] - month_pos[month_two]
    if diff < 0:
        diff = diff * - 1
    if diff > 6:
        diff = (12 - diff)
    return diff

In [178]:
def rating_algo(data_set):
    score = pd.DataFrame(columns=['score'])
    max_score = 6
    for data in data_set:
        #for an unknow reason,
        #the serie extracted from data_set give a str when pass to the for instead of a list of list
        value = eval(data)
        length = len(value)
        month_score = initiat_month_score()
        if length != 0:
            for elem in value:
                first_month = ''
                last_month = ''
                for x in range(0, len(months)):
                    if months[x] in elem:
                        if first_month == '':
                            first_month = months[x]
                        last_month = months[x]
                for x in range(0, len(months)):
                    if months[x] in elem:
                        month_score[x][1] += max_score
                    else:
                        with_first = max_score - month_distance(months[x], first_month)
                        with_last = max_score - month_distance(months[x], last_month)
                        month_score[x][1] += max(with_first, with_last)
            for x in range(0, len(months)):
                month_score[x][1] = month_score[x][1]/length
        score = score.append({
                'score' : month_score,
            }, ignore_index=True)
    return score

In [179]:
months_eat_score = rating_algo(months_eat['months_eat_season'])
months_eat_score.head()

Unnamed: 0,score
0,"[[january, 2.6666666666666665], [february, 2.3..."
1,"[[january, 4.0], [february, 3.0], [march, 2.0]..."
2,"[[january, 6.0], [february, 5.0], [march, 4.0]..."
3,"[[january, 0], [february, 0], [march, 0], [apr..."
4,"[[january, 6.0], [february, 5.0], [march, 4.0]..."


In [180]:
months_season_score = rating_algo(months_season['months_season_guid'])
months_season_score.head()

Unnamed: 0,score
0,"[[january, 6.0], [february, 6.0], [march, 6.0]..."
1,"[[january, 6.0], [february, 6.0], [march, 6.0]..."
2,"[[january, 5.0], [february, 5.0], [march, 5.5]..."
3,"[[january, 4.0], [february, 3.0], [march, 2.0]..."
4,"[[january, 6.0], [february, 6.0], [march, 6.0]..."


### Now we have our month scores, add them to the review data

In [186]:
review_data['months_eat_score'] = months_eat_score
review_data['months_season_score'] = months_season_score
review_data.head()

Unnamed: 0,reviews_dates,domain,url,file,title,ingredients_list,months_eat_score,months_season_score
0,"Dec. 22, 2003 - Dec. 2, 2005 - Sep. 30, 2007 -...",http://allrecipes.com/,http://allrecipes.com/recipe/classic-minestrone/,7e0ad7374f08c4a8de3500c065c17180.html,Classic Minestrone Recipe,"3 tablespoons olive oil|1 leek, sliced|2 carro...","[[january, 2.6666666666666665], [february, 2.3...","[[january, 6.0], [february, 6.0], [march, 6.0]..."
1,"Dec. 14, 2007 - Jun. 9, 2006 - Jul. 12, 2006 -...",http://allrecipes.com/,http://allrecipes.com/Recipe/basil-butter-2/de...,4f9ea44a8519ba9d013264eb55711c9b.html,Basil Butter Recipe,4 cloves garlic|15 leaves fresh basil|1/2 teas...,"[[january, 4.0], [february, 3.0], [march, 2.0]...","[[january, 6.0], [february, 6.0], [march, 6.0]..."
2,"on January 07, 2012 - on December 22, 2011 - o...",http://www.foodnetwork.com/,http://www.foodnetwork.com/recipes/claire-robi...,10cf272724e823b8038b8190addf04d3.html,Roasted Sweet Potato Fries Recipe : Claire Rob...,"2 large sweet potatoes, peeled|1/4 cup freshly...","[[january, 6.0], [february, 5.0], [march, 4.0]...","[[january, 5.0], [february, 5.0], [march, 5.5]..."
3,"Jun. 21, 2004 - Dec. 5, 2007 - Sep. 2, 2007 - ...",http://allrecipes.com/,http://allrecipes.com/recipe/dirty-martini/,856e6ca1d45014b045c1266d406f3ccf.html,Dirty Martini Recipe,6 fluid ounces vodka|1 dash dry vermouth|1 flu...,"[[january, 0], [february, 0], [march, 0], [apr...","[[january, 4.0], [february, 3.0], [march, 2.0]..."
4,"Nov. 12, 2003 - Nov. 30, 2009 - Aug. 20, 2003 ...",http://allrecipes.com/,http://allrecipes.com/recipe/candied-sweet-pot...,05bd905b46dcd56e9b97268b46f05e11.html,Candied Sweet Potatoes Recipe,"4 pounds sweet potatoes, quartered|1 1/4 cups ...","[[january, 6.0], [february, 5.0], [march, 4.0]...","[[january, 6.0], [february, 6.0], [march, 6.0]..."


# Map of the United States
Having a visual representation of our work is really helpfull for basic validation on our part. It is also better to explain what we did with example. Curently it's just the US states, we will implement it when we will have data to inject in it.

In [3]:
usa_geojson_path = os.path.join('GeoJson', 'gz_2010_us_states_500k.json')
usa_geojson = json.load(open(usa_geojson_path))

usa_map = folium.Map(location=[48, -102], zoom_start=3)

usa_states = []
#for i in usa_geojson['features']:
#    usa_states.append(i['properties']['NAME'])

folium.GeoJson(usa_geojson).add_to(usa_map)

#TODO: Inject usefull data in it.
usa_map

In [7]:
usa_topojson_path = os.path.join('topojson', 'us_states.json')
usa_topojson = json.load(open(usa_topojson_path))

m = folium.Map([48,-102], zoom_start=4)

m.choropleth(
    geo_data=usa_topojson,
    fill_color='YlGn',
    )
m

# Final visualisation
We have many data, many relation. It's time to give a life to all these information! The last part will be to represent these relations as concretely as possible.

--------------------------

--------------------------

--------------------------

--------------------------
# Archive
These cells are some elements we had but which took too many time to run or are code we simplify or don't use anymore.
We don't want to delet it as we took time to write them and as we can re-use a part of them. We will move these away for the delivery


#### allrecipes.com
Starting for allrecipes.com. An inspection on the html elements lead us to see that all ingredient are referenced in:

```<li class="plaincharacterwrap ingredient">text</li>```

To find it, we used the inspector feature in firefox. It apears on testing that many html file we have are malformed. Sometime a page is just a search on a food name and it's not a recipe. Some othertime, the recipe is not finish, and then the ingredient list contain some blanks. We had to modify the following methods manytime to take these error into account.
As we first parcour the entire dataset to extracte the useful information, we decide to put some nul value when the data are malformed. We will also have to take care of the quantity associated to the ingredients name. But we keep it for later.

In [29]:
'''def allRecipesIngredients(path):
    f = open(path, 'r', encoding='latin-1')
    soup = BeautifulSoup(f.read(), 'html.parser')
    ingredients_html = soup.find_all('li', class_="plaincharacterwrap ingredient")
    title_html = soup.find_all('h1', class_='plaincharacterwrap fn')
    #The data on allrecipes sometime are not recipes but just a search on a word
    title = ''
    if len(title_html) > 0:
        title = title_html[0].find('span', class_='itemreviewed').text
    ingredients = ''
    for ingr in ingredients_html:
        if ingr is not None:
            if ingredients != '':
                ingredients += ', '
            ingredients += ingr.text.strip()
    return ingredients, title
#Example of result with a random file on this domain
results, title= allRecipesIngredients('data/recipePages/000a3333ad24828769b6be5a5e1bdb4a.html')

#TODO: Format the data to only have the name of the food
print(title)
results'''

'def allRecipesIngredients(path):\n    f = open(path, \'r\', encoding=\'latin-1\')\n    soup = BeautifulSoup(f.read(), \'html.parser\')\n    ingredients_html = soup.find_all(\'li\', class_="plaincharacterwrap ingredient")\n    title_html = soup.find_all(\'h1\', class_=\'plaincharacterwrap fn\')\n    #The data on allrecipes sometime are not recipes but just a search on a word\n    title = \'\'\n    if len(title_html) > 0:\n        title = title_html[0].find(\'span\', class_=\'itemreviewed\').text\n    ingredients = \'\'\n    for ingr in ingredients_html:\n        if ingr is not None:\n            if ingredients != \'\':\n                ingredients += \', \'\n            ingredients += ingr.text.strip()\n    return ingredients, title\n#Example of result with a random file on this domain\nresults, title= allRecipesIngredients(\'data/recipePages/000a3333ad24828769b6be5a5e1bdb4a.html\')\n\n#TODO: Format the data to only have the name of the food\nprint(title)\nresults'

#### www.food.com
Same principe, we use the inspector on firefox to indentify the ingredient. But this time we had a 'span' with the name value. So we don't have the quantity to take care now.

In [30]:
'''def foodIngredients(path):
    f = open(path, 'r', encoding='latin-1')
    soup = BeautifulSoup(f.read(), 'html.parser')
    ingredients_html = soup.find_all('li', class_="ingredient")
    title_html = soup.find_all('h1', class_='fn')
    title = ''
    if len(title_html) > 0:
        title = title_html[0].text
    ingredients = ''
    for ingr in ingredients_html:
        if ingr.find('span', class_='name') is not None:
            if ingredients != '':
                ingredients += ', '        
            ingredients += ingr.find('span', class_='name').text.strip().replace('\n', '').replace('\t', '')
    return ingredients, title

#Example of result with a random file on this domain
results, title = foodIngredients('data/recipePages/60e9148725c3f64336fc9d83b2c1b521.html')
print(title)
results'''

'def foodIngredients(path):\n    f = open(path, \'r\', encoding=\'latin-1\')\n    soup = BeautifulSoup(f.read(), \'html.parser\')\n    ingredients_html = soup.find_all(\'li\', class_="ingredient")\n    title_html = soup.find_all(\'h1\', class_=\'fn\')\n    title = \'\'\n    if len(title_html) > 0:\n        title = title_html[0].text\n    ingredients = \'\'\n    for ingr in ingredients_html:\n        if ingr.find(\'span\', class_=\'name\') is not None:\n            if ingredients != \'\':\n                ingredients += \', \'        \n            ingredients += ingr.find(\'span\', class_=\'name\').text.strip().replace(\'\n\', \'\').replace(\'\t\', \'\')\n    return ingredients, title\n\n#Example of result with a random file on this domain\nresults, title = foodIngredients(\'data/recipePages/60e9148725c3f64336fc9d83b2c1b521.html\')\nprint(title)\nresults'

#### www.foodnetwork.com
Same as allrecipes.

In [31]:
'''def foodNetworkIngredients(path):
    f = open(path, 'r', encoding='latin-1')
    soup = BeautifulSoup(f.read(), 'html.parser')
    ingredients_html = soup.find_all('li', class_="ingredient")
    title_html = soup.find_all('h1', class_= 'fn')
    title = ''
    if len(title_html) > 0:
        title = title_html[0].text
    ingredients = ''
    for ingr in ingredients_html:
        if ingr is not None:
            if ingredients != '':
                ingredients += ', '
            ingredients += ingr.text.strip()
    return ingredients, title

#Example of result with a random file on this domain
results , title = foodNetworkIngredients('data/recipePages/10cf272724e823b8038b8190addf04d3.html')



#TODO: Format the data to only have the name of the food
print(title)
results'''

'def foodNetworkIngredients(path):\n    f = open(path, \'r\', encoding=\'latin-1\')\n    soup = BeautifulSoup(f.read(), \'html.parser\')\n    ingredients_html = soup.find_all(\'li\', class_="ingredient")\n    title_html = soup.find_all(\'h1\', class_= \'fn\')\n    title = \'\'\n    if len(title_html) > 0:\n        title = title_html[0].text\n    ingredients = \'\'\n    for ingr in ingredients_html:\n        if ingr is not None:\n            if ingredients != \'\':\n                ingredients += \', \'\n            ingredients += ingr.text.strip()\n    return ingredients, title\n\n#Example of result with a random file on this domain\nresults , title = foodNetworkIngredients(\'data/recipePages/10cf272724e823b8038b8190addf04d3.html\')\n\n\n\n#TODO: Format the data to only have the name of the food\nprint(title)\nresults'

## Getting the ingredients of each recipes
Now we can get a all the ingredients of a html file on our 3 favorite website, let's create a table with all of it!

In [32]:
'''html_file_path = 'data/recipePages/'
recipes_ingredient = pd.DataFrame(columns=['recipe_name', 'domain', 'file', 'ingredients'])
index = 0
for line in log_data['domain']:  
    #print(index)
    if line == 'http://allrecipes.com/' or line == 'http://www.food.com/' or line == 'http://www.foodnetwork.com/':
        ingredients = []
        title = ''
        if line == 'http://allrecipes.com/':
            ingredients, title = allRecipesIngredients(html_file_path + log_data['file'][index])
        if line == 'http://www.food.com/':
            ingredients, title = foodIngredients(html_file_path + log_data['file'][index])
        if line == 'http://www.foodnetwork.com/':
            ingredients, title = foodNetworkIngredients(html_file_path + log_data['file'][index])
        recipes_ingredient = recipes_ingredient.append({
                'recipe_name' : title,
                'domain' : line,
                'file' : log_data['file'][index],
                'ingredients' : ingredients
            }, ignore_index=True)
    index += 1
    
#TODO: Save this as a csv as it take age to compute. So we don't have to compute it again all the time.
recipes_ingredient.head()'''



"html_file_path = 'data/recipePages/'\nrecipes_ingredient = pd.DataFrame(columns=['recipe_name', 'domain', 'file', 'ingredients'])\nindex = 0\nfor line in log_data['domain']:  \n    #print(index)\n    if line == 'http://allrecipes.com/' or line == 'http://www.food.com/' or line == 'http://www.foodnetwork.com/':\n        ingredients = []\n        title = ''\n        if line == 'http://allrecipes.com/':\n            ingredients, title = allRecipesIngredients(html_file_path + log_data['file'][index])\n        if line == 'http://www.food.com/':\n            ingredients, title = foodIngredients(html_file_path + log_data['file'][index])\n        if line == 'http://www.foodnetwork.com/':\n            ingredients, title = foodNetworkIngredients(html_file_path + log_data['file'][index])\n        recipes_ingredient = recipes_ingredient.append({\n                'recipe_name' : title,\n                'domain' : line,\n                'file' : log_data['file'][index],\n                'ingredient