In [1]:
%%html 

<table  class = titlepage>

    <tr>
        <th colspan="2"><img class="logo-image" src="https://mydnvglcdn.azureedge.net/cdn/dnvgl-styles/common/images/logo/logo-wide.svg" stle="width:100%" /></th>
    </tr>

    <tr>
        <td colspan="2">Web Scraping</td>
    </tr>
  
    <tr>
        <td colspan="2">Jamie Oliver Recipes </td>
    </tr>

    <tr>
        <td colspan="2"><img src="scraper-tool.jpg" alt="CoverImage" style="width: 100%;"/></td>
    </tr>


    <tr>
      <td>Version</td>
      <td>0.1</td> 
    </tr>
    
    <tr>
      <td>Date</td>
      <td>07/06/2018</td> 
    </tr>
    
    <tr>
      <td>Author(s)</td>
      <td>Rachel Hassall</td> 
    </tr>
    
      <tr>
      <td>Approved By</td>
      <td> - </td> 
    </tr>
    
      <tr>
      <td>Contact</td>
      <td>rachel.hassall@dnvgl.com</td> 
    </tr>
  
</table>

Unnamed: 0,Unnamed: 1
Web Scraping,Web Scraping
Jamie Oliver Recipes,Jamie Oliver Recipes
,
Version,0.1
Date,07/06/2018
Author(s),Rachel Hassall
Approved By,-
Contact,rachel.hassall@dnvgl.com


<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Ingredient-Corpus" data-toc-modified-id="Ingredient-Corpus-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Ingredient Corpus</a></span></li><li><span><a href="#Basic-web-scraping" data-toc-modified-id="Basic-web-scraping-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Basic web scraping</a></span></li><li><span><a href="#Extract-data-into-recipe-dictionary" data-toc-modified-id="Extract-data-into-recipe-dictionary-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Extract data into recipe dictionary</a></span></li><li><span><a href="#Data-Wrangling" data-toc-modified-id="Data-Wrangling-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Data Wrangling</a></span></li></ul></div>

# Import Libraries

In [2]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
from urllib.error import HTTPError
import time
import json
import re

## Ingredient Corpus

Import a corpus of ingredient words (based on Kaggle dataset)

In [3]:
def get_json_ingreds(file):

    ingredient_corpus = []
    
    with open(file) as json_data:
        ingred_dict = json.load(json_data)
    for item in ingred_dict:
        ingredient_corpus.extend(item["ingredients"])
        
    ingredient_corpus = list(set(ingredient_corpus))
    
    return ingredient_corpus

In [4]:
ingredient_corpus = get_json_ingreds('data/ingredients.json')
ingredient_corpus.extend(get_json_ingreds('data/ingredients2.json'))

ingredient_corpus = list(set(ingredient_corpus))

In [5]:
for item in ingredient_corpus:
    if "coriander" in item:
        print(item)

coriander
Vietnamese coriander
fresh coriander
sazon goya with coriander and annatto
ground coriander
coriander seeds
coriander powder


## Basic web scraping

Scrape recipes based on search results on Jamie Oliver's website.

In [6]:
# get search results for jamie oliver recipes
base_url = 'https://www.jamieoliver.com/search/?s='
search = 'mexican'
url = base_url+search

html_page = urlopen(url)
parsed_page = BeautifulSoup(html_page, 'html.parser')

In [7]:
# go through search result websites and pick out recipe URLS

recipe_url_list = []

for link in parsed_page.find_all('a'):
    recipe_url = str(link.get('href'))
    if "jamieoliver.com/recipes" in recipe_url and 'category' not in recipe_url:
        recipe_url_list.append(recipe_url)

In [8]:
# go through recipe urls and save html to a dictionary, with the key as the recipe name
recipe_name_list = []
recipes={}

for recipe_url in recipe_url_list:
    
    # scrape the webpage
    try:
        hmtl_recipe = urlopen(recipe_url, timeout=5)
    except HTTPError or URLError:
        break
        
    # parse and get the name of the recipe    
    parsed_recipe = BeautifulSoup(hmtl_recipe, 'html.parser')
    simple_details = parsed_recipe.find(attrs={'class':'single-recipe-details-mobile-top'})
    try:
        recipe_name = simple_details.find('h3').text
        recipe_name_extra = simple_details.find('p').text
    except AttributeError:
        recipe_name_extra = ""
    
    # save to a dictionary, key is the name and value is the parsed page
    name = recipe_name+", " + recipe_name_extra
    recipe_name_list.append(name)
    recipes[name] = {"parsed_html":parsed_recipe}
    
    # wait a second to ensure that the website isn't overloaded
    time.sleep(1)

## Extract data into recipe dictionary

Get the info I want and put into a python dictionary.

In [9]:
testrecipe="Mexican breakfast, Spicy tomato stew & eggs"

In [10]:
# function to tidy up the ingredients list, and simplify with quantity and ingredient based on the ingredients corpus
def getingredients(parsed_recipe):
    
    try:
        ingredient_list_text = parsed_recipe.find(attrs={'class':'ingred-list'}).text
    
        ingredient_list=[]
        
        for ingredient_line in ingredient_list_text.split(sep="\n\n"):
            # if its empty then dont do anything
            if len(ingredient_line)==0:
                continue
            
            # split the ingredient into words, the original ingredient_line has lots of white space
            # split and then reset it to build it back
            ingredient_line_split = ingredient_line.split()
            ingredient_line=" "
          
            # for each word in the ingredient line check if it's a number or if it's in the ingredient corpus
            for word in ingredient_line_split:
                ingredient_line+=word+" "
            
            ingredient_list.append(ingredient_line)
            
    except AttributeError:
        return []
        
    return ingredient_list

In [11]:
# get the instructions
def getinstructions(parsed_recipe):
    try:
        instructions = parsed_recipe.find(attrs={'class':'recipeSteps'}).text
    except AttributeError:
        instructions = "none"
    
    return instructions

In [12]:
def getduration(parsed_recipe):
    try:
        duration = parsed_recipe.find(attrs={'class':'recipe-detail time'}).text
    except AttributeError:
        duration = "unknown"
        
    # reformat into nice list
    
    durationsplit = re.split('(\d+)',duration)
    duration=""
    
    for word in durationsplit:
        duration +=" "+ word + " "
        
    duration= duration.split()  

    return duration

In [13]:
def getportions(parsed_recipe):
    try:
        portions = parsed_recipe.find(attrs={'class':'recipe-detail serves'}).text
    except AttributeError:
        portions = "unknown"
    return portions

In [14]:
#recipes[testrecipe]["parsed_html"]

In [15]:
# go through the recipes dictionary and get the instructions and ingredients
for name in recipes:
    parsed_recipe = recipes[name]['parsed_html']
    
    recipes[name]["instructions"]=getinstructions(parsed_recipe)
    recipes[name]["ingredients"]=getingredients(parsed_recipe)
    recipes[name]["duration"]=getduration(parsed_recipe)
    recipes[name]["portions"] = getportions(parsed_recipe)

## Data Wrangling

Tidy up the extracted recipe data

In [16]:
# function to tidy up the ingredients list, and simplify with quantity and ingredient based on the ingredients corpus
def simplifyingredients(ingredient_list):


    ingredient_dict={}
    
    for ingredient_line in ingredient_list:
        if len(ingredient_line)==0:
            continue
            
        # split the ingredient into words
        ingredient_line_split = ingredient_line.split()
        
        # initialise summary
        quantity = np.nan
        ingredient=""
        
        # for each word in the ingredient line check if it's a number or if it's in the ingredient corpus
        for word in ingredient_line_split:
            
            try:
                quantity = int(word)
            except ValueError:
                if word in ingredient_corpus:
                    ingredient += word + " "
            
                
        ingredient_dict[ingredient_line]=[ingredient,quantity]

        
    return ingredient_dict

In [17]:
recipes[testrecipe]

{'duration': ['Cooks', 'In', '40', 'minutes'],
 'ingredients': [' 1 onion ',
  ' 2 cloves of garlic ',
  ' 2 red peppers ',
  ' 2 fresh red or orange chillies ',
  ' olive oil ',
  ' 1 large dried chilli ',
  ' 3 fresh bay leaves ',
  ' 2 x 400 g tins of quality plum tomatoes ',
  ' 2 large ripe tomatoes ',
  ' 6 large free-range eggs ',
  ' 6 tortillas ',
  ' Cheddar cheese , to serve '],
 'instructions': 'Peel and finely slice the onions and garlic. Deseed and finely slice the peppers and chillies.Get a large frying pan (make sure you’ve got a lid to go with it) on a high heat and add several good lugs of olive oil. Add the onion, garlic, peppers, fresh and dried chillies, bay leaves and a good pinch of sea salt and black pepper, and cook for 15 minutes, or until to softened and caramelised, stirring regularly.Pour in the tomatoes, using a spoon or potato masher to break them up. Bring to the boil, then turn down to a medium heat and cook for a further 5 minutes to reduce the sauce.W