### install additional packages

We're going to use `requests-html` instead of `requests` because it seems to do a better job of mimicking a web browser and thus not getting booted from the site.

In [None]:
!pip install requests-html

### import statements

In addition to **requests-html** we're using **json** to decode the data stored in the page header, and **pandas** to process some of the data.

In [None]:
import json
import pandas as pd
from requests_html import HTMLSession

### Preliminaries

Aspects of the connection to allrecipes that are common to the whole script.

In [None]:
# create a session for all HTTP requests
session = HTMLSession()

# this part of the URL doesn't change
base_url = 'https://www.allrecipes.com/recipe/'

### Download a single recipe

Here we test out our methods on a single page. Make sure to run this cell as infrequently as possible, so that the site doesn't get mad about bot-like patterns of requests.

In [None]:

page = '18379/best-green-bean-casserole/'
response = session.get(base_url+page)
if not response.ok:
  response.raise_for_status()

html = response.html


## Processing the results

Now that we have the HTTP response, we can take our time parsing it to pull out the information we want.

### Parsing the HTML

A lot of the information we want is right on the page. We just need to be able to navigate the HTML structure in order to find the bits we're interested in.

In [None]:
# title
title = html.find('h1.article-heading', first=True).text
print(title)

In [None]:
# ingredients
ingredients = []
for item in html.find('li.mntl-structured-ingredients__list-item > p'):
  quantity = item.find('span[@data-ingredient-quantity="true"]', first=True)
  if quantity is not None:
    quantity = int(quantity.text)

  unit = item.find('span[@data-ingredient-unit="true"]', first=True)
  if unit is not None:
    unit = unit.text

  name = item.find('span[@data-ingredient-name="true"]', first=True)
  if name is not None:
    name = name.text

  ingredients.append(dict(
      quantity = quantity,
      unit = unit,
      name = name,
  ))

ingredients = pd.DataFrame(ingredients)
display(ingredients)

In [None]:
# nutrition facts summary
facts_summary = {}

for tr in html.find('tr.mntl-nutrition-facts-summary__table-row'):
  cells = tr.find('td')
  if len(cells) == 2:
    value, label = [cell.text for cell in cells]
    facts_summary[label] = value

print(facts_summary)

In [None]:
# steps
steps = []
for li in html.find('#recipe__steps-content_1-0 > ol > li'):
  step_paras = [p.text for p in li.find('li > p')]
  step_text = '\n'.join(step_paras)
  steps.append(step_text)

for i, step in enumerate(steps):
  print(i, step, sep='\t')

In [None]:
### Parsing embedded JSON data

Some of the things we want, like the comments at the

In [None]:
# get JSON data stored in the header
script = html.find('#allrecipes-schema_1-0', first=True)
data = json.loads(script.text)
data = data[0]

In [None]:
# number of ratings
n_ratings = data.get('aggregateRating', {}).get('ratingCount')

# average ratings
rating = data.get('aggregateRating', {}).get('ratingValue')
if rating is not None:
  rating = float(rating)

In [None]:
# comments
comments = pd.DataFrame(dict(
    rating = comment.get('reviewRating', {}).get('ratingValue'),
    name = comment.get('author', {}).get('name'),
    text = comment.get('reviewBody'),
) for comment in data['review'])
display(comments)

In [None]:
# full nutrition info
nutrition = {}
for k, v in data['nutrition'].items():
  if k.startswith('@'):
    continue
  if k.endswith('Content'):
    k = k[:-7]
  nutrition[k] = v

print(nutrition)