In [4]:
import requests
from bs4 import BeautifulSoup
import re
import datetime

# Web Crawler of Epicurious

## check how many pages of recipe research result there are
####  https://www.epicurious.com/search?content=recipe
#### range from page 1 to 1969

In [29]:
# first page: https://www.epicurious.com/search?content=recipe&page=1 # not start from 0
# last page: https://www.epicurious.com/search?content=recipe&page=1969
website0 = 'https://www.epicurious.com/search/?content=recipe&page='
websites = [website0+str(i) for i in range(1,1970)]
print (websites[0], websites[-1])

https://www.epicurious.com/search/?content=recipe&page=1 https://www.epicurious.com/search/?content=recipe&page=1969


## check the structure of every search result page

In [30]:
page = requests.get(websites[-1])
soup = BeautifulSoup(page.text,'lxml')

In [31]:
title = soup.select('article.recipe-content-card h4 a')
len(title)

16

In [32]:
title[0]

<a data-reactid="71" href="/recipes/food/views/veiled-farm-girls-235002">Veiled Farm Girls</a>

## wrap crawler of search result page into a function

#### get all the links from the 1st search result page, return a list of (title, url) tuples
page = requests.get("https://www.epicurious.com/search?content=recipe&page=1 ")  
get_recipe_links(page)

In [42]:
def get_url_title(el):
    el = str(el)
    title = re.search(r'(href.+\">)(.+)(</a>)', el).group(2)
    layout = "https://www.epicurious.com"
    url = layout + re.search(r'(href=\")(.+)(\">)', el).group(2)
    return (title, url)

In [43]:
def get_recipe_links(page):
    import re
    recipe_links = []
    soup = BeautifulSoup(page.text,'lxml')
    els = soup.select('article.recipe-content-card h4 a')
    return [get_url_title(el) for el in els]

#### there are 18 recipe links in the 1st search result page, the 1sh of them is 'https://www.epicurious.com/recipes/food/views/grilled-lemon-pepper-chicken'

In [105]:
page = requests.get("https://www.epicurious.com/search?content=recipe&page=1")
links = get_recipe_links(page)
len(links)

18

In [183]:
links[0][1]

'https://www.epicurious.com/recipes/food/views/grilled-lemon-pepper-chicken'

## check the detial in a recipe page

In [224]:
page = requests.get('https://www.epicurious.com/recipes/food/views/grilled-bistecca-porterhouse-steaks-with-herby-fish-sauce')
soup = BeautifulSoup(page.text,'lxml')

In [110]:
main = soup.select('div.main-content')
main

[<div class="main-content" itemscope="" itemtype="https://schema.org/Recipe"> <aside class="site-pushdown-ad-wrap ad-wrapper" itemscope="" itemtype="https://schema.org/WPAdBlock" role="complementary"></aside>
 <div class="recipe-title-sidebar ">
 <div class="recipe-title-wrapper">
 <div class="title-source">
 <div class="ad-wrapper sponsor-slug">
 <aside class="sponsorSlug-ad-wrap ad-wrapper slug" itemscope="" itemtype="https://schema.org/WPAdBlock" role="complementary"></aside>
 </div>
 <h1 itemprop="name">Grilled Bistecca with Herby Fish Sauce </h1>
 <div class="byline-source">
 <cite class="contributors" data-react-checksum="165825731" data-reactid="1" data-reactroot=""><a class="contributor" data-reactid="2" href="/contributors/ignacio-mattos" itemprop="author" rel="author" title="Ignacio Mattos">Ignacio Mattos</a></cite>
 <span class="source" itemprop="publisher"><a href="/source/bon-appetit">Bon Appétit</a></span>
 <span class="pub-date">May 2018</span>
 </div>
 </div> <div class

#### there are some useful infomation in this page:
1. img: img["srcset"]
2. dek (could be none): div.dek
3. serving size
4. INGREDIENTS: div.ingredient
5. PREPARATION: div.instructions
6. NUTRITIONAL INFO (could be none)
7. TAGS: div.tags
8. ratings (could be a float num from 0 to 4)
9. reviews: div.review

In [111]:
img = soup.select('div.main-content img["srcset"]')
img
imgurl = re.search(r'(srcset=\")(.+)(")', str(img)).group(2)
imgurl

'https://assets.epicurious.com/photos/5ad7863b59e81a5d899942ba/6:4/w_274%2Ch_169/grilled-bistecca-steak-with-herby-fish-sauce-recipe-BA-041818.jpg'

In [112]:
dek = soup.select('div.main-content div.dek')
dek
dekcontent = re.search(r'(<p>)(.+)(<\/p>)', str(dek)).group(2)
dekcontent

"Why baste your steak with fish sauce? It's called umami. Get into it."

In [118]:
ingrs = soup.select('div.main-content div.ingredients-info li.ingredient')
ingrs

[<li class="ingredient" itemprop="ingredients">2 (1 1/2"–2"-thick) porterhouse steaks (about 6 lb. total)</li>,
 <li class="ingredient" itemprop="ingredients">1/2 cup extra-virgin olive oil, divided</li>,
 <li class="ingredient" itemprop="ingredients">Kosher salt</li>,
 <li class="ingredient" itemprop="ingredients">3 Tbsp. freshly ground black pepper</li>,
 <li class="ingredient" itemprop="ingredients">2 garlic cloves, finely grated</li>,
 <li class="ingredient" itemprop="ingredients">1/4 cup fish sauce</li>,
 <li class="ingredient" itemprop="ingredients">2 Tbsp. coarsely chopped marjoram, rosemary, and/or thyme</li>]

In [119]:
ingrscontent = []
for ingr in ingrs:
    ingr = str(ingr)
    ingrscontent.append(re.search(r'(<.+>)(.+)(<.+>)', ingr).group(2))
ingrscontent

['2 (1 1/2"–2"-thick) porterhouse steaks (about 6 lb. total)',
 '1/2 cup extra-virgin olive oil, divided',
 'Kosher salt',
 '3 Tbsp. freshly ground black pepper',
 '2 garlic cloves, finely grated',
 '1/4 cup fish sauce',
 '2 Tbsp. coarsely chopped marjoram, rosemary, and/or thyme']

In [248]:
size = soup.select('div.main-content span.per-serving')
size
size = re.search(r'(<.+>)(.*)(<.+>)', str(size)).group(2)
size

[<span class="per-serving">per serving (8 servings)</span>]

In [255]:
el = soup.select('div.main-content span.per-serving')
size = re.search(r'(\">)(.*)(<\/)', str(el)).group(2)
size

'per serving (8 servings)'

In [None]:
    def build_size(self, soup):
        try:
            el = soup.select('div.main-content span.per-serving')
            size = re.search(r'(\">)(.*)(<\/)',str(el)).group(2)
            return size
        except:
            print("Wrong: "+str(el))
            return ""

In [121]:
steps = soup.select('div.main-content div.instructions li.preparation-step')
steps
ss = []
for s in steps:
    s = re.search(r'(.+>\s+)(.+)(\s+.+)', str(s)).group(2)
    ss.append(s)
ss

['Rub steak with 1/4 cup oil; season with salt and sprinkle with pepper (it should nearly cover both sides). Let sit at room temperature 30 minutes.',
 'Mix garlic, fish sauce, herbs, and remaining 1/4 cup oil in a small bowl.',
 'Prepare a grill for medium-high indirect heat (for a gas grill, leave one or two burners off; for a charcoal grill, bank coals on one side of grill). Grill steaks over direct heat until nicely charred, about 2 minutes per side. Move steaks over indirect heat and continue grilling, basting with fish sauce mixture, until browned all over, about 5 minutes per side. An instant-read thermometer inserted into the thickest part of steaks should register 120°F for rare; temperature will rise to 125°F (or medium-rare) as they rest. Transfer to a cutting board and let rest 20 minutes before slicing against the grain.']

In [122]:
tags = soup.select('div.main-content dl.tags a')
tags
ts = []
for t in tags:
    t = re.search(r'(.+\">)(.+)(<\/d)', str(t)).group(2)
    ts.append(t)
ts

['Bon Appétit',
 'Dinner',
 'Steak',
 'Grill/Barbecue',
 'Grill',
 'Summer',
 'Italian',
 'Herb',
 'Garlic',
 'Dairy Free',
 'Wheat/Gluten-Free']

In [123]:
rating = soup.select('div.main-content span.rating')
rating
rating = re.search(r'(\">)(.+)(<\/)', str(rating)).group(2)
rating

'4/4'

In [124]:
reviews = soup.select('div.main-content div.reviews')
reviews

[<div class="reviews" data-react-checksum="-842451787" data-reactid="1" data-reactroot=""><div class="review-header" data-reactid="2"><h2 data-reactid="3">Reviews</h2></div><ul data-reactid="4"><li class="most-recent with-rating" data-reactid="5" id="5b26ba426796412fa6f9cd42"><img class="fork-rating" data-reactid="6" src="/static/img/recipe/ratings/4_forks.png"/><div class="review-text" data-reactid="7"><p data-reactid="8">I did not use porterhouse steaks. I used the sauce to baste NY strips and will definitely use the basting sauce again.
 </p><div class="review-footer" data-reactid="9"><span class="credit" data-reactid="10"><!-- react-text: 11 -->kim_in_mn<!-- /react-text --><!-- react-text: 12 --> from Chanhassen, MN<!-- /react-text --><!-- react-text: 13 --> / <!-- /react-text --></span><div class="reportproblem" data-reactid="14"><div class="flag" data-reactid="15"><a data-reactid="16">flag if inappropriate</a></div></div></div></div></li></ul></div>]

In [135]:
nulabel = soup.select('div.main-content span.nutri-label')
nulabel

[<span class="nutri-label">Calories</span>,
 <span class="nutri-label">Carbohydrates</span>,
 <span class="nutri-label">Fat</span>,
 <span class="nutri-label">Protein</span>,
 <span class="nutri-label">Saturated Fat</span>,
 <span class="nutri-label">Sodium</span>,
 <span class="nutri-label">Polyunsaturated Fat</span>,
 <span class="nutri-label">Fiber</span>,
 <span class="nutri-label">Monounsaturated Fat</span>,
 <span class="nutri-label">Cholesterol</span>]

In [137]:
nuval = soup.select('div.main-content span.nutri-data')
nuval

[<span class="nutri-data" itemprop="calories">339</span>,
 <span class="nutri-data" itemprop="carbohydrateContent">3 g(1%)</span>,
 <span class="nutri-data" itemprop="fatContent">22 g(35%)</span>,
 <span class="nutri-data" itemprop="proteinContent">31 g(61%)</span>,
 <span class="nutri-data" itemprop="saturatedFatContent">5 g(27%)</span>,
 <span class="nutri-data" itemprop="sodiumContent">776 mg(32%)</span>,
 <span class="nutri-data">2 g</span>,
 <span class="nutri-data" itemprop="fiberContent">1 g(4%)</span>,
 <span class="nutri-data">14 g</span>,
 <span class="nutri-data" itemprop="cholesterolContent">77 mg(26%)</span>]

In [138]:
len(nulabel) == len(nuval)

True

In [139]:
n = len(nulabel)
nuinfo = {}
for i in range(n):
    n = str(nulabel[i])
    n = re.search(r'(\">)(.+)(<\/)', n).group(2)
    v = str(nuval[i])
    v = re.search(r'(\">)(.+)(<\/)', v).group(2)
    nuinfo[n] = v
nuinfo

{'Calories': '339',
 'Carbohydrates': '3 g(1%)',
 'Fat': '22 g(35%)',
 'Protein': '31 g(61%)',
 'Saturated Fat': '5 g(27%)',
 'Sodium': '776 mg(32%)',
 'Polyunsaturated Fat': '2 g',
 'Fiber': '1 g(4%)',
 'Monounsaturated Fat': '14 g',
 'Cholesterol': '77 mg(26%)'}

In [143]:
title = soup.select('div.main-content div.title-source h1')
title
title = re.search(r'(\">)(.+)(<\/)', str(title)).group(2).strip()
title

'Grilled Bistecca with Herby Fish Sauce'

## wrap detail of a recipe page into a recipe class

In [1]:
class Recipe:
    # properties
    title = ''
    url = ''
    imgurl = ''
    desc = ''
    size = ''
    rating = ''
    #rawreview = ''
    ingredients = []
    steps = []
    tags = []
    nutritions = {}
    #soup
    
    # init empty recipe
    def __init__(self):
        self.title = ''
        self.url = ''
        self.imgurl = ''
        self.desc = ''
        self.size = ''
        self.rating = ''
        #self.rawreview = ''
        self.ingredients = []
        self.steps = []
        self.tags = []
        self.nutritions = {}
    
    # init recipe only with title
    """
    def __init__(self, title, imgurl):
        self.title = title
        self.imgurl = ''
        self.desc = ''
        self.size = ''
        self.rating = ''
        #self.rawreview = ''
        self.ingredients = []
        self.steps = []
        self.tags = []
        self.nutritions = {}
    """
    
    # init recipe using a recipe detial page
    def __init__(self, url):
        import requests
        from bs4 import BeautifulSoup
        import re
        self.url = url
        page = requests.get(url)
        soup = BeautifulSoup(page.text, 'lxml')
        self.title = self.build_title(soup)
        print(self.title, self.url)
        self.imgurl = self.build_imgurl(soup)
        self.desc = self.build_desc(soup)
        self.size = self.build_size(soup)
        self.rating = self.build_rating(soup)
        #self.rawreview = self.get_rawreview(page)
        self.ingredients = self.build_ingredients(soup)
        self.steps = self.build_steps(soup)
        self.tags = self.build_tags(soup)
        self.nutritions = self.build_nutritions(soup)
    
    # fuctions: get infomation from a page
    def build_title(self, soup):
        try:
            el = soup.select('div.main-content div.title-source h1')
            title = re.search(r'(\">)(.+)(<\/)', str(el)).group(2).strip()
            return title
        except:
            print("title Wrong: "+str(el))
            return ""
    
    def build_imgurl(self, soup):
        try:
            el = soup.select('div.main-content img["srcset"]')
            imgurl = re.search(r'(srcset=\")(.+)(")', str(el)).group(2)
            return imgurl
        except:
            print("img Wrong: "+str(el))
            return ""

    def build_desc(self, soup):
        try:
            el = soup.select('div.main-content div.dek')
            desc = re.search(r'(<p>)(.+)(<\/p>)', str(el)).group(2)     
            return desc
        except:
            print("desc Wrong: "+str(el))
            return ""
    
    def build_size(self, soup):
        try:
            el = soup.select('div.main-content span.per-serving')
            size = re.search(r'(\">)(.*)(<\/)',str(el)).group(2)
            return size
        except:
            #print("size Wrong: "+str(el))
            return ""
    
    
    def build_rating(self, soup):
        try:
            el = soup.select('div.main-content span.rating')
            rating = re.search(r'(\">)(.+)(<\/)', str(el)).group(2)
            return rating
        except:
            print("rating Wrong: "+str(el))
            return ""
    
    #def get_rawreview(self, page):
    #    return self.rawreview

    def build_ingredients(self, soup):
        try:
            ingredients = []
            ins = soup.select('div.main-content div.ingredients-info li.ingredient')
            for i in ins:
                ingredients.append(re.search(r'(<.+>)(.+)(<.+>)', str(i)).group(2))
            return ingredients
        except:
            print("in Wrong: "+str(ins))
            return []

    def build_steps(self, soup):
        try:
            steps = []
            ss = soup.select('div.main-content div.instructions li.preparation-step')
            for s in ss:
                steps.append(re.search(r'(.+>\s+)(.+)(\s+.+)', str(s)).group(2))
            return steps
        except:
            print("step Wrong: "+str(ss))
            return []

    def build_tags(self, soup):
        try:
            tags = []
            ts = soup.select('div.main-content dl.tags a')
            for t in ts:
                tags.append(re.search(r'(.+\">)(.+)(<\/d)', str(t)).group(2))
            return tags
        except:
            print("tags Wrong: "+str(ts))
            return []

    def build_nutritions(self, soup):
        try:
            nutritions = {}
            labels = soup.select('div.main-content span.nutri-label')
            values = soup.select('div.main-content span.nutri-data')
            for i in range(len(labels)):
                n = re.search(r'(\">)(.*)(<\/)', str(labels[i])).group(2)
                v = re.search(r'(\">)(.*)(<\/)', str(values[i])).group(2)
                nutritions[n] = v
            return nutritions
        except:
            print("nu Wrong: "+str(labels))
            return {}
    
    # to string
    def __str__(self):
        s = self.title +"\n"+ self.desc +"\n"+ self.url
        return s

    # getters
    def get_title(self):
        return self.title
    
    def get_imgurl(self):
        return self.imgurl
    
    def get_desc(self):
        return self.desc
    
    def get_size(self):
        return self.size
    
    def get_rating(self):
        return self.rating
    
    #def get_rawreview(self):
    #    return self.rawreview

    def get_ingredients(self):
        return self.ingredients
    
    def get_steps(self):
        return self.steps
    
    def get_tags(self):
        return self.tags
    
    def get_nutritions(self):
        return self.nutritions

## test the class

In [233]:
url = 'https://www.epicurious.com/recipes/food/views/grilled-bistecca-porterhouse-steaks-with-herby-fish-sauce'
r = Recipe(url)
#print(r)

Grilled Bistecca with Herby Fish Sauce https://www.epicurious.com/recipes/food/views/grilled-bistecca-porterhouse-steaks-with-herby-fish-sauce


In [234]:
r.get_nutritions()

{'Calories': '339',
 'Carbohydrates': '3 g(1%)',
 'Fat': '22 g(35%)',
 'Protein': '31 g(61%)',
 'Saturated Fat': '5 g(27%)',
 'Sodium': '776 mg(32%)',
 'Polyunsaturated Fat': '2 g',
 'Fiber': '1 g(4%)',
 'Monounsaturated Fat': '14 g',
 'Cholesterol': '77 mg(26%)'}

In [235]:
r.get_size()

'per serving (8 servings)'

## OK, now the class worked
## next step: 
1. collect all recipes in a search result page
2. write them in a json file

In [236]:
website0 = 'https://www.epicurious.com/search/?content=recipe&page='
websites = [website0+str(i) for i in range(1,1970)]

In [6]:
def get_url_title(el):
    el = str(el)
    title = re.search(r'(href.+\">)(.+)(</a>)', el).group(2)
    layout = "https://www.epicurious.com"
    url = layout + re.search(r'(href=\")(.+)(\">)', el).group(2)
    return (title, url)

def get_recipe_links(page):
    import re
    recipe_links = []
    soup = BeautifulSoup(page.text,'lxml')
    els = soup.select('article.recipe-content-card h4 a')
    return [get_url_title(el) for el in els]

In [7]:
page = requests.get("https://www.epicurious.com/search?content=recipe&page=1")
links = get_recipe_links(page)
len(links)

18

In [8]:
recipes = []
for link in links:
    #print(link)
    recipes.append(Recipe(link[1]))

Chicken Salad with Apricots, Celery, and Blue Cheese https://www.epicurious.com/recipes/food/views/chicken-salad-with-apricots-celery-and-blue-cheese
Grilled Short Ribs and Lettuces with Mustard-Orange Dressing https://www.epicurious.com/recipes/food/views/grilled-short-ribs-and-lettuces-with-mustard-orange-dressing
Grilled Lemon-Pepper Chicken https://www.epicurious.com/recipes/food/views/grilled-lemon-pepper-chicken
Instant Pot Chicken Stock https://www.epicurious.com/recipes/food/views/instant-pot-chicken-stock
Instant Pot Refried Black Beans https://www.epicurious.com/recipes/food/views/instant-pot-refried-black-beans
Instant Pot Bolognese https://www.epicurious.com/recipes/food/views/instant-pot-bolognese
Instant Pot Mushroom Risotto https://www.epicurious.com/recipes/food/views/instant-pot-mushroom-risotto
Big-Batch Black Beans https://www.epicurious.com/recipes/food/views/big-batch-black-beans
Gin Rocket https://www.epicurious.com/recipes/food/views/gin-rocket-fennel-arugula-lim

In [10]:
recipes[8].get_nutritions()

{'Calories': '285',
 'Carbohydrates': '35 g(12%)',
 'Fat': '1 g(1%)',
 'Protein': '3 g(6%)',
 'Saturated Fat': '0 g(1%)',
 'Sodium': '137 mg(6%)',
 'Polyunsaturated Fat': '0 g',
 'Fiber': '7 g(30%)',
 'Monounsaturated Fat': '0 g',
 '': ''}