In [1]:
import requests
from bs4 import BeautifulSoup
import re
import datetime

# Web Crawler for Epicurious

## 1. check how many pages of recipe research result there are
####  https://www.epicurious.com/search?content=recipe&page=1
####  https://www.epicurious.com/search?content=recipe&page=1969
#### range from page 1 to 1969

In [2]:
# first page: https://www.epicurious.com/search?content=recipe&page=1 # not start from 0
# last page: https://www.epicurious.com/search?content=recipe&page=1969
website0 = 'https://www.epicurious.com/search/?content=recipe&page='
websites = [website0+str(i) for i in range(1,1970)]
print (websites[0], websites[-1])

https://www.epicurious.com/search/?content=recipe&page=1 https://www.epicurious.com/search/?content=recipe&page=1969


## 2. check the structure of every search result page

In [3]:
page = requests.get(websites[-1])
soup = BeautifulSoup(page.text,'lxml')

In [4]:
title = soup.select('article.recipe-content-card h4 a')
len(title)

17

In [5]:
title[0]

<a data-reactid="71" href="/recipes/food/views/fish-dumplings-234997">Fish Dumplings</a>

## 3. wrap crawler of search result page into a function

#### get all the links from the 1st search result page, return a list of (title, url) tuples
page = requests.get("https://www.epicurious.com/search?content=recipe&page=1 ")  
get_recipe_links(page)

In [12]:
def get_url_title(el):
    url = re.search(r'<a.+href="(.+)">(.+)<\/a>', str(el)).group(1)
    title = re.search(r'<a.+href="(.+)">(.+)<\/a>', str(el)).group(2)
    return (title, url)

In [13]:
# test get_url_title(el)
el = '<a data-reactid="71" href="/recipes/food/views/fish-dumplings-234997">Fish Dumplings</a>'
get_url_title(el)

('Fish Dumplings', '/recipes/food/views/fish-dumplings-234997')

In [14]:
def get_recipe_links(page):
    recipe_links = []
    soup = BeautifulSoup(page.text,'lxml')
    els = soup.select('article.recipe-content-card h4 a')
    return [get_url_title(el) for el in els]

#### there are 18 recipe links in the 1st search result page. since there will be new recipes coming in, which luckily haven't effect the total page, I'd better add a date here. But I didn't find any date information in the search result page...

In [15]:
# test get_recipe_links(page)
page = requests.get("https://www.epicurious.com/search?content=recipe&page=1")
links = get_recipe_links(page)
len(links)

18

In [16]:
links[0][1]

'/recipes/food/views/chicken-salad-with-apricots-celery-and-blue-cheese'

## 4. check the detial in a recipe page

In [76]:
page = requests.get('https://www.epicurious.com/recipes/food/views/chicken-salad-with-apricots-celery-and-blue-cheese')
soup = BeautifulSoup(page.text,'lxml')

In [77]:
main = soup.select('div.main-content')
main

[<div class="main-content" itemscope="" itemtype="https://schema.org/Recipe"> <aside class="site-pushdown-ad-wrap ad-wrapper" itemscope="" itemtype="https://schema.org/WPAdBlock" role="complementary"></aside>
 <div class="recipe-title-sidebar ">
 <div class="recipe-title-wrapper">
 <div class="title-source">
 <div class="ad-wrapper sponsor-slug">
 <aside class="sponsorSlug-ad-wrap ad-wrapper slug" itemscope="" itemtype="https://schema.org/WPAdBlock" role="complementary"></aside>
 </div>
 <h1 itemprop="name">Chicken Salad with Apricots, Celery, and Blue Cheese </h1>
 <div class="byline-source">
 <cite class="contributors" data-react-checksum="183323855" data-reactid="1" data-reactroot=""><a class="contributor" data-reactid="2" href="/contributors/anna-stockwell" itemprop="author" rel="author" title="Anna Stockwell">Anna Stockwell</a></cite>
 <span class="source" itemprop="publisher">Epicurious</span>
 <span class="pub-date">June 2018</span>
 </div>
 </div> <div class="recipe-sidebar">
 

#### there are some useful infomation in this page:
1. img: img["srcset"]
2. dek (could be none): div.dek
3. yield(serving size), active-time, totle-time (could be none)
4. INGREDIENTS: div.ingredient
5. PREPARATION: div.instructions
6. NUTRITIONAL INFO (could be none)
7. TAGS: div.tags
8. ratings (could be a float num from 0 to 4), num of ratings, make if again percentage
9. reviews: div.review

In [95]:
l = {"red":1, "blue":2}
str(l)

"{'red': 1, 'blue': 2}"

### 4.1 imgurl

In [25]:
img = soup.select('div.main-content img["srcset"]')
img

[<img alt="Chicken Salad With Celery, Apricots, and Blue Cheese / Photo by Chelsea Kyle, Food Styling by Anna Stockwell" class="photo loaded" data-reactid="10" src="/static/img/px/background.png" srcset="https://assets.epicurious.com/photos/5b23ce065eea062731adf5a3/6:4/w_274%2Ch_169/Chicken-Salad-With-Celery-Apricots-and-Blue-Cheese-30052918.jpg"/>]

In [26]:
imgurl = re.search(r'srcset="(.+)"\/>', str(img)).group(1)
imgurl

'https://assets.epicurious.com/photos/5b23ce065eea062731adf5a3/6:4/w_274%2Ch_169/Chicken-Salad-With-Celery-Apricots-and-Blue-Cheese-30052918.jpg'

### 4.2 desc

In [27]:
dek = soup.select('div.main-content div.dek')
dek

[<div class="dek" data-reactid="18" itemprop="description"><p>This summer-season chicken salad is a dance between sweet, creamy, and acidic flavors and tender, crunchy, and juicy ingredients. Make it in early summer when apricots are peaking and choose a fairly nonassertive blue cheese that won’t overwhelm the mix.</p></div>]

In [28]:
dekcontent = re.search(r'<p>(.+)<\/p>', str(dek)).group(1)
dekcontent

'This summer-season chicken salad is a dance between sweet, creamy, and acidic flavors and tender, crunchy, and juicy ingredients. Make it in early summer when apricots are peaking and choose a fairly nonassertive blue cheese that won’t overwhelm the mix.'

### 4.3 yield, active time, total time

In [46]:
y = soup.select('div.main-content dd.yield')
y

[<dd class="yield" itemprop="recipeYield">4–6 servings</dd>]

In [47]:
yv = re.search(r'\">(.+)<', str(y)).group(1)
yv

'4–6 servings'

In [48]:
at = soup.select('div.main-content dd.active-time')
at

[<dd class="active-time">25 minutes</dd>]

In [49]:
atv = re.search(r'\">(.+)<', str(at)).group(1)
atv

'25 minutes'

In [51]:
tt = soup.select('div.main-content dd.total-time')
tt

[<dd class="total-time">25 minutes</dd>]

In [52]:
ttv = re.search(r'\">(.+)<', str(tt)).group(1)
ttv

'25 minutes'

### 4.4 ingredients

In [29]:
ingrs = soup.select('div.main-content div.ingredients-info li.ingredient')
ingrs

[<li class="ingredient" itemprop="ingredients">1 small rotisserie chicken, skin removed, meat shredded (about 4 cups)</li>,
 <li class="ingredient" itemprop="ingredients">1 bunch of celery, stalks separated, thinly sliced on a diagonal, leaves reserved</li>,
 <li class="ingredient" itemprop="ingredients">1 1/2 tsp. kosher salt, divided</li>,
 <li class="ingredient" itemprop="ingredients">1/2 cup buttermilk</li>,
 <li class="ingredient" itemprop="ingredients">3 Tbsp. white wine vinegar</li>,
 <li class="ingredient" itemprop="ingredients">1 tsp. honey</li>,
 <li class="ingredient" itemprop="ingredients">1 tsp. freshly ground black pepper, plus more</li>,
 <li class="ingredient" itemprop="ingredients">3 large or 4 small apricots, sliced</li>,
 <li class="ingredient" itemprop="ingredients">4 oz. mild blue cheese, sliced into shards</li>,
 <li class="ingredient" itemprop="ingredients">1 cup mint leaves</li>]

In [33]:
ingrscontent = []
for ingr in ingrs:
    ingr = str(ingr)
    ingrscontent.append(re.search(r'<li.+>(\s*.+\s*)<\/li>', ingr).group(1).strip())
ingrscontent

['1 small rotisserie chicken, skin removed, meat shredded (about 4 cups)',
 '1 bunch of celery, stalks separated, thinly sliced on a diagonal, leaves reserved',
 '1 1/2 tsp. kosher salt, divided',
 '1/2 cup buttermilk',
 '3 Tbsp. white wine vinegar',
 '1 tsp. honey',
 '1 tsp. freshly ground black pepper, plus more',
 '3 large or 4 small apricots, sliced',
 '4 oz. mild blue cheese, sliced into shards',
 '1 cup mint leaves']

### 4.5 steps

In [31]:
steps = soup.select('div.main-content div.instructions li.preparation-step')
steps

[<li class="preparation-step">                                        Toss chicken, celery stalks, and 1 tsp. salt in a large bowl. Let sit until ready to serve.
                                     </li>,
 <li class="preparation-step">                                        Whisk buttermilk, vinegar, honey, 1 tsp. pepper, and remaining 1/2 tsp. salt in a small bowl or measuring cup. Pour over celery mixture. Add apricots, blue cheese, mint, and celery leaves and toss just to coat.
                                     </li>,
 <li class="preparation-step">                                        Transfer salad to a platter and season with more pepper.
                                     </li>]

In [34]:
ss = []
for s in steps:
    s = re.search(r'<li.+>(\s*.+\s*)<\/li>', str(s)).group(1).strip()
    ss.append(s)
ss

['Toss chicken, celery stalks, and 1 tsp. salt in a large bowl. Let sit until ready to serve.',
 'Whisk buttermilk, vinegar, honey, 1 tsp. pepper, and remaining 1/2 tsp. salt in a small bowl or measuring cup. Pour over celery mixture. Add apricots, blue cheese, mint, and celery leaves and toss just to coat.',
 'Transfer salad to a platter and season with more pepper.']

### 4.6 tags

In [35]:
tags = soup.select('div.main-content dl.tags a')
tags

[<a href="/type/salad"><dt itemprop="recipeCategory">Salad</dt></a>,
 <a href="/ingredient/chicken"><dt itemprop="recipeCategory">Chicken</dt></a>,
 <a href="/ingredient/celery"><dt itemprop="recipeCategory">Celery</dt></a>,
 <a href="/ingredient/buttermilk"><dt itemprop="recipeCategory">Buttermilk</dt></a>,
 <a href="/ingredient/vinegar"><dt itemprop="recipeCategory">Vinegar</dt></a>,
 <a href="/ingredient/honey"><dt itemprop="recipeCategory">Honey</dt></a>,
 <a href="/ingredient/apricot"><dt itemprop="recipeCategory">Apricot</dt></a>,
 <a href="/ingredient/blue-cheese"><dt itemprop="recipeCategory">Blue Cheese</dt></a>,
 <a href="/ingredient/mint"><dt itemprop="recipeCategory">Mint</dt></a>,
 <a href="/meal/dinner"><dt itemprop="recipeCategory">Dinner</dt></a>,
 <a href="/occasion/summer"><dt itemprop="recipeCategory">Summer</dt></a>,
 <a href="/meal/lunch"><dt itemprop="recipeCategory">Lunch</dt></a>,
 <a href="/special-consideration/quick-and-easy"><dt itemprop="recipeCategory">Qui

In [36]:
ts = []
for t in tags:
    t = re.search(r'\".+>(.+)<\/dt>', str(t)).group(1)
    ts.append(t)
ts

['Salad',
 'Chicken',
 'Celery',
 'Buttermilk',
 'Vinegar',
 'Honey',
 'Apricot',
 'Blue Cheese',
 'Mint',
 'Dinner',
 'Summer',
 'Lunch',
 'Quick &amp; Easy',
 'Picnic',
 'Spring',
 'Wheat/Gluten-Free']

### 4.7 nutrition facts

In [37]:
nulabel = soup.select('div.main-content span.nutri-label')
nulabel

[<span class="nutri-label">Calories</span>,
 <span class="nutri-label">Carbohydrates</span>,
 <span class="nutri-label">Fat</span>,
 <span class="nutri-label">Protein</span>,
 <span class="nutri-label">Saturated Fat</span>,
 <span class="nutri-label">Sodium</span>,
 <span class="nutri-label">Polyunsaturated Fat</span>,
 <span class="nutri-label">Fiber</span>,
 <span class="nutri-label">Monounsaturated Fat</span>,
 <span class="nutri-label">Cholesterol</span>]

In [38]:
nuval = soup.select('div.main-content span.nutri-data')
nuval

[<span class="nutri-data" itemprop="calories">301</span>,
 <span class="nutri-data" itemprop="carbohydrateContent">9 g(3%)</span>,
 <span class="nutri-data" itemprop="fatContent">19 g(30%)</span>,
 <span class="nutri-data" itemprop="proteinContent">22 g(43%)</span>,
 <span class="nutri-data" itemprop="saturatedFatContent">9 g(43%)</span>,
 <span class="nutri-data" itemprop="sodiumContent">605 mg(25%)</span>,
 <span class="nutri-data">3 g</span>,
 <span class="nutri-data" itemprop="fiberContent">3 g(11%)</span>,
 <span class="nutri-data">7 g</span>,
 <span class="nutri-data" itemprop="cholesterolContent">76 mg(25%)</span>]

In [39]:
n = len(nulabel)
nuinfo = {}
for i in range(n):
    n = str(nulabel[i])
    n = re.search(r'\">(.+)<\/', n).group(1)
    v = str(nuval[i])
    v = re.search(r'\">(.+)<\/', v).group(1)
    nuinfo[n] = v
nuinfo

{'Calories': '301',
 'Carbohydrates': '9 g(3%)',
 'Fat': '19 g(30%)',
 'Protein': '22 g(43%)',
 'Saturated Fat': '9 g(43%)',
 'Sodium': '605 mg(25%)',
 'Polyunsaturated Fat': '3 g',
 'Fiber': '3 g(11%)',
 'Monounsaturated Fat': '7 g',
 'Cholesterol': '76 mg(25%)'}

In [100]:
v = '9 g(3%)'
float(v.split(" ")[0])

9.0

In [44]:
serving = soup.select('div.main-content span.per-serving')
serving

[<span class="per-serving">per serving (4 servings)</span>]

In [45]:
serving = re.search(r'.+(\d+).+', str(serving)).group(1)
serving

'4'

### 4.8 ratings

In [67]:
rating = soup.select('div.main-content span.rating')
rating

[<span class="rating">3.5/4</span>]

In [68]:
rating = re.search(r'\">(.+)\/\d', str(rating)).group(1)
rating

'3.5'

In [69]:
rc = soup.select('div.main-content span.reviews-count')
rc

[<span class="reviews-count" itemprop="reviewCount">5</span>]

In [70]:
rcv = re.search(r'(\d+)', str(rc)).group(1)
rcv

'5'

In [71]:
ma = soup.select('div.main-content div.prepare-again-rating span')
ma

[<span>100%</span>]

In [72]:
mav = re.search(r'(\d+)', str(ma)).group(1)
mav

'100'

### 4.9 first several reviews

In [87]:
reviews = soup.select('div.main-content div.reviews li div.review-text p')
reviews

[<p data-reactid="8">Sounds delicious.  Four of my favorite flavors, all in one dish.  The big problem for me would be finding decent apricots... it seems that those that turn up in my suburban markets are mostly woolly and flavorless.   I wonder... how about subbing in some rehydrated DRIED apricots ??</p>,
 <p data-reactid="20">Sounds great but instead of rotisserie chicken, I would use roasted chicken.  Not as soft and moist as rotisserie chicken.  I think it will hold up a little better. I never thought of using fresh apricots but I will make it and try it.  We grow our own apricots and they are almost ready.  Thank you for your recipe.</p>,
 <p data-reactid="32">what an enticing combination of flavors and textures!!  as a 33 year vegetarian I will be enjoying this with white beans or chickpeas, vs chicken, and fully expect it to be delightful..thank you  
 
 </p>,
 <p data-reactid="44">This is going into permanent rotation! Who knew that chicken salad was crying out for the beauti

In [90]:
reviewc = []
for r in reviews:
    try:
        reviewc.append(re.search(r'\">(.+[\s\S]*)<', str(r)).group(1).strip())
    except:
        print(r)
reviewc

['Sounds delicious.  Four of my favorite flavors, all in one dish.  The big problem for me would be finding decent apricots... it seems that those that turn up in my suburban markets are mostly woolly and flavorless.   I wonder... how about subbing in some rehydrated DRIED apricots ??',
 'Sounds great but instead of rotisserie chicken, I would use roasted chicken.  Not as soft and moist as rotisserie chicken.  I think it will hold up a little better. I never thought of using fresh apricots but I will make it and try it.  We grow our own apricots and they are almost ready.  Thank you for your recipe.',
 'what an enticing combination of flavors and textures!!  as a 33 year vegetarian I will be enjoying this with white beans or chickpeas, vs chicken, and fully expect it to be delightful..thank you',
 "This is going into permanent rotation! Who knew that chicken salad was crying out for the beautiful tang of blue cheese? The cheese adds a nice salty point against the sweetness of the apric

### 4.10 title

In [91]:
title = soup.select('div.main-content div.title-source h1')
title

[<h1 itemprop="name">Chicken Salad with Apricots, Celery, and Blue Cheese </h1>]

In [92]:
title = re.search(r'\">(.+)<', str(title)).group(1).strip()
title

'Chicken Salad with Apricots, Celery, and Blue Cheese'

## 5. wrap detail of a recipe page into a recipe class

#### write the class in src/Recipe.py, use "from src import Recipe"

## require requests, BeautifulSoup, re, json
class Recipe:
    
################################################################################################ init empty recipe
    def __init__(self):
        
        self.title = ''
        self.url = ''
        self.imgurl = ''
        self.desc = ''
        
        self.ingredients = []
        self.steps = []
        self.tags = []
        self.nutritions = {}
        self.serving = 0
        
        self.yiel = ''
        self.activetime = ''
        self.totaltime = ''
        
        self.rating = 0
        self.reviewcount = 0
        self.makeagain = 0
        self.reviews = []

    
############################################################################ init recipe using a recipe detial page
    def __init__(self, url):
        
        page = requests.get(url)
        soup = BeautifulSoup(page.text, 'lxml')
        
        self.title = self.build_title(soup)
        self.url = url
        self.imgurl = self.build_imgurl(soup)
        self.desc = self.build_desc(soup)        
        
        self.ingredients = self.build_ingredients(soup)
        self.steps = self.build_steps(soup)
        self.tags = self.build_tags(soup)
        self.nutritions = self.build_nutritions(soup)
        self.serving = self.build_serving(soup)
        
        self.yiel = self.build_yield(soup)
        self.activetime = self.build_activetime(soup)
        self.totaltime = self.build_totaltime(soup)
        
        self.rating = self.build_rating(soup)
        self.reviewcount = self.build_reviewcount(soup)
        self.makeagain = self.build_makeagain(soup)
        self.reviews = self.build_reviews(soup)
        
    
########################################################################### fuctions: get infomation from a page
    def build_title(self, soup):
        try:
            el = soup.select('div.main-content div.title-source h1')
            title = re.search(r'\">(.+)<', str(el)).group(1).strip()
            return title
        except:
            print("Title Wrong: "+self.url)
            return ''
    
    def build_imgurl(self, soup):
        try:
            el = soup.select('div.main-content img["srcset"]')
            imgurl = re.search(r'srcset="(.+)"\/>', str(el)).group(1)
            return imgurl
        except:
            print("IMG Wrong: "+self.url)
            return ''

    def build_desc(self, soup):
        try:
            el = soup.select('div.main-content div.dek')
            desc = re.search(r'<p>(.+)<\/p>', str(el)).group(1).strip()     
            return desc
        except:
            #print("Desc Wrong: "+self.url)
            return ''
        
    
    def build_ingredients(self, soup):
        try:
            ingredients = []
            els = soup.select('div.main-content div.ingredients-info li.ingredient')
            for el in els:
                ingredients.append(re.search(r'<li.+>(\s*.+\s*)<\/li>', str(el)).group(1).strip())
            return ingredients
        except:
            print("Ingredients Wrong: "+self.url)
            return []

    def build_steps(self, soup):
        try:
            steps = []
            els = soup.select('div.main-content div.instructions li.preparation-step')
            for el in els:
                steps.append(re.search(r'<li.+>(\s*.+\s*)<\/li>', str(el)).group(1).strip())
            return steps
        except:
            print("Steps Wrong: "+self.url)
            return []

    def build_tags(self, soup):
        try:
            tags = []
            els = soup.select('div.main-content dl.tags a')
            for el in els:
                tags.append(re.search(r'\".+>(.+)<\/dt>', str(el)).group(1))
            return tags
        except:
            print("Tags Wrong: "+self.url)
            return []

    def build_nutritions(self, soup):
        try:
            nutritions = {}
            labels = soup.select('div.main-content span.nutri-label')
            values = soup.select('div.main-content span.nutri-data')
            for i in range(len(labels)):
                n = re.search(r'\">(.+)<\/', str(labels[i])).group(1)
                v = re.search(r'\">(.+)<\/', str(values[i])).group(1)
                v = float(v.split(' ')[0])
                nutritions[n] = v
            return nutritions
        except:
            #print("Nutritions Wrong: "+self.url)
            return {}                
    
    def build_serving(self, soup):
        try:
            el = soup.select('div.main-content span.per-serving')
            serving = re.search(r'.+(\d+).+',str(el)).group(1)
            return float(serving)
        except:
            #print("Serving Wrong: "+self.url)
            return 0
        
    
    def build_yield(self, soup):
        try:
            el = soup.select('div.main-content dd.yield')
            y = re.search(r'\">(.+)<', str(el)).group(1)
            return y
        except:
            #print("Yield Wrong: "+self.url)
            return ''
        
    def build_activetime(self, soup):
        try:
            el = soup.select('div.main-content dd.active-time')
            active = re.search(r'\">(.+)<', str(el)).group(1)
            return active
        except:
            #print("Activetime Wrong: "+self.url)
            return ''
        
    def build_totaltime(self, soup):
        try:
            el = soup.select('div.main-content dd.total-time')
            total = re.search(r'\">(.+)<', str(el)).group(1)
            return total
        except:
            #print("Totaltime Wrong: "+self.url)
            return ''
        
        
    def build_rating(self, soup):
        try:
            el = soup.select('div.main-content span.rating')
            rating = re.search(r'\">(.+)\/\d', str(el)).group(1)
            return float(rating)
        except:
            print("Rating Wrong: "+self.url)
            return 0

    def build_reviewcount(self, soup):
        try:
            el = soup.select('div.main-content span.reviews-count')
            reviewcount = re.search(r'(\d+)', str(el)).group(1)
            return float(reviewcount)
        except:
            print("Reviewcount Wrong: "+self.url)
            return 0
        
    def build_makeagain(self, soup):
        try:
            el = soup.select('div.main-content div.prepare-again-rating span')
            makeagain = re.search(r'(\d+)', str(el)).group(1)
            return float(makeagain)
        except:
            print("MakeagainRating Wrong: "+self.url)
            return 0
    
    def build_reviews(self, soup):
        try:
            reviews = []
            els = soup.select('div.main-content div.reviews li div.review-text p')
            for el in els:
                review = re.search(r'\">(.+[\s\S]*)<', str(r)).group(1).strip()
                reviews.append(review)
            return reviews
        except:
            #print("Reviews Wrong: "+self.url)
            return []
        
    
######################################################################################################### to string
    def __str__(self):
        string = "\nTitle: " + self.title + "\nURL: " + self.url + "\nIMG URL: " + self.imgurl + "\nDesc: " + self.desc + "\nIngredients: " + str(self.ingredients) + "\nSteps: " + str(self.steps) + "\nTags: " + str(self.tags) + "\nNutritions: " + str(self.nutritions) + "\nServing: " + str(self.serving) + "\nYield: " + self.yiel + "\nActive time: " + self.activetime + "\nTotal time: " + self.totaltime + "\nRating: " + str(self.rating) + "\nReview count: " + str(self.reviewcount) + "\nMake again: " + str(self.makeagain) + "\nReviews: " + str(self.reviews)
        return string     

######################################################################################################### getters
    def get_title(self):
        return self.title
    
    def get_url(self):
        return self.url
    
    def get_imgurl(self):
        return self.imgurl
    
    def get_desc(self):
        return self.desc
    
    
    def get_ingredients(self):
        return self.ingredients
    
    def get_steps(self):
        return self.steps
    
    def get_tags(self):
        return self.tags
    
    def get_nutritions(self):
        return self.nutritions
    
    def get_serving(self):
        return self.serving
    
    
    def get_yield(self):
        return self.yiel
    
    def get_activetime(self):
        return self.activetime
    
    def get_totaltime(self):
        return self.totaltime
    
    
    def get_rating(self):
        return self.rating

    def get_reviewcount(self):
        return self.reviewcount
    
    def get_makeagain(self):
        return self.makeagain
    
    def get_reviews(self):
        return self.reviews

## test the class

In [121]:
url = 'https://www.epicurious.com/recipes/food/views/grilled-bistecca-porterhouse-steaks-with-herby-fish-sauce'
r = Recipe(url)
print(r)


Title: Grilled Bistecca with Herby Fish Sauce
URL: https://www.epicurious.com/recipes/food/views/grilled-bistecca-porterhouse-steaks-with-herby-fish-sauce
IMG URL: https://assets.epicurious.com/photos/5ad7863b59e81a5d899942ba/6:4/w_274%2Ch_169/grilled-bistecca-steak-with-herby-fish-sauce-recipe-BA-041818.jpg
Desc: Why baste your steak with fish sauce? It's called umami. Get into it.
Ingredients: ['2 (1 1/2"–2"-thick) porterhouse steaks (about 6 lb. total)', '1/2 cup extra-virgin olive oil, divided', 'Kosher salt', '3 Tbsp. freshly ground black pepper', '2 garlic cloves, finely grated', '1/4 cup fish sauce', '2 Tbsp. coarsely chopped marjoram, rosemary, and/or thyme']
Steps: ['Rub steak with 1/4 cup oil; season with salt and sprinkle with pepper (it should nearly cover both sides). Let sit at room temperature 30 minutes.', 'Mix garlic, fish sauce, herbs, and remaining 1/4 cup oil in a small bowl.', 'Prepare a grill for medium-high indirect heat (for a gas grill, leave one or two burner

In [122]:
r.get_nutritions()

{'Calories': 339.0,
 'Carbohydrates': 3.0,
 'Fat': 22.0,
 'Protein': 31.0,
 'Saturated Fat': 5.0,
 'Sodium': 776.0,
 'Polyunsaturated Fat': 2.0,
 'Fiber': 1.0,
 'Monounsaturated Fat': 14.0,
 'Cholesterol': 77.0}

In [123]:
r.get_serving()

8.0

In [124]:
r.to_json()

'{"title": "Grilled Bistecca with Herby Fish Sauce", "url": "https://www.epicurious.com/recipes/food/views/grilled-bistecca-porterhouse-steaks-with-herby-fish-sauce", "imgurl": "https://assets.epicurious.com/photos/5ad7863b59e81a5d899942ba/6:4/w_274%2Ch_169/grilled-bistecca-steak-with-herby-fish-sauce-recipe-BA-041818.jpg", "desc": "Why baste your steak with fish sauce? It\'s called umami. Get into it.", "ingredients": ["2 (1 1/2\\"\\u20132\\"-thick) porterhouse steaks (about 6 lb. total)", "1/2 cup extra-virgin olive oil, divided", "Kosher salt", "3 Tbsp. freshly ground black pepper", "2 garlic cloves, finely grated", "1/4 cup fish sauce", "2 Tbsp. coarsely chopped marjoram, rosemary, and/or thyme"], "steps": ["Rub steak with 1/4 cup oil; season with salt and sprinkle with pepper (it should nearly cover both sides). Let sit at room temperature 30 minutes.", "Mix garlic, fish sauce, herbs, and remaining 1/4 cup oil in a small bowl.", "Prepare a grill for medium-high indirect heat (for 

In [126]:
r2 = Recipe("https://www.epicurious.com/recipes/food/views/pasta-with-15-minute-burst-cherry-tomato-sauce-56390060")
print(r2)


Title: Pasta with 15-Minute Burst Cherry Tomato Sauce
URL: https://www.epicurious.com/recipes/food/views/pasta-with-15-minute-burst-cherry-tomato-sauce-56390060
IMG URL: https://assets.epicurious.com/photos/55f72d733c346243461d496e/6:4/w_274%2Ch_169/09112015_15minute_pastasauce_tomato.jpg
Desc: Juicy sweet cherry tomatoes burst open in warm olive oil, creating a luxuriously silky sauce that comes together in minutes.
Ingredients: ['1 pound pasta', 'Kosher salt', '1/2 cup olive oil', '2 large garlic cloves, finely chopped', '3 pints cherry tomatoes', '1/2 teaspoon freshly ground black pepper', 'Pinch of sugar', '1 cup coarsely chopped fresh basil', 'Freshly grated Parmesan (for serving)']
Steps: ['Cook pasta in a large pot of boiling salted water, stirring occasionally, until al dente; drain and transfer to a large bowl.', 'Meanwhile, heat oil in a 12" skillet or wide heavy saucepan over medium-high. Add garlic, then tomatoes, pepper, sugar, and 1 tsp. salt. Cook, stirring occasionally

In [146]:
with open('test.json', 'w') as fp:
    json.dump(r.__dict__, fp)
    fp.write("\n")

In [147]:
with open('test.json', 'a') as fp:
    json.dump(r2.__dict__, fp)
    fp.write("\n")

## 6. Craw all the recipes in one search resule page and write them into Json file

In [148]:
website0 = 'https://www.epicurious.com/search/?content=recipe&page='
websites = [website0+str(i) for i in range(1,1970)]

In [149]:
def get_url_title(el):
    el = str(el)
    title = re.search(r'(href.+\">)(.+)(</a>)', el).group(2)
    layout = "https://www.epicurious.com"
    url = layout + re.search(r'(href=\")(.+)(\">)', el).group(2)
    return (title, url)

def get_recipe_links(page):
    import re
    recipe_links = []
    soup = BeautifulSoup(page.text,'lxml')
    els = soup.select('article.recipe-content-card h4 a')
    return [get_url_title(el) for el in els]

In [150]:
page = requests.get("https://www.epicurious.com/search?content=recipe&page=1")
links = get_recipe_links(page)
len(links)

18

In [152]:
recipes = []
for link in links:
    recipes.append(Recipe(link[1]))

In [153]:
len(recipes)

18

In [154]:
print(recipes[10])


Title: Thai-Style Squid and Cucumber Salad
URL: https://www.epicurious.com/recipes/food/views/thai-style-squid-and-cucumber-salad
IMG URL: https://assets.epicurious.com/photos/5b19aa4a18f10d6be39984cc/6:4/w_274%2Ch_169/Thai-Style-Squid-and-Cucumber-Salad-recipe-30052918.jpg
Desc: The secret to cooking squid so it’s tender, not tough, is to cook it very quickly in small batches. Tossed with crunchy cucumbers, peanuts, and fresh red chiles, it makes a refreshing summer dinner spooned over rice or on its own.
Ingredients: ['1/4 cup fresh lime juice (from about 3 limes)', '1 garlic clove, finely grated', '1 Tbsp. fish sauce', '1 1/2 tsp. dark brown sugar', '1/4 cup plus 1 Tbsp. (or more) vegetable oil', '2 tsp. kosher salt, divided', '4 mini seedless or Persian cucumbers, halved lengthwise, sliced diagonally into 1/4"-thick slices', '1–2 Fresno chiles, seeded, thinly sliced', '1/2 cup salted, roasted peanuts', '1 1/2 lb. cleaned squid, patted dry', '1/4 cup cilantro sprigs', 'Steamed whit

In [159]:
with open('data/test.json', 'a') as fp:
    for r in recipes:
        json.dump(r.__dict__, fp)
        fp.write("\n")   

## 7. get all the links from all search result pages

## 8. get all the recipes from the links and write them into the json file