In [33]:
import requests
from bs4 import BeautifulSoup
import re
import dill
import json
import datetime

# Web Crawler for Epicurious

## 1. get all the links of recipes and write them into "data/epi-links.pkd"

In [2]:
search_result_pages = ["https://www.epicurious.com/search/?content=recipe&page=" + str(i) for i in range(1,1970)]

In [3]:
def get_url_title(el):
    try:
        url = re.search(r'<a.+href="(.+)">(.+[\s\S]+)<\/a>', str(el)).group(1)
        title = re.search(r'<a.+href="(.+)">(.+[\s\S]+)<\/a>', str(el)).group(2).strip()
        #text = requests.get(""+url).text
        return (title, url)
    except:
        print(el)
        return None

def get_recipe_links(page):
    recipe_links = []
    soup = BeautifulSoup(page.text,'lxml')
    els = soup.select('article.recipe-content-card h4 a')
    return [get_url_title(el) for el in els]

In [9]:
from requests_futures.sessions import FuturesSession

session = FuturesSession(max_workers=5)

links = [link
        for future in [session.get(page) for page in search_result_pages]
        for link in get_recipe_links(future.result())]

In [10]:
#len(links)

35441

In [14]:
#links[0]

('Chicken Salad with Apricots, Celery, and Blue Cheese',
 '/recipes/food/views/chicken-salad-with-apricots-celery-and-blue-cheese')

In [15]:
#dill.dump(links, open('../data/epi-links.pkd', 'wb'))

In [34]:
link_list = dill.load(open('../data/epi-links.pkd', 'rb'))
len(link_list)

35441

In [35]:
link_list[-1]

('Portobello Mushroom Salad à la Grecque',
 '/recipes/food/views/portobello-mushroom-salad-a-la-grecque-recipe')

## 2. get all the recipes and write them into "data/epi-recipes.json"

In [36]:
## require requests, BeautifulSoup, re, json
class Recipe:

################################################################################################ init empty recipe
    def __init__(self):

        self.title = ''
        self.url = ''
        self.imgurl = ''
        self.desc = ''

        self.ingredients = []
        self.steps = []
        self.tags = []
        self.nutritions = {}
        self.serving = 0

        self.yiel = ''
        self.activetime = ''
        self.totaltime = ''

        self.rating = 0
        self.reviewcount = 0
        self.makeagain = 0
        self.reviews = []


############################################################################ init recipe using a recipe detial page
    def __init__(self, path):

        self.url = "https://www.epicurious.com" + path
        page = requests.get(self.url)
        soup = BeautifulSoup(page.text, 'lxml')

        self.title = self.build_title(soup)
        self.imgurl = self.build_imgurl(soup)
        self.desc = self.build_desc(soup)

        self.ingredients = self.build_ingredients(soup)
        self.steps = self.build_steps(soup)
        self.tags = self.build_tags(soup)
        self.nutritions = self.build_nutritions(soup)
        self.serving = self.build_serving(soup)

        self.yiel = self.build_yield(soup)
        self.activetime = self.build_activetime(soup)
        self.totaltime = self.build_totaltime(soup)

        self.rating = self.build_rating(soup)
        self.reviewcount = self.build_reviewcount(soup)
        self.makeagain = self.build_makeagain(soup)
        self.reviews = self.build_reviews(soup)


########################################################################### fuctions: get infomation from a page
    def build_title(self, soup):
        try:
            el = soup.select('div.main-content div.title-source h1')
            title = re.search(r'\">(.+)<', str(el)).group(1).strip()
            return title
        except:
            print("Title Wrong: "+self.url)
            return ''

    def build_imgurl(self, soup):
        try:
            el = soup.select('div.main-content img["srcset"]')
            imgurl = re.search(r'srcset="(.+)"\/>', str(el)).group(1)
            return imgurl
        except:
            #print("IMG Wrong: "+self.url)
            return ''

    def build_desc(self, soup):
        try:
            el = soup.select('div.main-content div.dek')
            desc = re.search(r'<p>(.+)<\/p>', str(el)).group(1).strip()
            return desc
        except:
            #print("Desc Wrong: "+self.url)
            return ''


    def build_ingredients(self, soup):
        try:
            ingredients = []
            els = soup.select('div.main-content div.ingredients-info li.ingredient')
            for el in els:
                ingredients.append(re.search(r'<li.+>(\s*.+\s*)<\/li>', str(el)).group(1).strip())
            return ingredients
        except:
            print("Ingredients Wrong: "+self.url)
            return []

    def build_steps(self, soup):
        try:
            steps = []
            els = soup.select('div.main-content div.instructions li.preparation-step')
            for el in els:
                steps.append(re.search(r'<li.+>(\s*.+\s*)<\/li>', str(el)).group(1).strip())
            return steps
        except:
            print("Steps Wrong: "+self.url)
            return []

    def build_tags(self, soup):
        try:
            tags = []
            els = soup.select('div.main-content dl.tags a')
            for el in els:
                tags.append(re.search(r'\".+>(.+)<\/dt>', str(el)).group(1))
            return tags
        except:
            print("Tags Wrong: "+self.url)
            return []

    def build_nutritions(self, soup):
        try:
            nutritions = {}
            labels = soup.select('div.main-content span.nutri-label')
            values = soup.select('div.main-content span.nutri-data')
            for i in range(len(labels)):
                n = re.search(r'\">(.+)<\/', str(labels[i])).group(1)
                v = re.search(r'\">(.+)<\/', str(values[i])).group(1)
                v = float(v.split(' ')[0])
                nutritions[n] = v
            return nutritions
        except:
            #print("Nutritions Wrong: "+self.url)
            return {}

    def build_serving(self, soup):
        try:
            el = soup.select('div.main-content span.per-serving')
            serving = re.search(r'.+(\d+).+',str(el)).group(1)
            return float(serving)
        except:
            #print("Serving Wrong: "+self.url)
            return 0


    def build_yield(self, soup):
        try:
            el = soup.select('div.main-content dd.yield')
            y = re.search(r'\">(.+)<', str(el)).group(1)
            return y
        except:
            #print("Yield Wrong: "+self.url)
            return ''

    def build_activetime(self, soup):
        try:
            el = soup.select('div.main-content dd.active-time')
            active = re.search(r'\">(.+)<', str(el)).group(1)
            return active
        except:
            #print("Activetime Wrong: "+self.url)
            return ''

    def build_totaltime(self, soup):
        try:
            el = soup.select('div.main-content dd.total-time')
            total = re.search(r'\">(.+)<', str(el)).group(1)
            return total
        except:
            #print("Totaltime Wrong: "+self.url)
            return ''


    def build_rating(self, soup):
        try:
            el = soup.select('div.main-content span.rating')
            rating = re.search(r'\">(.+)\/\d', str(el)).group(1)
            return float(rating)
        except:
            print("Rating Wrong: "+self.url)
            return 0

    def build_reviewcount(self, soup):
        try:
            el = soup.select('div.main-content span.reviews-count')
            reviewcount = re.search(r'(\d+)', str(el)).group(1)
            return float(reviewcount)
        except:
            print("Reviewcount Wrong: "+self.url)
            return 0

    def build_makeagain(self, soup):
        try:
            el = soup.select('div.main-content div.prepare-again-rating span')
            makeagain = re.search(r'(\d+)', str(el)).group(1)
            return float(makeagain)
        except:
            print("MakeagainRating Wrong: "+self.url)
            return 0

    def build_reviews(self, soup):
        try:
            reviews = []
            els = soup.select('div.main-content div.reviews li div.review-text p')
            for el in els:
                review = re.search(r'\">(.+[\s\S]*)<', str(r)).group(1).strip()
                reviews.append(review)
            return reviews
        except:
            #print("Reviews Wrong: "+self.url)
            return []


######################################################################################################### to string
    def __str__(self):
        string = "\nTitle: " + self.title + "\nURL: " + self.url + "\nIMG URL: " + self.imgurl + "\nDesc: " + self.desc + "\nIngredients: " + str(self.ingredients) + "\nSteps: " + str(self.steps) + "\nTags: " + str(self.tags) + "\nNutritions: " + str(self.nutritions) + "\nServing: " + str(self.serving) + "\nYield: " + self.yiel + "\nActive time: " + self.activetime + "\nTotal time: " + self.totaltime + "\nRating: " + str(self.rating) + "\nReview count: " + str(self.reviewcount) + "\nMake again: " + str(self.makeagain) + "\nReviews: " + str(self.reviews)
        return string

######################################################################################################### getters
    def get_title(self):
        return self.title

    def get_url(self):
        return self.url

    def get_imgurl(self):
        return self.imgurl

    def get_desc(self):
        return self.desc


    def get_ingredients(self):
        return self.ingredients

    def get_steps(self):
        return self.steps

    def get_tags(self):
        return self.tags

    def get_nutritions(self):
        return self.nutritions

    def get_serving(self):
        return self.serving


    def get_yield(self):
        return self.yiel

    def get_activetime(self):
        return self.activetime

    def get_totaltime(self):
        return self.totaltime


    def get_rating(self):
        return self.rating

    def get_reviewcount(self):
        return self.reviewcount

    def get_makeagain(self):
        return self.makeagain

    def get_reviews(self):
        return self.reviews


In [37]:
recipes = []
count = 1


for link in link_list:
    count += 1
    recipes.append(Recipe(link[1]))
    if count%5000 == 0:
        print("Finish %d pages at "%count + str(datetime.datetime.today()))

Steps Wrong: https://www.epicurious.com/recipes/food/views/sams-spring-cucumber-radish-fattoush-salad
Steps Wrong: https://www.epicurious.com/recipes/food/views/vietnamese-candy-pork
Steps Wrong: https://www.epicurious.com/recipes/food/views/salted-caramel-chocolate-tart
Steps Wrong: https://www.epicurious.com/recipes/food/views/spicy-dry-fried-beef
Steps Wrong: https://www.epicurious.com/recipes/food/views/spinach-yogurt-dip-with-sizzled-mint
Steps Wrong: https://www.epicurious.com/recipes/food/views/perfect-pot-roast
Steps Wrong: https://www.epicurious.com/recipes/food/views/roasted-acorn-squash-with-sage-and-honey
Steps Wrong: https://www.epicurious.com/recipes/food/views/paella-with-tomatoes-and-eggs
Steps Wrong: https://www.epicurious.com/recipes/food/views/muffin-cup-vegetarian-veggie-omelets
Steps Wrong: https://www.epicurious.com/recipes/food/views/pan-roasted-chicken-with-shallots-and-dates
Steps Wrong: https://www.epicurious.com/recipes/food/views/blood-orange-poppy-polenta-s

Rating Wrong: https://www.epicurious.com/recipes/food/views/chicken-salad-with-creme-fraiche-and-rye-51236450
Reviewcount Wrong: https://www.epicurious.com/recipes/food/views/chicken-salad-with-creme-fraiche-and-rye-51236450
MakeagainRating Wrong: https://www.epicurious.com/recipes/food/views/chicken-salad-with-creme-fraiche-and-rye-51236450
Rating Wrong: https://www.epicurious.com/recipes/food/views/sesame-rice-noodles-with-shrimp-51235820
Reviewcount Wrong: https://www.epicurious.com/recipes/food/views/sesame-rice-noodles-with-shrimp-51235820
MakeagainRating Wrong: https://www.epicurious.com/recipes/food/views/sesame-rice-noodles-with-shrimp-51235820
Rating Wrong: https://www.epicurious.com/recipes/food/views/linguine-and-clams-with-almonds-and-herbs-51231650
Reviewcount Wrong: https://www.epicurious.com/recipes/food/views/linguine-and-clams-with-almonds-and-herbs-51231650
MakeagainRating Wrong: https://www.epicurious.com/recipes/food/views/linguine-and-clams-with-almonds-and-herbs-5

Ingredients Wrong: https://www.epicurious.com/recipes/food/views/bibimbap-352271
Rating Wrong: https://www.epicurious.com/recipes/food/views/tandoori-chicken-238388
Reviewcount Wrong: https://www.epicurious.com/recipes/food/views/tandoori-chicken-238388
MakeagainRating Wrong: https://www.epicurious.com/recipes/food/views/tandoori-chicken-238388
Steps Wrong: https://www.epicurious.com/recipes/food/views/50-whole-wheat-sandwich-bread-351231
Steps Wrong: https://www.epicurious.com/recipes/food/views/basic-soft-white-sandwich-loaf-351269
Steps Wrong: https://www.epicurious.com/recipes/food/views/golden-dinner-rolls-351232
Steps Wrong: https://www.epicurious.com/recipes/food/views/rosemary-flat-bread-351249
Steps Wrong: https://www.epicurious.com/recipes/food/views/basic-brioche-351237
Steps Wrong: https://www.epicurious.com/recipes/food/views/basic-sourdough-bread-351236
Finish 10000 pages at 2018-06-26 00:47:42.818190
Steps Wrong: https://www.epicurious.com/recipes/food/views/grilled-pizz

In [38]:
len(recipes)

35441

In [39]:
dill.dump(recipes, open('../data/epi-recipes.pkd', 'wb'))

In [42]:
with open('../data/epi-recipes-json.json', 'a') as fp:
    for r in recipes:
        json.dump(r.__dict__, fp)
        fp.write("\n")   