# Greenr day 1 - attempting to parse recipe data (YL)

---
## Imports

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from pathlib import Path

---
## Get data

### Scrape

In [None]:
def get_ingredients(section):
    
    page = requests.get(f'https://www.bbc.co.uk/food/{section}')
    soup = BeautifulSoup(page.content, 'html.parser')
    ingredients = []

    for a in soup.find_all('a', class_ ="promo promo__main_course" ):
        npage = requests.get(f'https://www.bbc.co.uk{a["href"]}')
        soup = BeautifulSoup(npage.content, 'html.parser')
        ingredient = []
        for a in soup.find_all('li', class_ = "recipe-ingredients__list-item"):
            ingredient.append(a.get_text())
        
        ingredients.append(ingredient)
        ingredients = pd.DataFrame(ingredients)
        
    return ingredients

In [None]:
df_scraped = get_ingredients('cuisines/mexican')

In [None]:
df_scraped.head()

### NYT demo data

In [None]:
df_nyt = pd.read_csv('https://raw.githubusercontent.com/mtlynch/ingredient-phrase-tagger/master/nyt-ingredients-snapshot-2015.csv')

In [None]:
df_nyt.shape

In [None]:
display(df_nyt.head(10)), df_nyt.head(10).input[0]

In [None]:
len(df_nyt.name.unique())

In [None]:
ingredient_list = df_nyt.name

In [None]:
from collections import Counter

In [None]:
ingredient_counter = Counter(ingredient_list)

In [None]:
import _pickle as cPickle

In [None]:
most_common_ingredients = ingredient_counter.most_common()

In [None]:
cPickle.dump(most_common_ingredients, open('ingredients_counts.pkl', 'wb'))

In [None]:
! ls

In [None]:
tmp = cPickle.load?

In [None]:
var = cPickle.load(open('ingredients_counts.pkl', 'rb'))

## Allrecipes - ingredient names incl. amounts

In [4]:
# -*- coding: utf-8 -*-

from bs4 import BeautifulSoup

import urllib.parse
import urllib.request

import re

class AllRecipes(object):

    @staticmethod
    def search(query_dict):
        """
        Search recipes parsing the returned html data.
        """
        base_url = "https://allrecipes.com/search/results/?"
        query_url = urllib.parse.urlencode(query_dict)

        url = base_url + query_url

        req = urllib.request.Request(url)
        req.add_header('Cookie', 'euConsent=true')

        html_content = urllib.request.urlopen(req).read()

        soup = BeautifulSoup(html_content, 'html.parser')

        search_data = []
        articles = soup.findAll("article", {"class": "fixed-recipe-card"})

        iterarticles = iter(articles)
        next(iterarticles)
        for article in iterarticles:
            data = {}
            try:
                data["name"] = article.find("h3", {"class": "fixed-recipe-card__h3"}).get_text().strip(' \t\n\r')
                data["description"] = article.find("div", {"class": "fixed-recipe-card__description"}).get_text().strip(' \t\n\r')
                data["url"] = article.find("a", href=re.compile('^https://www.allrecipes.com/recipe/'))['href']
                try:
                    data["image"] = article.find("a", href=re.compile('^https://www.allrecipes.com/recipe/')).find("img")["data-original-src"]
                except Exception as e1:
                    pass
                try:
                    data["rating"] = float(article.find("div", {"class": "fixed-recipe-card__ratings"}).find("span")["data-ratingstars"])
                except ValueError:
                    data["rating"] = None
            except Exception as e2:
                pass
            if data and "image" in data:  # Do not include if no image -> its probably an add or something you do not want in your result
                search_data.append(data)

        return search_data

    @staticmethod
    def get(url):
        """
        'url' from 'search' method.
         ex. "/recipe/106349/beef-and-spinach-curry/"
        """
        #base_url = "https://allrecipes.com/"
        #url = base_url + uri

        req = urllib.request.Request(url)
        req.add_header('Cookie', 'euConsent=true')

        html_content = urllib.request.urlopen(req).read()
        soup = BeautifulSoup(html_content, 'html.parser')
        
        #try: -- Doesn't currently work
        #    rating = float(soup.find("div", {"class": "rating-stars"})["data-ratingstars"]) -- Doesn't currently work
        #except ValueError: -- Doesn't currently work
        #    rating = None -- Doesn't currently work
        ingredients = soup.findAll("li", {"class": "ingredients-item"})
        steps = soup.findAll("span", {"class": "recipe-directions__list--item"})
        name = soup.find("h1", {"class": "headline heading-content"}).get_text().replace("®", "")

        direction_data = soup.find("div", {"class": "directions--section__steps"})
        #prep_time = direction_data.find("time", {"itemprop": "prepTime"}).get_text() -- Doesn't currently work
        #cook_time = direction_data.find("time", {"itemprop": "cookTime"}).get_text() -- Doesn't currently work
        #total_time = direction_data.find("time", {"itemprop": "totalTime"}).get_text() -- Doesn't currently work

        data = {
                #"rating": rating,
                "ingredients": [],
                "steps": [],
                "name": name
                #"prep_time": prep_time,
                #"cook_time": cook_time,
                #"total_time": total_time
                }

        for ingredient in ingredients:
            str_ing = ingredient.find("span", {"class": "ingredients-item-name"}).get_text()
            if str_ing and str_ing != "Add all ingredients to list":
                data["ingredients"].append(str_ing.strip())

        for step in steps:
            str_step = step.get_text()
            if str_step:
                data["steps"].append(str_step)

        return data

In [None]:
query_options = {
  "wt": "duck curry",         # Query keywords
  "ingIncl": "olives",        # 'Must be included' ingrdients (optional)
  "ingExcl": "onions salad",  # 'Must not be included' ingredients (optional)
  "sort": "re"                # Sorting options : 're' for relevance, 'ra' for rating, 'p' for popular (optional)
}
query_result = AllRecipes.search(query_options)

# Get :
main_recipe_url = query_result[0]['url']
detailed_recipe = AllRecipes.get(main_recipe_url)  # Get the details of the first returned recipe (most relevant in our case)

# Display result :
print("## %s :" % detailed_recipe['name'])  # Name of the recipe

for ingredient in detailed_recipe['ingredients']:  # List of ingredients
    print("- %s" % ingredient.strip())

for step in detailed_recipe['steps']:  # List of cooking steps
    print("# %s" % step)

## Allrecipes - ingredient names only 

In [None]:
# -*- coding: utf-8 -*-

from bs4 import BeautifulSoup

import urllib.parse
import urllib.request

import re

class AllRecipes(object):

    @staticmethod
    def search(query_dict):
        """
        Search recipes parsing the returned html data.
        """
        base_url = "https://allrecipes.com/search/results/?"
        query_url = urllib.parse.urlencode(query_dict)

        url = base_url + query_url

        req = urllib.request.Request(url)
        req.add_header('Cookie', 'euConsent=true')

        html_content = urllib.request.urlopen(req).read()

        soup = BeautifulSoup(html_content, 'html.parser')

        search_data = []
        articles = soup.findAll("article", {"class": "fixed-recipe-card"})

        iterarticles = iter(articles)
        next(iterarticles)
        for article in iterarticles:
            data = {}
            try:
                data["name"] = article.find("h3", {"class": "fixed-recipe-card__h3"}).get_text().strip(' \t\n\r')
                data["description"] = article.find("div", {"class": "fixed-recipe-card__description"}).get_text().strip(' \t\n\r')
                data["url"] = article.find("a", href=re.compile('^https://www.allrecipes.com/recipe/'))['href']
                try:
                    data["image"] = article.find("a", href=re.compile('^https://www.allrecipes.com/recipe/')).find("img")["data-original-src"]
                except Exception as e1:
                    pass
                try:
                    data["rating"] = float(article.find("div", {"class": "fixed-recipe-card__ratings"}).find("span")["data-ratingstars"])
                except ValueError:
                    data["rating"] = None
            except Exception as e2:
                pass
            if data and "image" in data:  # Do not include if no image -> its probably an add or something you do not want in your result
                search_data.append(data)

        return search_data

    @staticmethod
    def get(url):
        """
        'url' from 'search' method.
         ex. "/recipe/106349/beef-and-spinach-curry/"
        """
        #base_url = "https://allrecipes.com/"
        #url = base_url + uri

        req = urllib.request.Request(url)
        req.add_header('Cookie', 'euConsent=true')

        html_content = urllib.request.urlopen(req).read()
        soup = BeautifulSoup(html_content, 'html.parser')
        
        #try: -- Doesn't currently work
        #    rating = float(soup.find("div", {"class": "rating-stars"})["data-ratingstars"]) -- Doesn't currently work
        #except ValueError: -- Doesn't currently work
        #    rating = None -- Doesn't currently work
        ingredients = soup.findAll("li", {"class": "ingredients-item"})
        steps = soup.findAll("span", {"class": "recipe-directions__list--item"})
        name = soup.find("h1", {"class": "headline heading-content"}).get_text().replace("®", "")

        direction_data = soup.find("div", {"class": "directions--section__steps"})
        #prep_time = direction_data.find("time", {"itemprop": "prepTime"}).get_text() -- Doesn't currently work
        #cook_time = direction_data.find("time", {"itemprop": "cookTime"}).get_text() -- Doesn't currently work
        #total_time = direction_data.find("time", {"itemprop": "totalTime"}).get_text() -- Doesn't currently work
                
        data = {
                #"rating": rating,
                "ingredients": [],
                "steps": [],
                "name": name
                #"prep_time": prep_time,
                #"cook_time": cook_time,
                #"total_time": total_time
                }

        for ingredient in ingredients:
            str_ing = ingredient.find("input", {"class": "checkbox-list-input"})['value']
            if str_ing and str_ing != "Add all ingredients to list":
                data["ingredients"].append(str_ing.strip())

        for step in steps:
            str_step = step.get_text()
            if str_step:
                data["steps"].append(str_step)

        return data

In [None]:
query_options = {
  "wt": "duck curry",         # Query keywords
  "ingIncl": "olives",        # 'Must be included' ingrdients (optional)
  "ingExcl": "onions salad",  # 'Must not be included' ingredients (optional)
  "sort": "re"                # Sorting options : 're' for relevance, 'ra' for rating, 'p' for popular (optional)
}
query_result = AllRecipes.search(query_options)

# Get :
main_recipe_url = query_result[0]['url']
detailed_recipe = AllRecipes.get(main_recipe_url)  # Get the details of the first returned recipe (most relevant in our case)

# Display result :
print("## %s :" % detailed_recipe['name'])  # Name of the recipe

for ingredient in detailed_recipe['ingredients']:  # List of ingredients
    print("- %s" % ingredient.strip())

for step in detailed_recipe['steps']:  # List of cooking steps
    print("# %s" % step)

### Testing a bit

In [None]:
query_options = {
  "wt": "beef",         # Query keywords
  "ingIncl": "potato",        # 'Must be included' ingrdients (optional)
#  "ingExcl": "onions salad",  # 'Must not be included' ingredients (optional)
  "sort": "ra"                # Sorting options : 're' for relevance, 'ra' for rating, 'p' for popular (optional)
}
query_result = AllRecipes.search(query_options)

In [None]:
query_result

In [None]:
ingredient_database = []

for result in query_result:
    main_recipe_url = result['url']
    print(main_recipe_url)
    
    ingredient_database.append(AllRecipes.get(main_recipe_url)['ingredients'])


## Scraping until they kick me out (incl. amounts)

In [5]:
scraped_ingredients = []

In [6]:
j = 0
i = 7000
while j < 1000:
    try:
        url = f'https://www.allrecipes.com/recipe/{i}'
        scraped_ingredients.append(AllRecipes.get(url)['ingredients'])
        print(f'{i},({j})')
        j += 1
    except:
        print(f'-- {i} ({j})')
    i +=1


        

7000,(0)
7001,(1)
7002,(2)
7003,(3)
7004,(4)
7005,(5)
7006,(6)
7007,(7)
7008,(8)
7009,(9)
7010,(10)
7011,(11)
7012,(12)
7013,(13)
7014,(14)
7015,(15)
7016,(16)
7017,(17)
7018,(18)
7019,(19)
7020,(20)
7021,(21)
7022,(22)
7023,(23)
7024,(24)
7025,(25)
7026,(26)
7027,(27)
7028,(28)
-- 7029 (29)
7030,(29)
7031,(30)
7032,(31)
7033,(32)
7034,(33)
7035,(34)
7036,(35)
7037,(36)
7038,(37)
7039,(38)
7040,(39)
7041,(40)
7042,(41)
7043,(42)
7044,(43)
7045,(44)
7046,(45)
7047,(46)
7048,(47)
7049,(48)
7050,(49)
7051,(50)
7052,(51)
7053,(52)
7054,(53)
7055,(54)
7056,(55)
7057,(56)
7058,(57)
7059,(58)
7060,(59)
7061,(60)
7062,(61)
7063,(62)
7064,(63)
7065,(64)
7066,(65)
7067,(66)
7068,(67)
7069,(68)
7070,(69)
7071,(70)
7072,(71)
7073,(72)
7074,(73)
7075,(74)
7076,(75)
7077,(76)
7078,(77)
7079,(78)
7080,(79)
7081,(80)
7082,(81)
7083,(82)
7084,(83)
7085,(84)
7086,(85)
7087,(86)
7088,(87)
7089,(88)
7090,(89)
7091,(90)
7092,(91)
7093,(92)
7094,(93)
7095,(94)
7096,(95)
7097,(96)
7098,(97)
7099,(98)
7100,(9

7755,(753)
7756,(754)
7757,(755)
7758,(756)
7759,(757)
7760,(758)
7761,(759)
7762,(760)
7763,(761)
7764,(762)
7765,(763)
7766,(764)
7767,(765)
7768,(766)
7769,(767)
7770,(768)
7771,(769)
7772,(770)
7773,(771)
7774,(772)
7775,(773)
7776,(774)
7777,(775)
7778,(776)
7779,(777)
7780,(778)
7781,(779)
7782,(780)
7783,(781)
7784,(782)
7785,(783)
7786,(784)
7787,(785)
7788,(786)
7789,(787)
7790,(788)
7791,(789)
7792,(790)
7793,(791)
7794,(792)
7795,(793)
7796,(794)
7797,(795)
7798,(796)
7799,(797)
7800,(798)
7801,(799)
7802,(800)
7803,(801)
7804,(802)
7805,(803)
7806,(804)
7807,(805)
7808,(806)
7809,(807)
7810,(808)
7811,(809)
7812,(810)
7813,(811)
7814,(812)
7815,(813)
7816,(814)
7817,(815)
7818,(816)
7819,(817)
7820,(818)
7821,(819)
7822,(820)
7823,(821)
7824,(822)
7825,(823)
7826,(824)
7827,(825)
7828,(826)
7829,(827)
7830,(828)
7831,(829)
7832,(830)
7833,(831)
7834,(832)
7835,(833)
7836,(834)
7837,(835)
7838,(836)
7839,(837)
7840,(838)
7841,(839)
7842,(840)
7843,(841)
7844,(842)
7845,(843)

In [7]:
len(scraped_ingredients)

1000

In [8]:
import _pickle as cPickle

cPickle.dump(scraped_ingredients, open('scraped_recipe_ingredients_1000.pkl', 'wb'))

In [None]:
! ls