In [8]:
import logging
import time 
from urllib.request import urlopen
from bs4 import BeautifulSoup
from tqdm import tqdm 

LOG_FORMAT = "%(levelname)s %(asctime)s: %(message)s"
logging.basicConfig(level=logging.DEBUG, format=LOG_FORMAT)
logger = logging.getLogger()

In [9]:
def get_recipes_urls(url):
  """Obtiene las direcciones url del sitio web de las recetas
  
  Parametros
  ----------
  url: str
    URL direcction recipes

  Return
  ------
  url_recipes: str
    URL's recipes 
  """
  html = urlopen(url)
  bs = BeautifulSoup(html, 'html.parser')
  articles = bs.find_all('article', {'class': {'article-loop', 'asap-columns-3'}})
  url_recipes = []
  for article in articles:  
    url_recipes.append(article.a.get('href'))  
  return url_recipes

In [73]:
def get_info_recipe(url_recipe):
  """Get information of the url recipe

  Parameters
  ---------
  url_recipe: str
    URL where we find recipe
  
  Return
  -------
  recipe: dict
    One recipe with format json
  """
  html = urlopen(url_recipe)
  bs = BeautifulSoup(html, 'html.parser')
  recipe = {}
  title = bs.h1.get_text()
  recipe['title'] = title
  recipe['ingredients'] = []
  # We wish complete data or anything.
  try:
    tags_ingredientes = bs.findAll('ul', class_=None)[0].findAll('li')      
    for ingrediente in tags_ingredientes:
      recipe['ingredients'].append(ingrediente.getText())    
    paragraphs = bs.find('div', class_='the-content').findAll('p', class_=None)[1:-1]
    description = ""
    for i, paragraph in enumerate(paragraphs):
      description += paragraph.getText()
    url_video = bs.find('iframe').get('src')
    url_image = bs.find('div', class_='post-thumbnail').find('img').get('src')    
    recipe['procedimiento'] = description
    recipe['url_video'] = url_video
    recipe['url_image'] = url_image
  except:
    return None
  return recipe

In [68]:
# Ejemplo de receta que tiene ingredientes ni descripcion
# html = urlopen('https://recetas-mexicanas.com.mx/pimienta-negra')
# bs = BeautifulSoup(html, 'html.parser')
#bs.findAll('ul', class_=None)[0]

In [62]:

def retrieve_recipes(pages=1):
  """Retrive recipes in the page https://recetas-mexicanas.com.mx
  
  Parameters
  ----------
  page: int
    Number page of the siteweb
  
  Return
  ------
  recipe: list
    List recipes of the page
  """
  recipes = []  
  # Get 10 recipes by page
  url_recipes = get_recipes_urls(f'https://recetas-mexicanas.com.mx/page/{pages}')    
  for url_recipe in url_recipes:    
    try:
      recipe = get_info_recipe(url_recipe)    
    except:
      logger.warn('Error al desestructurar')        
    recipes.append(recipe)
    logger.info(f'Agregando {recipe["title"]}...')
    time.sleep(2) 
  return recipes       

Example page
~~~
https://recetas-mexicanas.com.mx/salsa-roja-tatemada
https://recetas-mexicanas.com.mx/tacos-labio-res
https://recetas-mexicanas.com.mx/ejotes-carne-cerdo
https://recetas-mexicanas.com.mx/menudo-rojo
https://recetas-mexicanas.com.mx/lengua-salsa-verde
https://recetas-mexicanas.com.mx/costillas-rancheras
https://recetas-mexicanas.com.mx/espagueti-verde
https://recetas-mexicanas.com.mx/pescado-mojo-ajo
https://recetas-mexicanas.com.mx/pozole-rojo-cerdo
https://recetas-mexicanas.com.mx/costillas-salsa-chiles-secos
~~~

Structur json
~~~
{'title': 'Salsa roja tatemada',
 'ingredients': ['3 Tomates rojos',
  '2 Chiles guajillo',
  '2 Jalapeños',
  '1 Cebolla blanca',
  '3 Dientes de ajo',
  '2 Cucharadas de sal'],
 'procedimiento': 'El primer paso para...',
 'url_video': 'https://www.youtube.com/embed/Qd75WMF7JLc',
 'url_image': 'https://recetas-mexicanas.com.mx/wp-content/uploads/2021/06/salsa-roja-tatemada-945x630.jpg'}
 ~~~


## Insertando en la db


~~~
docker run --name mongo-container -d -p 27017:27017 mongo

Si ya existe un contenedor ejecutar:
docker start mongo-container

Ejecuta el clinete del contentedor
docker exec -it mongo-container mongo
~~~

In [58]:
from pymongo import MongoClient

client = MongoClient('localhost', 27017)
db = client['chef-casa']
recipes_collection = db['recipes']
recipes_collection.count_documents({})

In [65]:

def schedule(step=1, begin_page=1, end_page=10):
  """Retrive the data every step time
  
  step: int
    Time lapse for every called in retrive recipes
  """
  recipes = []
  for page in tqdm(range(begin_page, end_page+1)):
    recipes.extend(retrieve_recipes(page))
    time.sleep(step)    
  recipes_collection.insert_many(recipes)
      

