In [80]:
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import re
from recipe_scrapers import scrape_me
from tqdm import tqdm
import time
import concurrent.futures
from itertools import repeat

In [50]:
links_df = pd.read_csv('allrecipes_categories_links.csv')
links_df.fillna('na', inplace=True)

In [68]:
links_df

Unnamed: 0,region,subregion,links_to_scrape,meal_type
0,latin_american,mexican,https://www.allrecipes.com/recipes/1217/world-...,dessert
1,latin_american,mexican,https://www.allrecipes.com/recipes/1214/world-...,appetizer
2,latin_american,mexican,https://www.allrecipes.com/recipes/1215/world-...,soups_and_stews
3,latin_american,mexican,https://www.allrecipes.com/recipes/1470/world-...,na
4,latin_american,mexican,https://www.allrecipes.com/recipes/1525/world-...,bread
...,...,...,...,...
315,usa,cajun_creole,https://www.allrecipes.com/recipes/272/us-reci...,na
316,usa,cajun_creole,https://www.allrecipes.com/recipes/1425/soups-...,soups_and_stews
317,usa,cajun_creole,https://www.allrecipes.com/recipes/1426/soups-...,soups_and_stews
318,usa,cajun_creole,https://www.allrecipes.com/recipes/1428/soups-...,soups_and_stews


In [None]:
PLACEHOLDER_IMG_LINK = 'https://www.allrecipes.com/thmb/dgnxqxdc3s1YI1CE-1_O-hu9dnU=/1500x0/filters:no_upscale():max_bytes(150000):strip_icc()/ar_placeholder-6681e9afa32045a78d8f2632de8b28b7.jpg'

recipe_df = pd.DataFrame(columns=['title', 'recipe_link', 'img_link', 'ingredients', 'yield', 'nutrients', 'region', 'subregion', 'meal_type'])


def scrape_sites(url):
    RECIPE_PATTERN = 'https://www.allrecipes.com/recipe/'
    reqs = requests.get(url)
    soup = BeautifulSoup(reqs.text, 'html.parser')
    recipe_links = []
    for link in soup.find_all('a'):
        link_url = str(link.get('href'))
        if link_url[:34] == RECIPE_PATTERN:
            recipe_links.append(link_url)
    return recipe_links


def scrape_recipe_to_series(link):
    scraper = scrape_me(link)
    if scraper.image() != PLACEHOLDER_IMG_LINK:
        if scraper.title() in recipe_df.title.values:
            recipe_idx = recipe_df.loc[recipe_df['title'] == scraper.title()].index[0]
            recipe_row = recipe_df.iloc[recipe_idx,:]
            if recipe_row['subregion'] == 'na' and row.subregion != 'na':
                recipe_df.at[recipe_idx, 'subregion'] = row.subregion
            if recipe_row['meal_type'] == 'na' and row.meal_type != 'na':
                recipe_df.at[recipe_idx, 'meal_type'] = row.meal_type
        else:
            single_recipe = {
                'title': scraper.title(),
                'recipe_link': link, 
                'img_link': scraper.image(), 
                'ingredients': scraper.ingredients(), 
                'yield': scraper.yields(), 
                'nutrients': scraper.nutrients(), 
                'region': row.region, 
                'subregion': row.subregion, 
                'meal_type': row.meal_type
            }
            return pd.Series(single_recipe)


for index, row in tqdm(links_df.iterrows(), total=links_df.shape[0]):
    recipe_links = scrape_sites(row.links_to_scrape)
    with concurrent.futures.ProcessPoolExecutor() as executor:
        out_recipe_series = executor.map(scrape_recipe_to_series.scrape_recipe_to_series, recipe_links, repeat(row))
        for recipe_series in out_recipe_series:
            recipe_df = pd.concat([recipe_df, recipe_series.to_frame().T], ignore_index = True)

recipe_df.to_csv('recipe_links.csv', index=False)

In [97]:
recipe_links_df = pd.read_csv('recipe_links.csv')
recipe_links_df

Unnamed: 0,title,recipe_link,img_link,ingredients,yield,nutrients,region,subregion,meal_type
0,Tres Leches (Milk Cake),https://www.allrecipes.com/recipe/7399/tres-le...,https://www.allrecipes.com/thmb/KfnfyrWfrla-34...,"['1.5 cups all-purpose flour', '1 teaspoon bak...",24 servings,"{'calories': '280 kcal', 'carbohydrateContent'...",latin_american,mexican,dessert
1,Churros,https://www.allrecipes.com/recipe/24700/churros/,https://www.allrecipes.com/thmb/LG2tkbuVe8D-wR...,"['1 cup water', '2.5 tablespoons white sugar',...",4 servings,"{'calories': '691 kcal', 'carbohydrateContent'...",latin_american,mexican,dessert
2,Sopapilla Cheesecake Pie,https://www.allrecipes.com/recipe/169305/sopap...,https://www.allrecipes.com/thmb/K-JgYMxUjrG-mP...,"['cooking spray', '2 (8 ounce) packages cream ...",12 servings,"{'calories': '481 kcal', 'carbohydrateContent'...",latin_american,mexican,dessert
3,Mexican Wedding Cookies,https://www.allrecipes.com/recipe/15542/mexica...,https://www.allrecipes.com/thmb/fUPJOX2pYUPQa5...,"['1 cup unsalted butter, softened', '0.5 cup w...",36 servings,"{'calories': '104 kcal', 'carbohydrateContent'...",latin_american,mexican,dessert
4,Pumpkin Empanadas,https://www.allrecipes.com/recipe/216489/mexic...,https://www.allrecipes.com/thmb/je83M2D1YLXytb...,"['3 cups all-purpose flour', '0.33333334326744...",12 servings,"{'calories': '384 kcal', 'carbohydrateContent'...",latin_american,mexican,dessert
...,...,...,...,...,...,...,...,...,...
13313,Moist Herman Coffee Cake,https://www.allrecipes.com/recipe/7963/moist-h...,https://www.allrecipes.com/thmb/X1oHHQdhfwQGJa...,"['1 cup sourdough starter', '1 cup white sugar...",36 servings,"{'calories': '204 kcal', 'carbohydrateContent'...",usa,amish_mennonite,na
13314,Amish Chocolate Pudding,https://www.allrecipes.com/recipe/278333/amish...,https://www.allrecipes.com/thmb/FMpkzxP8mwFLn5...,"['2 cups milk, or more as needed', '2 cups whi...",8 servings,"{'calories': '274 kcal', 'carbohydrateContent'...",usa,amish_mennonite,na
13315,Gera's Amish Funny Cake,https://www.allrecipes.com/recipe/139695/geras...,https://www.allrecipes.com/thmb/gmhyOssY2NkQzd...,"['1 cup white sugar', '0.5 cup cocoa powder', ...",16 servings,"{'calories': '343 kcal', 'carbohydrateContent'...",usa,amish_mennonite,na
13316,Quick Banana Nut Cake,https://www.allrecipes.com/recipe/232881/quick...,https://www.allrecipes.com/thmb/3RqQT9khBm7fvM...,"['1 cup white sugar', '0.5 cup butter', '0.25 ...",12 servings,"{'calories': '273 kcal', 'carbohydrateContent'...",usa,amish_mennonite,na
