In [1]:
import pandas as pd

import time
import pickle
import random
from datetime import datetime
import re
from dateutil.parser import parse

from pymongo import MongoClient
from pprint import pprint

import requests
from bs4 import BeautifulSoup

### List of all recipes on the New York Times

In [2]:
base_url = 'https://cooking.nytimes.com/search'
recipe_urls = []
image_urls = []

In [3]:
# test
response = requests.get(base_url)
print(response.status_code)

200


In [4]:
# recipe urls
def get_recipe_and_image_urls(url):
    '''Given a URL in the NYT's recipe search system, make a tuple containing the recipe url and image url of each recipe 
    on that page, and return all the tuples for that page as a list'''
    
    recipes_and_images=[]
    response = requests.get(url)
    page = response.text
    soup = BeautifulSoup(page, "lxml")

    for article in soup.find_all('article',attrs={"class":"card recipe-card"}):
        recipe_url= 'https://cooking.nytimes.com' + article['data-url']
        
        if '.jpg' in article['data-seo-image-url']:
            image_url = article['data-seo-image-url']
        else:
            image_url = None
        
        recipes_and_images.append((recipe_url, image_url))
        
    return recipes_and_images

In [24]:
recipes = []

for num in range(1,417): #416 pages of recipes in search- this will only go up with time
    time.sleep(.5+2*random.random())
    search_url = base_url + '?q=&page=' + str(num)
    recipes.extend(get_recipe_and_image_urls(search_url))

In [31]:
print(len(recipes))
print(recipes[19930:])

19933
[('https://cooking.nytimes.com/recipes/1016999-rhubarb-big-crumb-coffeecake', 'https://static01.nyt.com/images/2015/03/16/dining/big-crumb-coffee-cake/big-crumb-coffee-cake-superJumbo.jpg'), ('https://cooking.nytimes.com/recipes/1015959-myra-waldos-swedish-lamb', 'https://static01.nyt.com/images/2014/01/15/dining/recipes-myrawaldoswedishlamb/recipes-myrawaldoswedishlamb-superJumbo.jpg'), ('https://cooking.nytimes.com/recipes/1017332-salty-dog', 'https://static01.nyt.com/images/2015/04/03/dining/salty-dog/salty-dog-superJumbo-v2.jpg')]


In [33]:
# pickle the list of urls
with open('urls_recipe_image.pickle', 'wb') as write_file:
    pickle.dump(recipes, write_file)

 ### Get recipe descriptions and data, as well as the URLs of the full articles that link to them

In [2]:
client = MongoClient()

In [3]:
# create db
recipe_db = client['recipes']

In [4]:
# make a collection in the db for recipes and their descriptions
descr_col = recipe_db['descriptions']

In [18]:
# add recipe documents to the collection, and track which recipe websites don't work

no_website_list = []

for recipe in recipes:
    time.sleep(.5+2*random.random())
    response = requests.get(recipe[0])
    
    if response.status_code == 200:
        page = response.text
        soup = BeautifulSoup(page, "lxml")
        recipe_dict={
            'recipe_url':recipe[0], #recipe url
            'image_url':recipe[1] #image url
        }
        
        # article
        try:
            recipe_dict['article_url'] = soup.find('p',attrs={"class":"related-article"}).find('a')['href']
        except:
            pass
        # recipe title
        try:
            recipe_dict['recipe_title'] = soup.find('div',attrs={"class":"title-container"}).find('h1')['data-name']
        except:
            pass
        #recipe description
        try:
            recipe_dict['recipe_description'] = soup.find(itemprop='description').find('p').text
        except:
            pass
        #recipe author
        try:
            recipe_dict['recipe_author'] = soup.find('div',attrs={"class":"recipe-subhead"}).find('h3').find('a')['data-author']
        except:
            pass
        #recipe date
        try:
            dt_list = re.findall(r'\b\d{4}/\d\d?/\d\d?\b', doc['article_url'])
            doc['recipe_date'] = datetime.strptime(dt_list[0], '%Y/%m/%d')
        except:
            try:
                dt_list = re.findall(r'\b\d{4}/\d\d?/\d\d?\b', doc['image_url'])
                doc['recipe_date'] = datetime.strptime(dt_list[0], '%Y/%m/%d')
            except:
                pass
        
        # add the dictionary to the database
        descr_col.insert_one(recipe_dict)
        
    # add the recipe url to a list of broken links if it doesn't work, and print the recipe url
    else:
        no_website_list.append(recipe[0])
        print(recipe[0])

https://cooking.nytimes.com/recipes/8478-candied-squash-and-ginger-relish
https://cooking.nytimes.com/recipes/8476-roasted-squash-puree
https://cooking.nytimes.com/recipes/8321-monterey-county-jail-oatmeal
https://cooking.nytimes.com/recipes/10625-oatmeal-cookies
https://cooking.nytimes.com/recipes/9689-green-glory-juice
https://cooking.nytimes.com/recipes/1372-hummus
https://cooking.nytimes.com/recipes/2461-oatmeal-raisin-cookies


### Create a separate collection for the articles that link to recipes, with their associated information, and a list of all recipes that link to them

In [15]:
print(f"number of recipes: {len(list(descr_col.find()))}") 
print(f"number of recipes with an article: {len(list(descr_col.find({'article_url': {'$exists':True}})))}")
print(f"number of unique articles: {len(set(art['article_url'] for art in descr_col.find({'article_url': {'$exists':True}}, {'article_url':1})))}")

number of recipes: 19926
number of recipes with an article: 18868
number of unique articles: 8215


In [5]:
# make a collection in the db for articles
article_col = recipe_db['articles']

In [70]:
# cursor that we'll use to loop through all the recipes that link to an article
recipe_with_article_cursor = descr_col.find({'article_url': {'$exists':True}})

for recipe_doc in recipe_with_article_cursor:
    
    article_url_from_recipe = recipe_doc['article_url']
    query = {'article_url': article_url_from_recipe}
    
    article_doc = article_col.find_one(query)
    
    recipe_url = recipe_doc['recipe_url']
    
    if article_doc:
        new_recipe_list = article_doc['linked_recipes']
        new_recipe_list.append(recipe_url)
        new_values = { "$set": { 'linked_recipes': new_recipe_list} }
        article_col.update_one(query, new_values)
    
    else:
        recipe_list = [recipe_url]
        article_dict = {
            'article_url':article_url_from_recipe,
            'linked_recipes':recipe_list
        }
        article_col.insert_one(article_dict)

recipe_with_article_cursor.close()

In [73]:
def get_cleaned_article(article_soup):
    view_str = '(View this recipe in NYT Cooking.)'
    save_str = 'Save these essentials to your NYT Cooking recipe box.'
    ten_str = '10 Essential Recipes is a new occasional feature that explores different cuisines.'

    article_body=''

    for div in article_soup.find_all('div',attrs={"class":"StoryBodyCompanionColumn"})[:-1]:
        for next_div in div.find_all('div'):
            if next_div.find('p'):
                for p in next_div.find_all('p'):
                    article_body+=' ' + p.text
    
    cleaned_article = ' '.join(article_body.replace(view_str, '').replace(save_str, '').replace(ten_str, '').split())
    
    if cleaned_article == '':
        raise Exception
    
    return cleaned_article

In [None]:
article_cursor = article_col.find()

for article_doc in article_cursor:
    time.sleep(.5+2*random.random())
    
    article_url = article_doc['article_url']
    doc_id = article_doc['_id']
    response = requests.get(article_url)
    
    if response.status_code == 200:
        page = response.text
        soup = BeautifulSoup(page, "lxml")
    
    # article body
    try:
        article_doc['article_body'] = get_cleaned_article(soup)
    except:
        pass
    # article author
    try:
        article_doc['article_author'] = soup.find('p',attrs={"itemprop":"author"}).find('span',attrs={"itemprop":"name"}).text
    except:
        # check to make sure there's actually an article body before assuming that it doesn't name an author     
        if 'article_body' in article_doc.keys():
            article_doc['article_author'] = 'unnamed'
    # article date
    try:
        dt_list = re.findall(r'\b\d{4}/\d\d?/\d\d?\b', article_url)
        article_doc['article_date'] = datetime.strptime(dt_list[0], '%Y/%m/%d')
    except:
        pass
    # article title
    try:
        article_doc['article_title'] = soup.find('h1', attrs={'itemprop':'headline'}).text
    except:
        pass
    
    query = {'_id': doc_id}
    new_values = {"$set": article_doc}
    article_col.update_one(query, new_values)
    
article_cursor.close()

In [17]:
client.close()