In [1]:
import csv
import json
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
import logging

In [2]:
selected_nutrition_attributes = [
    'ENERC_KJ', # [kcal]
    'FAT', # [g]
    'CHOCDF', # carbs [g]
    'PROCNT', # protein [g]
    'TOCPHA', # vitamin E [g]
    'VITC', # [g]
    'VITB12', # [g]
    'VITD-', # [IU]
    'VITK', # [g]
    'VITA_IU',
    'FOLFD', # vit 9 [g]
    'VITB6A', # vit b6 [g]
    'FASAT', # fatty acids [g]
    'CHOLE', # [g]
    'NA', # salt [g]
    'FIBTG', # fiber [g]
    'SUGAR', # [g]
    'CA', # calcium [g]
    'FE', # iron [g]
    'MG', # magnesium [g]
    'ZN', # zinc [g]
    'K', # potassium [g]
    'CARTB', # beta carotene [g]
]

# settings for web scraping
prefs = {"profile.managed_default_content_settings.images": 2} # disable image loading
chrome_options = Options()
chrome_options.add_argument("--ignore-certificate-errors-spki-list")
chrome_options.add_argument("--disable-logging") # removed pages are not that interesting
chrome_options.add_experimental_option("prefs", prefs)
chrome_options.add_experimental_option("excludeSwitches", ["disable-popup-blocking"])

# settings for logging
logging.basicConfig(filename='preprocessing.log', level=logging.DEBUG)

In [3]:
def preprocess(start_i, end_i): 
       
    for i in range(start_i, end_i):
        # get recipe file and load from json to dict
        stringcount = format(i, '05d') # zero-padding
        jsonfile = f'./Yummly28K/meta{stringcount}.json'
        try:
            f = open(jsonfile)
            data = json.load(f)
            f.close()
        except Exception as e:
            logging.error(f'{stringcount}.json failed during file load: {str(e)}')
            continue

        dish = [
            stringcount,
            data.get('name'),
            data.get('numberOfServings'),
            data.get('totalTimeInSeconds'),
            json.dumps(data.get('flavors')),
            json.dumps(data.get('attributes',{}).get('cuisine',{}))
        ]

        try:
            # extract url from recipe dict and scrape clean ingredient data from web
            url = data.get('attribution',{}).get('url')
            browser = webdriver.Chrome(
                executable_path=r'C:\Users\Andreas\Downloads\chromedriver_win64\chromedriver.exe',
                options=chrome_options
            ) 
            browser.get(url) 
            time.sleep(3) # hotfix to counter weird chromedriver bug if redirected from 404 to page
            webelements = browser.find_elements_by_class_name('ingredient')
            ingredients = json.dumps([el.text for el in webelements])
            browser.close()
            browser.quit()

        except Exception as e:
            logging.error(f'{stringcount}.json failed during ingredient scraping: {str(e)}')
            ingredients = []

        dish.append(ingredients)

        # Retrieve nutritional values from unindexed list
        try:
            for attribute in selected_nutrition_attributes:
                matchFound = False
                for nutrition_dict in data.get('nutritionEstimates'):
                    if attribute == nutrition_dict.get('attribute'):
                        matchFound = True
                        break
                nutrition_value = nutrition_dict.get('value') if matchFound else None
                dish.append(nutrition_value)

        except Exception as e:
            logging.error(f'{stringcount}.json failed during nutrition extraction: {str(e)}')
           
        # out-file is opened every iteration to continuously update in case of crash
        with open(f'./yummly28k.csv', 'a', encoding='UTF8', newline='') as dataset: 
            writer = csv.writer(dataset) 
            writer.writerow(dish)
            if i % 50 == 0:
                logging.debug(f'Wrote {stringcount} to file') 
        
preprocess(17501,20000)         

KeyboardInterrupt: 