In [None]:
from concurrent.futures import thread
import csv
import json
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from multiprocessing import Process
import time

In [None]:
selected_nutrition_attributes = [
    'ENERC_KJ', # [kcal]
    'FAT', # [g]
    'CHOCDF', # carbs [g]
    'PROCNT', # protein [g]
    'TOCPHA', # vitamin E [g]
    'VITC', # [g]
    'VITB12', # [g]
    'VITD-', # [IU]
    'VITK', # [g]
    'VITA_IU',
    'FOLFD', # vit 9 [g]
    'VITB6A', # vit b6 [g]
    'FASAT', # fatty acids [g]
    'CHOLE', # [g]
    'NA', # salt [g]
    'FIBTG', # fiber [g]
    'SUGAR', # [g]
    'CA', # calcium [g]
    'FE', # iron [g]
    'MG', # magnesium [g]
    'ZN', # zinc [g]
    'K', # potassium [g]
    'CARTB', # beta carotene [g]
]

prefs = {"profile.managed_default_content_settings.images": 2} # disable image loading
chrome_options = Options()
chrome_options.add_argument("--ignore-certificate-errors-spki-list")
chrome_options.add_argument('log-level=3')
chrome_options.add_experimental_option("prefs", prefs)

In [None]:
def partitioner(start_i, end_i, partition):
    def importLine(i, writer):
        
        # Get recipe file and load from json to dict
        stringcount = format(i, '05d') # zero-padding
        jsonfile = f'./milestone-1/Yummly28K/meta{stringcount}.json'
        f = open(jsonfile)
        data = json.load(f)
        f.close()

        dish = [
            stringcount,
            data.get('name'),
            data.get('numberOfServings'),
            data.get('totalTimeInSeconds'),
            json.dumps(data.get('flavors')),
            json.dumps(data.get('attributes',{}).get('cuisine',{}))
        ]
        try:
            # Extract url from recipe dict and scrape clean ingredient data from web
            url = data.get('attribution',{}).get('url')
            browser = webdriver.Chrome(
                executable_path=r'C:\Users\Andreas\Downloads\chromedriver_win64\chromedriver.exe',
                options=chrome_options
            ) 
            browser.get(url) 
            webelements = browser.find_elements_by_class_name('ingredient')
            ingredients = json.dumps([el.text for el in webelements])
            browser.close()
            browser.quit()

        except Exception as e:
            print(f'{stringcount}.json failed during ingredient scraping: {str(e)}')
            ingredients = []

        dish.append(ingredients)
            
        # Retrieve nutritional values from unindexed list
        try:
            for attribute in selected_nutrition_attributes:
                matchFound = False
                for nutrition_dict in data.get('nutritionEstimates'):
                    if attribute == nutrition_dict.get('attribute'):
                        matchFound = True
                        break
                nutrition_value = nutrition_dict.get('value') if matchFound else None
                dish.append(nutrition_value)

        except Exception as e:
            print(f'{stringcount}.json failed during nutrition extraction: {str(e)}')

        writer.writerow(dish) 

    for i in range(start_i, end_i):
        # get csv file to write dataset 
        with open(f'./milestone-1/yummly28k-{partition}.csv', 'a', encoding='UTF8', newline='') as dataset:
            writer = csv.writer(dataset)    
            importLine(i+1, writer)


In [None]:
if __name__ == '__main__':
    number_of_files = 30
    partitions = 5
    k, rest = divmod(number_of_files, partitions)
    splits = [(i * k + min(i, rest), (i + 1) * k + min(i + 1, rest, i+1), i) for i in range(partitions)]

    processes = []
    for split in splits:
        p = Process(target=partitioner, args=(split[0],split[1],split[2],))
        p.start()
        processes.append(p)
        time.sleep(1)

    for p in processes:
        p.join()