Secondary notebook, called by the main notebook and containing the algorithm to access the US Department of Agriculture database.

In [1]:
# Imports
import pandas as pd
from urllib.request import urlopen
import urllib.request
import json

import numpy as np

DATA_FOLDER = './data/'

In [11]:
# Error handling
error_ = { "errors": { "error": [{
                "status": 400,
                "parameter": "results",
                "message": "Your search resulted in zero results.Change your parameters and try again" }]}}

def catch_nutriment_value(nutri_dict, id_):
    ''' Catch the value of a nutriment defined by its id_ '''
    value = 0
    for i in range(len(nutri_dict)):
        if int(nutri_dict[i]['nutrient_id']) == id_:
            value = float(nutri_dict[i]['value'])
                        
    return value

def catch_fruit_or_veg(raw_aliment):
    ''' Return 1 if the element is a fruit or a vegetable '''
    
    fruit_or_veg = 0
    group = raw_aliment['group']
    if group == 'Fruits and Fruit Juices': fruit_or_veg = 1
    elif group == 'Vegetables and Vegetable Products' :  fruit_or_veg = 1
    elif group == 'Legumes and Legume Products' : fruit_or_veg = 1
    
    return fruit_or_veg

def find_raw_aliment(search_dict):
    ''' Sometimes, the raw aliment is not the first to appear in search result, this function is there 
    to ensure that the '''
    
    score_list = []
    aliment_list = search_dict['list']['item']
    bonus_list = ['Fruits and Fruit Juices','Vegetables and Vegetable Products','Legumes and Legume Products']
    best_score = 0
    
    for i in range(len(aliment_list)):
        score = 0
        if ('raw' in aliment_list[i]['name']) or ('unprepared' in aliment_list[i]['name']) : score += 1
        if (aliment_list[i]['group'] in bonus_list) : score += 1
        score_list.append(score)
    
    for i in range(len(aliment_list)):
        # NB the entries are also classified by relevance in the database, so that the upper entries
        # are more likely to be relevant
        if score_list[i] == max(score_list) : return aliment_list[i]

def scrap(query_, ds_='Standard%20Reference', type_ = 'b'):
    ''' Scrap nutriment values from US Agriculture department database '''
    
    # Allow to handle spaces in query without any problem to establish url
    query_ = query_.replace(' ', '%20')
    
    # Parameters
    api_key_ = 'HOEmuSjOUY4TSTXC4DM3I9CeOXOtypKAfpqi8Fuv' # Official API key for access to US gov database
    format1_ = 'json' # Output format
    sort_ = 'r' # Sort by relevance
    max_ = '20' # Number of search result(s)
    offset_ = '0' # Beginning row in the result
    
    # Query the API (will list all the possible results)
    url_search = 'https://api.nal.usda.gov/ndb/search/' + '?format=' + format1_ + '&q=' + query_ + \
                '&max=' + max_ + '&sort=' + sort_ + '&offset=' + offset_ + '&ds=' + ds_ + '&api_key=' + api_key_ 
    print(url_search)
    
    f_search = urlopen(url_search)
    assert f_search.code == 200
    search_dict = json.loads(f_search.read())
    
    # Error handling
    if search_dict == error_:
        ds2_='Branded%20Food%20Products'
        url_search = 'https://api.nal.usda.gov/ndb/search/' + '?format=' + format1_ + '&q=' + query_ + \
                '&max=' + max_ + '&sort=' + sort_ + '&offset=' + offset_ + '&ds=' + ds2_ + '&api_key=' + api_key_ 
        print(url_search)
        
        f_search = urlopen(url_search)
        assert f_search.code == 200
        search_dict = json.loads(f_search.read())
        
        if search_dict == error_:
            return {'Name' : np.nan,'kJ': np.nan,'Proteins' : np.nan,'Sugars' : np.nan,'Sat_fats' : np.nan,'Fibers' : np.nan,
                    'Sodium': np.nan,'Lipids' : np.nan,'Fruit_Veg_content' : np.nan}
    
    
    # From the possible results list, we now have to choose the best product
    # NB: this could be another product than the top product from the list
    # In our case, we would like the find the most 'raw' product
    f_search = urlopen(url_search)
    assert f_search.code == 200
    search_dict = json.loads(f_search.read())
        
    # Find the most 'raw' element
    raw_aliment = find_raw_aliment(search_dict)
    
    # Identification number in the database
    ndbno_ = raw_aliment['ndbno'] 
    
    # Get the proper report and open it
    url_food_report = 'https://api.nal.usda.gov/ndb/reports/' + '?ndbno=' + ndbno_ + '&type=' + type_ + \
                                                                '&format=' + format1_ + '&api_key=' + api_key_ 
    print(url_food_report)
    f_food_report = urlopen(url_food_report)
    assert f_food_report.code == 200
    
    # Loads report
    food_report_dict = json.loads(f_food_report.read())
    
    nutri_dict = food_report_dict['report']['food']['nutrients']
    
    kcal_to_kJ = 4.184
    
    # Catch nutriments using ID from the US database
    nutri_values = {
        'Name' : raw_aliment['name'],
        'kJ': catch_nutriment_value(nutri_dict, 208) * kcal_to_kJ,
        'Proteins' : catch_nutriment_value(nutri_dict, 203),
        'Sugars' : catch_nutriment_value(nutri_dict, 269),
        'Sat_fats' : catch_nutriment_value(nutri_dict, 606),
        'Fibers' : catch_nutriment_value(nutri_dict, 291),
        'Sodium' : catch_nutriment_value(nutri_dict, 307),
        'Lipids' : catch_nutriment_value(nutri_dict, 204),
        'Fruit_Veg_content' : catch_fruit_or_veg(raw_aliment)
    }
    
    return nutri_values   

In [10]:
# Example
#scrap(query_ = 'pear')

https://api.nal.usda.gov/ndb/search/?format=json&q=pear&max=20&sort=r&offset=0&ds=Standard%20Reference&api_key=HOEmuSjOUY4TSTXC4DM3I9CeOXOtypKAfpqi8Fuv
https://api.nal.usda.gov/ndb/reports/?ndbno=09252&type=b&format=json&api_key=HOEmuSjOUY4TSTXC4DM3I9CeOXOtypKAfpqi8Fuv


{'Name': 'Pears, raw',
 'kJ': 238.488,
 'Proteins': 0.36,
 'Sugars': 9.75,
 'Sat_fats': 0.022,
 'Fibers': 3.1,
 'Sodium': 1.0,
 'Lipids': 0.14,
 'Fruit_Veg_content': 1}