In [None]:
"""This notebook scrapes the USDA market research website for historical wholesale produce data
This is done by linking directly to a server request output in html form
Only one year of data can be requested at a time so this notebook loops over years.

To use this in the future don't forget to: look up the short city code for each city, as well as short produce name.
"""

In [1]:
import requests
import os
import time
import random
import sys

In [2]:
# 06/26/18: this is a little hacked apart right now in order to update the produce items through june 25 2018 from 05/31/2018

In [3]:
def fetch_data(city, shortcity, producename, shortname, year, directory):
    """Given a city and produce item, fetches a year of data and saves it in a target directory.
    Skips any cities/items/year combos that have already been downloaded. Slightly hardened against 
    timeouts,etc. from the USDA server, which is a bit flaky.
    """
    if str(city)+'_'+str(producename)+'_'+str(year)+'.html' not in os.listdir(directory):
        # this is modified to retrieve 06/01/18 to 06/25/18
        url = 'https://www.marketnews.usda.gov/mnp/fv-report?&commAbr='+str(shortname)+'&repType=termPriceDaily&repTypeChanger=termPriceDaily&locAbrPass='+str(producename)+'%7C%7C'+str(shortname)+'&step3date=true&locName='+str(city)+'&type=termPrice&locChoose=location&refine=false&_environment=1&locAbrlength=1&organic=NO&environment=&locAbr='+str(shortcity)+'&commodityClass=allcommodity&Run=Run&repDate=06%2F01%2F'+str(year)+'&endDate=06%2F25%2F'+str(int(year))+'&format=excel&rebuild=false'
        try:
            r = requests.get(url, allow_redirects=True, timeout=300)
            open(str(directory)+str(city)+'_'+str(producename)+'_'+str(year)+'.html', 'wb').write(r.content)
            return True
        except requests.exceptions.Timeout:
            print('request timed out, trying again...')
            try:
                r = requests.get(url, allow_redirects=True, timeout=300)
                open(str(directory)+str(city)+'_'+str(producename)+'_'+str(year)+'.html', 'wb').write(r.content)
                return True
            except requests.exceptions.Timeout:
                print('request timed out again, exiting...')
                sys.exit()
    else:
        return False

In [4]:
# test grabbing a single item
test_city = 'NEW+YORK'
test_short = 'NX'
test_producename = 'CARROTS'
test_year = '2007'

In [5]:
#fetch_data(test_city, test_producename, test_year)
#veggies = ['APPLES','APRICOTS','ASPARAGUS','AVOCADOS','BANANAS','BEANS','BEETS','BLACKBERRIES','BLUEBERRIES','BROCCOLI','BRUSSELS+SPROUTS','CABBAGE','CANTALOUPS','CARROTS','CAULIFLOWER','CELERY','CHERRIES','CLEMENTINES','CORN-SWEET','CRANBERRIES', 'CUCUMBERS','EGGPLANT','ENDIVE','GARLIC','GINGER+ROOT','GRAPEFRUIT','GRAPES','HONEYDEWS','KALE+GREENS','KIWIFRUIT','KOHLRABI','LEMONS','LETTUCE%2C+ICEBERG','LETTUCE%2C+ROMAINE','LETTUCE%2C+RED+LEAF','LETTUCE%2C+GREEN+LEAF','LETTUCE%2C+BIBB']
#shortveg = ['APL','APR','ASP','AVOC','BAN','BNS','BTS','BLKBERI-V','BLUBY','BROC','BRSPT','CAB','CANT','CARR','CAUL','CEL','CHER','CLEM','CORN','CRBY','CUX','EGPLT','END','GARLIC','GNGRT','GRPFT','GRPS','HDEW','KALEGRNS','KIWI','KOHLRABI','LEM','LETT','LETTR','RDLFLET-V','GRNLFLET-V','BIBBLET-V']

In [6]:
# loop over a few veggies/years/cities and dump into directory
cities = ['NEW+YORK', 'LOS+ANGELES']
shortcities = ['NX', 'HC']
veggies = ['APPLES','APRICOTS','ASPARAGUS','AVOCADOS','BANANAS','BEANS','BEETS','BLACKBERRIES','BLUEBERRIES','BROCCOLI','BRUSSELS+SPROUTS','CABBAGE','CANTALOUPS','CARROTS','CAULIFLOWER','CELERY','CHERRIES','CLEMENTINES','CORN-SWEET','CRANBERRIES', 'CUCUMBERS','EGGPLANT','ENDIVE','GARLIC','GINGER+ROOT','GRAPEFRUIT','GRAPES','HONEYDEWS','KALE+GREENS','KIWIFRUIT','KOHLRABI','LEMONS','LETTUCE%2C+ICEBERG','LETTUCE%2C+ROMAINE','LETTUCE%2C+RED+LEAF','LETTUCE%2C+GREEN+LEAF','LETTUCE%2C+BIBB', 'LIMES','MANGOES','MUSHROOMS','NECTARINES','OKRA','ORANGES','PEACHES','PEARS','PEAS+GREEN','PEPPERS%2C+BELL+TYPE','PINEAPPLES','PLUMS','POTATOES','PUMPKINS','RADISHES','RASPBERRIES','RHUBARB','SPINACH','SQUASH','STRAWBERRIES','SWEET+POTATOES','TOMATOES','TURNIPS','WATERMELONS']
shortveg = ['APL','APR','ASP','AVOC','BAN','BNS','BTS','BLKBERI-V','BLUBY','BROC','BRSPT','CAB','CANT','CARR','CAUL','CEL','CHER','CLEM','CORN','CRBY','CUX','EGPLT','END','GARLIC','GNGRT','GRPFT','GRPS','HDEW','KALEGRNS','KIWI','KOHLRABI','LEM','LETT','LETTR','RDLFLET-V','GRNLFLET-V','BIBBLET-V','LIM','MANGO','MUSH','NECT','OKRA','ORG','PCH','PEAR','PEASG','PEP','PINE','PLUM','POTS','PUMP','RAD','RASP','RHUB','SPIN','SQU','STRBY','SWPOT','TOM','TRNP','WMEL']
#years = ['2007','2008', '2009','2010','2011','2012','2013','2014','2015','2016','2017']
# replace with 2018 data
years = ['2018']
targdir = './raw_data/'

In [12]:
# loop over all the produce items, cities specified above. this can take a while. 
initial_time = time.time()
count_city = 0
for c in cities:
    count_veg = 0
    for v in veggies:
        for y in years:
            random.seed()
            sleeptime = random.randint(8, 18)
            try:
                if fetch_data(c, shortcities[count_city], v, shortveg[count_veg], y, targdir):
                    print('fetched '+str(c)+' '+str(v)+' '+str(y)+', sleeping for '+str(sleeptime)+'s')
                    time.sleep(sleeptime)
            except OSError:
                print('problem fetching, trying again...')
                try:
                    if fetch_data(c, shortcities[count_city], v, shortveg[count_veg], y, targdir):
                        print('fetched '+str(c)+' '+str(v)+' '+str(y)+', sleeping for '+str(sleeptime)+'s')
                        time.sleep(sleeptime)
                except OSError:
                    print('problem fetching, trying one last time...')
                    if fetch_data(c, shortcities[count_city], v, shortveg[count_veg], y, targdir):
                        print('fetched '+str(c)+' '+str(v)+' '+str(y)+', sleeping for '+str(sleeptime)+'s')
                        time.sleep(sleeptime)        
        count_veg+=1
    count_city+=1
final_time = time.time()
print('fetched in '+str(round(final_time-initial_time, 2))+'s')

fetched LOS+ANGELES LETTUCE%2C+BIBB 2018, sleeping for 14s
fetched LOS+ANGELES LIMES 2018, sleeping for 17s
problem fetching, trying again...
request timed out, trying again...
fetched LOS+ANGELES MANGOES 2018, sleeping for 14s
fetched LOS+ANGELES MUSHROOMS 2018, sleeping for 18s
fetched LOS+ANGELES NECTARINES 2018, sleeping for 16s
fetched LOS+ANGELES OKRA 2018, sleeping for 11s
fetched LOS+ANGELES ORANGES 2018, sleeping for 14s
fetched LOS+ANGELES PEACHES 2018, sleeping for 15s
fetched LOS+ANGELES PEARS 2018, sleeping for 12s
fetched LOS+ANGELES PEAS+GREEN 2018, sleeping for 15s
fetched LOS+ANGELES PEPPERS%2C+BELL+TYPE 2018, sleeping for 12s
fetched LOS+ANGELES PINEAPPLES 2018, sleeping for 18s
fetched LOS+ANGELES PLUMS 2018, sleeping for 11s
fetched LOS+ANGELES POTATOES 2018, sleeping for 17s
fetched LOS+ANGELES PUMPKINS 2018, sleeping for 9s
fetched LOS+ANGELES RADISHES 2018, sleeping for 16s
fetched LOS+ANGELES RASPBERRIES 2018, sleeping for 10s
fetched LOS+ANGELES RHUBARB 2018, s