In [109]:
import requests
import pandas as pd

from utils import state_names, state_names_short, state_alphas

## Crop yields

In [49]:
api_key = 'YOUR-API-KEY-HERE'
api_key = '210BA222-FC6E-3FB2-B4D7-DA2DAA1CC829'

In [195]:
def get_crop(crop_info, states):
    """
    Downloads county-level yield data from USDA NASS API for 1979 - 2020.
    Input: crop_info = dictionary with API request information
           states = list of U.S. state abbreviations
    Output: pandas dataframe
    """
    # Retain only these columns
    retain_cols = ['Value', 'short_desc', 'year', 'county_code', 'state_fips_code']
    
    # Build API request
    request = 'http://quickstats.nass.usda.gov/api/api_GET/?key=' + api_key + '&source_desc=SURVEY&sector_desc=CROPS&group_desc=FIELD CROPS&agg_level_desc=COUNTY&year__GE=1979&year__LE=2020'
    for key in crop_info:
        request += key + '=' + crop_info[key]
    
    # Dataframe for results
    df_out = pd.DataFrame()

    # Loop through states
    for state in states:
        data = requests.get(request + '&state_alpha=' + state)
        if data.status_code == 200:
            data = data.json()
            data = pd.DataFrame(data["data"])
            df_out = pd.concat([df_out, data], ignore_index=True)

    return df_out[retain_cols]

In [16]:
%%time
############# Maize ##############
maize_yield_request = {'&commodity_desc':'CORN', '&util_practice_desc':'GRAIN', '&statisticcat_desc':'YIELD', '&unit_desc':'BU / ACRE'}
maize_area_request = {'&commodity_desc':'CORN', '&util_practice_desc':'GRAIN', '&statisticcat_desc':'AREA HARVESTED'}
maize_yield = get_crop(maize_yield_request, state_names_short)
maize_area = get_crop(maize_area_request, state_names_short)

############## Soy ##############
soy_yield_request = {'&commodity_desc':'SOYBEANS', '&statisticcat_desc':'YIELD', '&unit_desc':'BU / ACRE'}
soy_area_request = {'&commodity_desc':'SOYBEANS', '&statisticcat_desc':'AREA HARVESTED'}
soy_yield = get_crop(soy_yield_request, state_names_short)
soy_area = get_crop(soy_area_request, state_names_short)

CPU times: user 10.5 s, sys: 661 ms, total: 11.1 s
Wall time: 4min 44s


In [26]:
def tidy_yield_area(yield_in, area_in):
    """
    Merge and tidy yield and area data from USDA API
    """
    yields = yield_in.copy()
    areas = area_in.copy()
    
    # Yield tidy
    yields.rename(columns = {'Value':'yield', 'state_fips_code':'state'}, inplace=True)
    yields = yields[yields['county_code'] != '998']
    yields['fips'] = yields['state'] + yields['county_code']
    yields.drop(columns = ['county_code', 'short_desc'], inplace=True)
    yields['yield'] = yields['yield'].astype(float)
    
    # Area tidy
    areas.rename(columns = {'Value':'area', 'state_fips_code':'state'}, inplace=True)
    areas = areas[areas['county_code'] != '998']
    areas['fips'] = areas['state'] + areas['county_code']
    areas.drop(columns = ['county_code', 'short_desc'], inplace=True)
    areas['area'] = areas['area'].str.replace(',','').astype(float)
    
    # Merge
    return pd.merge(yields.drop_duplicates(subset=['fips','year']),
                    areas.drop_duplicates(subset=['fips','year']),
                    on = ['fips', 'year', 'state'],
                    how='inner')

In [29]:
# Maize
print(maize_yield.shape, maize_area.shape)

maize_all = tidy_yield_area(maize_yield, maize_area)
print(maize_all.shape)

maize_all.to_csv('../input_data/usda_maize_yields_1979-2020.csv', index=False)

(69062, 5) (69006, 5)
(50769, 5)


In [30]:
# Soy
print(soy_yield.shape, soy_area.shape)

soy_all = tidy_yield_area(soy_yield, soy_area)
print(soy_all.shape)

soy_all.to_csv('../input_data/usda_soy_yields_1979-2020.csv', index=False)

(61053, 5) (61044, 5)
(46697, 5)


## Crop prices

In [None]:
api_key = 'YOUR-API-KEY-HERE'
api_key = '210BA222-FC6E-3FB2-B4D7-DA2DAA1CC829'

In [46]:
def tidy_prices(prices_in):
    """
    Tidies national historical crop prices from USDA NASS API
    """
    # Output will be pandas dataframe
    prices_in = prices_in.json()
    df_out = pd.DataFrame(prices_in['data'])
    
    # Annual prices
    df_out = df_out[df_out['reference_period_desc'] == 'MARKETING YEAR']
    
    # Rename and select final columns
    df_out['value'] = df_out['Value'].apply(lambda x: float(x))
    df_out.rename(columns = {'value':'price', 'state_fips_code':'state'}, inplace=True)
    
    return df_out[['year', 'price']]

In [47]:
# Maize
maize_price = requests.get('http://quickstats.nass.usda.gov/api/api_GET/?key=' + api_key + '&source_desc=SURVEY&sector_desc=CROPS&group_desc=FIELD CROPS&commodity_desc=CORN' + 
                            '&statisticcat_desc=PRICE RECEIVED&unit_desc=$ / BU&agg_level_desc=NATIONAL&year__GE=1950&year__LE=2016')
maize_price = tidy_prices(maize_price)

# Soy 
soy_price = requests.get('http://quickstats.nass.usda.gov/api/api_GET/?key=' + api_key + '&source_desc=SURVEY&sector_desc=CROPS&group_desc=FIELD CROPS&commodity_desc=CORN' + 
                            '&statisticcat_desc=PRICE RECEIVED&unit_desc=$ / BU&agg_level_desc=NATIONAL&year__GE=1950&year__LE=2016')
soy_price = tidy_prices(soy_price)

In [110]:
############################
# Deflate with PPI data
############################

In [51]:
api_key = 'YOUR-API-KEY-HERE'
api_key = 'b775498337369c2e29920e4cb778c874'

In [97]:
##### Producer Price Index by Commodity: Farm Products
##### Index 1982=100
##### Not Seasonally Adjusted 
##### https://fred.stlouisfed.org/series/WPU01

# HTTP request from STL FRED
data = requests.get('https://api.stlouisfed.org/fred/series/observations?series_id=WPU01&api_key=' + api_key + '&file_type=json')
data = data.json()

# Pandas dataframe
ppi = pd.DataFrame(data['observations'])[['date','value']]
ppi['year'] = pd.to_datetime(ppi['date'])
ppi['value'] = ppi['value'].astype(float)

# We use annual prices
ppi_annual = ppi.set_index('year').resample('1Y').mean()
ppi_annual.rename(columns={'value':'ppi'}, inplace=True)
ppi_annual.index = ppi_annual.index.year

In [105]:
# Merge prices and adjust for inflation (set to 2018 USD)
ppi_2018 = ppi_annual.loc[2018]['ppi']

# Maize
maize_ppi = pd.merge(maize_price, ppi_annual.reset_index(), on='year', how='inner')
maize_ppi['price_ppi'] = maize_ppi['price'] / maize_ppi['ppi'] * ppi_2018
maize_ppi = maize_ppi.sort_values(by='year')

# Soy
soy_ppi = pd.merge(soy_price, ppi_annual.reset_index(), on='year', how='inner')
soy_ppi['price_ppi'] = soy_ppi['price'] / soy_ppi['ppi'] * ppi_2018
soy_ppi = soy_ppi.sort_values(by='year')

In [189]:
# Store
maize_ppi[['year', 'price', 'price_ppi']].to_csv('../input_data/usda_maize_prices_deflated_1950-2016.csv', index=False)
soy_ppi[['year', 'price', 'price_ppi']].to_csv('../input_data/usda_soy_prices_deflated_1950-2016.csv', index=False)

## Irrigation: Water applied

In [145]:
api_key = 'YOUR-API-KEY-HERE'
api_key = '210BA222-FC6E-3FB2-B4D7-DA2DAA1CC829'

In [197]:
def get_crop_water_applied(crop_name, states):
    """
    Downloads state-level irrigation data (water applied) from USDA NASS API for 2013, 2018.
    """
    # Retain only these columns
    retain_cols = ['Value', 'short_desc', 'year', 'state_fips_code', 'state_name']
    
    # Build API request
    request = 'http://quickstats.nass.usda.gov/api/api_GET/?key=' + api_key + '&source_desc=CENSUS&sector_desc=CROPS&group_desc=FIELD CROPS&agg_level_desc=STATE&commodity_desc=' + crop_name + '&statisticcat_desc=WATER APPLIED&unit_desc=ACRE FEET / ACRE&domain_desc=TOTAL'
    
    # Dataframe for results
    df_out = pd.DataFrame()

    # Loop through states
    for state in states:
        data = requests.get(request + '&state_alpha=' + state)
        if data.status_code == 200:
            data = data.json()
            data = pd.DataFrame(data["data"])
            df_out = pd.concat([df_out, data], ignore_index=True)

    return df_out[retain_cols]

In [192]:
%%time
# Maize
maize_irr = get_crop_water_applied('CORN', state_names_short)

# Soy
soy_irr = get_crop_water_applied('SOYBEANS', state_names_short)

CPU times: user 750 ms, sys: 32.5 ms, total: 782 ms
Wall time: 13.1 s


In [193]:
#################
# Tidy and store
#################

retain_cols = ['year', 'state_fips_code', 'state_name', 'water_applied']

# Maize
maize_irr = maize_irr[maize_irr['short_desc'] == 'CORN, GRAIN, IRRIGATED - WATER APPLIED, MEASURED IN ACRE FEET / ACRE'] # subset grain for maize
maize_irr['Value'] = maize_irr['Value'].astype(float) * 304.8 # acre-feet/acre -> mm/acre
maize_irr.rename(columns = {'Value' : 'water_applied'}, inplace=True)
maize_irr[retain_cols].to_csv('../input_data/usda_maize_water_applied_2013-2018.csv', index=False)

# Soy
soy_irr['Value'] = soy_irr['Value'].astype(float) * 304.8 # acre-feet/acre -> mm/acre
soy_irr.rename(columns = {'Value' : 'water_applied'}, inplace=True)
soy_irr[retain_cols].to_csv('../input_data/usda_soy_water_applied_2013-2018.csv', index=False)

## Irrigation: Acres harvested

In [207]:
api_key = 'YOUR-API-KEY-HERE'
api_key = '210BA222-FC6E-3FB2-B4D7-DA2DAA1CC829'

In [240]:
def get_crop_irr_area(crop_name, states):
    """
    Downloads county-level irrigated acreage data from USDA NASS API
    """
    # Retain only these columns
    retain_cols = ['Value', 'short_desc', 'year', 'county_code', 'state_fips_code']
    
    # Build API request
    request = 'http://quickstats.nass.usda.gov/api/api_GET/?key=' + api_key + '&source_desc=CENSUS&sector_desc=CROPS&group_desc=FIELD CROPS&agg_level_desc=COUNTY&commodity_desc=' + crop_name + '&prodn_practice_desc=IRRIGATED&statisticcat_desc=AREA HARVESTED'
    if crop_name == 'CORN':
        request += '&util_practice_desc=GRAIN'
    # Dataframe for results
    df_out = pd.DataFrame()

    # Loop through states
    for state in states:
        data = requests.get(request + '&state_alpha=' + state)
        if data.status_code == 200:
            data = data.json()
            data = pd.DataFrame(data["data"])
            df_out = pd.concat([df_out, data], ignore_index=True)

    return df_out[retain_cols]

In [241]:
%%time
# Maize
maize_irr = get_crop_irr_area('CORN', state_names_short)

# Soy
soy_irr = get_crop_irr_area('SOYBEANS', state_names_short)

CPU times: user 1.29 s, sys: 62.6 ms, total: 1.35 s
Wall time: 57.2 s


In [244]:
def tidy_irr_area(irr_in):
    """
    Tidy irrigated acreage data from USDA API
    """
    irr = irr_in.copy()
    
    # Yield tidy
    irr.rename(columns = {'Value':'irrigated_acreage', 'state_fips_code':'state'}, inplace=True)
    irr = irr[irr['county_code'] != '998']
    irr['fips'] = irr['state'] + irr['county_code']
    irr.drop(columns = ['county_code', 'short_desc'], inplace=True)
    irr['irrigated_acreage'] = irr['irrigated_acreage'].astype(float)
    
    return irr

In [245]:
# Maize
print(maize_irr.shape)

maize_irr_out = tidy_irr_area(maize_irr)
print(maize_irr_out.shape)

# maize_all.to_csv('../input_data/usda_maize_yields_1979-2020.csv', index=False)

(8876, 5)


ValueError: could not convert string to float: '                 (D)'

In [30]:
# Soy
print(soy_yield.shape, soy_area.shape)

soy_all = tidy_yield_area(soy_yield, soy_area)
print(soy_all.shape)

soy_all.to_csv('../input_data/usda_soy_yields_1979-2020.csv', index=False)

(61053, 5) (61044, 5)
(46697, 5)
