# Normalizing Supply & Demand Data

In [1]:
import pandas as pd
import geopandas as gpd
import numpy as np
import requests
from geopy.geocoders import Nominatim

#supressing fragmented data warnings as the techniques used here do not degrade performance too badly
import warnings  
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

## Input Data
Read more about Input daa in the Readme file

In [2]:
# Input Data Files

# Crop and Consumption Data
cdl_codes = pd.read_csv("input-data/cdl-codes.csv")
county_pixels = pd.read_csv("input-data/county-crops-conus-all.csv")
state_fp_codes = pd.read_csv("input-data/state-fp-codes.csv")
scd_calories = pd.read_csv("input-data/stability_crop_diversity.csv")
cdl_scd_crosswalk = pd.read_csv("input-data/crosswalk-cdl<>stability-crop-diversity.csv")
income_consumption = pd.read_csv("input-data/income-consumption.csv")
lafa_calories = pd.read_csv("input-data/food-availability-2007-2017.csv")

# Imports/Exports Data
food_exports = pd.read_csv("input-data/food-exports-2017.csv", low_memory=False)
food_imports = pd.read_csv("input-data/food-imports-2017.csv")
imports_crosswalk = pd.read_csv("input-data/import-commodities.csv")
port_codes = pd.read_csv("input-data/port-codes.csv", delimiter="|")

# Population Data
age_sex_pop = pd.read_csv("input-data/ACSST5Y2021.S0101-Data.csv", low_memory=False) 
poverty_pop = pd.read_csv("input-data/ACSST5Y2021.S1701-Data.csv", low_memory=False)
race_pop = pd.read_csv("input-data/ACSST5Y2021.S1701-Data.csv", low_memory=False)

# County geo data
county_boundaries = gpd.read_file("input-data/geo-boundaries-2022/cb_2022_us_county_500k.shp")
counties_2021 = gpd.read_file('input-data/geo-boundaries-2021/cb_2021_us_county_500k.shp')



## Output Data
There are several intermediate outputs from this work that we've also made available. (will explain these in more detail in the readme)
- Grown Supply (county level)
- Imported Supply
- Ports geojson
- Consumption by demographics pdf conversions
- Population by county (CONUS and Full US)
- County distance matrix with geojson lines
- Pixels of crops per county
- Pixels of crops per 1km grid

# Crop Data Normalization

## CDL Data Cleaning

In [3]:
# Drops all blank crop codes from the CDL code list
cdl_codes = cdl_codes[cdl_codes.Class_Names != ' ']
cdl_codes = cdl_codes.reset_index(drop=True)

In [4]:
# removing any rows that have less than 100 pixels of crops which is roughly 1 square kilometer
for c in county_pixels.columns:
    if c.startswith("crop"):
        county_pixels[c] = county_pixels[c].mask(county_pixels[c] < 100, 0)
        # might need to change all 0's to NaNs

In [5]:
# Joins state FIP codes to the counties crops data. This is used to join the state name to the counties crops data
state_fp_codes.columns = state_fp_codes.columns.str.lower()
county_pixels["statefp"] = county_pixels["statefp"].astype(int)
state_fp_codes = state_fp_codes[state_fp_codes["state"].notna()]
state_fp_codes["fips"] = state_fp_codes["fips"].astype(int)
county_pixels["statefp"] = county_pixels["statefp"].astype(str)
state_fp_codes["fips"] = state_fp_codes["fips"].astype(str)

# join State column and Postal column to counties based on FIPS and STATEFP
county_pixels = county_pixels.merge(state_fp_codes, left_on="statefp", right_on="fips")
county_pixels = county_pixels.drop(columns=["unnamed: 3", "unnamed: 4", "unnamed: 5", "fips"])

In [6]:
# removing any rows that have less than 100 pixels of crops which is roughly 1 square kilometer
# for every column that begins with crop
for c in county_pixels.columns:
    if c.startswith("crop"):
        # if a value in c is less than 100, replace it with 0
        county_pixels[c] = county_pixels[c].mask(county_pixels[c] < 100, 0)


## Crosswalk 1 CDL -> Stabilty Crop Diversity

In [7]:
# Some of the crop names in the CDL Data refer to a single crop name in the stability crop diversity data
# In these cases we "rollup" these crop values into a single crop value
county_pixels["crop_42"] = county_pixels["crop_42"] + county_pixels["crop_51"]
county_pixels["crop_22"] = county_pixels["crop_22"] + county_pixels["crop_23"] + county_pixels["crop_24"]
county_pixels = county_pixels.drop(columns=["crop_51", "crop_23", "crop_24"])

## Crosswalk 2 Stabilty Crop Diversity -> CDL

In [8]:
# production has data from many years 
# This takes each crop from each state and adds the row that is closest to 2017 (the year our cdl data is from) to a dataframe called rel_prod
rel_prod_list = []
for s in scd_calories["State_Abbr"].unique():
    state = scd_calories[scd_calories["State_Abbr"] == s]
    for c in state["Crop_Name"].unique():
        crop = state[state["Crop_Name"] == c]
        rel_prod_list.append(crop.iloc[(crop['Year']-2017).abs().argsort()[:1]])


rel_prod = pd.concat(rel_prod_list)

In [9]:
# This function controls how crop names from the stability crop diversity dataset are consolidated
# This will be called for crops in the stability crop diversity dataset that need to be combined into one CDL crop code
def crop_rollup(crops, rel_prod):
    roll_df = pd.DataFrame()
    for s in rel_prod["State_Abbr"].unique():
        state = rel_prod[rel_prod["State_Abbr"] == s]
        temp = state[state["Crop_Name"].isin(crops)]
        if len(temp) > 0:
            roll_df = pd.concat([
                roll_df, 
                pd.DataFrame({
                    "State_Abbr": s, 
                    "Year": temp["Year"].max(), 
                    "Crop_Name": crops[0], 
                    "Price_Received_USD_kg": temp["Price_Received_USD_kg"].mean(), 
                    "Crop_Area_ha": temp["Crop_Area_ha"].sum(), 
                    "Production_kg": temp["Production_kg"].sum(), 
                    "FIPS": temp["FIPS"].iloc[0], 
                    "kcal_kg": temp["kcal_kg"].mean(),
                    "Production_kcal": temp["Production_kcal"].sum(), 
                    "Production_USD": temp["Production_USD"].sum()
                }, 
                index=[0])
            ])
    roll_df = roll_df.drop_duplicates(subset="State_Abbr", keep="first")
    rel_prod = rel_prod[~rel_prod["Crop_Name"].isin(crops)]
    rel_prod = pd.concat([rel_prod, roll_df])
    return rel_prod

In [10]:
roll1 = ["LETTUCE-HEAD", "LETTUCE-LEAF", "LETTUCE-ROMAINE"]
rel_prod = crop_rollup(roll1, rel_prod)

roll2 = ["PEAS-GREEN", "PEAS-DRY EDIBLE", "PEAS-AUSTRIAN WINTER"]
rel_prod = crop_rollup(roll2, rel_prod)

roll3 = [ "GRAPEFRUIT-ALL CLASSES", "LEMONS-ALL CLASSES", "TANGELOS-ALL CLASSES", "TANGERINES-ALL CLASSES", ]
rel_prod = crop_rollup(roll3, rel_prod)

rel_prod = rel_prod.reset_index(drop=True)

In [11]:
# This adds crop_code to rel_prod and drops any rows from rel_prod that are not in final_crops (e.g. Tobacco)
# MIGHT NEED TO DO MORE HERE TO ROLL UP ROWS AND THEN MAKE THE DROPS
for r in rel_prod["Crop_Name"]:
    if r in cdl_scd_crosswalk["production_name"].values:
        # add the crop_code to rel_prod
        rel_prod.loc[rel_prod["Crop_Name"] == r, "crop_code"] = cdl_scd_crosswalk.loc[cdl_scd_crosswalk["production_name"] == r, "crop_code"].iloc[0]
    else:
        # drop the row from rel_prod if it is not in final_crops
        rel_prod = rel_prod[rel_prod["Crop_Name"] != r]
        print ("dropping "+ r)
rel_prod = rel_prod.reset_index(drop=True)

dropping COTTON-UPLAND
dropping HAY-ALL CLASSES
dropping HOPS-ALL CLASSES
dropping COTTON-PIMA
dropping ARTICHOKES-ALL CLASSES
dropping BEANS-SNAP
dropping BRUSSELS SPROUTS-ALL CLASSES
dropping MELONS-HONEYDEW
dropping PEPPERS-BELL
dropping ESCAROLE & ENDIVE-ALL CLASSES
dropping BOYSENBERRIES-ALL CLASSES
dropping DATES-ALL CLASSES
dropping FIGS-ALL CLASSES
dropping KIWIFRUIT-ALL CLASSES
dropping RASPBERRIES-ALL CLASSES
dropping BEANS-GREEN, LIMA
dropping SPINACH-ALL CLASSES
dropping HAY-ALL CLASSES
dropping BEANS-SNAP
dropping BEANS-GREEN, LIMA
dropping BEANS-SNAP
dropping COTTON-UPLAND
dropping HAY-ALL CLASSES
dropping TOBACCO-ALL CLASSES
dropping ESCAROLE & ENDIVE-ALL CLASSES
dropping PEPPERS-BELL
dropping OKRA-ALL CLASSES
dropping HAY-ALL CLASSES
dropping BEANS-SNAP
dropping BEANS-GREEN, LIMA
dropping HAY-ALL CLASSES
dropping TOBACCO-ALL CLASSES
dropping BEANS-SNAP
dropping HAY-ALL CLASSES
dropping TOBACCO-ALL CLASSES
dropping BEANS-SNAP
dropping BEANS-GREEN, LIMA
dropping BEANS-SNA

In [12]:
# in counties drop any column whose column name starts with "crop" and is not in final_crop[crop_code] column
# this is to remove any columns that are not in the final crop list
for c in county_pixels.columns:
    if c.startswith("crop") and c not in cdl_scd_crosswalk["crop_code"].values:
        county_pixels = county_pixels.drop(c, axis=1)

In [13]:
# drop crop_215 and 210 (Avocadoes and Prunes) from rel_prod because it is not in the CDL data
# For some reason, there were no pixels of avocadoes or prunes in the CDL data
rel_prod = rel_prod[rel_prod["crop_code"] != "crop_215"]
rel_prod = rel_prod[rel_prod["crop_code"] != "crop_210"]
rel_prod = rel_prod.reset_index(drop=True)

In [14]:
# adding total state pixels to rel_prod
for i in range(len(rel_prod)):
    # get the crop_code from the i'th row of rel_prod
    crop_code = rel_prod.loc[i, "crop_code"]
    # get the state from the i'th row of rel_prod
    state = rel_prod.loc[i, "State_Abbr"]
    # get the value of the column in county_pixels that matches crop_code and where rel_counties["Postal"] matches state
    # then, sum the values in that column this is the total number of pixels in that state that are of that crop type.
    state_pixels = county_pixels.loc[county_pixels["postal"] == state, crop_code].sum()
    # add the value of state_pixels to the row in rel_prod
    rel_prod.loc[i, "state_pixels"] = state_pixels

# add kcal_pixel_state as a column to rel_prod
#this column is the ratio of kcal/pixel for that specific state
rel_prod["kcal_pixel_state"] = rel_prod["Production_kcal"] / rel_prod["state_pixels"]
#this column is the ratio of kcal/pixel for that specific state
rel_prod["kg_pixel_state"] = rel_prod["Production_kg"] / rel_prod["state_pixels"]

# change infinite values to NaN
rel_prod.loc[rel_prod["kcal_pixel_state"] == np.inf, "kcal_pixel_state"] = np.nan

In [15]:
for i in range(len(county_pixels)):
    # total_state_kcal = 0
    # total_state_kg = 0
    state = county_pixels.loc[i, "postal"]
    for c in county_pixels.columns:
        if c.startswith("crop") and not pd.isna(county_pixels.loc[i, c]):
            crop_code = c
            crop_pixels = county_pixels.loc[i, c]
            state_kcal_col_name = "kcal_state_"+crop_code
            state_kg_col_name = "kg_state_"+crop_code
            try:
                # kcal calculation
                state_kcal = rel_prod.loc[(rel_prod["crop_code"] == crop_code) & (rel_prod["State_Abbr"] == state), "kcal_pixel_state"].iloc[0]
                # kg calculation
                state_kg = rel_prod.loc[(rel_prod["crop_code"] == crop_code) & (rel_prod["State_Abbr"] == state), "kg_pixel_state"].iloc[0]
            except:
                state_kcal = np.nan
                state_kg = np.nan
            county_pixels.loc[i, state_kcal_col_name] = state_kcal * crop_pixels
            county_pixels.loc[i, state_kg_col_name] = state_kg * crop_pixels

  county_pixels.loc[i, state_kg_col_name] = state_kg * crop_pixels


In [16]:
# using this to do a spot check on the kcal values
check_dict = {}
for c in county_pixels.columns:
    if c.startswith("crop") or c.startswith("kcal") or c.startswith("kg"):
        check_dict[c] = county_pixels[c].sum()
check_dict

{'crop_1': 373150787,
 'crop_3': 9583913,
 'crop_4': 23490620,
 'crop_5': 367925803,
 'crop_6': 5519649,
 'crop_10': 5671798,
 'crop_12': 626744,
 'crop_21': 9739556,
 'crop_22': 188895736,
 'crop_27': 2866515,
 'crop_28': 7069477,
 'crop_29': 3095268,
 'crop_31': 8670916,
 'crop_32': 940553,
 'crop_33': 743887,
 'crop_35': 371859,
 'crop_41': 4742563,
 'crop_42': 8792263,
 'crop_43': 3730983,
 'crop_46': 538012,
 'crop_48': 230276,
 'crop_49': 691123,
 'crop_50': 274165,
 'crop_52': 5086219,
 'crop_53': 6035383,
 'crop_54': 606999,
 'crop_66': 748669,
 'crop_67': 195679,
 'crop_68': 1501536,
 'crop_69': 4383980,
 'crop_72': 452376,
 'crop_74': 2716823,
 'crop_75': 5503767,
 'crop_76': 2355737,
 'crop_77': 159730,
 'crop_204': 1786478,
 'crop_206': 219677,
 'crop_207': 12979,
 'crop_208': 89736,
 'crop_209': 167057,
 'crop_211': 156680,
 'crop_212': 3561138,
 'crop_214': 68989,
 'crop_216': 85220,
 'crop_218': 11913,
 'crop_220': 239092,
 'crop_221': 101253,
 'crop_222': 131161,
 'crop

## Income Consumption Data Cleaning

In [17]:
# keep only most recent data from income consumption data 2007-08
for c in income_consumption.columns:
    if c.startswith("low") or c.startswith("high"):
        if "2007" not in c:
            income_consumption = income_consumption.drop(c, axis=1)

# convert pounds to kilograms
income_consumption["low_2007-08"] = income_consumption["low_2007-08"] * 0.453592
income_consumption["high_2007-08"] = income_consumption["high_2007-08"] * 0.453592

# make an average column that is the average of low and high and income consumption
#this is used for our "other population" later on
income_consumption["avg_2007-08"] = (income_consumption["low_2007-08"] + income_consumption["high_2007-08"]) / 2

income_consumption

Unnamed: 0,Food Type,low_2007-08,high_2007-08,avg_2007-08
0,Fruit; total,53.233557,54.762162,53.997860
1,Apples; total,14.106711,13.063450,13.585080
2,Apples from fruit,5.402281,6.300393,5.851337
3,Apple juice,8.758862,6.504509,7.631685
4,Bananas,4.295516,4.944153,4.619835
...,...,...,...,...
58,Oils; other,0.462664,0.589670,0.526167
59,Caloric sweeteners,36.772703,35.720370,36.246537
60,Nuts; total,2.653513,4.694677,3.674095
61,Peanuts,2.023020,3.016387,2.519704


In [18]:
income_consumption = income_consumption.replace("Apples; total", "Apples")
income_consumption = income_consumption.replace("Oranges; total", "Oranges")

# These are the Crops we're using moving forward. These were selected based on what could be matched to CDL data
curated_crop_names = ['Tree nuts', 'Apples', 'Stone fruits', 'Legumes', 'Berries', 'Broccoli and cauliflower', 'Carrots', 'Celery', 'Corn flour', 'Cucumbers', 'Grapes', 'Lettuce', 'Melons', 'Oat flour', 'Onions', 'Oranges', 'Peanuts', 'Green peas', 'Peppers', 'Potatoes', 'Rice dried', 'Sweet corn', 'Tomatoes', 'Wheat flour', 'Other citrus fruits', "Grains; total", "Brassica; total", "Bananas", "Tropical fruits"]
# Drop all rows that are not in curated_crop_names
mask = income_consumption['Food Type'].isin(curated_crop_names)
income_consumption = income_consumption.loc[mask]
income_consumption = income_consumption.reset_index(drop=True)

In [19]:
# creating an other grain row from Total grain - ('Corn flour' + 'Wheat flour' + 'Rice dried' = 'Oat flour')
grain_rows = income_consumption[income_consumption['Food Type'].isin(['Corn flour', 'Wheat flour', 'Rice dried', 'Oat flour'])]
grain_sum = pd.DataFrame(grain_rows.sum()).transpose()
grain_sum["Food Type"] = "Other grain"
grain_sum["low_2007-08"] = income_consumption.loc[income_consumption["Food Type"] == "Grains; total", "low_2007-08"].iloc[0] - grain_sum["low_2007-08"]
grain_sum["high_2007-08"] = income_consumption.loc[income_consumption["Food Type"] == "Grains; total", "high_2007-08"].iloc[0] - grain_sum["high_2007-08"]
grain_sum["avg_2007-08"] = income_consumption.loc[income_consumption["Food Type"] == "Grains; total", "avg_2007-08"].iloc[0] - grain_sum["avg_2007-08"]
income_consumption = pd.concat([income_consumption, grain_sum], ignore_index=True)
income_consumption = income_consumption[income_consumption['Food Type'] != 'Grains; total']
income_consumption

Unnamed: 0,Food Type,low_2007-08,high_2007-08,avg_2007-08
0,Apples,14.106711,13.06345,13.58508
1,Bananas,4.295516,4.944153,4.619835
2,Berries,2.476612,3.347509,2.912061
3,Grapes,3.96893,4.331804,4.150367
4,Melons,1.873335,2.544651,2.208993
5,Oranges,15.540062,15.06379,15.301926
6,Other citrus fruits,2.458469,2.875773,2.667121
7,Stone fruits,2.662585,3.528946,3.095765
8,Tropical fruits,2.889381,2.394966,2.642173
9,Brassica; total,3.40194,4.599423,4.000681


In [20]:
# create a new row called radishes that is Brassica; total- Broccoli and cauliflower / 2
Radishes = income_consumption[income_consumption['Food Type'] == 'Brassica; total'].copy()
Radishes.loc[Radishes['Food Type'] == "Brassica; total", 'Food Type'] = "Radishes"
Radishes.loc[:, 'low_2007-08'] = (Radishes['low_2007-08'] - income_consumption.loc[income_consumption['Food Type'] == 'Broccoli and cauliflower', 'low_2007-08'].iloc[0]) / 2
Radishes.loc[:, 'high_2007-08'] = (Radishes['high_2007-08'] - income_consumption.loc[income_consumption['Food Type'] == 'Broccoli and cauliflower', 'high_2007-08'].iloc[0]) / 2
Radishes.loc[:, 'avg_2007-08'] = (Radishes['avg_2007-08'] - income_consumption.loc[income_consumption['Food Type'] == 'Broccoli and cauliflower', 'avg_2007-08'].iloc[0]) / 2

# create a new row called Cabbage that is Brassica; total- Broccoli and cauliflower / 2
Cabbage = income_consumption[income_consumption['Food Type'] == 'Brassica; total'].copy()
Cabbage.loc[Cabbage['Food Type'] == 'Brassica; total', 'Food Type'] = 'Cabbage'
Cabbage.loc[:, 'low_2007-08'] = (Cabbage['low_2007-08'] - income_consumption.loc[income_consumption['Food Type'] == 'Broccoli and cauliflower', 'low_2007-08'].iloc[0]) / 2
Cabbage.loc[:, 'high_2007-08'] = (Cabbage['high_2007-08'] - income_consumption.loc[income_consumption['Food Type'] == 'Broccoli and cauliflower', 'high_2007-08'].iloc[0]) / 2
Cabbage.loc[:, 'avg_2007-08'] = (Cabbage['avg_2007-08'] - income_consumption.loc[income_consumption['Food Type'] == 'Broccoli and cauliflower', 'avg_2007-08'].iloc[0]) / 2


income_consumption = pd.concat([income_consumption, Radishes, Cabbage], ignore_index=True)
# income_consumption = income_consumption.drop("Brassica; total", axis=1)
income_consumption

Unnamed: 0,Food Type,low_2007-08,high_2007-08,avg_2007-08
0,Apples,14.106711,13.06345,13.58508
1,Bananas,4.295516,4.944153,4.619835
2,Berries,2.476612,3.347509,2.912061
3,Grapes,3.96893,4.331804,4.150367
4,Melons,1.873335,2.544651,2.208993
5,Oranges,15.540062,15.06379,15.301926
6,Other citrus fruits,2.458469,2.875773,2.667121
7,Stone fruits,2.662585,3.528946,3.095765
8,Tropical fruits,2.889381,2.394966,2.642173
9,Brassica; total,3.40194,4.599423,4.000681


In [21]:
# Transposing income consumption to make it easier to work with during the next steps
income_consumption = income_consumption.set_index("Food Type").transpose()

## Crosswalk 3 County Production -> Consumption by income
1. drop irrelevent crops, do the rollups as needed, and rename all columns to match the demographic consumption data
2. adjust the county_pixelsuction data by how much was consumed by each demographic

*Note we are using income as it's most indicative of the consumption*

In [22]:
# many of the categories originally from CDL data need to be grouped to match up with the consumption by income data

# create Tree Nut column from crop_75, crop_74, crop_204, crop_76
county_pixels["pixels_Tree nuts"] = county_pixels["crop_75"] + county_pixels["crop_74"] + county_pixels["crop_204"] + county_pixels["crop_76"]
county_pixels["kcal_Tree nuts"] = county_pixels["kcal_state_crop_75"] + county_pixels["kcal_state_crop_74"] + county_pixels["kcal_state_crop_204"] + county_pixels["kcal_state_crop_76"]
county_pixels["kg_Tree nuts"] = county_pixels["kg_state_crop_75"] + county_pixels["kg_state_crop_74"] + county_pixels["kg_state_crop_204"] + county_pixels["kg_state_crop_76"]
county_pixels = county_pixels.drop(["crop_75", "crop_74", "crop_204", "crop_76"], axis=1)
county_pixels = county_pixels.drop(["kcal_state_crop_75", "kcal_state_crop_74", "kcal_state_crop_204", "kcal_state_crop_76"], axis=1)
county_pixels = county_pixels.drop(["kg_state_crop_75", "kg_state_crop_74", "kg_state_crop_204", "kg_state_crop_76"], axis=1)

# create Other grain column from crop_27 and crop_21
county_pixels["pixels_Other grain"] = county_pixels["crop_27"] + county_pixels["crop_21"]
county_pixels["kcal_Other grain"] = county_pixels["kcal_state_crop_27"] + county_pixels["kcal_state_crop_21"]
county_pixels["kg_Other grain"] = county_pixels["kg_state_crop_27"] + county_pixels["kg_state_crop_21"]
county_pixels = county_pixels.drop(["crop_27", "crop_21"], axis=1)
county_pixels = county_pixels.drop(["kcal_state_crop_27", "kcal_state_crop_21"], axis=1)
county_pixels = county_pixels.drop(["kg_state_crop_27", "kg_state_crop_21"], axis=1)

# create Broccoli and cauliflower column from crop_214 and crop_244
county_pixels["pixels_Broccoli and cauliflower"] = county_pixels["crop_214"] + county_pixels["crop_244"]
county_pixels["kcal_Broccoli and cauliflower"] = county_pixels["kcal_state_crop_214"] + county_pixels["kcal_state_crop_244"]
county_pixels["kg_Broccoli and cauliflower"] = county_pixels["kg_state_crop_214"] + county_pixels["kg_state_crop_244"]
county_pixels = county_pixels.drop(["crop_214", "crop_244"], axis=1)
county_pixels = county_pixels.drop(["kcal_state_crop_214", "kcal_state_crop_244"], axis=1)
county_pixels = county_pixels.drop(["kg_state_crop_214", "kg_state_crop_244"], axis=1)

# create Stone fruits column from crop_223  crop_66  crop_67  crop_220  crop_210
county_pixels["pixels_Stone fruits"] = county_pixels["crop_223"] + county_pixels["crop_66"] + county_pixels["crop_67"] + county_pixels["crop_220"]
county_pixels["kcal_Stone fruits"] = county_pixels["kcal_state_crop_223"] + county_pixels["kcal_state_crop_66"] + county_pixels["kcal_state_crop_67"] + county_pixels["kcal_state_crop_220"]
county_pixels["kg_Stone fruits"] = county_pixels["kg_state_crop_223"] + county_pixels["kg_state_crop_66"] + county_pixels["kg_state_crop_67"] + county_pixels["kg_state_crop_220"]
county_pixels = county_pixels.drop(["crop_223", "crop_66", "crop_67", "crop_220"], axis=1)
county_pixels = county_pixels.drop(["kcal_state_crop_223", "kcal_state_crop_66", "kcal_state_crop_67", "kcal_state_crop_220"], axis=1)
county_pixels = county_pixels.drop(["kg_state_crop_223", "kg_state_crop_66", "kg_state_crop_67", "kg_state_crop_220"], axis=1)

# create Berries column from crop_242 crop_250 crop_221
county_pixels["pixels_Berries"] = county_pixels["crop_242"] + county_pixels["crop_250"] + county_pixels["crop_221"]
county_pixels["kcal_Berries"] = county_pixels["kcal_state_crop_242"] + county_pixels["kcal_state_crop_250"] + county_pixels["kcal_state_crop_221"]
county_pixels["kg_Berries"] = county_pixels["kg_state_crop_242"] + county_pixels["kg_state_crop_250"] + county_pixels["kg_state_crop_221"]
county_pixels = county_pixels.drop(["crop_242", "crop_250", "crop_221"], axis=1)
county_pixels = county_pixels.drop(["kcal_state_crop_242", "kcal_state_crop_250", "kcal_state_crop_221"], axis=1)
county_pixels = county_pixels.drop(["kg_state_crop_242", "kg_state_crop_250", "kg_state_crop_221"], axis=1)

# create Legumes column from crop_42 crop_52
county_pixels["pixels_Legumes"] = county_pixels["crop_42"] + county_pixels["crop_52"]
county_pixels["kcal_Legumes"] = county_pixels["kcal_state_crop_42"] + county_pixels["kcal_state_crop_52"]
county_pixels["kg_Legumes"] = county_pixels["kg_state_crop_42"] + county_pixels["kg_state_crop_52"]
county_pixels = county_pixels.drop(["crop_42", "crop_52"], axis=1)
county_pixels = county_pixels.drop(["kcal_state_crop_42", "kcal_state_crop_52"], axis=1)
county_pixels = county_pixels.drop(["kg_state_crop_42", "kg_state_crop_52"], axis=1)

# create Melons column from crop_209 crop_48
county_pixels["pixels_Melons"] = county_pixels["crop_209"] + county_pixels["crop_48"]
county_pixels["kcal_Melons"] = county_pixels["kcal_state_crop_209"] + county_pixels["kcal_state_crop_48"]
county_pixels["kg_Melons"] = county_pixels["kg_state_crop_209"] + county_pixels["kg_state_crop_48"]
county_pixels = county_pixels.drop(["crop_209", "crop_48"], axis=1)
county_pixels = county_pixels.drop(["kcal_state_crop_209", "kcal_state_crop_48"], axis=1)
county_pixels = county_pixels.drop(["kg_state_crop_209", "kg_state_crop_48"], axis=1)

In [23]:
# drop crops not listed in consumption data
drop_list = [ "crop_207", "crop_215", "crop_31", "crop_248", "crop_32", "crop_208", "crop_29", "crop_35", "crop_218", "crop_211", "crop_77", "crop_229", "crop_33", "crop_4", "crop_5", "crop_222", "crop_41", "crop_6", "crop_46", ]
kcal_drop_list = ["kcal_state_" + item for item in drop_list]
kg_drop_list = ["kg_state_" + item for item in drop_list]

# for every value in drop_list drop the column from county_pixels if it exists
for item in drop_list:
    if item in county_pixels.columns:
        county_pixels = county_pixels.drop(item, axis=1)
for item in kcal_drop_list:
    if item in county_pixels.columns:
        county_pixels = county_pixels.drop(item, axis=1)
for item in kg_drop_list:
    if item in county_pixels.columns:
        county_pixels = county_pixels.drop(item, axis=1)

In [24]:
# making a dictionary that will be used to rename columns in county_pixels
code_name_map = { 'crop_68': 'Apples', 'crop_206': 'Carrots', 'crop_245': 'Celery', 'crop_1': 'Corn flour', 'crop_50': 'Cucumbers', 'crop_69': 'Grapes', 'crop_227': 'Lettuce', 'crop_28': 'Oat flour', 'crop_49': 'Onions', 'crop_212': 'Oranges', 'crop_10': 'Peanuts', 'crop_53': 'Green peas', 'crop_216': 'Peppers', 'crop_43': 'Potatoes', 'crop_3': 'Rice dried', 'crop_12': 'Sweet corn', 'crop_54': 'Tomatoes', 'crop_22': 'Wheat flour', 'crop_72': 'Other citrus fruits', 'crop_243': 'Cabbage', 'crop_246': 'Radishes'}
kcal_code_name_map = { "kcal_state_" + key: value for (key, value) in code_name_map.items() }
kcal_code_name_map = { key: "kcal_" + value for (key, value) in kcal_code_name_map.items() }
kg_code_name_map = { "kg_state_" + key: value for (key, value) in code_name_map.items() }
kg_code_name_map = { key: "kg_" + value for (key, value) in kg_code_name_map.items() }
code_name_map = { key: "pixels_" + value for (key, value) in code_name_map.items() }

In [25]:
# for every column in county_prod that matches a key in code_name_map, rename the column to the value in code_name_map
county_pixels = county_pixels.rename(columns=code_name_map)
county_pixels = county_pixels.rename(columns=kcal_code_name_map)
county_pixels = county_pixels.rename(columns=kg_code_name_map)
for c in county_pixels.columns:
    print (c)

statefp
countyfp
countyns
geoid
name
namelsad
lsad
classfp
mtfcc
csafp
cbsafp
metdivfp
funcstat
aland
awater
intptlat
intptlon
pixels_Corn flour
pixels_Rice dried
pixels_Peanuts
pixels_Sweet corn
pixels_Wheat flour
pixels_Oat flour
pixels_Potatoes
pixels_Onions
pixels_Cucumbers
pixels_Green peas
pixels_Tomatoes
pixels_Apples
pixels_Grapes
pixels_Other citrus fruits
pixels_Carrots
pixels_Oranges
pixels_Peppers
pixels_Lettuce
pixels_Cabbage
pixels_Celery
pixels_Radishes
state
postal
kcal_Corn flour
kg_Corn flour
kcal_Rice dried
kg_Rice dried
kcal_Peanuts
kg_Peanuts
kcal_Sweet corn
kg_Sweet corn
kcal_Wheat flour
kg_Wheat flour
kcal_Oat flour
kg_Oat flour
kcal_Potatoes
kg_Potatoes
kcal_Onions
kg_Onions
kcal_Cucumbers
kg_Cucumbers
kcal_Green peas
kg_Green peas
kcal_Tomatoes
kg_Tomatoes
kcal_Apples
kg_Apples
kcal_Grapes
kg_Grapes
kcal_Other citrus fruits
kg_Other citrus fruits
kcal_Carrots
kg_Carrots
kcal_Oranges
kg_Oranges
kcal_Peppers
kg_Peppers
kcal_Lettuce
kg_Lettuce
kcal_Cabbage
kg_Cabb

## Crosswalk 4 2007 consumption -> 2017 consumption
This is necessary as our prodction data is from 2017 but our consumption data is currently from 2007. We're going to scale this by the Food abailability per capita data

In [26]:
# cleaning income_consumption data
# income_consumption = income_consumption.rename(columns={"Unnamed: 0": "Income"})
income_consumption = income_consumption.drop("Brassica; total", axis=1)

In [27]:
lafa_calories = lafa_calories.set_index("Year")

In [28]:
# for every column in income_consumption multiply that columns values by the value in the 3rd row of the food_availability column with the same name
for c in income_consumption.columns:
    income_consumption[c] = income_consumption[c] / lafa_calories.loc["ratio 2007/2017"][c]
income_consumption = income_consumption.rename(index={"low_2007-08": "low_2017", "high_2007-08": "high_2017", "avg_2007-08": "avg_2017"})

# Population Data

## Age and Sex

In [29]:
# Keeping only relevant columns
rel_columns = ["Geography", "Geographic Area Name", "Estimate!!Total!!Total population", "Estimate!!Male!!Total population", "Estimate!!Male!!Total population!!AGE!!Under 5 years", "Estimate!!Male!!Total population!!AGE!!5 to 9 years", "Estimate!!Male!!Total population!!AGE!!10 to 14 years", "Estimate!!Male!!Total population!!AGE!!15 to 19 years", "Estimate!!Male!!Total population!!AGE!!20 to 24 years", "Estimate!!Male!!Total population!!AGE!!25 to 29 years", "Estimate!!Male!!Total population!!AGE!!30 to 34 years", "Estimate!!Male!!Total population!!AGE!!35 to 39 years", "Estimate!!Male!!Total population!!AGE!!40 to 44 years", "Estimate!!Male!!Total population!!AGE!!45 to 49 years", "Estimate!!Male!!Total population!!AGE!!50 to 54 years", "Estimate!!Male!!Total population!!AGE!!55 to 59 years", "Estimate!!Male!!Total population!!AGE!!60 to 64 years", "Estimate!!Male!!Total population!!AGE!!65 to 69 years", "Estimate!!Male!!Total population!!AGE!!70 to 74 years", "Estimate!!Male!!Total population!!AGE!!75 to 79 years", "Estimate!!Male!!Total population!!AGE!!80 to 84 years", "Estimate!!Male!!Total population!!AGE!!85 years and over", "Estimate!!Female!!Total population", "Estimate!!Female!!Total population!!AGE!!Under 5 years", "Estimate!!Female!!Total population!!AGE!!5 to 9 years", "Estimate!!Female!!Total population!!AGE!!10 to 14 years", "Estimate!!Female!!Total population!!AGE!!15 to 19 years", "Estimate!!Female!!Total population!!AGE!!20 to 24 years", "Estimate!!Female!!Total population!!AGE!!25 to 29 years", "Estimate!!Female!!Total population!!AGE!!30 to 34 years", "Estimate!!Female!!Total population!!AGE!!35 to 39 years", "Estimate!!Female!!Total population!!AGE!!40 to 44 years", "Estimate!!Female!!Total population!!AGE!!45 to 49 years", "Estimate!!Female!!Total population!!AGE!!50 to 54 years", "Estimate!!Female!!Total population!!AGE!!55 to 59 years", "Estimate!!Female!!Total population!!AGE!!60 to 64 years", "Estimate!!Female!!Total population!!AGE!!65 to 69 years", "Estimate!!Female!!Total population!!AGE!!70 to 74 years", "Estimate!!Female!!Total population!!AGE!!75 to 79 years", "Estimate!!Female!!Total population!!AGE!!80 to 84 years", "Estimate!!Female!!Total population!!AGE!!85 years and over", ]
age_sex_pop = age_sex_pop[[col for col in age_sex_pop.columns if col in rel_columns]]

In [30]:
# Cleaning column names
age_sex_pop.columns = age_sex_pop.columns.str.replace('Estimate!!', '')
age_sex_pop.columns = age_sex_pop.columns.str.replace('AGE!!', '')
age_sex_pop.columns = age_sex_pop.columns.str.replace('Male!!Total population!!', 'male_')
age_sex_pop.columns = age_sex_pop.columns.str.replace('Female!!Total population!!', 'female_')
age_sex_pop.columns = age_sex_pop.columns.str.replace('Male!!Total population', 'male_total')
age_sex_pop.columns = age_sex_pop.columns.str.replace('Female!!Total population', 'female_total')
age_sex_pop.columns = age_sex_pop.columns.str.replace('Total!!Total population', 'total_population')
age_sex_pop.columns = age_sex_pop.columns.str.replace(' to ', '-')
age_sex_pop.columns = age_sex_pop.columns.str.replace(' years', '')
age_sex_pop.columns = age_sex_pop.columns.str.replace(' and over', '+')
age_sex_pop.columns = age_sex_pop.columns.str.replace(' ', '_')
age_sex_pop.columns = age_sex_pop.columns.str.lower()

In [31]:
# Grouping into kids and adults
grouped_pop = age_sex_pop[['geography', 'geographic_area_name', 'total_population', 'male_total', 'female_total',]].copy()
grouped_pop['male_kids'] = age_sex_pop.loc[:, 'male_under_5':'male_15-19'].sum(axis=1)
grouped_pop['male_adults'] = age_sex_pop.loc[:, 'male_20-24':'male_85+'].sum(axis=1)
grouped_pop['female_kids'] = age_sex_pop.loc[:, 'female_under_5':'female_15-19'].sum(axis=1)
grouped_pop['female_adults'] = age_sex_pop.loc[:, 'female_20-24':'female_85+'].sum(axis=1)
grouped_pop['kid_total'] = grouped_pop['female_kids'] + grouped_pop['male_kids']
grouped_pop['adult_total'] = grouped_pop['female_adults'] + grouped_pop['male_adults']

## Poverty

In [32]:
poverty_pop = poverty_pop[[col for col in poverty_pop.columns if 'Annotation' not in col]]
poverty_pop = poverty_pop[[col for col in poverty_pop.columns if 'Margin of Error' not in col]]

In [33]:
pov_columns = ["Geography", "Geographic Area Name", "Estimate!!Total!!Population for whom poverty status is determined", "Estimate!!Total!!Population for whom poverty status is determined!!ALL INDIVIDUALS WITH INCOME BELOW THE FOLLOWING POVERTY RATIOS!!185 percent of poverty level"]
poverty_pop = poverty_pop[[col for col in poverty_pop.columns if col in pov_columns]]

In [34]:
# Grouping into high and low income. Also adding total population column
poverty_pop = poverty_pop.rename(columns={'Geography': 'geography'})
poverty_pop = poverty_pop.merge(grouped_pop[['geography', 'total_population']], on='geography', how='left')

# Cleaning up column names
poverty_pop = poverty_pop.rename(columns={'Estimate!!Total!!Population for whom poverty status is determined!!ALL INDIVIDUALS WITH INCOME BELOW THE FOLLOWING POVERTY RATIOS!!185 percent of poverty level': 'low_income'})
poverty_pop = poverty_pop.rename(columns={'Estimate!!Total!!Population for whom poverty status is determined': 'total_population_determined'})
poverty_pop['high_income'] = poverty_pop['total_population_determined'] - poverty_pop['low_income']
poverty_pop['unknown_income'] = poverty_pop['total_population'] - poverty_pop['total_population_determined']
poverty_pop.columns = poverty_pop.columns.str.lower()
poverty_pop.columns = poverty_pop.columns.str.replace(' ', '_')

## Race

In [35]:
race_columns = ["Geography", "Geographic Area Name", "Estimate!!Total!!Population for whom poverty status is determined", "Estimate!!Total!!Population for whom poverty status is determined!!RACE AND HISPANIC OR LATINO ORIGIN!!White alone, not Hispanic or Latino", "Estimate!!Total!!Population for whom poverty status is determined!!RACE AND HISPANIC OR LATINO ORIGIN!!Black or African American alone", "Estimate!!Total!!Population for whom poverty status is determined!!RACE AND HISPANIC OR LATINO ORIGIN!!Hispanic or Latino origin (of any race)"]
race_pop = race_pop[[col for col in race_pop.columns if col in race_columns]]

In [36]:
# grouping columns and cleaning up column names
race_pop = race_pop.rename(columns={'Geography': 'geography'})
race_pop = race_pop.merge(grouped_pop[['geography', 'total_population']], on='geography', how='left')
race_pop = race_pop.rename(columns={'Estimate!!Total!!Population for whom poverty status is determined': 'total_population_determined'})
race_pop = race_pop.rename(columns={'Estimate!!Total!!Population for whom poverty status is determined!!RACE AND HISPANIC OR LATINO ORIGIN!!Black or African American alone': 'black-african_american'})
race_pop = race_pop.rename(columns={'Estimate!!Total!!Population for whom poverty status is determined!!RACE AND HISPANIC OR LATINO ORIGIN!!Hispanic or Latino origin (of any race)': 'hispanic-latino'})
race_pop = race_pop.rename(columns={'Estimate!!Total!!Population for whom poverty status is determined!!RACE AND HISPANIC OR LATINO ORIGIN!!White alone, not Hispanic or Latino': 'white'})
race_pop['other_race'] = race_pop['total_population_determined']- (race_pop['black-african_american'] + race_pop['hispanic-latino'] + race_pop['white'])
race_pop['unknown_race'] = race_pop['total_population'] - race_pop['total_population_determined']
race_pop.columns = race_pop.columns.str.lower()
race_pop.columns = race_pop.columns.str.replace(' ', '_')

## Joining Population Data

In [37]:
race_pop.columns = ['race_' + col if col not in ['geographic_area_name', 'geography'] else col for col in race_pop.columns]
poverty_pop.columns = ['income_' + col if col not in ['geographic_area_name', 'geography'] else col for col in poverty_pop.columns]
grouped_pop.columns = ['agesex_' + col if col not in ['geographic_area_name', 'geography'] else col for col in grouped_pop.columns]

In [38]:
race_pov = poverty_pop.merge(race_pop, on='geography')
agesex_race_pov = race_pov.merge(grouped_pop, on='geography')
columns_to_keep = ['geography', 'geographic_area_name', 'agesex_total_population', 'agesex_male_total', 'agesex_female_total', 'agesex_male_kids', 'agesex_male_adults', 'agesex_female_kids', 'agesex_female_adults', 'agesex_kid_total', 'agesex_adult_total', 'income_total_population_determined', 'income_low_income', 'income_total_population', 'income_high_income', 'income_unknown_income', 'race_total_population_determined', 'race_black-african_american', 'race_hispanic-latino', 'race_white', 'race_total_population', 'race_other_race', 'race_unknown_race']
joined_pop = agesex_race_pov[columns_to_keep]

In [39]:
joined_pop = joined_pop.rename(columns={'agesex_total_population': 'total_population'})
joined_pop = joined_pop.drop(columns=['income_total_population', 'race_total_population'])

In [40]:
# write to output csv
joined_pop.to_csv('output-data/population-data.csv', index=False)

## Joining Population to County Geodata

In [41]:
county_boundaries = county_boundaries.set_geometry("geometry")
county_boundaries = county_boundaries.to_crs(epsg=4326)

In [42]:
# this is to deal with Connecticut now using management regions instead of counties
counties_2021 = counties_2021.set_geometry("geometry")
counties_2021 = counties_2021.to_crs(epsg=4326)
county_boundaries = county_boundaries[county_boundaries["STUSPS"] != "CT"]
ct_df = counties_2021[counties_2021["STUSPS"] == "CT"]
ct_df = ct_df.set_geometry("geometry")
ct_df = ct_df.to_crs(epsg=4326)

# add CT boundaries to county_boundaries
county_boundaries = pd.concat([county_boundaries, ct_df], ignore_index=True)  
county_boundaries.columns = county_boundaries.columns.str.lower()
county_boundaries = county_boundaries.reset_index(drop=True)

In [43]:
# merging population with counties
pop_counties = joined_pop.merge(county_boundaries, left_on='geography', right_on='affgeoid')
pop_counties = pop_counties.drop(columns=['affgeoid'])
pop_counties.columns = pop_counties.columns.str.lower()
pop_counties = gpd.GeoDataFrame(pop_counties, geometry='geometry')
pop_counties = pop_counties.to_crs(epsg=4326)
pop_counties_conus = pop_counties[~pop_counties['state_name'].isin(['Puerto Rico', 'Alaska', 'Hawaii'])]

In [44]:
pop_counties.to_file("output-data/population-demographics-county.geojson", driver='GeoJSON')

# Import Data

## Creating Import Data

In [45]:
# This was run to get all food imported to and exported from the USA in 2017
# This was saved to a csv file and read in at the beginning of this notebook.
# To run this code change fetch_imports to True

fetch_imports = True

if fetch_imports == True:
    import_url = 'https://api.census.gov/data/timeseries/intltrade/imports/porths?get=PORT,CTY_CODE,I_COMMODITY,I_COMMODITY_SDESC,GEN_VAL_MO,AIR_VAL_MO,AIR_WGT_MO,CNT_WGT_MO,CNT_VAL_MO&YEAR=2017&SUMMARY_LVL=DET&COMM_LVL=HS6&I_COMMODITY='
    export_url = 'https://api.census.gov/data/timeseries/intltrade/exports/porths?get=PORT,CTY_CODE,E_COMMODITY,E_COMMODITY_SDESC,AIR_VAL_MO,AIR_WGT_MO,CNT_WGT_MO,CNT_VAL_MO&YEAR=2017&SUMMARY_LVL=DET&COMM_LVL=HS6&E_COMMODITY='
    categories = ['07*', '08*', '10*', '11*', '20*']
    food_imports = pd.DataFrame()
    food_exports = pd.DataFrame()
    for c in categories:
        r = requests.get(export_url+c)
        print (c + " export request made")
        food_exports = pd.concat([food_exports, pd.DataFrame(r.json()[1:], columns=r.json()[0])])
        r = requests.get(import_url+c)
        print (c + " import request made")
        food_imports = pd.concat([food_imports, pd.DataFrame(r.json()[1:], columns=r.json()[0])])
else:
    try:
        food_imports = pd.read_csv("input-data/food-imports.csv")
        food_exports = pd.read_csv("input-data/food-exports.csv")
    except:
        print ("No file found. Try setting fetch_imports to True and run again.")


07* export request made
07* import request made
08* export request made
08* import request made
10* export request made
10* import request made
11* export request made
11* import request made
20* export request made
20* import request made


In [46]:
# this cell adds total calories column, adds FDC ID column, and drops rows with no total calories
# converts kg to kcal using the import commodities table
def import_kg_to_kcal(commodity, kg):
    return int(kg) * imports_crosswalk[imports_crosswalk['I_COMMODITY_SDESC'] == commodity]['kcal/kg'].values[0]
def import_commodity_to_FDCID(commodity):
    return imports_crosswalk[imports_crosswalk['I_COMMODITY_SDESC'] == commodity]['FDC ID'].values[0]
def import_commodity_to_croptype(commodity):
    return imports_crosswalk[imports_crosswalk['I_COMMODITY_SDESC'] == commodity]['crop_type'].values[0]

# create a total_kg column of food_imports that is the sum of the CNT_WGT_MO column and the AIR_WGT_MO column
food_imports['total_kg'] = food_imports['CNT_WGT_MO'] + food_imports['AIR_WGT_MO']
# for each row in food_imports, call import_kg_to_kcal with the I_COMMODITY_SDESC and total_kg columns as arguments and assign the result to a new column called total_kcal
food_imports['total_kcal'] = food_imports.apply(lambda row: import_kg_to_kcal(row['I_COMMODITY_SDESC'], row['total_kg']), axis=1)
# for each row in food_imports, call import_commodity_to_croptype with the I_COMMODITY_SDESC and total_kg columns as arguments and assign the result to a new column called crop_type
food_imports['crop_type'] = food_imports.apply(lambda row: import_commodity_to_croptype(row['I_COMMODITY_SDESC']), axis=1)
# for each row in food_imports, call import_commodity_to_FDCID with the I_COMMODITY_SDESC and total_kg columns as arguments and assign the result to a new column called FDC ID
food_imports['FDC ID'] = food_imports.apply(lambda row: import_commodity_to_FDCID(row['I_COMMODITY_SDESC']), axis=1)

# Sometimes imports are only reported in value, not weight. this section drops those rows
food_imports = food_imports.dropna(subset=['total_kcal'])

food_imports.reset_index(inplace=True, drop=True)


In [47]:
# This adds geocoordinates for each of the ports
geolocator = Nominatim(user_agent="https://theplotline.org/")
for port in port_codes['Name']:
    try:
        location = geolocator.geocode(port)
        lat = location.raw['lat']
        lon = location.raw['lon']
        port_codes.loc[port_codes['Name'] == port, 'lat'] = lat
        port_codes.loc[port_codes['Name'] == port, 'lon'] = lon
    except:
        port_codes.loc[port_codes['Name'] == port, 'lat'] = np.nan
        port_codes.loc[port_codes['Name'] == port, 'lon'] = np.nan

# creating a geodataframe from port_codes dataframe
port_codes_gdf = gpd.GeoDataFrame(port_codes, geometry=gpd.points_from_xy(port_codes['lon'], port_codes['lat']))
port_codes_gdf = port_codes_gdf.set_crs(epsg=4326)
# drop anything without a lat or lon
port_codes_gdf = port_codes_gdf.dropna(subset=['lon'])
# drop lat and lon as we have a geometry column now
port_codes_gdf.drop(['lat', 'lon'], axis=1, inplace=True)
port_codes_gdf.reset_index(inplace=True, drop=True)


In [48]:
# reformatting port codes as strings rather than floats and ensuring all are in proper 4 digit format
port_codes_gdf['Port Code'] = port_codes_gdf['Port Code'].astype(str)
port_codes_gdf['Port Code'] = port_codes_gdf['Port Code'].astype(str).str.replace('.0', '')
port_codes_gdf['Port Code'] = port_codes_gdf['Port Code'].apply(lambda x: '{0:0>4}'.format(x))
port_codes_gdf['Port Code'] = port_codes_gdf['Port Code'].replace('0nan', np.nan)


In [49]:
# adding county names to port_codes_gdf
port_codes_gdf = gpd.sjoin(port_codes_gdf, pop_counties[['geoid', 'geography', 'geographic_area_name', 'geometry']], how='left', op='within')
port_codes_gdf.drop('index_right', axis=1, inplace=True)
#writing to output file
port_codes_gdf.to_file("output-data/us-port-locations.geojson", driver='GeoJSON')

  if await self.run_code(code, result, async_=asy):


In [50]:
# joining food imports and port codes. Note, port codes includes a geometry and county name at this point.
food_imports = food_imports.merge(port_codes_gdf, left_on='PORT', right_on='Port Code', how='left')
food_imports.drop('Port Code', axis=1, inplace=True)

# drop all values with drop as crop_type
food_imports = food_imports[food_imports['crop_type'] != 'drop']
food_imports = food_imports.dropna(subset=['geoid'])

# drop any values that are total_kg = 0
food_imports = food_imports[food_imports['total_kg'] != 0]
food_imports['geoid'] = food_imports['geoid'].astype(str).replace('\.0$', '', regex=True)
food_imports.reset_index(inplace=True, drop=True)

food_imports.to_csv("output-data/food-imports-ports.csv", index=False)

In [51]:
food_imports = food_imports.drop('I_COMMODITY', axis=1)
food_imports

Unnamed: 0,PORT,CTY_CODE,I_COMMODITY_SDESC,GEN_VAL_MO,AIR_VAL_MO,AIR_WGT_MO,CNT_WGT_MO,CNT_VAL_MO,YEAR,SUMMARY_LVL,...,total_kg,total_kcal,crop_type,FDC ID,District Code,Name,geometry,geoid,geography,geographic_area_name
0,0106,1220,"POTATOES, EXCEPT SEED, FRESH OR CHILLED, NESOI",6384710,0,0,0,0,2017,DET,...,00,0.0,Potatoes,2344876.0,,"HOULTON, ME",POINT (-67.84079 46.12513),23003,0500000US23003,"Aroostook County, Maine"
1,0108,1220,"POTATOES, EXCEPT SEED, FRESH OR CHILLED, NESOI",79716,0,0,0,0,2017,DET,...,00,0.0,Potatoes,2344876.0,,"VAN BUREN, ME",POINT (-67.93533 47.15751),23003,0500000US23003,"Aroostook County, Maine"
2,0115,1220,"POTATOES, EXCEPT SEED, FRESH OR CHILLED, NESOI",766670,0,0,0,0,2017,DET,...,00,0.0,Potatoes,2344876.0,,"CALAIS, ME",POINT (-67.27814 45.18903),23029,0500000US23029,"Washington County, Maine"
3,0127,1220,"POTATOES, EXCEPT SEED, FRESH OR CHILLED, NESOI",486864,0,0,0,0,2017,DET,...,00,0.0,Potatoes,2344876.0,,"BRIDGEWATER, ME",POINT (-67.84401 46.42792),23003,0500000US23003,"Aroostook County, Maine"
4,0209,1220,"POTATOES, EXCEPT SEED, FRESH OR CHILLED, NESOI",184956,0,0,0,0,2017,DET,...,00,0.0,Potatoes,2344876.0,,"DERBY LINE, VT",POINT (-72.09963 45.00533),50019,0500000US50019,"Orleans County, Vermont"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75969,4909,3330,"CRANBERRY JUICE, LINGONBERRY JUICE",0,0,0,0,0,2017,DET,...,00,0.0,Berries,2003594.0,,"SAN JUAN, PR",POINT (-66.11667 18.46530),72127,0500000US72127,"San Juan Municipio, Puerto Rico"
75970,5201,4280,"CRANBERRY JUICE, LINGONBERRY JUICE",0,0,0,0,0,2017,DET,...,00,0.0,Berries,2003594.0,,"MIAMI, FL",POINT (-80.19362 25.77417),12086,0500000US12086,"Miami-Dade County, Florida"
75971,5201,4700,"CRANBERRY JUICE, LINGONBERRY JUICE",0,0,0,0,0,2017,DET,...,00,0.0,Berries,2003594.0,,"MIAMI, FL",POINT (-80.19362 25.77417),12086,0500000US12086,"Miami-Dade County, Florida"
75972,5201,7290,"CRANBERRY JUICE, LINGONBERRY JUICE",0,0,0,0,0,2017,DET,...,00,0.0,Berries,2003594.0,,"MIAMI, FL",POINT (-80.19362 25.77417),12086,0500000US12086,"Miami-Dade County, Florida"


In [52]:
#convert food_imports to a geodataframe and write to file
food_imports_gdf = gpd.GeoDataFrame(food_imports, geometry='geometry')
food_imports_gdf = food_imports_gdf.set_crs(epsg=4326)
food_imports_gdf.to_file("output-data/food-imports-ports.geojson", driver='GeoJSON')

## Integrating Import Data with Production Data

In [53]:
# Banana and Tropical fruit handler. Because none of this category is grown in the US we need to manually add these columns to the county_prod dataframe
# adding a "pixels_Bananas" and "pixels_Tropical fruits column to county_prod in case we want to use later
county_pixels["pixels_Bananas"] = 0
county_pixels["kcal_Bananas"] = 0
county_pixels["kg_Bananas"] = 0
county_pixels["pixels_Tropical fruits"] = 0
county_pixels["kcal_Tropical fruits"] = 0
county_pixels["kg_Tropical fruits"] = 0

In [54]:
# doing this to make math work when adding pixels to NaN rows
county_pixels = county_pixels.fillna(0)

for r in food_imports.index:

    county_row = county_pixels[county_pixels["geoid"] == int(food_imports.loc[r, "geoid"])]
    crop_type = food_imports.loc[r, "crop_type"]
    crop_kg = int(food_imports.loc[r, "total_kg"])
    crop_kcal = int(food_imports.loc[r, "total_kcal"])

    # add crop_kg to the value in column of county_row that matches "kg_"+crop_type and kcal_crop_type. 
    # Rows we skip are non-CONUS geoids
    try:
        county_pixels.loc[county_pixels["geoid"] == int(food_imports.loc[r, "geoid"]), "kg_"+crop_type] = county_row["kg_"+crop_type].iloc[0] + int(crop_kg)
        county_pixels.loc[county_pixels["geoid"] == int(food_imports.loc[r, "geoid"]), "kcal_"+crop_type] = county_row["kcal_"+crop_type].iloc[0] + int(crop_kcal)
    except:
        print ("skipping " + str(food_imports.loc[r, "geoid"]))

skipping 72127
skipping 72127
skipping 72127
skipping 72127
skipping 15003
skipping 72127
skipping 72127
skipping 72127
skipping 15003
skipping 72127
skipping 72127
skipping 72127
skipping 72127
skipping 72127
skipping 72127
skipping 72127
skipping 72127
skipping 15003
skipping 15003
skipping 15003
skipping 72127
skipping 72127
skipping 15003
skipping 72127
skipping 72127
skipping 72127
skipping 72127
skipping 72127
skipping 72127
skipping 72127
skipping 72127
skipping 72127
skipping 15003
skipping 15003
skipping 15003
skipping 72127
skipping 15003
skipping 72127
skipping 72127
skipping 72127
skipping 72127
skipping 72127
skipping 15003
skipping 72127
skipping 15003
skipping 15003
skipping 72127
skipping 72127
skipping 72127
skipping 72127
skipping 72127
skipping 72127
skipping 72127
skipping 72127
skipping 72127
skipping 72127
skipping 72127
skipping 72127
skipping 72127
skipping 72127
skipping 15003
skipping 72127
skipping 15003
skipping 15003
skipping 15003
skipping 72127
skipping 7

# Scaling Production to Consumption

In [55]:
county_pixels["geoid"] = county_pixels["geoid"].astype(int)
county_pixels = county_pixels.set_index("geoid")
pop_counties_conus["geoid"] = pop_counties_conus["geoid"].astype(int)
pop_counties_conus = pop_counties_conus.set_index("geoid")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


In [56]:
# this adds consumption columns to pop_counties_conus
# these columns are named in the following schema low_kg_consumed_crop type
# for example low_kg_consumed_apples
for c in income_consumption.columns:
    low_consumption_kg = income_consumption[c]["low_2017"] * pop_counties_conus["income_low_income"]
    high_consumption_kg = income_consumption[c]["high_2017"] * pop_counties_conus["income_high_income"]
    other_consumption_kg = income_consumption[c]["avg_2017"] * pop_counties_conus["income_unknown_income"]
    pop_counties_conus["low_kg_consumed_" + c] = low_consumption_kg
    pop_counties_conus["high_kg_consumed_" + c] = high_consumption_kg
    pop_counties_conus["other_kg_consumed_" + c] = other_consumption_kg
    pop_counties_conus["total_kg_consumed_" + c] = low_consumption_kg + high_consumption_kg + other_consumption_kg


In [57]:
# This calculates a kcal/kg ratio and adds a column to pop_counties_conus with the number of kcal's consumed by income demographic
for c in county_pixels.columns:
    if c.startswith("kg_"):
        crop_name = c[3:]
        kcal_kg_ratio = county_pixels["kcal_" + crop_name].sum() / county_pixels[c].sum()
        pop_counties_conus["low_kcal_consumed_"+crop_name] = pop_counties_conus["low_kg_consumed_" + crop_name] * kcal_kg_ratio
        pop_counties_conus["high_kcal_consumed_"+crop_name] = pop_counties_conus["high_kg_consumed_" + crop_name] * kcal_kg_ratio
        pop_counties_conus["other_kcal_consumed_"+crop_name] = pop_counties_conus["other_kg_consumed_" + crop_name] * kcal_kg_ratio
        pop_counties_conus["total_kcal_consumed_"+crop_name] = pop_counties_conus["total_kg_consumed_" + crop_name] * kcal_kg_ratio

In [58]:
# write county_pixels to output csv
county_pixels.to_csv('output-data/full_production_2017.csv')

In [59]:
# this creates a new dataframe with the values from county_prod scaled to match the total kcal consumed
for c in county_pixels.columns:
    if c.startswith("total_kcal_consumed_"):
        crop_name = c[20:]
        crop_total_consumed = county_pixels[c].sum()
        scale_ratio = crop_total_consumed / county_pixels["kcal_" + crop_name].sum()
        county_pixels["kcal_" + crop_name] = county_pixels["kcal_" + crop_name] * scale_ratio
        county_pixels["kg_" + crop_name] = county_pixels["kg_" + crop_name] * scale_ratio


In [60]:
# this joins kg and kcal values from county_prod_scaled to pop_counties_conus on geoid
pop_counties_conus = pop_counties_conus.join(county_pixels.filter(regex="^kg_|^kcal_"))

for c in pop_counties_conus.columns:
    if c.startswith("kg_"):
        crop_name = c[3:]
        pop_counties_conus = pop_counties_conus.rename(columns={c: "kg_produced_" + crop_name})
    if c.startswith("kcal_"):
        crop_name = c[5:]
        pop_counties_conus = pop_counties_conus.rename(columns={c: "kcal_produced_" + crop_name})

In [61]:
# This is a double check on our data to make sure it's scaled correctly. Summed production and consumption should be the same
for c in pop_counties_conus.columns:
    if c.startswith("kcal_produced_"):
        dc_crop_name = c[14:]
        double_check = pop_counties_conus[c].sum() - pop_counties_conus["kcal_produced_"+dc_crop_name].sum()
        if double_check == 0:
            print (dc_crop_name + " numbers look good!")
        else: 
            print (dc_crop_name + " numbers don't match, difference is " + str(double_check))


Corn flour numbers look good!
Rice dried numbers look good!
Peanuts numbers look good!
Sweet corn numbers look good!
Wheat flour numbers look good!
Oat flour numbers look good!
Potatoes numbers look good!
Onions numbers look good!
Cucumbers numbers look good!
Green peas numbers look good!
Tomatoes numbers look good!
Apples numbers look good!
Grapes numbers look good!
Other citrus fruits numbers look good!
Carrots numbers look good!
Oranges numbers look good!
Peppers numbers look good!
Lettuce numbers look good!
Cabbage numbers look good!
Celery numbers look good!
Radishes numbers look good!
Tree nuts numbers look good!
Other grain numbers look good!
Broccoli and cauliflower numbers look good!
Stone fruits numbers look good!
Berries numbers look good!
Legumes numbers look good!
Melons numbers look good!
Bananas numbers look good!
Tropical fruits numbers look good!


In [62]:
pop_counties_conus.to_file("output-data/county-population-consumption-production-scaled.geojson", driver="GeoJSON")
pop_counties_conus.to_csv("output-data/county-population-consumption-production-scaled.csv")