# Regression analysis: fit the yield model

We fit each county individually ensuring coefficients exhibit the correct sign. We then check for which counties the yield model adds significant skill by using an F-test.

In [1]:
import numpy as np
import pandas as pd
import geopandas as gp
from sklearn.linear_model import LinearRegression
import sklearn.preprocessing as sklp
import sklearn.metrics as sklm
import matplotlib as mpl
import matplotlib.pyplot as plt

from matplotlib.colors import LinearSegmentedColormap
from mpl_toolkits.axes_grid1 import make_axes_locatable
from matplotlib import gridspec
plt.rcParams.update({'font.size': 18})

###Â Data processing

In [18]:
# GMFD
gmfd = pd.read_csv('../agvars/GMFD/agvar_historical_gmfd.csv')
gmfd = gmfd.query("Year <= 2005")
gmfd.sort_values(by = ['GEOID', 'Year'], inplace = True)
gmfd['prcp2'] = gmfd['prcp']**2
gmfd["GEOID"] = gmfd["GEOID"].astype(str).str.zfill(5)
gmfd.set_index(['GEOID', 'Year'], inplace = True)

# USDA with county trends
usda_county = pd.read_csv('../USDA/output/USDA_county_yields_w_county_quad_trends.csv')
usda_county = usda_county.query("year <= 2005")
usda_county['state_fips_code'] = usda_county["state_fips_code"].astype(str).str.zfill(2)
usda_county["GEOID"] = usda_county["GEOID"].astype(str).str.zfill(7)
usda_county.sort_values(by = ['GEOID', 'year'], inplace = True)
usda_county.set_index(['state_fips_code', 'GEOID', 'year'], inplace = True)

def get_features(usda_data, minYears = 23):
    # Merge
    features = pd.merge(usda_data.reset_index().filter(['GEOID', 'year', 'yield_anom', 'log_yield', 'quad_fit']), gmfd.reset_index(), left_on = ['GEOID', 'year'], right_on = ['GEOID', 'Year'], how = 'inner')
    features.drop(columns = ['Year'], inplace = True)
    features.set_index(['GEOID', 'year'], inplace = True)
    
    # Select minimum number of years for each county
    features['count'] = 1.
    minFeatures = features.groupby('GEOID').sum().query('count >= ' + str(minYears)).index
    # Select minimum number of years for each county
    features = features.loc[minFeatures].drop(columns = ['count'])
    print('Final shape of features: ' + str(features.shape))
    return features

In [19]:
usda_county.tail()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,asd_code,county_name,yield,state_alpha,county_code,area,log_yield,quad_fit,yield_anom
state_fips_code,GEOID,year,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
56,5604520,1986,20,WESTON,95.0,WY,45,100,4.553877,4.337998,0.215879
56,5604520,1986,20,WESTON,95.0,WY,45,100,4.553877,4.337998,0.215879
56,5604520,1986,20,WESTON,95.0,WY,45,100,4.553877,4.337998,0.215879
56,5604520,1986,20,WESTON,95.0,WY,45,100,4.553877,4.337998,0.215879
56,5604520,1986,20,WESTON,95.0,WY,45,100,4.553877,4.337998,0.215879
