In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import RandomizedSearchCV, cross_val_score, GridSearchCV

In [2]:
rand_state = 19

In [3]:
gdd_data = pd.read_csv('/Users/coleromanyk/Downloads/corn_gdd.csv')

In [4]:
gdd_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15651 entries, 0 to 15650
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Year             15651 non-null  object 
 1   Month            15650 non-null  float64
 2   Day              15650 non-null  float64
 3   Accumulated GDD  15650 non-null  float64
dtypes: float64(3), object(1)
memory usage: 489.2+ KB


In [5]:
gdd_data.Year.unique()

array(['Average', 'Median', '1981', '1982', '1983', '1984', '1985',
       '1986', '1987', '1988', '1989', '1990', '1991', '1992', '1993',
       '1994', '1995', '1996', '1997', '1998', '1999', '2000', '2001',
       '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009',
       '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017',
       '2018', '2019', '2020', '2021', "''"], dtype=object)

In [6]:
non_year = ['Average', 'Median', "''"]

In [7]:
gdd_data = gdd_data[~gdd_data.Year.isin(non_year)]

In [8]:
gdd_data.Year = pd.to_numeric(gdd_data.Year)

In [9]:
year_range = list(range(1990, 2018))

In [10]:
gdd_filter = gdd_data[(gdd_data['Year'].isin(year_range)) & \
                      (gdd_data['Month'] == gdd_data.Month.max()) & (gdd_data['Day'] == gdd_data.Day.max())]

In [11]:
gdd = gdd_filter.drop(['Month', 'Day'], axis=1)

In [12]:
hold_out = pd.read_csv('/Users/coleromanyk/Documents/GitHub/Capstone 2/hold_out_data300.csv')
train_data= pd.read_csv('/Users/coleromanyk/Documents/GitHub/Capstone 2/SMOTE300_data.csv')

In [13]:
hold_out = hold_out.merge(gdd, how='left')
train_data = train_data.merge(gdd, how='left')

In [14]:
train_data.head()

Unnamed: 0,Year,Per Acre Emissions,N,P,Potash,IN Corn Yield per Acre,Precipitation,Average Temperature,IN No Till Corn (Thousands of acres),IN Con Till Corn (Thousands of acres),IN Corn Cover Crop (Thousands of acres),Conventional Till,efficiency,Accumulated GDD
0,1993,2.86059,134.0,68.0,114.0,132,50.78,52.0,1211.769,1536.438,0.0,2651.793,46.144325,3231.0
1,2007,3.367331,149.0,69.0,124.0,154,36.75,55.0,1542.152,2202.153,0.0,2625.695,45.733543,3778.0
2,1999,3.341237,154.0,56.0,116.0,132,32.4,54.5,1044.103,1548.732,0.0,3077.165,39.506327,3680.0
3,2001,2.929382,140.0,66.0,121.0,156,41.91,54.2,1092.997,1532.988,0.0,3044.015,53.253548,3607.0
4,1990,3.306718,139.0,75.0,111.0,129,50.44,54.4,479.255,824.2,0.0,4146.545,39.011486,3310.0


In [15]:
X_train = train_data.drop(['IN Corn Yield per Acre'], axis=1)
y_train = train_data['efficiency']

In [16]:
X_test = hold_out.drop(['efficiency', 'IN Corn Yield per Acre'], axis=1)
y_test = hold_out['efficiency']

In [17]:
# X_train = train_data.drop(['Per Acre Emissions', 'efficiency', 'IN Corn Yield per Acre'], axis=1)
# y_train = train_data['efficiency']

In [18]:
# X_test = hold_out.drop(['Per Acre Emissions', 'efficiency', 'IN Corn Yield per Acre'], axis=1)
# y_test = hold_out['efficiency']

In [19]:
X_train[X_train.N == X_train.N.max()]

Unnamed: 0,Year,Per Acre Emissions,N,P,Potash,Precipitation,Average Temperature,IN No Till Corn (Thousands of acres),IN Con Till Corn (Thousands of acres),IN Corn Cover Crop (Thousands of acres),Conventional Till,efficiency,Accumulated GDD
16,2010,3.420192,178.0,69.0,119.0,33.89,54.6,1270.35,2146.1,0.0,2303.55,45.903862,3845.0
96,2010,3.420192,178.0,69.0,119.0,33.89,54.6,1270.35,2146.1,0.0,2303.55,45.903862,3845.0
108,2010,3.420192,178.0,69.0,119.0,33.89,54.6,1270.35,2146.1,0.0,2303.55,45.903862,3845.0
112,2010,3.420192,178.0,69.0,119.0,33.89,54.6,1270.35,2146.1,0.0,2303.55,45.903862,3845.0
182,2010,3.420192,178.0,69.0,119.0,33.89,54.6,1270.35,2146.1,0.0,2303.55,45.903862,3845.0
250,2010,3.420192,178.0,69.0,119.0,33.89,54.6,1270.35,2146.1,0.0,2303.55,45.903862,3845.0
293,2010,3.420192,178.0,69.0,119.0,33.89,54.6,1270.35,2146.1,0.0,2303.55,45.903862,3845.0


In [20]:
# X_train['total_acres'] = X_train['IN No Till Corn (Thousands of acres)']+\
# X_train['IN Con Till Corn (Thousands of acres)']+X_train['Conventional Till']

In [21]:
# X_train['full_till_ratio'] = X_train['Conventional Till'] / X_train['total_acres']

In [22]:
# X_train['conservation_till_ratio'] = 1 - X_train['full_till_ratio']

In [23]:
# X_train['cover_crop_ratio'] = X_train['IN Corn Cover Crop (Thousands of acres)'] / X_train['total_acres']

In [24]:
dfs = [X_train, X_test]
for df in dfs:
    df['total_acres'] = df['IN No Till Corn (Thousands of acres)']+\
    df['IN Con Till Corn (Thousands of acres)']+df['Conventional Till']
    
    df['full_till_ratio'] = df['Conventional Till'] / df['total_acres']
    
    df['conservation_till_ratio'] = 1 - df['full_till_ratio']
    
    df['cover_crop_ratio'] = df['IN Corn Cover Crop (Thousands of acres)'] / df['total_acres']

In [25]:
X_train['temp_variation'] = X_train['Average Temperature'] - X_train['Average Temperature'].mean()

In [26]:
X_train['precip_variation'] = X_train['Precipitation'] - X_train['Precipitation'].mean()

In [27]:
X_train['N_variation'] = X_train['N'] - X_train['N'].mean()

In [28]:
X_train[(X_train['Per Acre Emissions'] > 3.7) & (X_train['Per Acre Emissions'] < 4.3) & (X_train['IN Corn Cover Crop (Thousands of acres)'] > 0)] 

Unnamed: 0,Year,Per Acre Emissions,N,P,Potash,Precipitation,Average Temperature,IN No Till Corn (Thousands of acres),IN Con Till Corn (Thousands of acres),IN Corn Cover Crop (Thousands of acres),Conventional Till,efficiency,Accumulated GDD,total_acres,full_till_ratio,conservation_till_ratio,cover_crop_ratio,temp_variation,precip_variation,N_variation
13,2012,3.948001,160.0,75.0,127.0,38.04,56.8,1281.5,2195.55,139.65,2552.95,25.075985,3539.0,6030.0,0.423375,0.576625,0.023159,2.715336,-6.953009,6.442958
25,2014,3.948001,162.603244,73.603244,114.157827,45.95413,54.390706,1333.669748,2132.130037,380.858499,2147.032338,58.908347,3383.0,5612.832122,0.382522,0.617478,0.067855,0.306042,0.961121,9.046202
81,2012,3.948001,160.0,75.0,127.0,38.04,56.8,1281.5,2195.55,139.65,2552.95,25.075985,3539.0,6030.0,0.423375,0.576625,0.023159,2.715336,-6.953009,6.442958
82,2014,3.948001,162.603244,73.603244,114.157827,45.95413,54.390706,1333.669748,2132.130037,380.858499,2147.032338,58.908347,3383.0,5612.832122,0.382522,0.617478,0.067855,0.306042,0.961121,9.046202
88,2014,3.948001,162.603244,73.603244,114.157827,45.95413,54.390706,1333.669748,2132.130037,380.858499,2147.032338,58.908347,3383.0,5612.832122,0.382522,0.617478,0.067855,0.306042,0.961121,9.046202
122,2012,3.948001,160.0,75.0,127.0,38.04,56.8,1281.5,2195.55,139.65,2552.95,25.075985,3539.0,6030.0,0.423375,0.576625,0.023159,2.715336,-6.953009,6.442958
148,2014,3.948001,162.603244,73.603244,114.157827,45.95413,54.390706,1333.669748,2132.130037,380.858499,2147.032338,58.908347,3383.0,5612.832122,0.382522,0.617478,0.067855,0.306042,0.961121,9.046202
196,2014,3.948001,162.603244,73.603244,114.157827,45.95413,54.390706,1333.669748,2132.130037,380.858499,2147.032338,58.908347,3383.0,5612.832122,0.382522,0.617478,0.067855,0.306042,0.961121,9.046202
222,2014,3.948001,162.603244,73.603244,114.157827,45.95413,54.390706,1333.669748,2132.130037,380.858499,2147.032338,58.908347,3383.0,5612.832122,0.382522,0.617478,0.067855,0.306042,0.961121,9.046202
251,2012,3.948001,160.0,75.0,127.0,38.04,56.8,1281.5,2195.55,139.65,2552.95,25.075985,3539.0,6030.0,0.423375,0.576625,0.023159,2.715336,-6.953009,6.442958


In [29]:
X_train[(X_train['N'] > 175) & (X_train['IN Corn Cover Crop (Thousands of acres)'] == 0)] 

Unnamed: 0,Year,Per Acre Emissions,N,P,Potash,Precipitation,Average Temperature,IN No Till Corn (Thousands of acres),IN Con Till Corn (Thousands of acres),IN Corn Cover Crop (Thousands of acres),Conventional Till,efficiency,Accumulated GDD,total_acres,full_till_ratio,conservation_till_ratio,cover_crop_ratio,temp_variation,precip_variation,N_variation
16,2010,3.420192,178.0,69.0,119.0,33.89,54.6,1270.35,2146.1,0.0,2303.55,45.903862,3845.0,5720.0,0.402719,0.597281,0.0,0.515336,-11.103009,24.442958
96,2010,3.420192,178.0,69.0,119.0,33.89,54.6,1270.35,2146.1,0.0,2303.55,45.903862,3845.0,5720.0,0.402719,0.597281,0.0,0.515336,-11.103009,24.442958
108,2010,3.420192,178.0,69.0,119.0,33.89,54.6,1270.35,2146.1,0.0,2303.55,45.903862,3845.0,5720.0,0.402719,0.597281,0.0,0.515336,-11.103009,24.442958
112,2010,3.420192,178.0,69.0,119.0,33.89,54.6,1270.35,2146.1,0.0,2303.55,45.903862,3845.0,5720.0,0.402719,0.597281,0.0,0.515336,-11.103009,24.442958
182,2010,3.420192,178.0,69.0,119.0,33.89,54.6,1270.35,2146.1,0.0,2303.55,45.903862,3845.0,5720.0,0.402719,0.597281,0.0,0.515336,-11.103009,24.442958
250,2010,3.420192,178.0,69.0,119.0,33.89,54.6,1270.35,2146.1,0.0,2303.55,45.903862,3845.0,5720.0,0.402719,0.597281,0.0,0.515336,-11.103009,24.442958
293,2010,3.420192,178.0,69.0,119.0,33.89,54.6,1270.35,2146.1,0.0,2303.55,45.903862,3845.0,5720.0,0.402719,0.597281,0.0,0.515336,-11.103009,24.442958


In [30]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 300 entries, 0 to 299
Data columns (total 20 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   Year                                     300 non-null    int64  
 1   Per Acre Emissions                       300 non-null    float64
 2   N                                        300 non-null    float64
 3   P                                        300 non-null    float64
 4   Potash                                   300 non-null    float64
 5   Precipitation                            300 non-null    float64
 6   Average Temperature                      300 non-null    float64
 7   IN No Till Corn (Thousands of acres)     300 non-null    float64
 8   IN Con Till Corn (Thousands of acres)    300 non-null    float64
 9   IN Corn Cover Crop (Thousands of acres)  300 non-null    float64
 10  Conventional Till                        300 non-n