In [51]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
import xgboost as xg
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import RandomizedSearchCV, cross_val_score

In [5]:
hold_out = pd.read_csv('/Users/coleromanyk/Documents/GitHub/Capstone 2/hold_out_data.csv')
train_data= pd.read_csv('/Users/coleromanyk/Documents/GitHub/Capstone 2/SMOTE_data.csv')

In [8]:
print(train_data.info())
train_data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 172 entries, 0 to 171
Data columns (total 12 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   Year                                     172 non-null    int64  
 1   Per Acre Emissions                       172 non-null    float64
 2   N                                        172 non-null    float64
 3   P                                        172 non-null    float64
 4   Potash                                   172 non-null    float64
 5   IN Corn Yield per Acre                   172 non-null    int64  
 6   Precipitation                            172 non-null    float64
 7   Average Temperature                      172 non-null    float64
 8   IN No Till Corn (Thousands of acres)     172 non-null    float64
 9   IN Con Till Corn (Thousands of acres)    172 non-null    float64
 10  IN Corn Cover Crop (Thousands of acres)  172 non-n

Unnamed: 0,Year,Per Acre Emissions,N,P,Potash,IN Corn Yield per Acre,Precipitation,Average Temperature,IN No Till Corn (Thousands of acres),IN Con Till Corn (Thousands of acres),IN Corn Cover Crop (Thousands of acres),Conventional Till
0,1993,2.86059,134.0,68.0,114.0,132,50.78,52.0,1211.769,1536.438,0.0,2651.793
1,2007,3.367331,149.0,69.0,124.0,154,36.75,55.0,1542.152,2202.153,0.0,2625.695
2,1999,3.341237,154.0,56.0,116.0,132,32.4,54.5,1044.103,1548.732,0.0,3077.165
3,2001,2.929382,140.0,66.0,121.0,156,41.91,54.2,1092.997,1532.988,0.0,3044.015
4,1990,3.306718,139.0,75.0,111.0,129,50.44,54.4,479.255,824.2,0.0,4146.545


In [9]:
print(hold_out.info())
hold_out.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 12 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   Year                                     6 non-null      int64  
 1   Per Acre Emissions                       6 non-null      float64
 2   N                                        6 non-null      float64
 3   P                                        6 non-null      float64
 4   Potash                                   6 non-null      float64
 5   IN Corn Yield per Acre                   6 non-null      int64  
 6   Precipitation                            6 non-null      float64
 7   Average Temperature                      6 non-null      float64
 8   IN No Till Corn (Thousands of acres)     6 non-null      float64
 9   IN Con Till Corn (Thousands of acres)    6 non-null      float64
 10  IN Corn Cover Crop (Thousands of acres)  6 non-null   

Unnamed: 0,Year,Per Acre Emissions,N,P,Potash,IN Corn Yield per Acre,Precipitation,Average Temperature,IN No Till Corn (Thousands of acres),IN Con Till Corn (Thousands of acres),IN Corn Cover Crop (Thousands of acres),Conventional Till
0,2006,2.75826,148.0,69.0,122.0,157,51.06,54.5,1365.257,1953.378,0.0,2061.365
1,2017,2.802987,166.0,72.0,114.0,180,47.45,55.6,1134.432,1816.156,362.494,2249.412
2,2005,3.096088,147.0,77.0,124.0,154,43.74,53.9,1188.362,1704.603,0.0,2877.035
3,2008,2.802926,149.0,70.0,126.0,160,49.04,52.7,1393.276,2095.076,0.0,1971.648
4,1994,3.331703,147.0,74.0,112.0,144,31.63,53.3,1131.817,1534.523,0.0,3293.66


In [14]:
X_train = train_data.drop('Per Acre Emissions', axis=1)
y_train = train_data['Per Acre Emissions']

In [23]:
X_test = hold_out.drop('Per Acre Emissions', axis=1)
y_test = hold_out['Per Acre Emissions']

In [35]:
xg_params = {"booster":"gblinear", "objective":"reg:linear"}

In [52]:
ridge = Ridge()
lasso = Lasso()
rf = RandomForestRegressor()
xgb  = xg.XGBRegressor()

In [53]:
models = [ridge, lasso, rf, xgb]

In [54]:
scaler = ('Scale', StandardScaler())

In [55]:
pipelines = []
for i in models:
    ipipe = Pipeline([scaler, (str(i), i)])
    pipelines.append(ipipe)

In [56]:
ridge_pipe = pipelines[0]
lasso_pipe = pipelines[1]
rf_pipe = pipelines[2]
xgb_pipe = pipelines[3]

In [69]:
pipes = [ridge_pipe, lasso_pipe, rf_pipe, xgb_pipe]

In [91]:
scores = []
for pipe in pipes:
    name = pipes[pipes.index(pipe)][1]
    cv_score = cross_val_score(pipe, X_train, y_train, scoring='neg_mean_squared_error', cv=5)
    score = (name, np.mean(np.sqrt(np.abs(cv_score))))
    scores.append(score)

In [96]:
for score in scores:
    print(score, sep='/n')

(Ridge(), 0.09018447356516088)
(Lasso(), 0.19742629764805658)
(RandomForestRegressor(), 0.062274716397280494)
(XGBRegressor(base_score=None, booster=None, colsample_bylevel=None,
             colsample_bynode=None, colsample_bytree=None, gamma=None,
             gpu_id=None, importance_type='gain', interaction_constraints=None,
             learning_rate=None, max_delta_step=None, max_depth=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=100, n_jobs=None, num_parallel_tree=None,
             random_state=None, reg_alpha=None, reg_lambda=None,
             scale_pos_weight=None, subsample=None, tree_method=None,
             validate_parameters=None, verbosity=None), 0.054399917901490014)
