In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import RandomizedSearchCV, cross_val_score, GridSearchCV

Load train and test data

In [2]:
hold_out = pd.read_csv('/Users/coleromanyk/Documents/GitHub/Capstone 2/hold_out_data.csv')
train_data= pd.read_csv('/Users/coleromanyk/Documents/GitHub/Capstone 2/SMOTE_data.csv')

In [3]:
print(train_data.info())
train_data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 172 entries, 0 to 171
Data columns (total 13 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   Year                                     172 non-null    int64  
 1   Per Acre Emissions                       172 non-null    float64
 2   N                                        172 non-null    float64
 3   P                                        172 non-null    float64
 4   Potash                                   172 non-null    float64
 5   IN Corn Yield per Acre                   172 non-null    int64  
 6   Precipitation                            172 non-null    float64
 7   Average Temperature                      172 non-null    float64
 8   IN No Till Corn (Thousands of acres)     172 non-null    float64
 9   IN Con Till Corn (Thousands of acres)    172 non-null    float64
 10  IN Corn Cover Crop (Thousands of acres)  172 non-n

Unnamed: 0,Year,Per Acre Emissions,N,P,Potash,IN Corn Yield per Acre,Precipitation,Average Temperature,IN No Till Corn (Thousands of acres),IN Con Till Corn (Thousands of acres),IN Corn Cover Crop (Thousands of acres),Conventional Till,efficiency
0,1993,2.86059,134.0,68.0,114.0,132,50.78,52.0,1211.769,1536.438,0.0,2651.793,46.144325
1,2007,3.367331,149.0,69.0,124.0,154,36.75,55.0,1542.152,2202.153,0.0,2625.695,45.733543
2,1999,3.341237,154.0,56.0,116.0,132,32.4,54.5,1044.103,1548.732,0.0,3077.165,39.506327
3,2001,2.929382,140.0,66.0,121.0,156,41.91,54.2,1092.997,1532.988,0.0,3044.015,53.253548
4,1990,3.306718,139.0,75.0,111.0,129,50.44,54.4,479.255,824.2,0.0,4146.545,39.011486


In [4]:
print(hold_out.info())
hold_out.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 13 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   Year                                     6 non-null      int64  
 1   Per Acre Emissions                       6 non-null      float64
 2   N                                        6 non-null      float64
 3   P                                        6 non-null      float64
 4   Potash                                   6 non-null      float64
 5   IN Corn Yield per Acre                   6 non-null      int64  
 6   Precipitation                            6 non-null      float64
 7   Average Temperature                      6 non-null      float64
 8   IN No Till Corn (Thousands of acres)     6 non-null      float64
 9   IN Con Till Corn (Thousands of acres)    6 non-null      float64
 10  IN Corn Cover Crop (Thousands of acres)  6 non-null   

Unnamed: 0,Year,Per Acre Emissions,N,P,Potash,IN Corn Yield per Acre,Precipitation,Average Temperature,IN No Till Corn (Thousands of acres),IN Con Till Corn (Thousands of acres),IN Corn Cover Crop (Thousands of acres),Conventional Till,efficiency
0,2006,2.75826,148.0,69.0,122.0,157,51.06,54.5,1365.257,1953.378,0.0,2061.365,56.919946
1,2017,2.802987,166.0,72.0,114.0,180,47.45,55.6,1134.432,1816.156,362.494,2249.412,64.217218
2,2005,3.096088,147.0,77.0,124.0,154,43.74,53.9,1188.362,1704.603,0.0,2877.035,49.740185
3,2008,2.802926,149.0,70.0,126.0,160,49.04,52.7,1393.276,2095.076,0.0,1971.648,57.083199
4,1994,3.331703,147.0,74.0,112.0,144,31.63,53.3,1131.817,1534.523,0.0,3293.66,43.221141


Drop components features of efficiency

In [5]:
X_train = train_data.drop(['Per Acre Emissions', 'efficiency', 'IN Corn Yield per Acre'], axis=1)
y_train = train_data['efficiency']

In [6]:
X_test = hold_out.drop(['Per Acre Emissions','efficiency','IN Corn Yield per Acre'], axis=1)
y_test = hold_out['efficiency']

Define XGBoost parameters

In [7]:
xgb_params = {"objective":"reg:squarederror"}

Define models for comparison

In [8]:
ridge = Ridge()
lasso = Lasso()
rf = RandomForestRegressor()
xg = xgb.XGBRegressor(objective='reg:squarederror')

Define scaler varible and scaler parameter for pipelines

In [9]:
scaler = StandardScaler()
scaler_param = ('Scale', scaler)

Create and run model pipelines, printing the score to compare RMSE and R^2 returned by each model

In [10]:
ridge_pipe = Pipeline([scaler_param, ('Ridge', ridge)])
lasso_pipe = Pipeline([scaler_param, ('Lasso', lasso)])
rf_pipe = Pipeline([scaler_param, ('Random Forrest', rf)])
xgb_pipe = Pipeline([scaler_param, ('XGB Regressor', xg)])

In [11]:
pipes = [ridge_pipe, lasso_pipe, rf_pipe, xgb_pipe]
for pipe in pipes:
    cv_score = cross_val_score(pipe, X_train, y_train, scoring='neg_root_mean_squared_error', cv=5)
    score = (str(pipe[1]), np.mean(np.sqrt(np.abs(cv_score))))
    print(score)

('Ridge()', 2.0542068382797014)
('Lasso()', 2.592099790619153)
('RandomForestRegressor()', 1.2290192041261392)
("XGBRegressor(base_score=None, booster=None, colsample_bylevel=None,\n             colsample_bynode=None, colsample_bytree=None, gamma=None,\n             gpu_id=None, importance_type='gain', interaction_constraints=None,\n             learning_rate=None, max_delta_step=None, max_depth=None,\n             min_child_weight=None, missing=nan, monotone_constraints=None,\n             n_estimators=100, n_jobs=None, num_parallel_tree=None,\n             random_state=None, reg_alpha=None, reg_lambda=None,\n             scale_pos_weight=None, subsample=None, tree_method=None,\n             validate_parameters=None, verbosity=None)", 0.9000113961347272)


In [12]:
pipes = [ridge_pipe, lasso_pipe, rf_pipe, xgb_pipe]
for pipe in pipes:
    cv_score = cross_val_score(pipe, X_train, y_train, scoring='r2', cv=5)
    score = (str(pipe[1]), np.mean(np.sqrt(np.abs(cv_score))))
    print(score)

('Ridge()', 0.8508520102521562)
('Lasso()', 0.5392397871751451)
('RandomForestRegressor()', 0.9690238051275657)
("XGBRegressor(base_score=None, booster=None, colsample_bylevel=None,\n             colsample_bynode=None, colsample_bytree=None, gamma=None,\n             gpu_id=None, importance_type='gain', interaction_constraints=None,\n             learning_rate=None, max_delta_step=None, max_depth=None,\n             min_child_weight=None, missing=nan, monotone_constraints=None,\n             n_estimators=100, n_jobs=None, num_parallel_tree=None,\n             random_state=None, reg_alpha=None, reg_lambda=None,\n             scale_pos_weight=None, subsample=None, tree_method=None,\n             validate_parameters=None, verbosity=None)", 0.9636337877900765)


Perform Grid and Randomized Search Params for XG Boost

In [13]:
X_scaled = scaler.fit_transform(X_train)
y_scaled = scaler.fit_transform(np.array(y_train).reshape(-1, 1))

In [14]:
cv_score = cross_val_score(xgb.XGBRegressor(objective="reg:squarederror"), X_scaled, y_scaled, scoring='neg_mean_squared_error', cv=5)
print(np.mean(np.sqrt(np.abs(cv_score))))

0.16876815645548215


In [15]:
params = {'n_estimators':range(15,51), 'max_depth':range(3,11), 'learning_rate':[ .1, .3, .5, .7], 'subsample':[.2,.3,.5,.6,.7,.8]}

In [16]:
X_dmatrix = xgb.DMatrix(data=X_scaled, label=y_scaled)

In [45]:
grid_mse = GridSearchCV(estimator=xgb.XGBRegressor(objective="reg:squarederror"), param_grid=params, scoring='neg_root_mean_squared_error', cv=5, verbose=1)

In [46]:
# grid_mse.fit(X_train, y_train)

Fitting 5 folds for each of 6912 candidates, totalling 34560 fits


GridSearchCV(cv=5,
             estimator=XGBRegressor(base_score=None, booster=None,
                                    colsample_bylevel=None,
                                    colsample_bynode=None,
                                    colsample_bytree=None, gamma=None,
                                    gpu_id=None, importance_type='gain',
                                    interaction_constraints=None,
                                    learning_rate=None, max_delta_step=None,
                                    max_depth=None, min_child_weight=None,
                                    missing=nan, monotone_constraints=None,
                                    n_estimators=100, n_jobs=...
                                    num_parallel_tree=None, random_state=None,
                                    reg_alpha=None, reg_lambda=None,
                                    scale_pos_weight=None, subsample=None,
                                    tree_method=None, validate_parame

In [37]:
randomized_mse = RandomizedSearchCV(estimator=xgb.XGBRegressor(objective="reg:squarederror"), param_distributions=params, scoring='neg_root_mean_squared_error', n_iter=86, cv=5, verbose=1)

In [38]:
randomized_mse.fit(X_train, y_train)

Fitting 5 folds for each of 86 candidates, totalling 430 fits


RandomizedSearchCV(cv=5,
                   estimator=XGBRegressor(base_score=None, booster=None,
                                          colsample_bylevel=None,
                                          colsample_bynode=None,
                                          colsample_bytree=None, gamma=None,
                                          gpu_id=None, importance_type='gain',
                                          interaction_constraints=None,
                                          learning_rate=None,
                                          max_delta_step=None, max_depth=None,
                                          min_child_weight=None, missing=nan,
                                          monotone_constraints=None,
                                          n_estimators=100, n...
                                          num_parallel_tree=None,
                                          random_state=None, reg_alpha=None,
                                          reg_l

In [49]:
# print("Best Grid parameters found: ", grid_mse.best_params_)
# print("Lowest Grid RMSE found: ", np.sqrt(np.abs(grid_mse.best_score_)))

Best Grid parameters found:  {'learning_rate': 0.7, 'max_depth': 4, 'n_estimators': 50, 'subsample': 0.7}
Lowest Grid RMSE found:  1.014475142385794


Best Grid parameters found:  {'learning_rate': 0.7, 'max_depth': 4, 'n_estimators': 50, 'subsample': 0.7}
Lowest Grid RMSE found:  1.014475142385794

In [48]:
print("Best Random parameters found: ", randomized_mse.best_params_)
print("Lowest Random RMSE found: ", np.sqrt(np.abs(randomized_mse.best_score_)))

Best Random parameters found:  {'subsample': 0.7, 'n_estimators': 29, 'max_depth': 4, 'learning_rate': 0.7}
Lowest Random RMSE found:  1.017821599127864
