In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib as matplotlib
import matplotlib.pyplot as plt

import statsmodels.api as sm

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import ShuffleSplit, GroupKFold, cross_validate, GridSearchCV
from sklearn.linear_model import LinearRegression, Lasso, lasso_path
from sklearn.dummy import DummyRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor, RandomForestRegressor

from xgboost import XGBRegressor

pd.options.mode.chained_assignment = None  # default='warn'
%load_ext autoreload
%autoreload 2

In [2]:
from utils import REPO_PATH, DATA_PATH, DATA_CLEAN_PATH
from utils import set_plt_settings
from train_functions import scoring, run_cv

set_plt_settings()

## Make analytic datasets

In [3]:
# read in
df = pd.read_csv(DATA_CLEAN_PATH / 'analysis_df.csv')

# make target and features
X = df.loc[:, [col for col in df.columns if col.startswith(('ndvi', 'lst'))]]
y = (df.total / df.area_km).to_numpy() # short tons per km^2
g_state = df.statefp

# scale features
scaler = StandardScaler(copy=True, with_mean=True, with_std=True)
scaler.fit(X)
X_scaled = pd.DataFrame(scaler.transform(X), columns=X.columns)

## Split data into train and test

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.10, random_state=1, shuffle=True, stratify=None)

## Cross validate contending models

In [4]:
# dummy regressor
params={}
cv_mean, cv_mean_min = run_cv(X=X_scaled, y=y, estimator=DummyRegressor(), params=params, scoring=scoring)
cv_mean_min

model                          DummyRegressor()
params                                       {}
neg_root_mean_squared_error        -4382.952289
mean_test_r_squared                   -0.001913
Name: 0, dtype: object

In [5]:
# linear regression
params={}
cv_lm, cv_lm_min = run_cv(X=X_scaled, y=y, estimator=LinearRegression(), params=params, scoring=scoring)
cv_lm_min

model                          LinearRegression()
params                                         {}
neg_root_mean_squared_error            -1989.4228
mean_test_r_squared                      0.793126
Name: 0, dtype: object

In [6]:
# lasso regression
params={'alpha':[j*(10**i) for i in range(1,3) for j in range(1, 9)]}
cv_lasso, cv_lasso_min = run_cv(X=X_scaled, y=y, estimator=Lasso(), params=params, scoring=scoring)
cv_lasso_min

model                                Lasso()
params                         {'alpha': 10}
neg_root_mean_squared_error     -2013.503508
mean_test_r_squared                 0.788329
Name: 0, dtype: object

In [7]:
# decision tree regression
params={'min_samples_leaf': [1, 5, 10, 50, 100], 'max_depth': [None, 100, 50, 10, 5]}
cv_dt, cv_dt_min = run_cv(X=X_scaled, y=y, estimator=DecisionTreeRegressor(), params=params, scoring=scoring)
cv_dt_min

model                                            DecisionTreeRegressor()
params                         {'max_depth': 10, 'min_samples_leaf': 10}
neg_root_mean_squared_error                                 -1708.643698
mean_test_r_squared                                             0.847266
Name: 17, dtype: object

In [8]:
# random forest regression
params={'min_samples_leaf': [1, 5, 10, 50, 100], 'max_depth': [None, 100, 50, 10, 5]}
# params={'min_samples_leaf': [3, 5, 7], 'max_depth': [200, 150, 100]} #7, 100
# params={'min_samples_leaf': [6, 7, 8], 'max_depth': [90, 100, 110]}
cv_rf, cv_rf_min = run_cv(X=X_scaled, y=y, estimator=RandomForestRegressor(), params=params, scoring=scoring)
cv_rf_min


KeyboardInterrupt



In [None]:
# xgboost regression
params={'reg_alpha':[.01, .1, 1, 10, 100], 'reg_lambda':[0.1, .1, 1, 10, 100],
        'tree_method':['approx', 'hist']}
cv_xgb, cv_xgb_min = run_cv(X=X_scaled, y=y, estimator=XGBRegressor(objective="reg:squarederror"), params=params, scoring=scoring)
cv_xgb_min

In [None]:
cv = pd.DataFrame((cv_mean_min, cv_lm_min, cv_lasso_min, cv_dt_min, cv_rf_min, cv_xgb_min))
cv