# Modeling

## Imports

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import dtreeviz
import seaborn as sns
import pickle

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, Lasso, LassoCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics

In [3]:
df = pd.read_csv("../data/clean_us_food_waste.csv")

### Feature Selection

In [4]:
df.head()

Unnamed: 0,year,tons_surplus,tons_supply,us_dollars_surplus,tons_waste,tons_uneaten,tons_inedible_parts,tons_not_fit_for_human_consumption,tons_donated,tons_biomaterial_processing,...,food_type_dairy_&_eggs,food_type_dry_goods,food_type_fresh_meat_&_seafood,food_type_frozen,food_type_prepared_foods,food_type_produce,food_type_ready-to-drink_beverages,tons_productive_surplus,tons_unproductive_surplus,tons_consumed
0,2022,169741.1,5675411.0,204422500.0,169741.1,169741.1,0.0,82917.64,0.0,0.0,...,0,1,0,0,0,0,0,0.0,169741.1,5505670.0
1,2022,1759.466,71372.74,1592708.0,1697.192,1754.997,0.0,844.4716,4.469649,0.0,...,0,1,0,0,0,0,0,62.274196,1697.192,69613.27
2,2022,2754374.0,7201575.0,1764610000.0,2169203.0,2712374.0,275437.4305,1462724.0,41999.93657,0.0,...,0,0,0,0,0,1,0,585171.71377,2169203.0,4447201.0
3,2022,12204.09,38281.62,12063120.0,9641.038,12020.13,854.285977,6462.121,183.959352,0.0,...,0,0,0,0,0,1,0,2563.046946,9641.038,26077.53
4,2022,3942.489,39575.25,4730987.0,2870.689,3867.242,2365.493625,2544.075,75.247693,0.0,...,0,0,0,0,0,1,0,1071.800077,2870.689,35632.76


In [5]:
X = df[["tons_consumed", "tons_donated", "tons_biomaterial_processing", "tons_animal_feed", "tons_anaerobically_digested", "tons_composted", "tons_incinerated", "tons_land_application", "tons_landfilled", "tons_sewer", "tons_refuse_discards", "sector_farm", "sector_foodservice", "sector_manufacturing", "sector_residential", "sector_retail", "food_type_breads_&_bakery", "food_type_dairy_&_eggs", "food_type_dry_goods", "food_type_fresh_meat_&_seafood", "food_type_frozen", "food_type_prepared_foods", "food_type_produce", "food_type_ready-to-drink_beverages"]]
y = df["total_mtco2e_footprint"]

## Preprocessing

### Splitting and Scaling

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=51)

In [7]:
ss = StandardScaler()

X_train_ss = ss.fit_transform(X_train)
X_test_ss = ss.transform(X_test)

### Model Instantiation

In [8]:
lr = LinearRegression()

lasso = Lasso(max_iter=10000, tol=0.001, random_state=51)

ridge = Ridge(random_state=51)

dtr = DecisionTreeRegressor(random_state=51)

rfr = RandomForestRegressor(random_state=51)

## Model Selection and Evaluation

### Linear Regression

In [9]:
model1 = lr.fit(X_train_ss, y_train)

In [10]:
print(model1.score(X_train_ss, y_train))

print(model1.score(X_test_ss, y_test))

0.949913850558376
0.96708519062423


In [11]:
y_pred_1 = lr.predict(X_train_ss)

In [12]:
mae1 = metrics.mean_absolute_error(y_train, y_pred_1)
mae1

129960.00847716266

In [13]:
mse1 = metrics.mean_squared_error(y_train, y_pred_1)
mse1

88982004054.34956

In [14]:
rmse1 = np.sqrt(mse1)
rmse1

298298.5150052705

### LASSO Regression

In [15]:
lasso_params = {
    'alpha' : [1, 5, 10]
}

In [16]:
lasso_gridsearch = GridSearchCV(lasso,
                                lasso_params,
                                cv = 5,
                                verbose = 1)

In [17]:
model2 = lasso_gridsearch.fit(X_train_ss, y_train)

Fitting 5 folds for each of 3 candidates, totalling 15 fits


In [18]:
model2.best_score_

0.9461107791194404

In [19]:
model2.best_params_

{'alpha': 1}

In [20]:
best_model2 = model2.best_estimator_
best_model2.fit(X_train_ss, y_train)

In [21]:
print(best_model2.score(X_train_ss, y_train))

print(best_model2.score(X_test_ss, y_test))

0.9499138503102171
0.9670853222598297


In [22]:
y_pred_2 = best_model2.predict(X_train_ss)

In [23]:
mae2 = metrics.mean_absolute_error(y_train, y_pred_2)
mae2

129958.57737636422

In [24]:
mse2 = metrics.mean_squared_error(y_train, y_pred_2)
mse2

88982004495.22322

In [25]:
rmse2 = np.sqrt(mse2)
rmse2

298298.5157442511

### Ridge Regression

In [26]:
ridge_params = {
    'alpha' : [1, 5, 10]
}

In [27]:
ridge_gridsearch = GridSearchCV(ridge,
                                ridge_params,
                                cv = 5,
                                verbose = 1)

In [28]:
model3 = ridge_gridsearch.fit(X_train_ss, y_train)

Fitting 5 folds for each of 3 candidates, totalling 15 fits


In [29]:
model3.best_score_

0.9461289854753006

In [30]:
model3.best_params_

{'alpha': 5}

In [31]:
best_model3 = model3.best_estimator_
best_model3.fit(X_train_ss, y_train)

In [32]:
print(best_model3.score(X_train_ss, y_train))

print(best_model3.score(X_test_ss, y_test))

0.949904819105736
0.9670669157194531


In [33]:
y_pred_3 = best_model3.predict(X_train_ss)

In [34]:
mae3 = metrics.mean_absolute_error(y_train, y_pred_3)
mae3

130016.04553069883

In [35]:
mse3 = metrics.mean_squared_error(y_train, y_pred_3)
mse3

88998049143.94781

In [36]:
rmse3 = np.sqrt(mse3)
rmse3

298325.40814343625

### Decision Tree Regression

In [37]:
dtr_params = {
    'max_depth' : [10, 15, 25, 50, 100],
    'min_samples_split' : [2, 4, 6],
    'min_samples_leaf' : [1, 3, 5, 10]
}

In [38]:
dtr_gridsearch = GridSearchCV(dtr,
                              dtr_params,
                              cv = 5,
                              verbose = 1)

In [39]:
model4 = dtr_gridsearch.fit(X_train, y_train)

Fitting 5 folds for each of 60 candidates, totalling 300 fits


In [40]:
model4.best_score_

0.9829690671131346

In [41]:
model4.best_params_

{'max_depth': 25, 'min_samples_leaf': 1, 'min_samples_split': 4}

In [42]:
best_model4 = model4.best_estimator_
best_model4.fit(X_train, y_train)

In [43]:
print(best_model4.score(X_train, y_train))

print(best_model4.score(X_test, y_test))

0.9997010159766507
0.996169535706307


In [44]:
y_pred_4 = best_model4.predict(X_train)

In [45]:
mae4 = metrics.mean_absolute_error(y_train, y_pred_4)
mae4

2814.4308194313653

In [46]:
mse4 = metrics.mean_squared_error(y_train, y_pred_4)
mse4

531168753.72613484

In [47]:
rmse4 = np.sqrt(mse4)
rmse4

23047.098596702683

### Random Forest Regression

In [31]:
rfr_params = {
    'n_estimators' : [50, 100, 200],
    'max_depth' : [10, 50, 100],
    'min_samples_split' : [2, 4],
    'min_samples_leaf' : [1, 5]
}

In [32]:
rfr_gridsearch = GridSearchCV(rfr,
                              rfr_params,
                              cv = 5,
                              verbose = 1)

In [33]:
model5 = rfr_gridsearch.fit(X_train, y_train)

Fitting 5 folds for each of 36 candidates, totalling 180 fits


In [34]:
model5.best_score_

0.9943095896704441

In [35]:
model5.best_params_

{'max_depth': 50,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 50}

In [36]:
best_model5 = model5.best_estimator_
best_model5.fit(X_train, y_train)

In [37]:
print(best_model5.score(X_train, y_train))

print(best_model5.score(X_test, y_test))

0.999341616848391
0.9962975501461497


In [77]:
y_pred_5 = best_model5.predict(X_train)

In [78]:
mae5 = metrics.mean_absolute_error(y_train, y_pred_5)
mae5

4076.171018926899

In [79]:
mse5 = metrics.mean_squared_error(y_train, y_pred_5)
mse5

1169669717.4548814

In [80]:
rmse5 = np.sqrt(mse5)
rmse5

34200.43446295502

### Pickling the Best Model

After checking the MAE, MSE, and RMSE for each model, the best model appears to be the Decision Tree Regression model. This is the model that will be used to create the predictive web app.

In [53]:
with open('../models/co2_prediction.pkl', 'wb') as f:
    pickle.dump(best_model4, f)