# Regression Modeling

## Library Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [56]:
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.metrics import root_mean_squared_error
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor

In [3]:
sns.set_style('darkgrid')

## Data Import 

In [4]:
df = pd.read_csv('./data/cleaned_concrete.csv')

In [5]:
df.head()

Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age,Concrete compressive strength
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.99
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.89
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.27
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.3


## Modeling

### Feature Selection

In [6]:
X = df.drop(columns=['Concrete compressive strength'])
# X = df[['Cement', 'Superplasticizer', 'Age', 'Water']]
y = df['Concrete compressive strength']

In this modeling, we will use all the features, as there are not too many of them, and it is important to consider all the ingredients of concrete. 

### Data Split

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [9]:
kf = KFold(n_splits=5, random_state=42, shuffle=True)

### Linear Regression

In [64]:
lr = LinearRegression()

In [65]:
grid_lr = GridSearchCV(lr, param_grid={}, cv=kf)

In [66]:
grid_lr.fit(X_train, y_train)

0,1,2
,estimator,LinearRegression()
,param_grid,{}
,scoring,
,n_jobs,
,refit,True
,cv,KFold(n_split... shuffle=True)
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [67]:
grid_lr.best_score_

np.float64(0.6001310221694599)

In [68]:
best_lr = grid_lr.best_estimator_

In [69]:
best_lr.score(X_test, y_test)

0.6249829353885578

For linear regression, the best performing model from teh grid search cross validation got an R2 score of 0.625

### Ridge Regression

In [None]:
r = Ridge()

In [16]:
grid_r = GridSearchCV(r, param_grid={'alpha': [0.01, 0.1, 1, 10, 100], "solver": ["auto", "svd", "lsqr"]}, cv=kf)

In [17]:
grid_r.fit(X_train, y_train)

0,1,2
,estimator,Ridge()
,param_grid,"{'alpha': [0.01, 0.1, ...], 'solver': ['auto', 'svd', ...]}"
,scoring,
,n_jobs,
,refit,True
,cv,KFold(n_split... shuffle=True)
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,alpha,100
,fit_intercept,True
,copy_X,True
,max_iter,
,tol,0.0001
,solver,'auto'
,positive,False
,random_state,


In [19]:
grid_r.best_params_

{'alpha': 100, 'solver': 'auto'}

In [20]:
grid_r.best_score_

np.float64(0.6001370030592948)

In [21]:
best_r = grid_r.best_estimator_

In [57]:
best_r.score(X_test, y_test)

0.6250256615532552

For ridge regression, the best performing model got an R2 score of 0.625. This model had hyperparameters of alpha=100, and solver=auto.

This is the same as it was for linear regression, so ridge's regularization has no effect. 

### Lasso Regression

In [None]:
l = Lasso()

In [22]:
grid_l = GridSearchCV(l, param_grid={'alpha': [0.01, 0.1, 1, 10, 100]}, cv=kf)

In [23]:
grid_l.fit(X_train, y_train)

0,1,2
,estimator,Lasso()
,param_grid,"{'alpha': [0.01, 0.1, ...]}"
,scoring,
,n_jobs,
,refit,True
,cv,KFold(n_split... shuffle=True)
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,alpha,0.01
,fit_intercept,True
,precompute,False
,copy_X,True
,max_iter,1000
,tol,0.0001
,warm_start,False
,positive,False
,random_state,
,selection,'cyclic'


In [24]:
grid_l.best_params_

{'alpha': 0.01}

In [25]:
grid_l.best_score_

np.float64(0.6001308901250643)

In [27]:
best_l = grid_l.best_estimator_

In [58]:
best_l.score(X_test, y_test)

0.6249913033440581

For lasso regression, the best performing model got an R2 score of 0.625. This model had hyperparameter of alpha=0.01

This is the same as it was for linear regression, so lasso's regularization has no effect. 

### RMSE Scores

In [28]:
lr_preds = best_lr.predict(X_test)
r_preds = best_r.predict(X_test)
l_preds = best_l.predict(X_test)

In [29]:
baseline = np.full_like(y_test, y_test.mean())

In [30]:
root_mean_squared_error(y_test, lr_preds)

10.078759627529536

In [31]:
root_mean_squared_error(y_test, r_preds)

10.078185468310409

In [32]:
root_mean_squared_error(y_test, l_preds)

10.078647180536453

In [33]:
root_mean_squared_error(y_test, baseline)

16.458171086274376

All 3 of the linear models got nearly the same RMSE score of ~10.08. This did beat baseline, so all the models perform better than just predicting the mean. 

### Random Forest

In [35]:
rf = RandomForestRegressor(random_state=42)

In [36]:
grid_rf = GridSearchCV(rf, param_grid={}, cv=kf)

In [37]:
grid_rf.fit(X_train, y_train)

0,1,2
,estimator,RandomForestR...ndom_state=42)
,param_grid,{}
,scoring,
,n_jobs,
,refit,True
,cv,KFold(n_split... shuffle=True)
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [38]:
grid_rf.best_score_

np.float64(0.8999203314725724)

In [39]:
grid_rf.score(X_test, y_test)

0.8887109879229889

In [40]:
best_rf = grid_rf.best_estimator_

In [41]:
rf_preds = best_rf.predict(X_test)

In [42]:
root_mean_squared_error(y_test, rf_preds)

5.490447158991832

The random forest model performs with an R2 score of 0.89 and gets an RMSE score of 5.49. 

This means that it can explain 89% of the variance of the concrete strength, and accuarely predict the strength within ~6 MPa. 

### K Nearest Neighbors

In [43]:
s = StandardScaler()

In [44]:
X_train_scaled = s.fit_transform(X_train)
X_test_scaled = s.transform(X_test)

In [45]:
knn = KNeighborsRegressor()

In [46]:
grid_knn = GridSearchCV(knn, param_grid={'n_neighbors': np.arange(3, 31, 2), 'weights': ['uniform', 'distance'], 'metric': ['minkowski', 'manhattan']}, cv=kf)

In [47]:
grid_knn.fit(X_train_scaled, y_train)

0,1,2
,estimator,KNeighborsRegressor()
,param_grid,"{'metric': ['minkowski', 'manhattan'], 'n_neighbors': array([ 3, 5..., 25, 27, 29]), 'weights': ['uniform', 'distance']}"
,scoring,
,n_jobs,
,refit,True
,cv,KFold(n_split... shuffle=True)
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,n_neighbors,np.int64(7)
,weights,'distance'
,algorithm,'auto'
,leaf_size,30
,p,2
,metric,'minkowski'
,metric_params,
,n_jobs,


In [48]:
grid_knn.best_params_

{'metric': 'minkowski', 'n_neighbors': np.int64(7), 'weights': 'distance'}

In [49]:
grid_knn.best_score_

np.float64(0.7379720371951318)

In [50]:
best_knn = grid_knn.best_estimator_

In [51]:
knn_preds = best_knn.predict(X_test_scaled)

In [52]:
root_mean_squared_error(y_test, knn_preds)

7.837509637066959

In [53]:
best_knn.score(X_test_scaled, y_test)

0.7732261201055246

The KNN model at its best perormance got an R2 score of 0.77, and an RMSE score of 7.84. 

The best hyperparameters of this model were using the minkoski distance formula, 7 neighbors, and weighted the distances to each neighbor. 

## Pickling the best model

The best score was 0.89 with Random Forest Regressor.

In [54]:
import pickle

In [55]:
# with open('model.pkl', 'wb') as file:
#     pickle.dump(best_rf, file)