# Models

## Setup

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

In [11]:
concrete = pd.read_csv('data/concrete.csv')
co2 = pd.read_csv('data/co2.csv')

concrete['co2_lower'] = sum([concrete[col] * co2.loc[co2.ingredient == col, 'lower_bound'].values[0] for col in concrete.columns[:7]])
concrete['co2_upper'] = sum([concrete[col] * co2.loc[co2.ingredient == col, 'upper_bound'].values[0] for col in concrete.columns[:7]])

concrete = concrete[concrete['age'] < 120]

concrete_as = concrete[(concrete['ash'] > 0) & (concrete['slag'] > 0)]
concrete_a = concrete[(concrete['ash'] > 0) & (concrete['slag'] == 0)]
concrete_s = concrete[(concrete['ash'] == 0) & (concrete['slag'] > 0)]
concrete_ = concrete[(concrete['ash'] == 0) & (concrete['slag'] == 0)]

concrete_train, concrete_test = train_test_split(concrete,
                                                 shuffle=True,
                                                 random_state=487)

features = concrete.columns[:-3]

In [12]:
# This function was modified from stackexchange user hughdbrown 
# at this link, 
# https://stackoverflow.com/questions/1482308/how-to-get-all-subsets-of-a-set-powerset

# This returns the power set of a set minus the empty set
def powerset(s):
    power_set = []
    x = len(s)
    for i in range(1 << x):
        power_set.append([s[j] for j in range(x) if (i & (1 << j))])
        
    return power_set[1:]


## Linear Model

In [5]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

# for future use, this function gets mean squared error without constantly copy-pasting

def get_slr_mses(data, features_list, y, n_splits=5, rs=97):
    # data is the dataframe
    # features_list is a list of all lists of features we wish to compare
    # eg [[], ['feature1'], ['feature1', 'feature2, 'feature5']]
    # if one list is [], then we make a baseline prediction
    # y is the y feature we are predicting
    # k is the number of cross-validation splits
    # rs is the random_state for kfold
    kfold = KFold(n_splits,
              shuffle=True,
              random_state=rs)
    mses=np.zeros((n_splits, len(features_list)))

    i = 0
    # cross-validation
    for train_index, test_index in kfold.split(data):
        data_tt = data.iloc[train_index]
        data_ho = data.iloc[test_index]

        j = 0
        for features in features_list:
            if features == []:
                # baseline prediction
                pred = data_tt[y].values.mean() * np.ones(len(data_ho))
            else:
                reg = LinearRegression(copy_X=True)
                reg.fit(data_tt[features], data_tt[y])
                pred = reg.predict(data_ho[features])
            
            mses[i, j] = mean_squared_error(y_true=data_ho[y],
                                            y_pred=pred)
            j += 1
        
        i += 1

    return np.mean(mses, axis=0)



We attempt to improve the linear model a little by picking the exponential factor with all features in consideration

In [17]:
features2 = ['cement', 'slag', 'ash', 'water', 'superplastic', 'coarseagg',
       'fineagg', 'exp_age']

powerset(features2)

[['cement'],
 ['slag'],
 ['cement', 'slag'],
 ['ash'],
 ['cement', 'ash'],
 ['slag', 'ash'],
 ['cement', 'slag', 'ash'],
 ['water'],
 ['cement', 'water'],
 ['slag', 'water'],
 ['cement', 'slag', 'water'],
 ['ash', 'water'],
 ['cement', 'ash', 'water'],
 ['slag', 'ash', 'water'],
 ['cement', 'slag', 'ash', 'water'],
 ['superplastic'],
 ['cement', 'superplastic'],
 ['slag', 'superplastic'],
 ['cement', 'slag', 'superplastic'],
 ['ash', 'superplastic'],
 ['cement', 'ash', 'superplastic'],
 ['slag', 'ash', 'superplastic'],
 ['cement', 'slag', 'ash', 'superplastic'],
 ['water', 'superplastic'],
 ['cement', 'water', 'superplastic'],
 ['slag', 'water', 'superplastic'],
 ['cement', 'slag', 'water', 'superplastic'],
 ['ash', 'water', 'superplastic'],
 ['cement', 'ash', 'water', 'superplastic'],
 ['slag', 'ash', 'water', 'superplastic'],
 ['cement', 'slag', 'ash', 'water', 'superplastic'],
 ['coarseagg'],
 ['cement', 'coarseagg'],
 ['slag', 'coarseagg'],
 ['cement', 'slag', 'coarseagg'],
 ['ash'

In [19]:
features2 = ['cement', 'slag', 'ash', 'water', 'superplastic', 'coarseagg',
       'fineagg', 'exp_age']

for factor in np.arange(.01, .1, .01):
    concrete_train['exp_age'] = np.exp(-factor * concrete_train['age'])
    print(factor, np.min(get_slr_mses(concrete_train, powerset(features2), 'strength')))

0.01 57.70330104192804
0.02 51.14081744397187
0.03 47.781495017706575
0.04 47.158922486225194
0.05 48.18470524244171
0.060000000000000005 50.0325012084111
0.06999999999999999 52.172666716534025
0.08 54.32070776381877
0.09 56.351408732591054


In [20]:
best_mses = []

for factor in np.arange(.03, .05, .001):
    concrete_train['exp_age'] = np.exp(-factor * concrete_train['age'])
    best_mses.append(np.min(get_slr_mses(concrete_train, powerset(features2), 'strength')))


print(np.arange(.03, .05, .001)[np.argmin(best_mses)], np.min(best_mses))

0.038000000000000006 47.11634037831618


In [21]:
concrete_train['exp_age'] = np.exp(-.038 * concrete_train['age'])

## Random Forest

In [25]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

In [28]:
max_depths = [5, 10, 50]
features = ['cement', 'slag', 'water', 'superplastic', 'coarseagg', 'fineagg', 'age', 'exp_age']
n_trees = [100, 500]

grid_cv = GridSearchCV(RandomForestRegressor(), # first put the model object here
                          param_grid = {'max_depth':max_depths, # place the grid values for max_depth and
                                        'n_estimators':n_trees}, # and n_estimators here
                          scoring = 'neg_mean_squared_error', # put the metric we are trying to optimize here as a string, "accuracy"
                          cv = 5) # put the number of cv splits here

## you fit it just like a model
grid_cv.fit(concrete_train[features], concrete_train['strength'])

print(grid_cv.best_params_)
print(-grid_cv.best_score_)

{'max_depth': 50, 'n_estimators': 500}
29.591726590750163


## Neural Network

In [37]:
from sklearn.preprocessing import StandardScaler
scale = StandardScaler(copy=True)

In [38]:
concrete_tt, concrete_val = train_test_split(concrete_train)

X_tt, y_tt = scale.fit_transform(concrete_tt[features]), concrete_tt['strength']
X_val, y_val = scale.fit_transform(concrete_val[features]), concrete_val['strength']

In [30]:
from sklearn.neural_network import MLPRegressor

In [39]:

mlp1 = MLPRegressor(max_iter=10000,
                     hidden_layer_sizes=(5, 5))

## Fit the model object
mlp1.fit(X_tt, y_tt)
mean_squared_error(y_val, mlp1.predict(X_val))

47.026201631608075

In [40]:

mlp1 = MLPRegressor(max_iter=10000,
                     hidden_layer_sizes=(100, 100))

## Fit the model object
mlp1.fit(X_tt, y_tt)
mean_squared_error(y_val, mlp1.predict(X_val))

28.739044222672337

In [33]:
## Import the following
import keras
from keras import models
from keras import layers
from keras import optimizers
from keras import losses
from keras import metrics

In [44]:
model1 = keras.Sequential()
model1.add(layers.Dense(100, activation='relu', input_shape=(X_tt.shape[1],)))
model1.add(layers.Dense(100, activation='relu'))
model1.add(layers.Dense(1, activation='relu'))

model1.compile(optimizer = 'rmsprop',
                 loss = 'mean_squared_error',
                 metrics = ['mse'])

In [49]:
n_epochs = 100
history1 = model1.fit(X_tt,
                       y_tt,
                       epochs = n_epochs,
                       batch_size = 50,
                       validation_data = (X_val, 
                                          y_val))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [50]:
np.min(history1.history['val_mse'])

32.66041946411133