## Regression Model with Keras ##

In [32]:
import pandas as pd
import numpy as np

In [33]:
concrete_data = pd.read_csv('https://s3-api.us-geo.objectstorage.softlayer.net/cf-courses-data/CognitiveClass/DL0101EN/labs/data/concrete_data.csv')
concrete_data.head()

Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age,Strength
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.99
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.89
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.27
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.3


In [34]:
concrete_data.shape

(1030, 9)

In [35]:
concrete_data.describe()

Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age,Strength
count,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0
mean,281.167864,73.895825,54.18835,181.567282,6.20466,972.918932,773.580485,45.662136,35.817961
std,104.506364,86.279342,63.997004,21.354219,5.973841,77.753954,80.17598,63.169912,16.705742
min,102.0,0.0,0.0,121.8,0.0,801.0,594.0,1.0,2.33
25%,192.375,0.0,0.0,164.9,0.0,932.0,730.95,7.0,23.71
50%,272.9,22.0,0.0,185.0,6.4,968.0,779.5,28.0,34.445
75%,350.0,142.95,118.3,192.0,10.2,1029.4,824.0,56.0,46.135
max,540.0,359.4,200.1,247.0,32.2,1145.0,992.6,365.0,82.6


In [37]:
concrete_data.isnull().sum()

Cement                0
Blast Furnace Slag    0
Fly Ash               0
Water                 0
Superplasticizer      0
Coarse Aggregate      0
Fine Aggregate        0
Age                   0
Strength              0
dtype: int64

## Part A ##

In [41]:
import keras
from keras.models import Sequential
from keras.layers import Dense

# define regression model
def regression_model():
    # create model
    model = Sequential()
    model.add(Dense(10, activation='relu', input_shape=(n_cols,)))
    model.add(Dense(10, activation='relu'))
    model.add(Dense(1))
    
    # compile model
    model.compile(optimizer='adam', loss='mean_squared_error')
    return model

In [42]:
from sklearn.model_selection import train_test_split
# splitting data
concrete_data_columns = concrete_data.columns

predictors = concrete_data[concrete_data_columns[concrete_data_columns != 'Strength']] # all columns except Strength
target = concrete_data['Strength'] # Strength column

# train_test_split
X_train, X_test, y_train, y_test = train_test_split(predictors, target, test_size = 0.2)

In [43]:
predictors.head()

Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360


In [44]:
target.head()

0    79.99
1    61.89
2    40.27
3    41.05
4    44.30
Name: Strength, dtype: float64

In [45]:
n_cols = predictors.shape[1] # number of predictors

In [46]:
# Buidling, Fitting and Evaluating model
from sklearn.metrics import mean_squared_error
MSEs = []

# build the model
model = regression_model()

for index in range(50) :
    
    # fit the model
    res = model.fit(X_train, y_train, epochs=50, verbose=0)
    
    # predicting and calculating MSE
    y_pred = model.predict(X_test)
    MSEs.append(mean_squared_error(y_test, y_pred))

print(MSEs)    

[80.62578393211501, 69.98307318950452, 65.00932415756304, 56.106644066270285, 57.222245200262186, 54.5619206950602, 54.307342646987784, 53.171492954489594, 55.54776614640659, 53.15869655905529, 68.36793001397497, 52.41265388956171, 61.33745950162794, 60.54570533748734, 52.84529935048334, 52.91505349867478, 52.82973713484343, 55.42346795954943, 53.93857527369763, 59.291451550053466, 57.88077753647722, 56.69021993530148, 53.25450133305301, 51.37825313714675, 54.80743977018341, 54.2454264575059, 56.31923700997158, 51.321172057694106, 49.206507423637305, 51.13137881437535, 47.33408311211343, 52.299300939784004, 48.90663855923677, 49.285084839463416, 50.02804995414567, 45.65544484391284, 47.20103416508569, 47.75270269793968, 46.86780391232471, 44.763682245587965, 50.51189697223857, 43.45955951105647, 45.16226337830766, 44.219398594411985, 44.26373518577449, 46.153635791910915, 50.48137634116056, 44.91436847240441, 43.94023758103334, 50.93356777076487]


In [47]:
import statistics

mean_mse = statistics.mean(MSEs)
stdev = statistics.stdev(MSEs)

print('Mean : %.2f' % mean_mse)
print('Standard Deviation : %.2f' % stdev)

Mean : 53.00
Standard Deviation : 7.18


## Part B ##
### Part A with Normalized Data ###

In [48]:
%%time
# Normalizing data
predictors_norm = (predictors - predictors.mean()) / predictors.std()
n_cols = predictors_norm.shape[1] # number of predictors

# train_test_split
X_train, X_test, y_train, y_test = train_test_split(predictors_norm, target, test_size = 0.2)

# Buidling, Fitting and Evaluating model
from sklearn.metrics import mean_squared_error
MSEsB = []

# build the model
modelB = regression_model()

for index in range(50) :
    
    # fit the model
    modelB.fit(X_train, y_train, epochs=50, verbose=0)
    
    # predicting and calculating MSE
    y_pred = modelB.predict(X_test)
    MSEsB.append(mean_squared_error(y_test, y_pred))

    
print(MSEsB)

[133.39482304467032, 107.17077913705197, 89.39437502035332, 79.70916543089967, 69.51829944155826, 53.14421621289056, 51.71692451057679, 49.10318435862816, 48.13169626788781, 45.7656657058075, 43.44749936652944, 42.2611796255986, 41.44826836079362, 41.53877229867046, 40.82921999691847, 41.545457106367074, 41.048562222969885, 40.80565030766517, 41.179131159138, 41.27526796435869, 41.63584856857713, 41.70073945388198, 42.77217854888467, 42.1617749195181, 42.09029449442689, 42.953174946588454, 42.8550282400053, 42.69352437649592, 43.682307166607416, 44.01617171741051, 44.09889025132808, 44.15735196396114, 45.33638994937981, 44.946846639131955, 46.458688861744704, 45.773804300922166, 45.30714619041577, 46.131797354790066, 46.052453436521546, 45.85738735980525, 45.95335875115382, 46.36265800040747, 46.78116646814492, 46.86381215130831, 46.72698597874491, 46.96399693676569, 46.97699856210743, 47.6244103172421, 48.111206201048404, 47.988975462274695]
CPU times: user 1min 4s, sys: 12.7 s, total

In [49]:
mean_mseB = statistics.mean(MSEsB)
stdevB = statistics.stdev(MSEsB)

print('Mean : %.2f' % mean_mseB)
print('Standard Deviation : %.2f' % stdevB)

Mean : 49.87
Standard Deviation : 17.29


In [50]:
print('Difference of means of Model A and Model B is {}', abs(mean_mseB - mean_mse))

Difference of means of Model A and Model B is {} 3.130137925854875


## Part C ##
### Part B with 100 epochs ###

In [51]:
%%time
# Normalizing data
predictors_norm = (predictors - predictors.mean()) / predictors.std()
n_cols = predictors_norm.shape[1] # number of predictors

# train_test_split
X_train, X_test, y_train, y_test = train_test_split(predictors_norm, target, test_size = 0.2)

# Buidling, Fitting and Evaluating model
from sklearn.metrics import mean_squared_error
MSEsC = []

# build the model
modelC = regression_model()

for index in range(50) :
    
    # fit the model
    modelC.fit(X_train, y_train, epochs=100, verbose=0)
    
    # predicting and calculating MSE
    y_pred = modelC.predict(X_test)
    MSEsC.append(mean_squared_error(y_test, y_pred))

    
print(MSEsC)

[106.70599883518507, 53.77097549450085, 41.25188947147479, 38.66621216560799, 37.36500284066783, 36.167176378723546, 34.916767251342506, 33.54850504366034, 33.24142771892517, 33.28478671664964, 33.048010734918144, 32.95712539187893, 32.43057129044716, 31.7844140854845, 32.0195107703705, 32.29777823591699, 31.91197973627221, 31.639911951333758, 31.37196031271582, 30.748750335220137, 30.804013820736287, 30.866252114836588, 30.803115795884263, 30.57786070575565, 30.473133683523724, 30.508463378660938, 30.65633215890033, 30.55601690257086, 30.97201822733585, 30.513874734994168, 30.786426586518562, 30.536118969152724, 31.020129712631572, 30.329879137833416, 31.027658435439424, 30.63808767979396, 30.47605969877012, 30.702341399924247, 30.864511281964788, 30.83813947515299, 30.79733364200145, 30.78960544846827, 30.88818164240613, 30.765306503845466, 30.894337782698425, 30.786590657690915, 30.8768981762117, 30.5881178470105, 31.043592433231186, 30.87905916358743]
CPU times: user 1min 56s, sys:

In [52]:
mean_mseC = statistics.mean(MSEsC)
stdevC = statistics.stdev(MSEsC)

print('Mean : %.2f' % mean_mseC)
print('Standard Deviation : %.2f' % stdevC)

# Difference of means of A and B model
print('Difference of means of Model B and Model C is {}', abs(mean_mseB - mean_mseC))

Mean : 33.83
Standard Deviation : 11.19
Difference of means of Model B and Model C is {} 16.04150586300201


## Part D ##
### Part B with 3 hidden layers, each with 10 nodes, and ReLU 

In [53]:
def regression_model_D():
    # create model
    modelD = Sequential()
    modelD.add(Dense(10, activation='relu', input_shape=(n_cols,)))
    modelD.add(Dense(10, activation='relu')) # HL1
    modelD.add(Dense(10, activation='relu')) # HL2
    modelD.add(Dense(10, activation='relu')) # HL3
    modelD.add(Dense(1))
    
    # compile model
    modelD.compile(optimizer='adam', loss='mean_squared_error')
    return modelD

In [54]:
%%time
# Normalizing data
predictors_norm = (predictors - predictors.mean()) / predictors.std()
n_cols = predictors_norm.shape[1] # number of predictors

# train_test_split
X_train, X_test, y_train, y_test = train_test_split(predictors_norm, target, test_size = 0.2)

# Buidling, Fitting and Evaluating model
from sklearn.metrics import mean_squared_error
MSEsD = []

# build the model
modelD = regression_model_D()

for index in range(50) :
    
    # fit the model
    modelD.fit(X_train, y_train, epochs=50, verbose=0)
    
    # predicting and calculating MSE
    y_pred = modelD.predict(X_test)
    MSEsD.append(mean_squared_error(y_test, y_pred))

    
print(MSEsD)

[114.47527017965726, 54.64519793517269, 41.405522467991915, 38.712967651296665, 38.96488264467859, 37.266195210744634, 37.36076172958264, 36.62358304887046, 37.84816086986525, 38.486107102219336, 38.39112040953204, 37.63065884189462, 39.38328585323777, 37.113148305083605, 37.87331333963708, 37.07482678191957, 36.91041317848004, 37.888068282522816, 39.13534627564476, 37.99960184500501, 38.312185246517664, 36.3620760252675, 36.85584573138186, 35.97553103214557, 38.04728147257777, 37.3352971187414, 39.69331950016797, 36.86299188621685, 37.502101297299255, 37.803642114010486, 36.3893121924462, 36.5118832950451, 39.03203019315099, 39.06088267731567, 38.013633594076225, 38.12049489158376, 36.54576793071513, 38.40273859670216, 36.65613168903326, 36.76913001871425, 37.29377228543342, 36.76247691673586, 36.33606177131504, 36.556668445933454, 36.31350149964459, 36.525581446891046, 36.674516675705846, 35.79164038496236, 37.02217352525663, 35.99719323419508]
CPU times: user 1min 26s, sys: 22.7 s, 

In [55]:
mean_mseD = statistics.mean(MSEsD)
stdevD = statistics.stdev(MSEsD)

print('Mean : %.2f' % mean_mseD)
print('Standard Deviation : %.2f' % stdevD)

# Difference of means of B and D model
print('Difference of means of Model B and Model D is {}', abs(mean_mseB - mean_mseD))


Mean : 39.41
Standard Deviation : 11.15
Difference of means of Model B and Model D is {} 10.454984209334185
