Training Models

In [24]:


# The dataset elements
CEMENT = 'Cement (component 1)(kg in a m^3 mixture)'; cement='cement'
SLAG ='Blast Furnace Slag (component 2)(kg in a m^3 mixture)'; slag = 'slag'
ASH = 'Fly Ash (component 3)(kg in a m^3 mixture)'; ash='ash'
WATER = 'Water  (component 4)(kg in a m^3 mixture)'; water = 'water'
SP = 'Superplasticizer (component 5)(kg in a m^3 mixture)'; sp = 'sp'
COARSE_AGG = 'Coarse Aggregate  (component 6)(kg in a m^3 mixture)'; coarse_agg = 'coarse_agg'
FINE_AGG = 'Fine Aggregate (component 7)(kg in a m^3 mixture)'; fine_agg = 'fine_agg'
AGE = 'Age (day)'; age= 'age'
STRENGTH = 'Concrete compressive strength(MPa, megapascals) '; strength='strength'

FILE_NAME = 'processed_data.pickle'

In [25]:
def get_column_names(target_col_name = strength):
    x_cols = list(df.columns.values)
    x_cols.remove(target_col_name) 
    return x_cols, [target_col_name]

In [26]:
import pandas as pd
df = pd.read_excel('Concrete_Data.xls')

# rename the column names
df.rename(columns={CEMENT: cement, 
                   SLAG: slag,
                   ASH: ash,
                   WATER: water,
                   SP: sp,
                   COARSE_AGG: coarse_agg,
                   FINE_AGG: fine_agg,
                   AGE: age,
                   STRENGTH: strength
                  }, inplace = True)

# get features and target column names
x_col_names, y_col_name = get_column_names()

In [27]:
df.head()

Unnamed: 0,cement,slag,ash,water,sp,coarse_agg,fine_agg,age,strength
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.986111
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.887366
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.269535
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05278
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.296075


In [28]:
X = df[x_col_names]
y = df[y_col_name]

In [29]:
from sklearn.model_selection import train_test_split
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size = 0.2, random_state = 43)

In [30]:
print(X_tr.shape, y_tr.shape, X_te.shape, y_te.shape)

(824, 8) (824, 1) (206, 8) (206, 1)


Polynomial transformations
Use PolynomialFeatures from sklearn.preprocessing to add higher degree features (degree=2).

In [31]:
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree=2,include_bias=False)
poly.fit(X_tr)
X_tr = poly.transform(X_tr)## YOUR CODE HERE ##
X_te = poly.fit_transform(X_te)## YOUR CODE HERE ##

In [32]:
print(X_tr.shape, X_te.shape)

(824, 44) (206, 44)


Scaling features- Use StandardScaler from sklearn.preprocessing to normalize the training and testing data

In [33]:
from sklearn.preprocessing import StandardScaler

# Train set
X_tr = StandardScaler().fit_transform(X_tr)
y_tr = StandardScaler().fit_transform(y_tr)

# Test set
X_te = StandardScaler().fit_transform(X_tr)
y_te = StandardScaler().fit_transform(y_tr)

In [34]:
# For comparing models
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
import numpy as np

def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())

In [35]:
# Linear regression on transformed features -- polynomial transformation + scaling
from sklearn.linear_model import LinearRegression
lin_scores = cross_val_score(LinearRegression(),
                             X_tr,
                             y_tr,
                             scoring="neg_mean_squared_error",
                             cv=4)
lin_rmse_scores = np.sqrt(-lin_scores)
display_scores(lin_rmse_scores)

Scores: [0.45886845 0.45112998 0.48153951 0.46719049]
Mean: 0.4646821088514097


In [36]:
# Ridge regression
from sklearn.linear_model import Ridge

param_grid = [{'alpha': [0.001,0.01,0.1,1,10,100,1000,1000]}]
grid_search_rr = GridSearchCV(Ridge(), param_grid, cv=3, scoring='neg_mean_squared_error')
grid_search_rr.fit(X_tr, y_tr)

GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=True,
                             max_iter=None, normalize=False, random_state=None,
                             solver='auto', tol=0.001),
             iid='warn', n_jobs=None,
             param_grid=[{'alpha': [0.001, 0.01, 0.1, 1, 10, 100, 1000, 1000]}],
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='neg_mean_squared_error', verbose=0)

In [37]:
print(grid_search_rr.best_params_)
print(np.sqrt(-grid_search_rr.best_score_))

{'alpha': 10}
0.4729131440307423


In [38]:
# Lasso regression 

from sklearn.linear_model import Lasso
param_grid = [{'alpha': [0.001,0.01,0.1,1,10,100,1000,1000]}]
grid_search_lr = GridSearchCV(Lasso(), param_grid, cv=3, scoring='neg_mean_absolute_error')
grid_search_lr.fit(X_tr, y_tr)

print(grid_search_lr.best_params_)
print(np.sqrt(-grid_search_lr.best_score_))   
    

  positive)
  positive)
  positive)
  positive)


{'alpha': 0.001}
0.6034455137633751


  positive)


In [39]:
# Elastic Net regression

from sklearn.linear_model import ElasticNet

#elastic_net = ElasticNet(alpha=0.1, l1_ratio=0.5, random_state=42)
#elastic_net.fit(X, y)
#elastic_net.predict([[1.5]])


param_grid = [{'l1_ratio': [0,0.5,1]}]
grid_search_Enr = GridSearchCV(ElasticNet(alpha=0.1, random_state=42), param_grid, cv=3, scoring='neg_mean_squared_error')
#grid_search_Enr = GridSearchCV(ElasticNet(alpha=0.1, l1_ratio=0.5, random_state=42), cv=3, scoring='neg_mean_squared_error')

grid_search_Enr.fit(X_tr, y_tr)

print(grid_search_Enr.best_params_)
print(np.sqrt(-grid_search_Enr.best_score_))


{'l1_ratio': 0}
0.4993522133372714


  positive)
  positive)
  positive)
  positive)


In [40]:
# Evaluating your best model on TESTING data
from sklearn.metrics import mean_squared_error

final_model = grid_search_lr.best_estimator_

y_te_estimation = final_model.predict(X_te)

final_mse = mean_squared_error(y_te, y_te_estimation)
final_rmse = np.sqrt(final_mse)
print(final_rmse)

0.44657795602626055
