# Chapter 2 

In [None]:
# Listing 1-1

%matplotlib inline

import time
import random
import datetime
import pandas as pd
import matplotlib.pyplot as plt
import statistics
import numpy as np
from scipy import stats
import seaborn

# from sklearn.grid_search import GridSearchCV # deprecated, moved to model_selection
from sklearn.model_selection import GridSearchCV
# from sklearn.cross_validation import train_test_split # deprecated, moved to model_selection
from sklearn.model_selection import train_test_split
# from sklearn.cross_validation import cross_val_score # deprecated, moved to model_selection
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import RANSACRegressor, LinearRegression, TheilSenRegressor
from sklearn.metrics import explained_variance_score, mean_absolute_error, mean_squared_error, median_absolute_error, r2_score

from sklearn.svm import SVR
from sklearn.linear_model import Ridge,Lasso,ElasticNet,BayesianRidge
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures

from IPython.display import Image

In [None]:
# Listing 1-2

data = pd.read_csv('datasets/Chapter 2/concrete_data.csv')

In [None]:
# Listing 1-3

print(len(data))
data.head()

In [None]:
# Listing 1-4

data.columns = ['cement_component', 'furnace_slag', 'flay_ash', 'water_component', 'superplasticizer', \
	'coarse_aggregate', 'fine_aggregate', 'age', 'concrete_strength']

In [None]:
# Listing 1-5

plt.figure(figsize=(15,10.5))
plot_count = 1

for feature in list(data.columns)[:-1]:
        plt.subplot(3,3,plot_count)
        plt.scatter(data[feature], data['concrete_strength'])
        plt.xlabel(feature.replace('_',' ').title())
        plt.ylabel('Concrete strength')
        plot_count+=1
        
plt.show()

In [None]:
# Listing 1-6

pd.set_option('display.width', 100)
pd.set_option('display.precision', 3) # add 'display.' to precision attr
correlations = data.corr(method='pearson')
print(correlations)

In [None]:
# Listing 1-7

data_ = data[(data.T != 0).any()]
seaborn.pairplot(data_, vars=data.columns, kind='reg')
plt.show()

In [None]:
# Listing 1-8

def split_train_test(data, feature, train_index=0.7):

    train, test = train_test_split(data, test_size = 1-train_index)

    if type(feature) == list:
        # pandas 0.23.0 (May 15, 2018)
        # .as_matrix()  is deprecated. Use DataFrame.values instead (GH18458).
        x_train = train[feature] # .as_matrix()
        y_train = train['concrete_strength'] # .as_matrix()

        x_test = test[feature] # .as_matrix()
        y_test = test['concrete_strength'] # .as_matrix()

    else:
        x_train = [[x] for x in list(train[feature])]
        y_train = [[x] for x in list(train['concrete_strength'])]

        x_test = [[x] for x in list(test[feature])]
        y_test = [[x] for x in list(test['concrete_strength'])]
        
    return x_train, y_train, x_test, y_test

In [None]:
# Listing 1-9

plt.figure(figsize=(15,7))
plot_count = 1

for feature in ['cement_component', 'flay_ash', 'water_component', 'superplasticizer', 'coarse_aggregate']:
    data_tr = data[['concrete_strength', feature]]
    data_tr=data_tr[(data_tr.T != 0).all()]
    
    x_train, y_train, x_test, y_test = split_train_test(data_tr, feature)

    # Create linear regression object
    regr = LinearRegression()

    # Train the model using the training sets
    regr.fit(x_train, y_train)
    y_pred = regr.predict(x_test)
    
    # Plot outputs
    plt.subplot(2,3,plot_count)
    
    plt.scatter(x_test, y_test,  color='black')
    plt.plot(x_test, y_pred, color='blue', linewidth=3)
    plt.xlabel(feature.replace('_',' ').title())
    plt.ylabel('Concrete strength')

    print(feature, r2_score(y_test, y_pred))
    
    plot_count+=1
        
plt.show()

In [None]:
# Listing 1-10

features = ['cement_component', 'flay_ash', 'water_component', 'superplasticizer', 'coarse_aggregate']

data_tr = data
data_tr=data_tr[(data_tr.T != 0).all()]

x_train, y_train, x_test, y_test = split_train_test(data_tr, features)

# Create linear regression object
regr = LinearRegression()

# Train the model using the training sets
regr.fit(x_train, y_train)
y_pred = regr.predict(x_test)

plt.scatter(list(range(len(y_test))), y_test,  color='black')
plt.plot(y_pred, color='blue', linewidth=3)

print('Features: ', str(features))
print('R2 score: ', r2_score(y_test, y_pred))
print('Intercept: ', regr.intercept_)
print('Coefficients: ', str(regr.coef_))

In [None]:
# Listing 1-11

alphas = np.arange(0.1,5,0.1)

model = Ridge()
cv = GridSearchCV(estimator=model, param_grid=dict(alpha=alphas))

y_pred = cv.fit(x_train, y_train).predict(x_test)

plt.scatter(list(range(len(y_test))), y_test,  color='black')
plt.plot(y_pred, color='blue', linewidth=3)

print('Features: ', str(features))
print('R2 score: ', r2_score(y_test, y_pred))
# print('Intercept: %f'%regr.intercept_)
# print('Coefficients: %s'%str(regr.coef_))

In [None]:
# Listing 1-12

model = Lasso()
cv = GridSearchCV(estimator=model, param_grid=dict(alpha=alphas))

y_pred = cv.fit(x_train, y_train).predict(x_test)

plt.scatter(list(range(len(y_test))), y_test,  color='black')
plt.plot(y_pred, color='blue', linewidth=3)

print('Features: ', str(features))
print('R2 score: ', r2_score(y_test, y_pred))
# print('Intercept: ', model.intercept_)
# print('Coefficients: ', str(model.coef_))

In [None]:
# Listing 1-13

model = ElasticNet()
cv = GridSearchCV(estimator=model, param_grid=dict(alpha=alphas))

y_pred = cv.fit(x_train, y_train).predict(x_test)

plt.scatter(list(range(len(y_test))), y_test,  color='black')
plt.plot(y_pred, color='blue', linewidth=3)

print('Features: ', str(features))
print('R2 score: ', r2_score(y_test, y_pred))
# print('Intercept: ', regr.intercept_)
# print('Coefficients: ', str(regr.coef_))

In [None]:
# Listing 1-14

plt.figure(figsize=(15,7))
plot_count = 1

for feature in ['cement_component', 'flay_ash', 'water_component', 'superplasticizer', 'coarse_aggregate']:
    data_tr = data[['concrete_strength', feature]]
    data_tr=data_tr[(data_tr.T != 0).all()]
    
    x_train, y_train, x_test, y_test = split_train_test(data_tr, feature)

    # Create linear regression object
    regr = GradientBoostingRegressor()

    # Train the model using the training sets
    
    # regr.fit(x_train, y_train.values.ravel()) does not work, still 
    # DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
    regr.fit(x_train, np.ravel(y_train))
    y_pred = regr.predict(x_test)
    
    # Plot outputs
    plt.subplot(2,3,plot_count)
    
    plt.scatter(x_test, y_test,  color='black')
    plt.plot(x_test, y_pred, color='blue',
             linewidth=3)
    plt.xlabel(feature.replace('_',' ').title())
    plt.ylabel('Concrete strength')

    print(feature, r2_score(y_test, y_pred))
    
    plot_count+=1
        
plt.show()

In [None]:
# Listing 1-15

model = GradientBoostingRegressor()

model.fit(x_train, np.ravel(y_train))
y_pred = model.predict(x_test)

plt.scatter(list(range(len(y_test))), y_test,  color='black')
plt.plot(y_pred, color='blue', linewidth=3)

print('Features: ', str(features))
print('R2 score: ', r2_score(y_test, y_pred))
# print('Intercept: %f'%regr.intercept_) # AttributeError: 'GradientBoostingRegressor' object has no attribute 'intercept_'
# print('Coefficients: %s'%str(regr.coef_))

In [None]:
# Listing 1-16

plt.figure(figsize=(15,7))
plot_count = 1

for feature in ['cement_component', 'flay_ash', 'water_component', 'superplasticizer', 'coarse_aggregate']:
    data_tr = data[['concrete_strength', feature]]
    data_tr=data_tr[(data_tr.T != 0).all()]
    
    x_train, y_train, x_test, y_test = split_train_test(data_tr, feature)

    # Create linear regression object
    regr = SVR(kernel='linear')

    # Train the model using the training sets
    regr.fit(x_train, np.ravel(y_train))
    y_pred = regr.predict(x_test)
    
    # Plot outputs
    plt.subplot(2,3,plot_count)
    
    plt.scatter(x_test, y_test,  color='black')
    plt.plot(x_test, y_pred, color='blue', linewidth=3)
    plt.xlabel(feature.replace('_',' ').title())
    plt.ylabel('Concrete strength')

    print(feature, r2_score(y_test, y_pred))
    
    plot_count+=1
        
plt.show()

In [None]:
# Listing 1-17

model = SVR(kernel='linear')

y_pred = model.fit(x_train, np.ravel(y_train)).predict(x_test)

plt.scatter(list(range(len(y_test))), y_test,  color='black')
plt.plot(y_pred, color='blue', linewidth=3)

print('Features: ', str(features))
print('R2 score: ', r2_score(y_test, y_pred))

In [None]:
# Listing 1-18

feature = 'cement_component'
cc_new_data = np.array([213.5])

data_tr = data[['concrete_strength', feature]]
data_tr=data_tr[(data_tr.T != 0).all()]

x_train, y_train, x_test, y_test = split_train_test(data_tr, feature)

regr = GradientBoostingRegressor()

# Train the model using the training sets

regr.fit(x_train, np.ravel(y_train))
cs_pred = regr.predict(cc_new_data.reshape(-1, 1))
print('Predicted value of concrete strength: ', cs_pred)

In [None]:
# Listing 1-19

feature = 'water_component'
wc_new_data = np.array([200])

data_tr = data[['concrete_strength', feature]]
data_tr=data_tr[(data_tr.T != 0).all()]

x_train, y_train, x_test, y_test = split_train_test(data_tr, feature)

regr = GradientBoostingRegressor()

# Train the model using the training sets
regr.fit(x_train, np.ravel(y_train))
cs_pred = regr.predict(wc_new_data.reshape(-1, 1))
print('Predicted value of concrete strength: ', cs_pred)

In [None]:
# Listing 1-20

feature = 'coarse_aggregate'
ca_new_data = np.array([1000])

data_tr = data[['concrete_strength', feature]]
data_tr=data_tr[(data_tr.T != 0).all()]

x_train, y_train, x_test, y_test = split_train_test(data_tr, feature)

regr = GradientBoostingRegressor()

# Train the model using the training sets
regr.fit(x_train, np.ravel(y_train))
cs_pred = regr.predict(ca_new_data.reshape(-1, 1))
print('Predicted value of concrete strength: ', cs_pred)