In [1]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt 
from sklearn.metrics import accuracy_score 
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score, roc_auc_score, roc_curve

In [2]:
df=pd.read_csv('compresive_strength_concrete.csv')

df.head(10)


Unnamed: 0,Cement (component 1)(kg in a m^3 mixture),Blast Furnace Slag (component 2)(kg in a m^3 mixture),Fly Ash (component 3)(kg in a m^3 mixture),Water (component 4)(kg in a m^3 mixture),Superplasticizer (component 5)(kg in a m^3 mixture),Coarse Aggregate (component 6)(kg in a m^3 mixture),Fine Aggregate (component 7)(kg in a m^3 mixture),Age (day),"Concrete compressive strength(MPa, megapascals)"
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.99
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.89
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.27
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.3
5,266.0,114.0,0.0,228.0,0.0,932.0,670.0,90,47.03
6,380.0,95.0,0.0,228.0,0.0,932.0,594.0,365,43.7
7,380.0,95.0,0.0,228.0,0.0,932.0,594.0,28,36.45
8,266.0,114.0,0.0,228.0,0.0,932.0,670.0,28,45.85
9,475.0,0.0,0.0,228.0,0.0,932.0,594.0,28,39.29


In [3]:
df.isnull().sum()

Cement (component 1)(kg in a m^3 mixture)                0
Blast Furnace Slag (component 2)(kg in a m^3 mixture)    0
Fly Ash (component 3)(kg in a m^3 mixture)               0
Water  (component 4)(kg in a m^3 mixture)                0
Superplasticizer (component 5)(kg in a m^3 mixture)      0
Coarse Aggregate  (component 6)(kg in a m^3 mixture)     0
Fine Aggregate (component 7)(kg in a m^3 mixture)        0
Age (day)                                                0
Concrete compressive strength(MPa, megapascals)          0
dtype: int64

In [4]:
X=df.iloc[:,:-1]

y=df.iloc[:,-1]

print(X.head(6))
print(y.head(6))

Cement (component 1)(kg in a m^3 mixture)  \
0                                      540.0   
1                                      540.0   
2                                      332.5   
3                                      332.5   
4                                      198.6   
5                                      266.0   

   Blast Furnace Slag (component 2)(kg in a m^3 mixture)  \
0                                                0.0       
1                                                0.0       
2                                              142.5       
3                                              142.5       
4                                              132.4       
5                                              114.0       

   Fly Ash (component 3)(kg in a m^3 mixture)  \
0                                         0.0   
1                                         0.0   
2                                         0.0   
3                                         0.0   


In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.30, random_state=0)



In [6]:
# Scaling the data before fitting models
'''
from sklearn.preprocessing import MinMaxScaler
sc = MinMaxScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform (X_test)
'''

'\nfrom sklearn.preprocessing import MinMaxScaler\nsc = MinMaxScaler()\nX_train = sc.fit_transform(X_train)\nX_test = sc.transform (X_test)\n'

<h3>Experimenting with different Algorithms  </h3>

In [7]:
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import SGDRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import BayesianRidge


for reg in (SVR(),AdaBoostRegressor(),RandomForestRegressor(),GradientBoostingRegressor(),BayesianRidge(),KNeighborsRegressor()):
    
    reg.fit(X_train,y_train)

    y_pred=reg.predict(X_test)

    rmse=mean_squared_error(y_test,y_pred,squared=False)
    mae=mean_absolute_error(y_test,y_pred)
    r2=r2_score(y_test,y_pred)

    print('rmse      \t\t mae        \t\t r2')
    print(rmse , mae , r2)
    print('\n')





rmse      		 mae        		 r2
13.987504786072574 11.23732069539425 0.2378244714527794


rmse      		 mae        		 r2
7.487675123527697 6.155326843840202 0.7815920260687992


rmse      		 mae        		 r2
5.2011388007474295 3.582608922021882 0.8946167975902231


rmse      		 mae        		 r2
5.096401301969084 3.824054337109506 0.8988183539604994


rmse      		 mae        		 r2
9.648979530100764 7.733760898528134 0.6373085861500805


rmse      		 mae        		 r2
9.507446856106156 7.313818770226536 0.6478705751615099




<font color='red'>GradientBoostingRegressor showed best performance in the initial model screening. So we will proceed with that</font>

<h2>Hyper Parameter tuning of Gradient Boosting Regressor</h2>

In [8]:
from sklearn.model_selection import GridSearchCV 






param_grid = {'n_estimators':[210,220,230],
                'max_depth':[2,3,4],
                'learning_rate':[0.2,0.225,0.23,0.235]
             
              }  
  
grid = GridSearchCV(GradientBoostingRegressor(max_features='log2'), param_grid, refit = True, verbose = 3) 
  
# fitting the model for grid search 
grid.fit(X_train, y_train) 

  0.3s
[CV] learning_rate=0.225, max_depth=2, n_estimators=220 ..............
[CV]  learning_rate=0.225, max_depth=2, n_estimators=220, score=0.913, total=   0.3s
[CV] learning_rate=0.225, max_depth=2, n_estimators=220 ..............
[CV]  learning_rate=0.225, max_depth=2, n_estimators=220, score=0.928, total=   0.4s
[CV] learning_rate=0.225, max_depth=2, n_estimators=220 ..............
[CV]  learning_rate=0.225, max_depth=2, n_estimators=220, score=0.896, total=   0.3s
[CV] learning_rate=0.225, max_depth=2, n_estimators=230 ..............
[CV]  learning_rate=0.225, max_depth=2, n_estimators=230, score=0.925, total=   0.3s
[CV] learning_rate=0.225, max_depth=2, n_estimators=230 ..............
[CV]  learning_rate=0.225, max_depth=2, n_estimators=230, score=0.933, total=   0.3s
[CV] learning_rate=0.225, max_depth=2, n_estimators=230 ..............
[CV]  learning_rate=0.225, max_depth=2, n_estimators=230, score=0.905, total=   0.2s
[CV] learning_rate=0.225, max_depth=2, n_estimators=230 .

GridSearchCV(cv=None, error_score=nan,
             estimator=GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0,
                                                 criterion='friedman_mse',
                                                 init=None, learning_rate=0.1,
                                                 loss='ls', max_depth=3,
                                                 max_features='log2',
                                                 max_leaf_nodes=None,
                                                 min_impurity_decrease=0.0,
                                                 min_impurity_split=None,
                                                 min_samples_leaf=1,
                                                 min_samples_split=2,
                                                 min_weight_fraction_leaf=0.0,
                                                 n_estimators=100,
                                                 n_iter_no_change=None,
            

In [9]:
print(grid.best_params_)

{'learning_rate': 0.225, 'max_depth': 4, 'n_estimators': 230}


In [10]:
y_pred=grid.predict(X_test)

rmse=mean_squared_error(y_test,y_pred,squared=False)
mae=mean_absolute_error(y_test,y_pred)
r2=r2_score(y_test,y_pred)

print('rmse      \t\t mae        \t\t r2')
print(rmse , mae , r2)
print('\n')

rmse      		 mae        		 r2
4.076480559090774 2.719468226360525 0.9352640902959037


