In [16]:
import pandas as pd 
from pycaret.regression import *
import shap
import matplotlib.pyplot as plt

# Problem Statement
The performance of concrete is universally determined by its unaxial compressive strength, which is also closely related to other physcial properties of the concrete batch. According to the General Specifications of CEDD. It is required to conduct 1-2 concrete strength test per 100m^3 batch. However, conventional Cube Crushing Test takes too much time, usually up to 28 days or more. Therefore, it may be desirable to obtain a relationship between concrete constituent materials to find its compressive strength. 

In [None]:
df = pd.read_csv('concrete_data.csv')

In [None]:
df.columns

In [None]:
df.describe().T

## Model Testing
The models that have been tested are: 

    1. Light Gradient Boositn Machine 
    2. Ensemble Learning with (Lightgbm, ExtraTree Regressor, Gradient Boosting Regressor)

In [None]:
reg101 = setup(data=df, target='concrete_compressive_strength', session_id=1)

In [None]:
compare_models()

In [None]:
lightgbm = create_model('lightgbm')

In [None]:
# lightgbm_tuned = tune_model(lightgbm)

In [None]:
# print(lightgbm_tuned)

In [None]:
# plot_model(lightgbm_tuned)

In [None]:
# plot_model(lightgbm_tuned, plot='error')

In [None]:
# plot_model(lightgbm_tuned, plot='feature')

In [None]:
predict_model(lightgbm)
# predict_model(lightgbm_tuned)

Ensemble Models 

In [None]:
best_rmse_models_top3 = compare_models(sort = 'RMSE', n_select = 3)

In [None]:
best_rmse_models_top3

In [None]:
blend_models(best_rmse_models_top3)

In [None]:
stacker = stack_models(best_rmse_models_top3)

In [None]:
stacker

In [None]:
plot_model(stacker, plot='residuals')

## Interpret Model

https://astrobenhart.medium.com/how-to-use-shap-with-pycaret-dc9a31278621

In [3]:
df = pd.read_csv('concrete_data.csv')

In [4]:
reg101 = setup(data=df, target='concrete_compressive_strength', session_id=1,train_size=0.8)

Unnamed: 0,Description,Value
0,Session id,1
1,Target,concrete_compressive_strength
2,Target type,Regression
3,Original data shape,"(1030, 9)"
4,Transformed data shape,"(1030, 9)"
5,Transformed train set shape,"(824, 9)"
6,Transformed test set shape,"(206, 9)"
7,Numeric features,8
8,Preprocess,True
9,Imputation type,simple


In [5]:
# top5 = compare_models(n_select=5,sort='RMSE')
# tuned_top5 = [tune_model(i, optimize='RMSE') for i in top5]
# ensem_top5 = [ensemble_model(i, n_estimators = 10, optimize='RMSE') for i in tuned_top5]
# blend = blend_models(tuned_top5, optimize='RMSE')
# blend_ensem = blend_models(ensem_top5, optimize='RMSE')
# model = automl(optimize='RMSE')

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
lightgbm,Light Gradient Boosting Machine,3.1388,21.7581,4.6103,0.9231,0.1537,0.112,0.485
et,Extra Trees Regressor,3.1224,24.0036,4.8456,0.9149,0.153,0.1109,0.044
gbr,Gradient Boosting Regressor,3.754,25.3993,5.011,0.909,0.1625,0.1303,0.035
rf,Random Forest Regressor,3.5166,26.5611,5.0891,0.9065,0.1693,0.1283,0.08
dt,Decision Tree Regressor,4.3412,48.0636,6.7711,0.8331,0.2212,0.1529,0.005
ada,AdaBoost Regressor,6.3155,58.8828,7.633,0.789,0.2854,0.2698,0.032
knn,K Neighbors Regressor,7.1024,86.3731,9.2318,0.693,0.3136,0.2782,0.005
lar,Least Angle Regression,8.0984,105.8964,10.215,0.6219,0.331,0.3151,0.004
ridge,Ridge Regression,8.0984,105.8964,10.215,0.6219,0.331,0.3151,0.005
lr,Linear Regression,8.0984,105.8964,10.215,0.6219,0.331,0.3151,0.366


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,2.9555,20.4511,4.5223,0.9263,0.1864,0.1251
1,2.6732,15.3974,3.924,0.9365,0.1469,0.1083
2,2.7165,22.827,4.7778,0.9185,0.1553,0.1062
3,2.6545,11.4935,3.3902,0.963,0.1589,0.1016
4,3.3121,23.3066,4.8277,0.9175,0.1436,0.1088
5,3.0655,22.0702,4.6979,0.9192,0.1466,0.0958
6,3.0071,17.1646,4.143,0.9352,0.1295,0.0942
7,2.6074,12.6018,3.5499,0.9441,0.1325,0.0942
8,3.2889,27.8226,5.2747,0.9245,0.1752,0.1242
9,3.1356,19.2593,4.3885,0.9291,0.1256,0.0959


Fitting 10 folds for each of 10 candidates, totalling 100 fits


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,6.4109,66.3177,8.1436,0.7611,0.2983,0.2763
1,6.2414,63.5786,7.9736,0.7376,0.3352,0.3173
2,6.5097,65.6304,8.1013,0.7658,0.3262,0.3095
3,6.2619,53.0542,7.2838,0.8291,0.2975,0.2858
4,6.7177,70.2266,8.3801,0.7514,0.2565,0.2324
5,6.017,54.4281,7.3775,0.8007,0.2502,0.2243
6,5.5954,48.6065,6.9718,0.8166,0.2275,0.1943
7,5.7908,48.3656,6.9545,0.7856,0.2814,0.2621
8,7.1701,85.1247,9.2263,0.7691,0.2913,0.2732
9,6.1619,57.4752,7.5812,0.7884,0.2694,0.243


Fitting 10 folds for each of 10 candidates, totalling 100 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,2.9115,15.4006,3.9244,0.9445,0.1374,0.108
1,2.9761,19.6276,4.4303,0.919,0.1551,0.1149
2,3.1886,24.1985,4.9192,0.9137,0.1757,0.1304
3,2.7999,13.2963,3.6464,0.9572,0.1288,0.0952
4,3.137,21.1056,4.5941,0.9253,0.1304,0.1004
5,3.332,19.38,4.4023,0.929,0.1297,0.099
6,2.9681,16.7889,4.0974,0.9366,0.1283,0.0937
7,3.0427,15.2121,3.9003,0.9326,0.1456,0.117
8,3.6691,31.1418,5.5805,0.9155,0.1628,0.1239
9,3.3456,19.8977,4.4607,0.9267,0.1333,0.1068


Fitting 10 folds for each of 10 candidates, totalling 100 fits


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,4.4186,33.7016,5.8053,0.8786,0.2193,0.1811
1,3.8189,26.9047,5.187,0.889,0.2267,0.1826
2,3.9137,30.6475,5.536,0.8906,0.223,0.1764
3,3.7021,21.8782,4.6774,0.9295,0.1689,0.1442
4,4.8176,42.8279,6.5443,0.8484,0.1826,0.1536
5,4.0662,26.9282,5.1892,0.9014,0.1548,0.1321
6,3.9505,25.9613,5.0952,0.902,0.154,0.1247
7,4.1055,28.9488,5.3804,0.8717,0.1981,0.165
8,5.2765,56.8989,7.5431,0.8456,0.2116,0.1788
9,4.2696,32.1745,5.6723,0.8815,0.186,0.1549


Fitting 10 folds for each of 10 candidates, totalling 100 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,5.8178,64.521,8.0325,0.7675,0.2463,0.2039
1,5.147,43.5903,6.6023,0.8201,0.2391,0.2055
2,5.4208,62.4416,7.902,0.7772,0.2545,0.2068
3,4.8868,36.29,6.0241,0.8831,0.2184,0.1901
4,6.0,64.9537,8.0594,0.77,0.238,0.1879
5,5.38,55.3634,7.4407,0.7973,0.2183,0.1611
6,4.8599,44.034,6.6358,0.8338,0.2007,0.1555
7,4.8236,44.1505,6.6446,0.8043,0.2402,0.1875
8,6.6575,116.5011,10.7936,0.684,0.2943,0.2137
9,4.6225,37.0686,6.0884,0.8635,0.1713,0.1445


Fitting 10 folds for each of 10 candidates, totalling 100 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,2.8571,17.5812,4.193,0.9367,0.1625,0.1202
1,2.6142,15.9132,3.9891,0.9343,0.1435,0.1053
2,2.641,21.8572,4.6752,0.922,0.1554,0.1046
3,2.5695,11.371,3.3721,0.9634,0.1238,0.0946
4,3.2933,20.9828,4.5807,0.9257,0.1344,0.1066
5,3.0376,17.5911,4.1942,0.9356,0.1334,0.0955
6,2.9286,15.4748,3.9338,0.9416,0.1266,0.095
7,2.7293,14.1822,3.7659,0.9371,0.1352,0.1012
8,3.5918,33.2866,5.7695,0.9097,0.1738,0.1305
9,3.2257,19.0027,4.3592,0.93,0.1314,0.1043


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,3.5274,23.8152,4.8801,0.9142,0.1762,0.1379
1,3.1528,22.6867,4.7631,0.9064,0.1969,0.1467
2,3.0782,25.518,5.0515,0.9089,0.1801,0.1301
3,3.1578,18.5661,4.3088,0.9402,0.1484,0.1154
4,3.4526,24.2835,4.9278,0.914,0.1341,0.1079
5,3.5705,20.5054,4.5283,0.9249,0.1275,0.1093
6,3.2674,21.6613,4.6542,0.9183,0.1397,0.0991
7,3.2942,18.5889,4.3115,0.9176,0.1465,0.1222
8,4.0919,43.9599,6.6302,0.8807,0.1875,0.1412
9,3.6854,25.6579,5.0654,0.9055,0.1554,0.1235


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,3.128,18.1352,4.2585,0.9347,0.1533,0.1207
1,3.2024,20.7514,4.5554,0.9144,0.1683,0.1319
2,3.0128,23.0275,4.7987,0.9178,0.1717,0.1255
3,2.8705,13.8242,3.7181,0.9555,0.125,0.1017
4,3.5369,24.1857,4.9179,0.9144,0.1346,0.1092
5,3.4677,19.9116,4.4622,0.9271,0.1394,0.1093
6,3.0198,16.3568,4.0444,0.9383,0.1321,0.0975
7,3.4306,19.8848,4.4592,0.9119,0.1604,0.1301
8,3.8207,32.7925,5.7265,0.911,0.1675,0.1316
9,3.5447,22.1571,4.7071,0.9184,0.1457,0.1169


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,3.9641,28.8616,5.3723,0.896,0.1958,0.1537
1,3.6473,25.3458,5.0345,0.8954,0.2105,0.1678
2,3.3941,25.341,5.034,0.9096,0.193,0.1458
3,3.6486,21.8252,4.6717,0.9297,0.1684,0.1373
4,4.323,36.7764,6.0644,0.8698,0.1669,0.1375
5,3.9848,25.2823,5.0282,0.9074,0.148,0.1236
6,3.7193,24.5485,4.9546,0.9074,0.1491,0.1166
7,3.902,24.9381,4.9938,0.8895,0.1779,0.149
8,4.9002,50.2441,7.0883,0.8637,0.2052,0.1692
9,4.0568,29.0973,5.3942,0.8929,0.1628,0.1357


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,3.7901,26.0403,5.103,0.9062,0.1909,0.1517
1,3.831,28.87,5.3731,0.8808,0.2227,0.1731
2,2.8692,23.2474,4.8216,0.917,0.1906,0.1349
3,3.7481,25.4445,5.0443,0.918,0.1739,0.1388
4,4.1284,39.2871,6.2679,0.8609,0.1683,0.1277
5,3.6314,26.0227,5.1012,0.9047,0.1613,0.1113
6,3.5716,24.2392,4.9233,0.9085,0.1577,0.1171
7,3.3417,21.9748,4.6877,0.9026,0.1627,0.1232
8,4.708,52.7374,7.2621,0.8569,0.2152,0.1698
9,3.8013,25.6303,5.0626,0.9056,0.1541,0.1267


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,3.3065,21.3156,4.6169,0.9232,0.1632,0.1246
1,2.6811,17.2861,4.1577,0.9287,0.1593,0.1165
2,2.6455,22.8296,4.778,0.9185,0.1686,0.1146
3,2.6356,12.8898,3.5902,0.9585,0.1217,0.0936
4,2.9577,18.2843,4.276,0.9353,0.1171,0.0917
5,3.2155,20.7202,4.5519,0.9241,0.1323,0.0936
6,3.0507,20.0049,4.4727,0.9245,0.1406,0.0976
7,2.5454,11.704,3.4211,0.9481,0.1267,0.0987
8,3.946,44.5588,6.6752,0.8791,0.187,0.1352
9,3.0996,20.6361,4.5427,0.924,0.1313,0.097


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,3.2206,19.2863,4.3916,0.9305,0.1663,0.1284
1,3.0851,19.7268,4.4415,0.9186,0.1801,0.1382
2,2.7051,21.2362,4.6083,0.9242,0.1711,0.1187
3,2.9629,15.3954,3.9237,0.9504,0.1308,0.1051
4,3.5378,25.8812,5.0874,0.9084,0.1374,0.1091
5,3.3315,18.5551,4.3076,0.9321,0.1289,0.1016
6,3.0518,17.4109,4.1726,0.9343,0.1293,0.0961
7,3.1586,17.6326,4.1991,0.9218,0.1471,0.1177
8,4.0485,39.5687,6.2904,0.8927,0.1826,0.1416
9,3.5164,22.5019,4.7436,0.9172,0.1403,0.114


In [6]:
predict_model(model)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Light Gradient Boosting Machine,3.1936,21.1201,4.5957,0.919,0.1303,0.1001




Unnamed: 0,cement,blast_furnace_slag,fly_ash,water,superplasticizer,coarse_aggregate,fine_aggregate,age,concrete_compressive_strength,prediction_label
339,297.200012,0.000000,117.500000,174.800003,9.5,1022.799988,753.500000,3,21.910000,23.189181
244,238.100006,0.000000,94.099998,186.699997,7.0,949.900024,847.000000,3,19.930000,14.251810
882,140.000000,133.000000,103.000000,200.000000,7.0,916.000000,753.000000,28,36.439999,31.728161
567,203.500000,135.699997,0.000000,185.699997,0.0,1076.199951,759.299988,7,11.960000,11.809381
923,162.000000,207.000000,172.000000,216.000000,10.0,822.000000,638.000000,28,39.840000,28.816653
...,...,...,...,...,...,...,...,...,...,...
258,212.500000,0.000000,100.400002,159.300003,8.7,1007.799988,903.599976,100,42.919998,44.931956
551,255.000000,0.000000,0.000000,192.000000,0.0,889.799988,945.000000,3,8.200000,6.626353
528,359.000000,19.000000,141.000000,154.000000,10.9,942.000000,801.000000,7,38.610001,38.309511
812,310.000000,0.000000,0.000000,192.000000,0.0,970.000000,850.000000,90,34.680000,34.527364


In [7]:
final_model = finalize_model(model)
save_model(final_model, 'concrete_UCS')

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=Memory(location=None),
          steps=[('numerical_imputer',
                  TransformerWrapper(include=['cement', 'blast_furnace_slag',
                                              'fly_ash', 'water',
                                              'superplasticizer',
                                              'coarse_aggregate',
                                              'fine_aggregate ', 'age'],
                                     transformer=SimpleImputer())),
                 ('categorical_imputer',
                  TransformerWrapper(include=[],
                                     transformer=SimpleImputer(strategy='most_frequent'))),
                 ('clean_column_names',
                  TransformerWrapper(transformer=CleanColumnNames())),
                 ('actual_estimator',
                  BaggingRegressor(estimator=LGBMRegressor(bagging_fraction=0.8,
                                                           bagging_freq=3,
                 

In [8]:
saved_model = load_model('concrete_UCS')

Transformation Pipeline and Model Successfully Loaded


In [5]:
lightgbm = create_model('lightgbm')

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,3.1518,19.7625,4.4455,0.9288,0.1783,0.1314
1,2.7067,16.0543,4.0068,0.9337,0.162,0.1171
2,2.8942,24.0939,4.9086,0.914,0.1861,0.1293
3,2.7206,13.7125,3.703,0.9558,0.1289,0.0978
4,3.3683,24.4087,4.9405,0.9136,0.1407,0.1054
5,3.3548,21.9338,4.6834,0.9197,0.1563,0.1002
6,2.8803,17.0569,4.13,0.9356,0.1245,0.0876
7,3.0403,17.0189,4.1254,0.9246,0.1441,0.1097
8,3.934,40.6537,6.376,0.8897,0.1866,0.1389
9,3.3366,22.8862,4.7839,0.9157,0.1295,0.1023


In [9]:
train_pipe = saved_model[:-1].transform(df)

In [12]:
print(saved_model)

Pipeline(memory=FastMemory(location=/var/folders/py/yncddnwd6kn7d9dzfmz1kcbm0000gn/T/joblib),
         steps=[('numerical_imputer',
                 TransformerWrapper(include=['cement', 'blast_furnace_slag',
                                             'fly_ash', 'water',
                                             'superplasticizer',
                                             'coarse_aggregate',
                                             'fine_aggregate ', 'age'],
                                    transformer=SimpleImputer())),
                ('categorical_imputer',
                 TransformerWrapper(include=[],
                                    transfor...
                ('clean_column_names',
                 TransformerWrapper(transformer=CleanColumnNames())),
                ('actual_estimator',
                 BaggingRegressor(estimator=LGBMRegressor(bagging_fraction=0.8,
                                                          bagging_freq=3,
                     

In [16]:
print(saved_model.named_steps)

{'numerical_imputer': TransformerWrapper(include=['cement', 'blast_furnace_slag', 'fly_ash', 'water',
                            'superplasticizer', 'coarse_aggregate',
                            'fine_aggregate ', 'age'],
                   transformer=SimpleImputer()), 'categorical_imputer': TransformerWrapper(include=[],
                   transformer=SimpleImputer(strategy='most_frequent')), 'clean_column_names': TransformerWrapper(transformer=CleanColumnNames()), 'actual_estimator': BaggingRegressor(estimator=LGBMRegressor(bagging_fraction=0.8, bagging_freq=3,
                                         feature_fraction=0.5,
                                         learning_rate=0.3,
                                         min_child_samples=26,
                                         min_split_gain=0.8, n_estimators=230,
                                         n_jobs=-1, num_leaves=100,
                                         random_state=1, reg_alpha=0.005,
                   

In [21]:
print(saved_model.named_steps["actual_estimator"])

BaggingRegressor(estimator=LGBMRegressor(bagging_fraction=0.8, bagging_freq=3,
                                         feature_fraction=0.5,
                                         learning_rate=0.3,
                                         min_child_samples=26,
                                         min_split_gain=0.8, n_estimators=230,
                                         n_jobs=-1, num_leaves=100,
                                         random_state=1, reg_alpha=0.005,
                                         reg_lambda=4),
                 random_state=1)


In [7]:
explainer = shap.TreeExplainer(lightgbm)
shap_values = explainer.shap_values(df.iloc[:,:-1],df.iloc[:,-1])

In [26]:
shap.initjs()
concrete_index = 0
# print(shap_values[concrete_index,:])
# print(df.iloc[concrete_index,:].to_numpy())
shap.force_plot(explainer.expected_value, shap_values[concrete_index,:], df.iloc[concrete_index,:-1])

In [24]:
shap.initjs()
shap.force_plot(explainer.expected_value, shap_values, df.iloc[:,:-1])

In [30]:
shap.plots.beeswarm(explainer.expected_value, shap_values, df.iloc[:,:-1])

TypeError: The beeswarm plot requires an `Explanation` object as the `shap_values` argument.