In [1]:
import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score

from sklearn import metrics

In [2]:
data = pd.read_csv('concrete.csv')
data.head()

Unnamed: 0,Cement,BlastFurnaceSlag,FlyAsh,Water,Superplasticizer,CoarseAggregate,FineAggregate,Age,CompressiveStrength
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.99
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.89
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.27
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.3


In [11]:
data.shape

(1030, 9)

In [3]:
data.isnull().sum()

Cement                 0
BlastFurnaceSlag       0
FlyAsh                 0
Water                  0
Superplasticizer       0
CoarseAggregate        0
FineAggregate          0
Age                    0
CompressiveStrength    0
dtype: int64

## Baseline Model Score

In [4]:
model = RandomForestRegressor(criterion = 'mae', random_state = 11)

In [5]:
X = data.drop('CompressiveStrength', axis=1)
y = data['CompressiveStrength']

In [6]:
scores = -1 * cross_val_score(model, X, y, cv=5, scoring = 'neg_mean_absolute_error')
scores

array([ 7.98255583,  6.48144782,  5.76683883,  4.20504733, 17.59681165])

In [7]:
b_score = scores.mean()
print('Baseline Score: ', b_score)

Baseline Score:  8.406540291262134


### Create some synthetic features

In [8]:
# Ratios of different features
# Coarse and Fine Aggregate
X['FnCsAgg'] = X['FineAggregate'] / X['CoarseAggregate']
X['CsFnAgg'] = X['CoarseAggregate'] / X['FineAggregate']

# Aggregate and Cement
X['AggCm'] = (X['FineAggregate'] + X['CoarseAggregate']) / X['Cement']

# Cement and Water
X['CmWater'] = X['Cement'] / X['Water']

X.head()

Unnamed: 0,Cement,BlastFurnaceSlag,FlyAsh,Water,Superplasticizer,CoarseAggregate,FineAggregate,Age,FnCsAgg,CsFnAgg,AggCm,CmWater
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,0.65,1.538462,3.177778,3.333333
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,0.640758,1.560651,3.205556,3.333333
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,0.637339,1.569024,4.589474,1.458333
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,0.637339,1.569024,4.589474,1.458333
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,0.843724,1.185221,9.083082,1.034375


### Model with synthetic features

In [9]:
model_2 = RandomForestRegressor(criterion = 'mae', random_state = 11)

scores_2 = -1 * cross_val_score(model_2, X, y, cv=5, scoring = 'neg_mean_absolute_error')

scores_2

array([ 7.85567015,  6.73362427,  6.00656966,  4.16160267, 13.9344801 ])

In [10]:
print('Model 2 score: ', scores_2.mean())

Model 2 score:  7.738389368932036


A significant improvement with the engineered features