# What is Feature Engineering
## Derek Caramella

In [14]:
import os

import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score

import plotly.express as px

In [15]:
concrete_df = pd.read_csv(filepath_or_buffer='../data/concrete.csv')
concrete_df.columns = ['cement', 'blast_furnace_slag', 'fly_ash', 'water', 'super_plasticizer', 'coarse_aggregate', 'fine_aggregate', 'age', 'compressive_strength']
concrete_df.head(n=3)

Unnamed: 0,cement,blast_furnace_slag,fly_ash,water,super_plasticizer,coarse_aggregate,fine_aggregate,age,compressive_strength
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.99
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.89
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.27


## Baseline Model

In [17]:
X_features_ls = ['cement', 'blast_furnace_slag', 'fly_ash', 'water', 'super_plasticizer', 'coarse_aggregate', 'fine_aggregate', 'age']
y_train_feature_str = 'compressive_strength'

X_train = concrete_df[X_features_ls].copy()
y_train = concrete_df[y_train_feature_str]

# Baseline Model
rf_clf = RandomForestRegressor(criterion='absolute_error', random_state=0)
# The reason for using the negative is that when using cross-validation for model selection, scikit-learn always maximizes the score, so by returning the negative of the error, it is effectively minimizing the error.
# We use cross-validation to assess the sensitivity of the training data on the generalizability of the model
cv_baseline_score = cross_val_score(rf_clf, X=X_train, y=y_train, cv=5, scoring='neg_mean_absolute_error')
avg_baseline_score = np.abs(cv_baseline_score).mean()

print(f"MAE Baseline: {avg_baseline_score:,.3f}")

MAE Baseline: 8.232


## Ratios
If you ever cook at home, you might know that the ratio of ingredients in a recipe is usually a better predictor of how the recipe turns out than their absolute amounts. We might reason then that ratios of the features above would be a good predictor of `CompressiveStrength`.

In [19]:
df = concrete_df.copy()
df['fc_ratio'] = df['fine_aggregate'] / df['coarse_aggregate']
df['agg_cmt_ratio'] = (df['coarse_aggregate'] + df['fine_aggregate']) / df['cement']
df['water_cmt_ratio'] = df['water'] / df['cement']


X_features_ls = ['cement', 'blast_furnace_slag', 'fly_ash', 'water', 'super_plasticizer', 'coarse_aggregate', 'fine_aggregate', 'age', 'fc_ratio', 'agg_cmt_ratio', 'water_cmt_ratio']
y_train_feature_str = 'compressive_strength'

X_train = df[X_features_ls].copy()
y_train = df[y_train_feature_str]

# Ratio Model
rf_clf = RandomForestRegressor(criterion='absolute_error', random_state=0)
# The reason for using the negative is that when using cross-validation for model selection, scikit-learn always maximizes the score, so by returning the negative of the error, it is effectively minimizing the error.
# We use cross-validation to assess the sensitivity of the training data on the generalizability of the model
cv_baseline_score = cross_val_score(rf_clf, X=X_train, y=y_train, cv=5, scoring='neg_mean_absolute_error')
avg_baseline_score = np.abs(cv_baseline_score).mean()

print(f"MAE Baseline: {avg_baseline_score:,.3f}")

MAE Baseline: 7.948
