In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

In [2]:
concrete = pd.read_csv('data/concrete.csv')
co2 = pd.read_csv('data/co2.csv')

concrete['co2_lower'] = sum([concrete[col] * co2.loc[co2.ingredient == col, 'lower_bound'].values[0] for col in concrete.columns[:7]])
concrete['co2_upper'] = sum([concrete[col] * co2.loc[co2.ingredient == col, 'upper_bound'].values[0] for col in concrete.columns[:7]])

concrete = concrete[concrete['age'] < 120]

concrete_as = concrete[(concrete['ash'] > 0) & (concrete['slag'] > 0)]
concrete_a = concrete[(concrete['ash'] > 0) & (concrete['slag'] == 0)]
concrete_s = concrete[(concrete['ash'] == 0) & (concrete['slag'] > 0)]
concrete_ = concrete[(concrete['ash'] == 0) & (concrete['slag'] == 0)]

concrete_train, concrete_test = train_test_split(concrete,
                                                 shuffle=True,
                                                 random_state=487)

features = concrete.columns[:-3]

In [3]:
# This function was modified from stackexchange user hughdbrown 
# at this link, 
# https://stackoverflow.com/questions/1482308/how-to-get-all-subsets-of-a-set-powerset

# This returns the power set of a set minus the empty set
def powerset(s):
    power_set = []
    x = len(s)
    for i in range(1 << x):
        power_set.append([s[j] for j in range(x) if (i & (1 << j))])
        
    return power_set[1:]


In [4]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

In [8]:
def forest_grid(max_depths,features,n_trees,concrete_train):

    grid_cv = GridSearchCV(RandomForestRegressor(), # first put the model object here
                              param_grid = {'max_depth':max_depths, # place the grid values for max_depth and
                                            'n_estimators':n_trees}, # and n_estimators here
                              scoring = 'neg_mean_squared_error', # put the metric we are trying to optimize here as a string, "accuracy"
                              cv = 5) # put the number of cv splits here

    ## you fit it just like a model
    grid_cv.fit(concrete_train[features], concrete_train['strength'])

    return grid_cv.best_params_, -grid_cv.best_score_

In [7]:
features_all = ['cement', 'slag', 'ash', 'water', 'superplastic', 'coarseagg', 'fineagg', 'exp_age']

max_depths = [11,12,13,14,15,16,17,18]
features = ['cement', 'slag', 'water', 'superplastic', 'coarseagg', 'fineagg', 'exp_age']
features2 = ['cement', 'slag', 'water', 'superplastic', 'coarseagg', 'fineagg', 'age']
n_trees = [100,200,300,400,500]

print(forest_grid(max_depths,features2,n_trees,concrete_train))
for factor in np.arange(.01, .1, .01):
    concrete_train['exp_age'] = np.exp(-factor * concrete_train['age'])
    print(factor, forest_grid(max_depths,features,n_trees,concrete_train))

{'max_depth': 12, 'n_estimators': 100}
29.249699627404482
0.01 ({'max_depth': 12, 'n_estimators': 100}, 29.249699627404482)
{'max_depth': 12, 'n_estimators': 100}
29.522849057716957
0.02 ({'max_depth': 12, 'n_estimators': 100}, 29.522849057716957)
{'max_depth': 50, 'n_estimators': 500}
29.56590868283339
0.03 ({'max_depth': 50, 'n_estimators': 500}, 29.56590868283339)
{'max_depth': 13, 'n_estimators': 100}
29.482969961263166
0.04 ({'max_depth': 13, 'n_estimators': 100}, 29.482969961263166)
{'max_depth': 50, 'n_estimators': 500}
29.664542906290357
0.05 ({'max_depth': 50, 'n_estimators': 500}, 29.664542906290357)
{'max_depth': 12, 'n_estimators': 500}
29.661545634949835
0.060000000000000005 ({'max_depth': 12, 'n_estimators': 500}, 29.661545634949835)
{'max_depth': 13, 'n_estimators': 500}
29.498994101089256
0.06999999999999999 ({'max_depth': 13, 'n_estimators': 500}, 29.498994101089256)
{'max_depth': 50, 'n_estimators': 100}
29.259107475953282
0.08 ({'max_depth': 50, 'n_estimators': 100},

In [10]:
features_all = ['cement', 'slag', 'ash', 'water', 'superplastic', 'coarseagg', 'fineagg', 'exp_age']

max_depths = [12,13,14,15,16,17]
features = ['cement', 'slag', 'ash', 'water', 'superplastic', 'coarseagg', 'fineagg', 'exp_age']
features2 = ['cement', 'slag', 'ash', 'water', 'superplastic', 'coarseagg', 'fineagg', 'age']
n_trees = [100,200,300,400,500]

print(forest_grid(max_depths,features2,n_trees,concrete_train))
for factor in np.arange(.01, .1, .01):
    concrete_train['exp_age'] = np.exp(-factor * concrete_train['age'])
    print(factor, forest_grid(max_depths,features,n_trees,concrete_train))

({'max_depth': 14, 'n_estimators': 500}, 29.277755654021405)
0.01 ({'max_depth': 15, 'n_estimators': 200}, 29.16496070087594)
0.02 ({'max_depth': 17, 'n_estimators': 500}, 29.321819353959334)
0.03 ({'max_depth': 17, 'n_estimators': 300}, 29.113327280204864)
0.04 ({'max_depth': 13, 'n_estimators': 100}, 29.22803766138504)
0.05 ({'max_depth': 13, 'n_estimators': 100}, 29.118157773206832)
0.060000000000000005 ({'max_depth': 15, 'n_estimators': 200}, 29.406813645286398)
0.06999999999999999 ({'max_depth': 15, 'n_estimators': 300}, 29.08688417599177)
0.08 ({'max_depth': 13, 'n_estimators': 500}, 29.177607244480686)
0.09 ({'max_depth': 17, 'n_estimators': 500}, 29.080181351407163)
