In [30]:
import pandas as pd

dat = pd.read_csv('../data/dataset_midterm.csv', sep = ",")

In [3]:
dat.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV,dataset
0,0.00632,18.0,2.31,0.0,,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0,train
1,0.02731,,7.07,0.0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6,train
2,0.02729,,7.07,,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7,train
3,0.03237,,2.18,0.0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4,train
4,0.06905,0.0,2.18,0.0,,7.147,54.2,6.0622,3,222,18.7,396.9,,36.2,test


In [16]:
dat['MEDV'].value_counts()

MEDV
50.0    16
25.0     8
22.0     7
21.7     7
23.1     7
        ..
32.9     1
34.6     1
30.3     1
33.3     1
8.1      1
Name: count, Length: 229, dtype: int64

In [31]:
categorical_variables = ['CHAS', 'RAD']
non_cat_var = list(set(dat.columns) - set(categorical_variables))

In [32]:
from sklearn.preprocessing import OneHotEncoder

In [33]:
# 3) Define "model"
ohe = OneHotEncoder(sparse_output = False, drop = 'first')

# 4) Train "model"
ohe.fit(dat[categorical_variables][dat['dataset'] == 'train'])

# 5) "Predict"
dat_ohe = pd.DataFrame(ohe.transform(dat[categorical_variables]))

# Optional
dat_ohe.columns = ohe.get_feature_names_out()

# Combine numerical and categorical
dat = pd.concat((dat[non_cat_var], dat_ohe), axis=1)

In [34]:
from fancyimpute import IterativeImputer as MICE
from sklearn.ensemble import RandomForestRegressor

In [35]:
numerical_variables = list(set(dat.columns) - {'dataset', 'MEDV'})
# 3) Define "model"
model = MICE(RandomForestRegressor())

# 4) Train "model"
model.fit(dat[numerical_variables][dat['dataset'] == 'train'])

# 5) "Predict"
dat[numerical_variables] = model.transform(dat[numerical_variables])



In [37]:
from sklearn.ensemble import RandomForestRegressor as model_constructor
from sklearn.metrics import mean_absolute_error as metric

In [56]:
# Random Forest
n_estimators_values = [250,500, 1000]
max_features_values = [8, 10, 12]
max_samples_values = [200, dat[dat['dataset'] == 'train'].shape[0]]

params_grid = {'max_features': max_features_values,
              'n_estimators': n_estimators_values,
               'max_samples': max_samples_values}


In [57]:
num_iter = 1
grid_results = pd.DataFrame(columns = ('max_features',
                                       'n_estimators',
                                       'max_samples',
                                       'metric_train',
                                       'metric_val'))

for max_features in params_grid['max_features']:
    for n_estimators in params_grid['n_estimators']:
        for max_samples in params_grid['max_samples']:

                        # Print trace
                        print('Iteracion = ' + str(num_iter))

                        # [3] Define model
                        model = model_constructor(max_features = max_features,
                                                  n_estimators = n_estimators,
                                                  max_samples = max_samples,
                                                  random_state = 0)

                        # [4] Train model
                        model.fit(dat[dat['dataset'] == 'train'].drop(['MEDV', 'dataset'], axis = 1),
                                  dat[dat['dataset'] == 'train'].MEDV.values)


                        # [5] Predict
                        pred_train = model.predict(dat[dat['dataset'] == 'train'].drop(['MEDV', 'dataset'], axis = 1)) # predict!
                        pred_val = model.predict(dat[dat['dataset'] == 'val'].drop(['MEDV', 'dataset'], axis = 1)) # predict!

                        # [6] Compute metric
                        metric_train = metric(dat[dat['dataset'] == 'train'].MEDV.values, pred_train)
                        metric_val = metric(dat[dat['dataset'] == 'val'].MEDV.values, pred_val)

                        # print error
                        print('Metric train = %.2f - Metric validation = %.2f.'
                              % (metric_train, metric_val))

                        # Save iteration results
                        grid_results.loc[num_iter]=[ max_features,
                                                    n_estimators,
                                                    max_samples,
                                                 metric_train,
                                                 metric_val]
                        num_iter += 1



Iteracion = 1
Metric train = 1.31 - Metric validation = 2.30.
Iteracion = 2
Metric train = 0.83 - Metric validation = 2.22.
Iteracion = 3
Metric train = 1.29 - Metric validation = 2.31.
Iteracion = 4
Metric train = 0.82 - Metric validation = 2.22.
Iteracion = 5
Metric train = 1.29 - Metric validation = 2.29.
Iteracion = 6
Metric train = 0.81 - Metric validation = 2.21.
Iteracion = 7
Metric train = 1.31 - Metric validation = 2.32.
Iteracion = 8
Metric train = 0.83 - Metric validation = 2.21.
Iteracion = 9
Metric train = 1.29 - Metric validation = 2.28.
Iteracion = 10
Metric train = 0.82 - Metric validation = 2.21.
Iteracion = 11
Metric train = 1.29 - Metric validation = 2.27.
Iteracion = 12
Metric train = 0.82 - Metric validation = 2.18.
Iteracion = 13
Metric train = 1.30 - Metric validation = 2.32.
Iteracion = 14
Metric train = 0.84 - Metric validation = 2.23.
Iteracion = 15
Metric train = 1.29 - Metric validation = 2.30.
Iteracion = 16
Metric train = 0.84 - Metric validation = 2.21.
I

In [58]:
grid_results_rf = grid_results.sort_values(by = ['metric_val', 'metric_train'], ascending = [True, True])
best_model_rf = grid_results_rf.iloc[0]
best_model_rf

max_features      10.000000
n_estimators    1000.000000
max_samples      354.000000
metric_train       0.823285
metric_val         2.181443
Name: 12, dtype: float64

In [59]:
# [3] define model
model = model_constructor(max_features = int(best_model_rf.max_features),
                          n_estimators = int(best_model_rf.n_estimators),
                          max_samples = int(best_model_rf.max_samples),
                          random_state = 0)

In [61]:
# [4] Train model
model.fit(dat[dat['dataset'] != 'test'].drop(['MEDV', 'dataset'], axis = 1), dat[dat['dataset'] != 'test'].MEDV.values)

In [62]:
# [5] Predict
pred_train = model.predict(dat[dat['dataset'] == 'train'].drop(['MEDV', 'dataset'], axis = 1)) # predict!
pred_val = model.predict(dat[dat['dataset'] == 'val'].drop(['MEDV', 'dataset'], axis = 1)) # predict!
pred_test = model.predict(dat[dat['dataset'] == 'test'].drop(['MEDV', 'dataset'], axis = 1)) # predict!

In [63]:
# [6] Compute metric
metric_train = metric(dat[dat['dataset'] == 'train'].MEDV.values, pred_train)
metric_val = metric(dat[dat['dataset'] == 'val'].MEDV.values, pred_val)
metric_test = metric(dat[dat['dataset'] == 'test'].MEDV.values, pred_test)

In [64]:
# print error
print('Metric train = %.2f - Metric val = %.2f - Metric test = %.2f'
      % (metric_train, metric_val, metric_test))

Metric train = 0.96 - Metric val = 0.95 - Metric test = 2.83


# XGBoost

In [68]:
from xgboost import XGBRegressor as model_constructor

In [89]:
# Xgboost
n_estimators_values = [1000]
learning_rate_values = [0.3, 0.5, 1]
gamma_values = [0, 1, 10]
max_depth_values = [6, 10, 20]
min_child_weight_values = [1, 20, 50]
subsample_values = [0.1, 1]
colsample_bytree_values = [0.5, 1]
num_parallel_tree_values = [10]

params_grid = {'n_estimators': n_estimators_values,
                  'learning_rate': learning_rate_values,
                 'gamma': gamma_values,
                 'max_depth': max_depth_values,
                 'min_child_weight': min_child_weight_values,
                 'subsample': subsample_values,
                 'colsample_bytree': colsample_bytree_values,
                 'num_parallel_tree': num_parallel_tree_values}

In [97]:
%%capture
num_iter = 1
grid_results = pd.DataFrame(columns = ('n_estimators',
                                       'learning_rate',
                                       'gamma',
                                       'max_depth',
                                       'min_child_weight',
                                       'subsample',
                                       'colsample_bytree',
                                       'num_parallel_tree',
                                       'best_iteration',
                                       'metric_train',
                                       'metric_val'))

for n_estimators in params_grid['n_estimators']:
    for learning_rate in params_grid['learning_rate']:
        for gamma in params_grid['gamma']:
            for max_depth in params_grid['max_depth']:
                for min_child_weight in params_grid['min_child_weight']:
                    for subsample in params_grid['subsample']:
                        for colsample_bytree in params_grid['colsample_bytree']:
                            for num_parallel_tree in params_grid['num_parallel_tree']:




                                                # Print trace
                                                print('Iteration = ' + str(num_iter))

                                                # [3] Define model
                                                model = model_constructor(n_estimators = n_estimators,
                                                                      learning_rate = learning_rate,
                                                                      gamma = gamma,
                                                                      max_depth = max_depth,
                                                                      min_child_weight = min_child_weight ,
                                                                      subsample = subsample,
                                                                      colsample_bytree = colsample_bytree,
                                                                      num_parallel_tree = num_parallel_tree,
                                                                      early_stopping_rounds = 10,
                                                                      eval_metric = "mae",
                                                                      reg_lambda= 10,
                                                                          reg_alpha = 5,
                                                                          device= 'cuda',
                                                                      random_state = 0) # nthread!!!

                                                # [4] Train model
                                                model.fit(dat[dat['dataset'] == 'train'].drop(['MEDV', 'dataset'], axis = 1), dat[dat['dataset'] == 'train'].MEDV.values,
                                                          eval_set=[(dat[dat['dataset'] == 'val'].drop(['MEDV', 'dataset'], axis = 1), dat[dat['dataset'] == 'val'].MEDV.values)],
                                                          verbose = False)
                                                best_iteration = model.best_iteration

                                                # [5] Predict
                                                pred_train = model.predict(dat[dat['dataset'] == 'train'].drop(['MEDV', 'dataset'], axis = 1)) # predict_proba!
                                                pred_val = model.predict(dat[dat['dataset'] == 'val'].drop(['MEDV', 'dataset'], axis = 1)) # predict_proba!

                                                # [6] Compute metric
                                                metric_train = metric(dat[dat['dataset'] == 'train'].MEDV.values, pred_train)
                                                metric_val = metric(dat[dat['dataset'] == 'val'].MEDV.values, pred_val)

                                                # print error
                                                print('AUC train = %.2f - AUC validation = %.2f.'
                                                      % (metric_train, metric_val))

                                                # Save iteration results
                                                grid_results.loc[num_iter]=[n_estimators,
                                                                            learning_rate,
                                                                            gamma,
                                                                            max_depth,
                                                                            min_child_weight,
                                                                            subsample,
                                                                            colsample_bytree,
                                                                            num_parallel_tree,
                                                                            best_iteration,
                                                                            metric_train,
                                                                            metric_val]
                                                num_iter += 1

In [98]:
grid_results_xgb = grid_results.sort_values(by = ['metric_val', 'metric_train'], ascending = [True, True])
best_model_xgb = grid_results_xgb.iloc[0]
best_model_xgb

n_estimators         1000.000000
learning_rate           0.300000
gamma                   0.000000
max_depth              20.000000
min_child_weight        1.000000
subsample               0.100000
colsample_bytree        1.000000
num_parallel_tree      10.000000
best_iteration         53.000000
metric_train            1.849894
metric_val              2.011669
Name: 26, dtype: float64

In [99]:
# [3] define model
model = model_constructor(n_estimators = int(best_model_xgb.best_iteration),
                          learning_rate = best_model_xgb.learning_rate,
                          gamma = best_model_xgb.gamma,
                          max_depth = int(best_model_xgb.max_depth),
                          min_child_weight = best_model_xgb.min_child_weight,
                          subsample = best_model_xgb.subsample,
                          colsample_bytree = best_model_xgb.colsample_bytree,
                          num_parallel_tree = int(best_model_xgb.num_parallel_tree),
                          random_state = 0)

In [100]:
%%capture
# [4] Train model
model.fit(dat[dat['dataset'] != 'test'].drop(['MEDV', 'dataset'], axis = 1), dat[dat['dataset'] != 'test'].MEDV.values)

In [101]:
%%capture
# [5] Predict
pred_train = model.predict(dat[dat['dataset'] == 'train'].drop(['MEDV', 'dataset'], axis = 1)) # predict!
pred_val = model.predict(dat[dat['dataset'] == 'val'].drop(['MEDV', 'dataset'], axis = 1)) # predict!
pred_test = model.predict(dat[dat['dataset'] == 'test'].drop(['MEDV', 'dataset'], axis = 1)) # predict!

In [102]:
# [6] Compute metric
metric_train = metric(dat[dat['dataset'] == 'train'].MEDV.values, pred_train)
metric_val = metric(dat[dat['dataset'] == 'val'].MEDV.values, pred_val)
metric_test = metric(dat[dat['dataset'] == 'test'].MEDV.values, pred_test)

In [103]:
# print error
print('Metric train = %.2f - Metric val = %.2f - Metric test = %.2f'
      % (metric_train, metric_val, metric_test))

Metric train = 1.14 - Metric val = 0.99 - Metric test = 2.89
