In [1]:
# Load libraries
import numpy as np
import pandas as pd
import io

#  Midterm Exercise

In this exercise you will have to implement code in the sections inside *Fill with Your Code* (*Load Data*, *Data Preprocessing* and *Create Model*) to create a model to predict the column *MEDV* in the dataset given on Blackboard. The dataset is already split into train, validation, and test subsets. To see to which subset belong each observation, you need to check the *dataset* column.

You must achieve in the last cell of this notebook a **MAE over test of 2.9 or less**.

The dataset to be used here corresponds to the Boston Housing dataset. Documentation of this dataset can be found here: https://www.cs.toronto.edu/~delve/data/boston/bostonDetail.html

# Fill with Your Code

### [1] Load Data

In [2]:
from google.colab import files

In [3]:
uploaded = files.upload()

Saving dataset_midterm.csv to dataset_midterm.csv


In [4]:
dat = pd.read_csv(io.BytesIO(uploaded['dataset_midterm.csv']), sep = ",")

In [5]:
dat

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV,dataset
0,0.00632,18.0,2.31,0.0,,6.575,65.2,4.0900,1,296,15.3,396.90,4.98,24.0,train
1,0.02731,,7.07,0.0,0.469,6.421,78.9,4.9671,2,242,17.8,396.90,9.14,21.6,train
2,0.02729,,7.07,,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7,train
3,0.03237,,2.18,0.0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4,train
4,0.06905,0.0,2.18,0.0,,7.147,54.2,6.0622,3,222,18.7,396.90,,36.2,test
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,,,6.593,69.1,2.4786,1,273,21.0,391.99,,22.4,train
502,0.04527,,11.93,0.0,,6.120,76.7,2.2875,1,273,21.0,396.90,9.08,20.6,test
503,0.06076,,11.93,0.0,0.573,6.976,91.0,2.1675,1,273,21.0,396.90,5.64,23.9,train
504,0.10959,0.0,11.93,0.0,,6.794,89.3,2.3889,1,273,21.0,393.45,6.48,22.0,train


### [2]  Data Preprocessing

### Encoding of Categorical Variables

In [6]:
categorical_vars = ['CHAS', 'RAD']
non_categorical_vars = list(set(dat.columns) - set(categorical_vars))

In [7]:
from sklearn.preprocessing import OneHotEncoder

In [8]:
# 3) Define "model"
ohe = OneHotEncoder(sparse_output = False, drop = 'first')

# 4) Train "model"
ohe.fit(dat[categorical_vars][dat['dataset'] == 'train'])

# 5) "Predict"
dat_ohe = pd.DataFrame(ohe.transform(dat[categorical_vars]))

# Optional
dat_ohe.columns = ohe.get_feature_names_out()

# Combine numerical and categorical
dat = pd.concat((dat[non_categorical_vars], dat_ohe), axis=1)

### Fill Missing Values

In [9]:
numerical_vars = list(set(dat.columns) - set(['MEDV', 'dataset']))

In [10]:
!pip install fancyimpute
from fancyimpute import IterativeImputer as MICE # pip install fancyimpute
from sklearn.tree import DecisionTreeRegressor

Collecting fancyimpute
  Downloading fancyimpute-0.7.0.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting knnimpute>=0.1.0 (from fancyimpute)
  Downloading knnimpute-0.1.0.tar.gz (8.3 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting nose (from fancyimpute)
  Downloading nose-1.3.7-py3-none-any.whl (154 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.7/154.7 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: fancyimpute, knnimpute
  Building wheel for fancyimpute (setup.py) ... [?25l[?25hdone
  Created wheel for fancyimpute: filename=fancyimpute-0.7.0-py3-none-any.whl size=29880 sha256=ca33552e961e89bd296c99cdb54f8c76a9d48303d23d50fe647c67d95a39a61b
  Stored in directory: /root/.cache/pip/wheels/7b/0c/d3/ee82d1fbdcc0858d96434af108608d01703505d453720c84ed
  Building wheel for knnimpute (setup.py) ... [?25l[?25hdone
  Created wheel for knnimpute: filename=knnimpute-0.1.0-py3-none-a

In [11]:
# 3) Define "model"
model = MICE(DecisionTreeRegressor(random_state = 200))

# 4) Train "model"
model.fit(dat[numerical_vars])

# 5) "Predict"
dat[numerical_vars] = model.transform(dat[numerical_vars])



### [3] Create Model

In [17]:
from sklearn.ensemble import RandomForestRegressor as model_constructor
from sklearn.metrics import mean_absolute_error as metric

In [18]:
# Random Forest
max_features_values = [20, 22]
n_estimators_values = [100, 200, 300];
max_samples_values = [354]

params_grid = {'max_features': max_features_values,
              'n_estimators': n_estimators_values,
               'max_samples': max_samples_values}



In [19]:
num_iter = 1;
grid_results = pd.DataFrame(columns = ('max_features',
                                       'n_estimators',
                                       'max_samples',
                                       'metric_train',
                                       'metric_val'))

for max_features in params_grid['max_features']:
    for n_estimators in params_grid['n_estimators']:
        for max_samples in params_grid['max_samples']:

                        # Print trace
                        print('Iteracion = ' + str(num_iter))

                        # [3] Define model
                        model = model_constructor(max_features = max_features,
                                                  n_estimators = n_estimators,
                                                  max_samples = max_samples,
                                                  random_state = 0)

                        # [4] Train model
                        model.fit(dat[dat['dataset'] == 'train'].drop(['MEDV', 'dataset'], axis = 1),
                                  dat[dat['dataset'] == 'train'].MEDV.values)


                        # [5] Predict
                        pred_train = model.predict(dat[dat['dataset'] == 'train'].drop(['MEDV', 'dataset'], axis = 1)) # predict!
                        pred_val = model.predict(dat[dat['dataset'] == 'val'].drop(['MEDV', 'dataset'], axis = 1)) # predict!

                        # [6] Compute metric
                        metric_train = metric(dat[dat['dataset'] == 'train'].MEDV.values, pred_train)
                        metric_val = metric(dat[dat['dataset'] == 'val'].MEDV.values, pred_val)

                        # print error
                        print('Metric train = %.2f - Metric validation = %.2f.'
                              % (metric_train, metric_val))

                        # Save iteration results
                        grid_results.loc[num_iter]=[ max_features,
                                                    n_estimators,
                                                    max_samples,
                                                 metric_train,
                                                 metric_val]
                        num_iter += 1



Iteracion = 1
Metric train = 0.90 - Metric validation = 2.23.
Iteracion = 2
Metric train = 0.89 - Metric validation = 2.22.
Iteracion = 3
Metric train = 0.88 - Metric validation = 2.20.
Iteracion = 4
Metric train = 0.90 - Metric validation = 2.24.
Iteracion = 5
Metric train = 0.88 - Metric validation = 2.25.
Iteracion = 6
Metric train = 0.87 - Metric validation = 2.23.


In [None]:
grid_results = grid_results.sort_values(by = ['metric_val', 'metric_train'], ascending = [True, False])
best_model = grid_results.iloc[0]
best_model

max_features     20.000000
n_estimators    300.000000
max_samples     354.000000
metric_train      0.861548
metric_val        2.299551
Name: 3, dtype: float64

In [None]:
model =  model_constructor(max_features = int(best_model['max_features']),
                                                  n_estimators = int(best_model['n_estimators']),
                                                  max_samples = int(best_model['max_samples']),
                                                  random_state = 0)

# Evaluate Model

In [None]:
# [4] Train model
# model.fit(dat[dat['dataset'] == 'train'].drop(['MEDV', 'dataset'], axis = 1), dat[dat['dataset'] == 'train'].MEDV.values)


# [5] Predict
pred_train = model.predict(dat[dat['dataset'] == 'train'].drop(['MEDV', 'dataset'], axis = 1)) # predict!
pred_val = model.predict(dat[dat['dataset'] == 'val'].drop(['MEDV', 'dataset'], axis = 1)) # predict!
pred_test = model.predict(dat[dat['dataset'] == 'test'].drop(['MEDV', 'dataset'], axis = 1)) # predict!


# [6] Compute metric
metric_train = metric(dat[dat['dataset'] == 'train'].MEDV.values, pred_train)
metric_val = metric(dat[dat['dataset'] == 'val'].MEDV.values, pred_val)
metric_test = metric(dat[dat['dataset'] == 'test'].MEDV.values, pred_test)

In [None]:
# print error
print('Metric train = %.2f - Metric val = %.2f - Metric test = %.2f'
      % (metric_train, metric_val, metric_test))

Metric train = 0.04 - Metric val = 2.02 - Metric test = 2.68
