# Kaggle Micro Course: "*Intro to Machine Learning*"

# House Pricing Prediction (Iowa)

**N.B.**: Be sure to edit and source the `.env` file!

In [1]:
!source .env

`comet.ml` installation

Import modules

In [2]:
# Import comet_ml in the top of your file
from comet_ml import Experiment

import os
import pandas as pd
from pathlib import Path
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer

Import data

In [3]:
# Path of the data folder
data_folder_path = Path('../../input/home-data-for-ml-course')

# Path of the files to read
train_path = data_folder_path / 'train.csv'
test_path = data_folder_path / 'test.csv'

# Read dataset from csv file
train_data = pd.read_csv(train_path, index_col='Id')
test_data = pd.read_csv(test_path, index_col='Id')

## Data Exploration

In [4]:
# Print the first 5 rows of the table
train_data.head()

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500
4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000
5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000


In [5]:
train_data.describe()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
count,1460.0,1201.0,1460.0,1460.0,1460.0,1460.0,1460.0,1452.0,1460.0,1460.0,...,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,56.89726,70.049958,10516.828082,6.099315,5.575342,1971.267808,1984.865753,103.685262,443.639726,46.549315,...,94.244521,46.660274,21.95411,3.409589,15.060959,2.758904,43.489041,6.321918,2007.815753,180921.19589
std,42.300571,24.284752,9981.264932,1.382997,1.112799,30.202904,20.645407,181.066207,456.098091,161.319273,...,125.338794,66.256028,61.119149,29.317331,55.757415,40.177307,496.123024,2.703626,1.328095,79442.502883
min,20.0,21.0,1300.0,1.0,1.0,1872.0,1950.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0,34900.0
25%,20.0,59.0,7553.5,5.0,5.0,1954.0,1967.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2007.0,129975.0
50%,50.0,69.0,9478.5,6.0,5.0,1973.0,1994.0,0.0,383.5,0.0,...,0.0,25.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0,163000.0
75%,70.0,80.0,11601.5,7.0,6.0,2000.0,2004.0,166.0,712.25,0.0,...,168.0,68.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0,214000.0
max,190.0,313.0,215245.0,10.0,9.0,2010.0,2010.0,1600.0,5644.0,1474.0,...,857.0,547.0,552.0,508.0,480.0,738.0,15500.0,12.0,2010.0,755000.0


In [6]:
# Shape of data (num_rows, num_columns)
print(train_data.shape)

(1460, 80)


---

## Data Preparation

### Preliminary steps

We are interested in the prediction of the sale price of houses in Iowa: I drop samples without sale price info from the training dataset.

In [7]:
# Remove rows with missing target
train_data.dropna(axis=0, subset=['SalePrice'], inplace=True)

Then I select the target variable, which corresponds to the sales price, and save it to a new variable called `y`.

In [8]:
# Separate target from predictors
y = train_data.SalePrice

Finally I create a DataFrame called `X` holding the predictive features.

In [9]:
X_full = train_data.drop(['SalePrice'], axis=1)

# To keep things simple, we'll use only numerical predictors
X = X_full.select_dtypes(exclude=['object'])
X_test = test_data.select_dtypes(exclude=['object'])

### Split *training data* in *training set* and *validation set*

I use the `train_test_split` function to split up my data.

I set the `random_state` property for the sake of reproducibility.

In [10]:
# Break off validation set from training data
X_train, X_valid, y_train, y_valid = train_test_split(X, y,
                                                      train_size=0.8,
                                                      test_size=0.2,
                                                      random_state=0)

### Handle Missing Values with Imputation

#### Preliminary investigation

Which columns present missing values? How many missing values are there in each column?

In [11]:
# Number of missing values in each column of training data
missing_val_count_by_column = (X_train.isnull().sum())
print(missing_val_count_by_column[missing_val_count_by_column > 0])

LotFrontage    212
MasVnrArea       6
GarageYrBlt     58
dtype: int64


In [12]:
my_imputer = SimpleImputer()
imputed_X_train = pd.DataFrame(my_imputer.fit_transform(X_train))
imputed_X_valid = pd.DataFrame(my_imputer.transform(X_valid))

# Imputation removed column names; I put them back
imputed_X_train.columns = X_train.columns
imputed_X_valid.columns = X_valid.columns

X_train = imputed_X_train
X_valid = imputed_X_valid

---

## Building and Evaluating the Model

### Experiment 1

Initialize Comet.ml experiment

In [13]:
experiment = Experiment(
    api_key=os.environ.get("COMET_API_KEY"),
    project_name=os.environ.get("COMET_PROJECT_NAME"),
    workspace=os.environ.get("COMET_WORKSPACE"))

COMET INFO: Experiment is live on comet.ml https://www.comet.ml/swsolutions4reproducibleai/house-pricing-prediction/35c7789e3f724e55a81e1df7df184d73



#### Build the Model

In [14]:
# First of all I need to specify the model
# For model reproducibility, I set the `random_state` argument
random_state=0
iowa_model = DecisionTreeRegressor(random_state=random_state)

# Then I need to fit the model to training data
iowa_model.fit(X_train, y_train)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=0, splitter='best')

Log experiment details in Comet.ml

In [15]:
# Dataset
experiment.log_dataset_hash(X_train)
experiment.log_dataset_info('Numeric features with simple imputation')

# Parameters
params = {
    "random_state": random_state,
    "model_type": "DecisionTreeRegressor",
    "max_leaf_nodes": None
}
experiment.log_parameters(params)

#### Make Predictions

Finally I make predictions with the model's `predict` command using `val_X` as the data.

I save the results to a variable called `val_predictions`.

In [16]:
val_predictions = iowa_model.predict(X_valid)

#### Evaluate the model (using the Mean Absolute Error)

In [17]:
mae = mean_absolute_error(y_valid, val_predictions)
print(f'{mae:,.2f}')

26,688.38


Log the result in Comet.ml and close the first experiment.

In [18]:
experiment.log_metric('MAE', mae)

experiment.end()

COMET INFO: ---------------------------
COMET INFO: Comet.ml Experiment Summary
COMET INFO: ---------------------------
COMET INFO:   Data:
COMET INFO:     display_summary_level : 1
COMET INFO:     url                   : https://www.comet.ml/swsolutions4reproducibleai/house-pricing-prediction/35c7789e3f724e55a81e1df7df184d73
COMET INFO:   Metrics:
COMET INFO:     MAE : 26688.380136986303
COMET INFO:   Others:
COMET INFO:     dataset_info : Numeric features with simple imputation
COMET INFO:   Parameters:
COMET INFO:     criterion                : mse
COMET INFO:     max_depth                : 1
COMET INFO:     max_features             : 1
COMET INFO:     max_leaf_nodes           : 1
COMET INFO:     min_impurity_decrease    : 1
COMET INFO:     min_impurity_split       : 1
COMET INFO:     min_samples_leaf         : 1
COMET INFO:     min_samples_split        : 2
COMET INFO:     min_weight_fraction_leaf : 1
COMET INFO:     model_type               : DecisionTreeRegressor
COMET INFO:     p

### Experiment 2

Initialize Comet.ml experiment

In [19]:
experiment = Experiment(
    api_key=os.environ.get("COMET_API_KEY"),
    project_name=os.environ.get("COMET_PROJECT_NAME"),
    workspace=os.environ.get("COMET_WORKSPACE"))

COMET INFO: Experiment is live on comet.ml https://www.comet.ml/swsolutions4reproducibleai/house-pricing-prediction/6c33b0bfcf03455c80d0e0c9e5027303



The following function will compute the MAE for predictions provided by a `DecisionTreeRegressor` with a variable `max_leaf_nods` parameter.

I will use it momentarily to study how to improve the model by varying that parameter.

In [20]:
def get_mae(max_leaf_nodes, random_state, X_train, X_valid, y_train, y_valid):
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=random_state)
    model.fit(X_train, y_train)
    preds_val = model.predict(X_valid)
    mae = mean_absolute_error(y_valid, preds_val)
    return(mae)

In [21]:
candidate_max_leaf_nodes = [5, 25, 50, 100, 250, 500]

# Loop to find the ideal tree size from candidate_max_leaf_nodes
random_state = 0
mae_values = []
for max_leaf_nodes in candidate_max_leaf_nodes:
    mae_values.append(get_mae(max_leaf_nodes, random_state, X_train, X_valid, y_train, y_valid))

# Best value of max_leaf_nodes
index_of_minimum_mae = mae_values.index(min(mae_values))
best_tree_size = candidate_max_leaf_nodes[index_of_minimum_mae]

print("Best MAE value: {:,.2f}".format(mae_values[index_of_minimum_mae]))
print("Best tree size:", best_tree_size)

Best MAE value: 25,263.71
Best tree size: 100


Time to log details about the new model in Comet.ml

In [22]:
# Dataset
experiment.log_dataset_hash(X_train)
experiment.log_dataset_info('Numeric features with simple imputation')

# Parameters
params = {
    "random_state": random_state,
    "model_type": "DecisionTreeRegressor",
    "max_leaf_nodes": best_tree_size
}
experiment.log_parameters(params)
experiment.log_metric('MAE', mae_values[index_of_minimum_mae])

experiment.end()

COMET INFO: ---------------------------
COMET INFO: Comet.ml Experiment Summary
COMET INFO: ---------------------------
COMET INFO:   Data:
COMET INFO:     display_summary_level : 1
COMET INFO:     url                   : https://www.comet.ml/swsolutions4reproducibleai/house-pricing-prediction/6c33b0bfcf03455c80d0e0c9e5027303
COMET INFO:   Metrics:
COMET INFO:     MAE : 25263.711944159804
COMET INFO:   Others:
COMET INFO:     dataset_info : Numeric features with simple imputation
COMET INFO:   Parameters [count]:
COMET INFO:     criterion                : mse
COMET INFO:     max_depth                : 1
COMET INFO:     max_features             : 1
COMET INFO:     max_leaf_nodes [7]       : 100
COMET INFO:     min_impurity_decrease    : 1
COMET INFO:     min_impurity_split       : 1
COMET INFO:     min_samples_leaf         : 1
COMET INFO:     min_samples_split        : 2
COMET INFO:     min_weight_fraction_leaf : 1
COMET INFO:     model_type               : DecisionTreeRegressor
COMET I

### Experiment 3

Let's try to train a more sophisticated model.

I will use a RandomForestRegressor.

Initialize Comet.ml experiment

In [23]:
experiment = Experiment(
    api_key=os.environ.get("COMET_API_KEY"),
    project_name=os.environ.get("COMET_PROJECT_NAME"),
    workspace=os.environ.get("COMET_WORKSPACE"))

COMET INFO: Experiment is live on comet.ml https://www.comet.ml/swsolutions4reproducibleai/house-pricing-prediction/8808d7c5f2074742b4fcc485e7e843ec



In [24]:
# Define the model. Set random_state to 1
random_state = 0
rf_model = RandomForestRegressor(random_state=random_state)

# Fit the model
rf_model.fit(X_train, y_train)

# Calculate the mean absolute error of the Random Forest model on the validation data
val_predictions = rf_model.predict(X_valid)
rf_val_mae = mean_absolute_error(val_predictions, y_valid)

print("Validation MAE for Random Forest Model: {:,.2f}".format(rf_val_mae))

Validation MAE for Random Forest Model: 19,255.56




Time to log details about the new model in Comet.ml

In [25]:
# Dataset
experiment.log_dataset_hash(X_train)
experiment.log_dataset_info('Numeric features with simple imputation')

# Parameters
params = {
    "random_state": random_state,
    "model_type": "RandomForestRegressor"
}
experiment.log_parameters(params)
experiment.log_metric('MAE', rf_val_mae)

experiment.end()

COMET INFO: ---------------------------
COMET INFO: Comet.ml Experiment Summary
COMET INFO: ---------------------------
COMET INFO:   Data:
COMET INFO:     display_summary_level : 1
COMET INFO:     url                   : https://www.comet.ml/swsolutions4reproducibleai/house-pricing-prediction/8808d7c5f2074742b4fcc485e7e843ec
COMET INFO:   Metrics:
COMET INFO:     MAE : 19255.558333333334
COMET INFO:   Others:
COMET INFO:     dataset_info : Numeric features with simple imputation
COMET INFO:   Parameters [count]:
COMET INFO:     bootstrap                : True
COMET INFO:     criterion                : mse
COMET INFO:     max_depth                : 1
COMET INFO:     max_features             : auto
COMET INFO:     max_leaf_nodes           : 1
COMET INFO:     min_impurity_decrease    : 1
COMET INFO:     min_impurity_split       : 1
COMET INFO:     min_samples_leaf         : 1
COMET INFO:     min_samples_split        : 2
COMET INFO:     min_weight_fraction_leaf : 1
COMET INFO:     model_t