# Decision Tree - Regression

We will predict the price (`price` column) of an AirBNB listing in Boston given a number of features about the listing.

**Therefore, our unit of analysis is an AIRBNB LISTING**

# Setup

In [1]:
# Common imports
import numpy as np
import pandas as pd

np.random.seed(42)


# Get the data

In [2]:
#We will predict the "price" value in the data set:

hills = pd.read_csv("HillsboroughCountyData.csv")
hills.head()

Unnamed: 0,PropertyType,SiteAddress,SiteCity,SiteZip,Acreage,Homestead,Neighborhood,TotalNumBuildings,TotalUnits,TotalStories,...,TotalBuildingValue,TotalExtraFeaturesValue,TotalHeatedAreaSqFt,JustValue,AssessedValue,TaxableValue,LastSaleDate,LastSalePrice,VacantImproved,Qualified
0,SINGLE FAMILY,8302 LUTZ LAKE FERN RD,ODESSA,33556,0.97,No,"E Lutz Lake Fern, W of Vets Xway",1,1,1.0,...,128296,2548,1094,264085,248933,248933,2020-02-16,100,Improved,Unqualified
1,SINGLE FAMILY,8304 LUTZ LAKE FERN RD,ODESSA,33556,1.47,Yes,"E Lutz Lake Fern, W of Vets Xway",1,1,1.0,...,263121,40759,2737,473506,268658,218658,2001-05-14,195000,Improved,Unqualified
2,SINGLE FAMILY,19146 HUCKAVALLE RD,ODESSA,33556,5.5,Yes,Northwest Corner of Hillsborough County,1,1,1.0,...,107433,45940,1555,417449,400551,350551,2018-01-25,425000,Improved,Qualified
3,SINGLE FAMILY,19108 RUSTIC WOODS TRL,ODESSA,33556,4.71,Yes,Northwest Corner of Hillsborough County,1,1,2.0,...,392657,57535,5376,680454,403161,348161,2004-11-10,395000,Improved,Qualified
4,SINGLE FAMILY,19115 HUCKAVALLE RD,ODESSA,33556,4.88,Yes,Northwest Corner of Hillsborough County,1,1,1.0,...,266616,59064,3228,573680,288391,238391,2005-03-29,500000,Improved,Unqualified


# Split the data into train and test

In [3]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(hills, test_size=0.3)

### Be careful: we haven't seperated the target column yet

## Check the missing values

In [33]:
train_set.isna().sum()

PropertyType                  0
SiteAddress                  18
SiteCity                      2
SiteZip                      10
Acreage                       0
Homestead                     0
Neighborhood                  0
TotalNumBuildings             0
TotalUnits                    0
TotalStories                  0
TotalBedrooms                 0
TotalBathrooms                0
YearBuilt                     0
TotalLandValue                0
TotalBuildingValue            0
TotalExtraFeaturesValue       0
TotalHeatedAreaSqFt           0
JustValue                     0
AssessedValue                 0
TaxableValue                  0
LastSaleDate                464
LastSalePrice                 0
VacantImproved             1839
Qualified                   445
dtype: int64

In [32]:
test_set.isna().sum()

PropertyType                 0
SiteAddress                  7
SiteCity                     1
SiteZip                      2
Acreage                      0
Homestead                    0
Neighborhood                 0
TotalNumBuildings            0
TotalUnits                   0
TotalStories                 0
TotalBedrooms                0
TotalBathrooms               0
YearBuilt                    0
TotalLandValue               0
TotalBuildingValue           0
TotalExtraFeaturesValue      0
TotalHeatedAreaSqFt          0
JustValue                    0
AssessedValue                0
TaxableValue                 0
LastSaleDate               206
LastSalePrice                0
VacantImproved             785
Qualified                  199
dtype: int64

# Data Prep

In [6]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

from sklearn.preprocessing import FunctionTransformer

## Drop the variables we can't use in this tutorial

In [9]:
# We can't use the following columns in this tutorial, because they are for classification tasks

train = train_set.drop([], axis=1)
test = test_set.drop([], axis=1)

## Separate the target variable (we don't want to transform it)

In [10]:
train_y = train[['JustValue']]
test_y = test[['JustValue']]

train_inputs = train.drop(['JustValue'], axis=1)
test_inputs = test.drop(['JustValue'], axis=1)

## Feature Engineering: Let's derive a new column

#### Formula: `reviews per day` = `number of reviews` / `number_days_btw_first_last_review`

In [None]:
def new_col(df):
    
    #Create a copy so that we don't overwrite the existing dataframe
    df1 = df.copy()

    # Use the formula, though fill in 0s when the value is 0/0 (because 0/0 generates "nan" values)
    df1['reviews_per_day'] = (df1['number_of_reviews']/df1['number_days_btw_first_last_review']).fillna(0)

    # Replace the infinity values with 1 (because a value divided by 0 generates infinity)
    df1['reviews_per_day'].replace(np.inf, 1, inplace=True)

    return df1[['reviews_per_day']]
    # You can use this to check whether the calculation is made correctly:
    #return df1
    

In [None]:
#Let's test the new function:

# Send the train set to the function we created
new_col(train_set)

##  Identify the numerical and categorical columns

In [11]:
train_inputs.dtypes

PropertyType                object
SiteAddress                 object
SiteCity                    object
SiteZip                     object
Acreage                    float64
Homestead                   object
Neighborhood                object
TotalNumBuildings            int64
TotalUnits                   int64
TotalStories               float64
TotalBedrooms              float64
TotalBathrooms             float64
YearBuilt                    int64
TotalLandValue               int64
TotalBuildingValue           int64
TotalExtraFeaturesValue      int64
TotalHeatedAreaSqFt          int64
AssessedValue                int64
TaxableValue                 int64
LastSaleDate                object
LastSalePrice                int64
VacantImproved              object
Qualified                   object
dtype: object

**At this stage, you can manually identify numeric, binary, and categorical columns as follows:**

`numeric_columns = ['latitude', 'longitude', 'accommodates', 'bathrooms', 'bedrooms', 'beds', 'Number of amenities', 'guests_included', 'price_per_extra_person', 'minimum_nights', 'number_of_reviews', 'number_days_btw_first_last_review', 'review_scores_rating']`
 
 `binary_columns = ['host_is_superhost', 'host_identity_verified']`
 
 `categorical_columns = ['neighbourhood_cleansed', 'property_type', 'room_type', 'bed_type', 'cancellation_policy']`
 
<br>
 
**If you do not want to manually type these, you can do the below tricks:**

In [19]:
# Identify the numerical columns
numeric_columns = train_inputs.select_dtypes(include=[np.number]).columns.to_list()

# Identify the categorical columns
categorical_columns = train_inputs.select_dtypes('object').columns.to_list()

In [20]:
# Identify the binary columns so we can pass them through without transforming
binary_columns = ['Homestead', 'Qualified']

In [21]:
# Be careful: numerical columns already includes the binary columns,
# So, we need to remove the binary columns from numerical columns.

for col in binary_columns:
    categorical_columns.remove(col)

In [22]:
binary_columns

['Homestead', 'Qualified']

In [23]:
numeric_columns

['Acreage',
 'TotalNumBuildings',
 'TotalUnits',
 'TotalStories',
 'TotalBedrooms',
 'TotalBathrooms',
 'YearBuilt',
 'TotalLandValue',
 'TotalBuildingValue',
 'TotalExtraFeaturesValue',
 'TotalHeatedAreaSqFt',
 'AssessedValue',
 'TaxableValue',
 'LastSalePrice']

In [24]:
categorical_columns

['PropertyType',
 'SiteAddress',
 'SiteCity',
 'SiteZip',
 'Neighborhood',
 'LastSaleDate',
 'VacantImproved']

In [18]:
feat_eng_columns = ['number_of_reviews', 'number_days_btw_first_last_review']

# Pipeline

In [25]:
numeric_transformer = Pipeline(steps=[
                ('imputer', SimpleImputer(strategy='median')),
                ('scaler', StandardScaler())])

In [26]:
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='unknown')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [27]:
binary_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent'))])

In [28]:
my_new_column = Pipeline(steps=[('my_new_column', FunctionTransformer(new_col)),
                               ('scaler', StandardScaler())])

NameError: name 'new_col' is not defined

In [30]:
preprocessor = ColumnTransformer([
        ('num', numeric_transformer, numeric_columns),
        ('cat', categorical_transformer, categorical_columns),
        ('binary', binary_transformer, binary_columns)],
        #('trans', my_new_column, feat_eng_columns)],
        remainder='passthrough')

#passtrough is an optional step. You don't have to use it.

In [34]:
train_set.isna().sum()

PropertyType                  0
SiteAddress                  18
SiteCity                      2
SiteZip                      10
Acreage                       0
Homestead                     0
Neighborhood                  0
TotalNumBuildings             0
TotalUnits                    0
TotalStories                  0
TotalBedrooms                 0
TotalBathrooms                0
YearBuilt                     0
TotalLandValue                0
TotalBuildingValue            0
TotalExtraFeaturesValue       0
TotalHeatedAreaSqFt           0
JustValue                     0
AssessedValue                 0
TaxableValue                  0
LastSaleDate                464
LastSalePrice                 0
VacantImproved             1839
Qualified                   445
dtype: int64

# Transform: fit_transform() for TRAIN

In [31]:
#Fit and transform the train data
train_x = preprocessor.fit_transform(train_inputs)

train_x

ValueError: For a sparse output, all columns should be a numeric or convertible to a numeric.

In [None]:
train_x.shape

# Tranform: transform() for TEST

In [None]:
# Transform the test data
test_x = preprocessor.transform(test_inputs)

test_x

In [None]:
test_x.shape

# Calculate the baseline

In [None]:
from sklearn.metrics import mean_squared_error

In [None]:
#First find the average value of the target

mean_value = np.mean(train_y['price'])

mean_value

In [None]:
# Predict all values as the mean

baseline_pred = np.repeat(mean_value, len(test_y))

baseline_pred

In [None]:
baseline_mse = mean_squared_error(test_y, baseline_pred)

baseline_rmse = np.sqrt(baseline_mse)

print('Baseline RMSE: {}' .format(baseline_rmse))

In [None]:
train_y['price']

# Train the model

Do NOT train a DecisionTreeRegressor() without any parameters. It OVERFITS. 

In [None]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor(min_samples_leaf = 10) # remove parameter to have un-restricted learning

tree_reg.fit(train_x, train_y)

In [None]:
#Train RMSE
train_pred = tree_reg.predict(train_x)

train_mse = mean_squared_error(train_y, train_pred)

train_rmse = np.sqrt(train_mse)

print('Train RMSE: {}' .format(train_rmse))

In [None]:
#Test RMSE
test_pred = tree_reg.predict(test_x)

test_mse = mean_squared_error(test_y, test_pred)

test_rmse = np.sqrt(test_mse)

print('Test RMSE: {}' .format(test_rmse))

## More regularization

In [None]:
#Let's restrict the depth as well

tree_reg2 = DecisionTreeRegressor(min_samples_leaf = 10, max_depth=5) # additional restriction on learning parameters

tree_reg2.fit(train_x, train_y)

In [None]:
#Train RMSE
train_pred = tree_reg2.predict(train_x)

train_mse = mean_squared_error(train_y, train_pred)

train_rmse = np.sqrt(train_mse)

print('Train RMSE: {}' .format(train_rmse))

In [None]:
#Test RMSE
test_pred = tree_reg2.predict(test_x)

test_mse = mean_squared_error(test_y, test_pred)

test_rmse = np.sqrt(test_mse)

print('Test RMSE: {}' .format(test_rmse))

In [None]:
np.round(tree_reg2.feature_importances_,2) # what columns are important? only a few...

In [None]:
from sklearn.tree import plot_tree
import matplotlib.pyplot as plt

plt.figure(figsize=(125,50))

tree = plot_tree(tree_reg2,
                 #feature_names=train_inputs.columns.value, # our feature names are stripped form the data set
                 #class_names=np.unique(train_y),
                 filled=True,
                 rounded=True,
                 fontsize=14)

## Avoid Overfitting

In [None]:
train_error = []
test_error = []

for x in range(1,31):
    tree_reg3 = DecisionTreeRegressor(max_depth=x)
    tree_reg3.fit(train_x, train_y)
    reg_train_predictions = tree_reg3.predict(train_x)
    reg_test_predictions = tree_reg3.predict(test_x)
    train_rmse = round(np.sqrt(mean_squared_error (train_y, reg_train_predictions)),4)
    test_rmse = round(np.sqrt(mean_squared_error (test_y, reg_test_predictions)),4)
    print('# Max depth = {}'.format(x) + "     " +'Train RMSE = {}'.format(train_rmse) + "   "
         'Test RMSE = {}'.format(test_rmse))
    
    train_error.append(train_rmse)
    test_error.append(test_rmse)


In [None]:
plt.plot(train_error, label='Train')
plt.plot(test_error, label='Test')
plt.xlabel("max Depth")
plt.ylabel("Error")
plt.legend()

# Randomized Grid Search

In [None]:
from sklearn.model_selection import RandomizedSearchCV

param_grid = [
    {'min_samples_leaf': np.arange(10, 30), 
     'max_depth': np.arange(10,30)}
  ]

tree_reg = DecisionTreeRegressor()

grid_search = RandomizedSearchCV(tree_reg, param_grid, cv=5, n_iter=10,
                           scoring='neg_mean_squared_error', verbose=1,
                           return_train_score=True)

grid_search.fit(train_x, train_y)

In [None]:
cvres = grid_search.cv_results_

for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

In [None]:
grid_search.best_params_

In [None]:
grid_search.best_estimator_

In [None]:
#Train RMSE
train_pred = grid_search.best_estimator_.predict(train_x)

train_mse = mean_squared_error(train_y, train_pred)

train_rmse = np.sqrt(train_mse)

print('Train RMSE: {}' .format(train_rmse))

In [None]:
#Test RMSE
test_pred = grid_search.best_estimator_.predict(test_x)

test_mse = mean_squared_error(test_y, test_pred)

test_rmse = np.sqrt(test_mse)

print('Test RMSE: {}' .format(test_rmse))