In [241]:
import pandas as pd
import numpy as np

In [242]:
housing_data = pd.read_csv('./melb_data.csv')

In [243]:
housing_data.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,...,1.0,1.0,202.0,,,Yarra,-37.7996,144.9984,Northern Metropolitan,4019.0
1,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra,-37.8079,144.9934,Northern Metropolitan,4019.0
2,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,...,2.0,0.0,134.0,150.0,1900.0,Yarra,-37.8093,144.9944,Northern Metropolitan,4019.0
3,Abbotsford,40 Federation La,3,h,850000.0,PI,Biggin,4/03/2017,2.5,3067.0,...,2.0,1.0,94.0,,,Yarra,-37.7969,144.9969,Northern Metropolitan,4019.0
4,Abbotsford,55a Park St,4,h,1600000.0,VB,Nelson,4/06/2016,2.5,3067.0,...,1.0,2.0,120.0,142.0,2014.0,Yarra,-37.8072,144.9941,Northern Metropolitan,4019.0


In [244]:
len(housing_data.columns)

21

In [245]:
cols_with_missing_values = [col for col in housing_data.columns if housing_data[col].hasnans]
cols_with_missing_values

['Car', 'BuildingArea', 'YearBuilt', 'CouncilArea']

In [246]:
housing_data.drop(columns=cols_with_missing_values, inplace=True)

In [247]:
len(housing_data.columns)

17

In [248]:
categorical_columns = [col for col in housing_data.columns if housing_data[col].dtype == "object" and housing_data[col].nunique() < 10]

categorical_columns

## Gives us the columns categorical variables

['Type', 'Method', 'Regionname']

In [249]:
numerical_types = ['int64', 'float64']

numerical_columns = [col for col in housing_data.columns if housing_data[col].dtype in numerical_types]

numerical_columns

['Rooms',
 'Price',
 'Distance',
 'Postcode',
 'Bedroom2',
 'Bathroom',
 'Landsize',
 'Lattitude',
 'Longtitude',
 'Propertycount']

In [250]:
merged_columns = categorical_columns + numerical_columns

formatted_housing_data = housing_data[merged_columns]

formatted_housing_data.head()

Unnamed: 0,Type,Method,Regionname,Rooms,Price,Distance,Postcode,Bedroom2,Bathroom,Landsize,Lattitude,Longtitude,Propertycount
0,h,S,Northern Metropolitan,2,1480000.0,2.5,3067.0,2.0,1.0,202.0,-37.7996,144.9984,4019.0
1,h,S,Northern Metropolitan,2,1035000.0,2.5,3067.0,2.0,1.0,156.0,-37.8079,144.9934,4019.0
2,h,SP,Northern Metropolitan,3,1465000.0,2.5,3067.0,3.0,2.0,134.0,-37.8093,144.9944,4019.0
3,h,PI,Northern Metropolitan,3,850000.0,2.5,3067.0,3.0,2.0,94.0,-37.7969,144.9969,4019.0
4,h,VB,Northern Metropolitan,4,1600000.0,2.5,3067.0,3.0,1.0,120.0,-37.8072,144.9941,4019.0


In [251]:
y = formatted_housing_data['Price']

formatted_housing_data.drop(columns=['Price'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  formatted_housing_data.drop(columns=['Price'], inplace=True)


In [252]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(formatted_housing_data, y)

In [253]:
s = (X_train.dtypes == "object")

object_cols = list(s[s].index)

object_cols

['Type', 'Method', 'Regionname']

In [254]:
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor

def score_dataset(x_train, y_train, x_test, y_test):
    model = RandomForestRegressor(random_state=0)
    model.fit(x_train, y_train)
    predictions = model.predict(x_test)
    return mean_absolute_error(predictions, y_test)

### Approach 1, we drop the categorical variable column

In [255]:
housing_data_categorical_drop = X_train.copy()

print(X_train.shape)
housing_data_categorical_drop.shape

(10185, 12)


(10185, 12)

In [256]:
print(housing_data.head())

housing_data_categorical_drop.drop(columns=categorical_columns, inplace=True)

       Suburb           Address  Rooms Type      Price Method SellerG  \
0  Abbotsford      85 Turner St      2    h  1480000.0      S  Biggin   
1  Abbotsford   25 Bloomburg St      2    h  1035000.0      S  Biggin   
2  Abbotsford      5 Charles St      3    h  1465000.0     SP  Biggin   
3  Abbotsford  40 Federation La      3    h   850000.0     PI  Biggin   
4  Abbotsford       55a Park St      4    h  1600000.0     VB  Nelson   

        Date  Distance  Postcode  Bedroom2  Bathroom  Landsize  Lattitude  \
0  3/12/2016       2.5    3067.0       2.0       1.0     202.0   -37.7996   
1  4/02/2016       2.5    3067.0       2.0       1.0     156.0   -37.8079   
2  4/03/2017       2.5    3067.0       3.0       2.0     134.0   -37.8093   
3  4/03/2017       2.5    3067.0       3.0       2.0      94.0   -37.7969   
4  4/06/2016       2.5    3067.0       3.0       1.0     120.0   -37.8072   

   Longtitude             Regionname  Propertycount  
0    144.9984  Northern Metropolitan        

In [257]:
cdrop_train_x, cdrop_val_x, cdrop_train_y, cdrop_val_y = train_test_split(housing_data_categorical_drop, y_train)

In [258]:
print(score_dataset(cdrop_train_x, cdrop_train_y, cdrop_val_x, cdrop_val_y))

177640.08411483164


Approach 2: Ordinal encoding

In [261]:
from sklearn.preprocessing import OrdinalEncoder

label_X_train = X_train.copy()
label_X_validation = X_test.copy()

ordinal_encoder = OrdinalEncoder()

label_X_train[categorical_columns] = ordinal_encoder.fit_transform(label_X_train[categorical_columns])
label_X_validation[categorical_columns] = ordinal_encoder.transform(label_X_validation[categorical_columns])

print(score_dataset(label_X_train, y_train, label_X_validation, y_test))

170936.25925029803


Approach 3: One-Hot Encoding

In [260]:
from sklearn.preprocessing import OneHotEncoder

one_hot_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)

train_ohe = one_hot_encoder.fit_transform(X_train[categorical_columns])
validation_ohe = one_hot_encoder.transform(X_test[categorical_columns])


train_ohe_df = pd.DataFrame(train_ohe)
validation_ohe_df = pd.DataFrame(validation_ohe)

train_ohe_df.index = X_train.index
validation_ohe_df.index = X_test.index

num_X_train = X_train.drop(columns=categorical_columns)
num_X_validation = X_test.drop(columns=categorical_columns)

OHE_train = pd.concat([num_X_train, train_ohe_df], axis=1)
OHE_validation = pd.concat([num_X_validation, validation_ohe_df], axis=1)

score_dataset(OHE_train, y_train, OHE_validation, y_test)




170652.522331019