In [1]:
import pandas as pd
import numpy as npm
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("E:/Projects/Learning/ML/Melbourne Housing Data/melb_data.csv")
df.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,...,1.0,1.0,202.0,,,Yarra,-37.7996,144.9984,Northern Metropolitan,4019.0
1,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra,-37.8079,144.9934,Northern Metropolitan,4019.0
2,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,...,2.0,0.0,134.0,150.0,1900.0,Yarra,-37.8093,144.9944,Northern Metropolitan,4019.0
3,Abbotsford,40 Federation La,3,h,850000.0,PI,Biggin,4/03/2017,2.5,3067.0,...,2.0,1.0,94.0,,,Yarra,-37.7969,144.9969,Northern Metropolitan,4019.0
4,Abbotsford,55a Park St,4,h,1600000.0,VB,Nelson,4/06/2016,2.5,3067.0,...,1.0,2.0,120.0,142.0,2014.0,Yarra,-37.8072,144.9941,Northern Metropolitan,4019.0


In [3]:
X = df.drop("Price", axis=1)
y = df.Price

X_train_full, X_valid_full, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=1)

missing_values_col = [col for col in X_train_full.columns if X_train_full[col].isna().any()]
X_train_full = X_train_full.drop(missing_values_col, axis=1)
X_valid_full = X_valid_full.drop(missing_values_col, axis=1)

In [4]:
# "Cardinality" means the number of unique values in a column
low_cardinality_cols = [col for col in X_train_full.columns if X_train_full[col].nunique() < 10 
                        and X_train_full[col].dtype == 'object']
numerical_cols = [col for col in X_train_full.columns if X_train_full[col].dtype in ['int64', 'float64']]

my_cols = low_cardinality_cols + numerical_cols

X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()

In [5]:
ob_col = (X_train.dtypes == 'object')
object_cols = list(ob_col[ob_col].index)
print("Categorical Columns:", object_cols)

Categorical Columns: ['Type', 'Method', 'Regionname']


In [6]:
def score_dataset(X_train, X_valid, y_train, y_valid):
    model = RandomForestRegressor(n_estimators=10, random_state=1)
    model.fit(X_train, y_train)
    prediction = model.predict(X_valid)
    mae = mean_absolute_error(prediction, y_valid)
    return mae

In [7]:
drop_X_train = X_train.select_dtypes(exclude='object')
drop_X_valid = X_valid.select_dtypes(exclude='object')

print("MAE after dropping Categorical Variables")
print(f"Score: {score_dataset(drop_X_train, drop_X_valid, y_train, y_valid)}")

MAE after dropping Categorical Variables
Score: 187082.57548478153


### Ordinal Encoding

In [8]:
from sklearn.preprocessing import OrdinalEncoder

label_X_train = X_train.copy()
label_X_valid = X_valid.copy()

ordinal_encoder = OrdinalEncoder()
label_X_train[object_cols] = ordinal_encoder.fit_transform(X_train[object_cols])
label_X_valid[object_cols] = ordinal_encoder.transform(X_valid[object_cols])

print("MAE after Ordinal Encoding")
print(f"Score: {score_dataset(label_X_train, label_X_valid, y_train, y_valid)}")

MAE after Ordinal Encoding
Score: 177537.4123588611


### One Hot Encoding

In [9]:
for c in object_cols:
    unique_vals = X_train[c].nunique()
    print(unique_vals)

3
5
8


In [10]:
from sklearn.preprocessing import OneHotEncoder

OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[object_cols]))
OH_cols_valid = pd.DataFrame(OH_encoder.transform(X_valid[object_cols]))

# We get 16 new columns in total since there are 16 unique values summed from every object col
OH_cols_train.index = X_train.index
OH_cols_valid.index = X_valid.index

# Lets drop the orginial Categorical Columns and replace with OHE cols
num_X_train = X_train.drop(object_cols, axis=1)
num_X_valid = X_valid.drop(object_cols, axis=1)

OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1)# 

# It seems out column names must be in string format so we create a mapping with dictionary comprehension
OH_cols_str_dict = {c: "col_" + str(c) for c in OH_cols_train.columns}

OH_X_train = OH_X_train.rename(columns=OH_cols_str_dict)
OH_X_valid = OH_X_valid.rename(columns=OH_cols_str_dict)

print("MAE after One-Hot Encoding")
print(f"Score: {score_dataset(OH_X_train, OH_X_valid, y_train, y_valid)}")

MAE after One-Hot Encoding
Score: 174645.73129777683
