In [None]:
import pandas as pd

In [None]:
X = pd.read_csv('/Users/felix/ml/KaggleML/melb_data.csv')

In [None]:
X.shape

In [None]:
X.isnull().sum()

In [None]:
# Let's remove rows with missing data and separate the target from predictors

X.dropna(axis=0, subset=['Price'], inplace=True)
y = X.Price
X.drop(['Price'], axis=1, inplace=True)

In [None]:
# Let's drop columns with missing values

cols_with_nulls = [col for col in X.columns if X[col].isnull().any()]
X.drop(cols_with_nulls, axis=1, inplace=True)

In [None]:
# Split the data up

from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X, y,
                                                      train_size=0.8, test_size=0.2,
                                                      random_state=0)

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# Function for comparing approaches
def score_dataset(X_train, X_val, y_train, y_val):
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    preds = model.predict(X_val)
    return mean_absolute_error(y_val, preds)

In [None]:
# Approach 1: Dropping categorical variables

In [None]:
X_train_dropped = X_train.select_dtypes(exclude=['object'])
X_val_dropped = X_val.select_dtypes(exclude=['object'])

In [60]:
print(f"MAE for dropping categorical variables is: \t {score_dataset(X_train_dropped, X_val_dropped, y_train, y_val)}")

MAE for dropping categorical variables is: 	 175730.74184705777


In [61]:
# Huge error of 175730.74184705777
# Clearly not a great approach

In [62]:
cat_var = X_train.dtypes=='object'
object_cols = list(cat_var[cat_var].index)

In [63]:
# Approach 3: OHE

from sklearn.preprocessing import OneHotEncoder

OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[object_cols]))
OH_cols_valid = pd.DataFrame(OH_encoder.fit_transform(X_val[object_cols]))

# Reset the index removed by OHE
OH_cols_train.index = X_train.index
OH_cols_valid.index = X_val.index

# Remove categorical columns to be replaced by OHE
num_X_train = X_train.drop(object_cols, axis=1)
num_X_valid = X_val.drop(object_cols, axis=1)

# Add OHE columns for training
OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1)


In [64]:
print(f"MAE for OHE is: \t {score_dataset(OH_X_train, OH_X_valid, y_train, y_val)}")

ValueError: Number of features of the model must match the input. Model n_features is 11384 and input n_features is 3223 