In [178]:
# Importing all libraries needed
import pandas as pd
import eli5
from eli5.sklearn import PermutationImportance
from xgboost import XGBRegressor
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

In [175]:
# Import the housing data
X = pd.read_csv('melb_data.csv')
y = X.Price

# Drop the target column (Price)
X.drop(['Price'], inplace=True, axis=1)

# Check the size of the dataframe
X.shape

# Check for null values
y.isnull().sum()

np.int64(0)

In [181]:
# Split my data
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=1)

# Initialize my model
model = XGBRegressor(n_estimators=100, learning_rate=0.1)


In [161]:
# PREPROCESSING
# Check the datatype for all the columns
X.dtypes

# Seperate the numerical and object columns
num_cols = [col for col in X_train.columns if X_train[col].dtype in ['int64', 'float64']]

obj_cols = [col for col in X_train.columns if X_train[col].dtype in ['object'] 
            and X_train[col].nunique() < 10] # Make sure to check for high cardinality columns as well
print(f'Numerical Columns: {num_cols}, \nCategorical Columns: {obj_cols}')

Numerical Columns: ['Rooms', 'Distance', 'Postcode', 'Bedroom2', 'Bathroom', 'Car', 'Landsize', 'BuildingArea', 'YearBuilt', 'Lattitude', 'Longtitude', 'Propertycount'], 
Categorical Columns: ['Type', 'Method', 'Regionname']


In [162]:
# PREPROCESSING
# Using column transformer, we impute our columns
preprocessor = ColumnTransformer(transformers=[('num', SimpleImputer(strategy='mean'), num_cols),
                                               ('obj', OneHotEncoder(handle_unknown='ignore'), obj_cols)])

##### This is the manual way to preprocess your data. The automatic way is using a pipeline. (Edit to view)
<!-- # Fit and transform the data. This way we can avoid data leakage
X_train_transformed = preprocessor.fit_transform(X_train)
X_valid_transformed = preprocessor.transform(X_valid)

# This is to check our transformed data to see if there are any null values
# It is not necessary at all
print(X_train_transformed.shape) 

X_train_transformed_df = pd.DataFrame(
    X_train_transformed,
    columns=preprocessor.get_feature_names_out())

# Check for missing values
print(X_train_transformed_df.isnull().sum()) -->

In [187]:
# We create a pipeline for our model
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', model)])

# Fit the model (Training the model)
my_pipeline.fit(X_train, y_train)

In [188]:
# PERMUTATION IMPORTANCE - To check for important features
# Transform the validation data first
X_valid_transformed = my_pipeline.named_steps['preprocessor'].transform(X_valid)

# Fit the permutation importance on validation data
perm = PermutationImportance(my_pipeline.named_steps['model'], random_state=1)
perm.fit(X_valid_transformed, y_valid)

# Show weights with ELI5
eli5.show_weights(perm, feature_names=my_pipeline.named_steps['preprocessor'].get_feature_names_out())

Weight,Feature
0.3245  ± 0.0189,num__Distance
0.2589  ± 0.0305,obj__Regionname_Southern Metropolitan
0.1321  ± 0.0070,num__Landsize
0.1109  ± 0.0071,num__Rooms
0.1021  ± 0.0138,num__Longtitude
0.0625  ± 0.0080,num__Lattitude
0.0609  ± 0.0085,obj__Type_h
0.0594  ± 0.0052,num__Bathroom
0.0487  ± 0.0146,num__BuildingArea
0.0241  ± 0.0037,num__Postcode


In [None]:
# Evaluate the model using validation data
preds = my_pipeline.predict(X_valid)

# Accuracy Metrics
print('Mean Absolute Error', mean_absolute_error(preds, y_valid))