## Importing Package

In [None]:
# %pip install missingno

In [None]:
import pandas as pd
import numpy as nyp
import matplotlib.pyplot as plt
import seaborn as sns
from pycaret.datasets import get_data
import pandas as pd
import missingno as msno
import math
from pycaret.regression import *

In [None]:
sns.set_style("whitegrid")
custom_palette = ["#4C72B0", "#55A868", "#C44E52", "#8172B2", "#CCB974", "#64B5CD"]
sns.set_palette(custom_palette)

## Exploratory Data Analysis

In [None]:
df = pd.read_csv('../../data/raw/01_Melbourne_Residential.csv')
df.head()

In [None]:
df.info()

In [None]:
df.describe().T

In [None]:
for col in df.select_dtypes(include=['object']).columns:
    unique_vals = df[col].unique()
    print(f"Column: {col}")
    print(f"Number of Unique Values: {df[col].nunique()}")
    print(f"Unique Values: {unique_vals[:10]}")
    print("-" * 50)

In [None]:
df.isnull().sum().sort_values(ascending=False)

In [None]:
msno.bar(df)
msno.matrix(df)

plt.show()

In [None]:
df[df.duplicated()]

In [None]:
Q1 = df['Price'].quantile(0.25)
Q3 = df['Price'].quantile(0.75)
IQR = Q3 - Q1

# creating the outlier threshold
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

outliers = df[(df['Price'] < lower_bound) | (df['Price'] > upper_bound)]
num_outliers = len(outliers)
total_listings = len(df)

plt.figure(figsize=(10, 6))
sns.histplot(df['Price'], bins=50, kde=True, color="#4C72B0", alpha=0.6, label="Price Distribution")
sns.histplot(outliers['Price'], bins=40, color="red", alpha=0.6, label="Outliers") # plotting outliers

plt.title(f"Distribution of Property Prices\nTotal Listings: {total_listings} | Outliers: {num_outliers}", 
          fontsize=14, fontweight="bold")
plt.xlabel("Price (AUD)", fontsize=12)
plt.ylabel("Frequency", fontsize=12)
plt.legend()

plt.show()

In [None]:
plt.figure(figsize=(12, 8))
corr_matrix = df.corr(numeric_only = True)
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Correlation Matrix of Key Features")
plt.show()

In [None]:
df.corr(numeric_only = True)

In [None]:
plt.figure(figsize=(12, 6))
sns.boxplot(data=df, x="Method", y="Price", palette="viridis")

plt.title("Price Distribution by Auction Method", fontsize=14, fontweight="bold")
plt.xlabel("Auction Method", fontsize=12)
plt.ylabel("Price (AUD)", fontsize=12)
plt.xticks(fontsize=10)
plt.show()

In [None]:
plt.figure(figsize=(15, 6))
avg_price_by_seller = df.groupby("Seller")["Price"].mean().sort_values(ascending=False).head(10)

sns.barplot(x=avg_price_by_seller.index, y=avg_price_by_seller.values, palette="viridis")

plt.title("Average Price by Top 10 Real Estate Agent", fontsize=14, fontweight="bold")
plt.xlabel("Real Estate Agent", fontsize=12)
plt.ylabel("Average Price (AUD)", fontsize=12)
plt.xticks(rotation=45, ha="right", fontsize=10)
plt.show()

In [None]:
plt.figure(figsize=(15, 6))
sns.boxplot(data=df, x="CouncilArea", y="Price", hue="CouncilArea")
plt.legend().remove()

plt.xticks(rotation=45, ha="right", fontsize=10)
plt.title("Price Distribution by Council Area", fontsize=14, fontweight="bold")
plt.xlabel("Council Area", fontsize=12)
plt.ylabel("Price (AUD)", fontsize=12)
plt.show()

In [None]:
regions = sorted(df["Region"].dropna().unique())  
num_regions = len(regions)

cols = 3
rows = math.ceil(num_regions / cols)

fig, axes = plt.subplots(rows, cols, figsize=(15, 5 * rows))  
fig.suptitle("Price Distribution Across Regions", fontsize=16, fontweight="bold")

axes = axes.flatten()

for i, region in enumerate(regions):
    sns.violinplot(data=df[df["Region"] == region], y="Price", ax=axes[i])
    axes[i].set_title(region, fontsize=12, fontweight="bold")
    axes[i].set_xlabel("")
    axes[i].set_ylabel("Price (AUD)", fontsize=10)
    axes[i].tick_params(axis='x', which='both', bottom=False)

for j in range(i + 1, len(axes)):
    fig.delaxes(axes[j])

plt.tight_layout(rect=[0, 0, 1, 0.96])  
plt.show()

In [None]:
plt.figure(figsize=(8, 5))
sns.boxplot(
    x=pd.qcut(df['Propertycount'], q=4, labels=["Low", "Medium", "High", "Very High"]),
    y=df['Price'],
    data=df,
    palette="viridis"
)

plt.title("Price Distribution by Propertycount Group", fontsize=14, fontweight="bold")
plt.xlabel("Propertycount Group", fontsize=12)
plt.ylabel("Price (AUD)", fontsize=12)
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x='Distance', y='Price')
plt.title("Price vs. Distance to CBD")
plt.xlabel("Distance (km)")
plt.ylabel("Price (AUD)")
plt.show()

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

sns.countplot(x='Type', data=df, hue='Type', ax=axes[0])
axes[0].set_title("Count of Properties by Type")
axes[0].set_xlabel("Property Type")
axes[0].set_ylabel("Count")

sns.violinplot(x='Type', y='Price', data=df, hue='Type', ax=axes[1])
axes[1].set_title("Price Distribution by Property Type")
axes[1].set_xlabel("Property Type")
axes[1].set_ylabel("Price (AUD)")

plt.tight_layout()
plt.show()

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(18, 6))

sns.violinplot(data=df, x="Bedroom2", y="Price", palette="viridis", ax=axes[0])
axes[0].set_title("Price vs. Number of Bedrooms", fontsize=14, fontweight="bold")
axes[0].set_xlabel("Number of Bedrooms", fontsize=12)
axes[0].set_ylabel("Price (AUD)", fontsize=12)

sns.violinplot(data=df, x="Bathroom", y="Price", palette="viridis", ax=axes[1])
axes[1].set_title("Price vs. Number of Bathrooms", fontsize=14, fontweight="bold")
axes[1].set_xlabel("Number of Bathrooms", fontsize=12)
axes[1].set_ylabel("Price (AUD)", fontsize=12)

sns.violinplot(data=df, x="Car", y="Price", palette="viridis", ax=axes[2])
axes[2].set_title("Price vs. Number of Car Spots", fontsize=14, fontweight="bold")
axes[2].set_xlabel("Number of Car Spots", fontsize=12)
axes[2].set_ylabel("Price (AUD)", fontsize=12)

plt.tight_layout()
plt.show()

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

sns.scatterplot(data=df, x="BuildingArea", y="Price", alpha=0.5, color="#4C72B0", ax=axes[0])
axes[0].set_title("Price vs. Building Area", fontsize=14, fontweight="bold")
axes[0].set_xlabel("Building Area (sqm)", fontsize=12)
axes[0].set_ylabel("Price (AUD)", fontsize=12)

sns.scatterplot(data=df, x="Landsize", y="Price", alpha=0.5, color="#4C72B0", ax=axes[1])
axes[1].set_title("Price vs. Land Size", fontsize=14, fontweight="bold")
axes[1].set_xlabel("Land Size", fontsize=12)
axes[1].set_ylabel("Price (AUD)", fontsize=12)

plt.tight_layout()
plt.show()

In [None]:
df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%Y')

avg_price_yearly = df.groupby(df['Date'].dt.year)['Price'].mean().reset_index()
avg_price_built = df.groupby('YearBuilt')['Price'].mean().reset_index()

fig, axes = plt.subplots(1, 2, figsize=(15, 5))  
fig.suptitle("Price Trends: Sale Year vs. Year Built", fontsize=16, fontweight="bold")

sns.lineplot(data=avg_price_yearly, x='Date', y='Price', ax=axes[0], color="#4C72B0")
axes[0].set_title("Average Price Over Sale Years", fontsize=12, fontweight="bold")
axes[0].set_xlabel("Year Sold", fontsize=10)
axes[0].set_ylabel("Average Price (AUD)", fontsize=10)

sns.lineplot(data=avg_price_built, x='YearBuilt', y='Price', ax=axes[1], color="#DD8452")
axes[1].set_title("Average Price by Year Built", fontsize=12, fontweight="bold")
axes[1].set_xlabel("Year Built", fontsize=10)
axes[1].set_ylabel("Average Price (AUD)", fontsize=10)

plt.tight_layout(rect=[0, 0, 1, 0.95])
plt.show()

In [None]:
plt.figure(figsize=(10, 8))
sns.scatterplot(data=df, x='Longtitude', y='Lattitude', hue='Price', palette='cool', alpha=0.6)
plt.title("Geographical Distribution of Properties")
plt.xlabel("Longitude")
plt.ylabel("Latitude")
plt.legend(title="Price", bbox_to_anchor=(1, 1))
plt.show()

## Data Preparation

In [None]:
# Numerical Columns
numerical_columns = ["Distance", "Bedroom2", "Bathroom", 
                     "Car", "Landsize", "YearBuilt", 
                     "Lattitude", "Longtitude"]

ordinal_columns = {
    'Rooms': sorted(df['Rooms'].unique()),  # 'Rooms' is numeric, so sorted naturally
    'Type': ['t', 'h', 'u'],  # Townhouse < House < Unit (example order)
    'Method': ['PI', 'S', 'SP', 'VB', 'SA', 'SS', 'W', 'NB', 'PN', 'SN', 'N/A'],  # Auction method orders
}

In [None]:
ignore_feat = ["Propertycount", "Postcode", "BuildingArea", "Address"]

reg1 = setup(
    data=df, 
    target="Price",
    ignore_features=ignore_feat,
    imputation_type="iterative",
    numeric_imputation="knn", 
    categorical_imputation="mode",
    ordinal_features=ordinal_columns,
    numeric_features=numerical_columns,
    categorical_features=["Suburb", "Type", "Method", "Seller", "CouncilArea", "Region"], 
    create_date_columns=["day", "month", "year"],
    polynomial_features=True,
    polynomial_degree=2,
    remove_multicollinearity=True, # default threshold = 0.9
    multicollinearity_threshold=0.55, # changing 0.9 to 0.55
    normalize=True,
    session_id=123,
    log_experiment=True,
    experiment_name='house_pricing',
    fold=10, # most common is 10 folds
    feature_selection=True, 
    n_features_to_select=0.8
)

In [None]:
reg1.dataset_transformed

In [None]:
print(f'Categorical features: {reg1._fxs["Categorical"]}')
print(f'Ordinal features: {reg1._fxs["Ordinal"]}')
print(f'Numeric features: {reg1._fxs["Numeric"]}')

## Modelling

In [None]:
best = reg1.compare_models()

In [None]:
best = create_model('et')

## Hyperparameter Tuning

In [None]:
best.get_params()

In [None]:
tuned_model = tune_model(
    best, 
    n_iter=50, 
    return_tuner=True, 
    search_library='optuna', 
    choose_better=True, 
    optimize='RMSLE'
) 

## Model Evaluation

In [None]:
predict_model(best)

In [None]:
evaluate_model(best)

In [None]:
plot_model(best, plot='residuals', plot_kwargs={'observed': True, 'color': 'blue'})

In [None]:
plot_model(best, plot='error', plot_kwargs={'alpha': 0.5, 'title': 'Actual vs Predicted'})

In [None]:
plot_model(best, plot='cooks', plot_kwargs={'draw_threshold': True, 'linefmt': 'r--'})

In [None]:
plot_model(best, plot='feature', plot_kwargs={'top_n': 15, 'figsize': (10,6)})

In [None]:
plot_model(best, plot='learning', plot_kwargs={'train_sizes': np.linspace(0.1, 1.0, 10)})

In [None]:
# to view more charts the below can be used
evaluate_model(best)

In [None]:
# ai explainability is also an important component
# using the below plot, we can understand how the model is arriving at its prediction
interpret_model(xgboost)

## Saving Model

In [None]:
final_model = finalize_model(best)
save_model(final_best, 'house_pricing_pipeline')

In [None]:
# to confirm if the model is loaded correctly
load_model_pipline = load_model('medical_pipeline')
load_model_pipline