In [1]:
import numpy as np
import pandas as pd
import torch
import xgboost as xgb
import joblib
import json
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline

# Load the data
with open('property_data.json', 'r') as file:
    data = json.load(file)

# Convert to DataFrame
df = pd.DataFrame(data)

# Display the first few rows
df.head()

# Remove rows where landsize is greater than 10,000
df = df[df['landsize'] <= 10000]

# Create static identifier so we can matchup results later
df['identifier'] = range(1, len(df) + 1)

# Create copy of the of our unsold houses with identifier saved
df_target = df[df['is_sold'] == False].copy() 

# Perform one-hot encoding on the updated house_type column
df = pd.get_dummies(df, columns=['house_type'], prefix='house_type')

# Remove rows where local_community_population is zero (aka no ABS data)
df = df[df['local_community_population'] != 0]

# Create a ratio for crime against people vs. property
df['crime_ratio'] = np.where((df['apn_crime'] != 0) & (df['ap_crime'] != 0), df['ap_crime'] / df['apn_crime'], 0)

# Summarize total crime
df['crime_per_capita'] = (df['ap_crime'] + df['apn_crime']) / df['people']

# Create affordability ratio for mortages
df['affordability_ratio'] = df['median_monthly_mortgage_repayment'] / df['median_weekly_household_income']

# Convert local_dining and local_shop to per capita values
df['dining_per_capita'] = df['local_dining'] / df['local_community_population']
df['shop_per_capita'] = df['local_shop'] / df['local_community_population']


# Interaction Features
df['landsize_population_interaction'] = df['landsize'] * df['local_community_population']
df['rooms_bathrooms_interaction'] = df['bedrooms'] * df['bathrooms']

# Polynomial Features
df['landsize_squared'] = df['landsize'] ** 2
df['distance_to_perth_cbd_squared'] = df['distance_to_perth_cbd'] ** 2

# Distance Ratios
df['distance_airport_cbd_ratio'] = df['distance_to_perth_airport'] / (df['distance_to_perth_cbd'] + 1)
df['distance_fuel_station_ratio'] = df['nearest_fuel_station'] / (df['distance_to_perth_cbd'] + 1)

# Crime Rates per Population
df['crime_rate'] = df['ap_crime'] / (df['local_community_population'] + 1)

# Density Features
df['population_density'] = df['local_community_population'] / (df['landsize'] + 1)
df['housing_density'] = df['local_community_dwellings'] / (df['landsize'] + 1)

# Ratio of median monthly mortgage repayment to median weekly household income
df['mortgage_income_ratio'] = df['median_monthly_mortgage_repayment'] / (df['median_weekly_household_income'] + 1)

# Ratio of median weekly rent to median weekly household income
df['rent_income_ratio'] = df['median_weekly_rent'] / (df['median_weekly_household_income'] + 1)

# Average number of people per house
df['people_per_house'] = df['people'] / (df['houses'] + 1)

# Apply logarithmic transformation to columns starting with 'nearest_' and 'distance_to_perth_cbd'
nearest_columns = [col for col in df.columns if col.startswith('nearest_')]
for col in nearest_columns + ['distance_to_perth_cbd']:
    df[f'log_{col}'] = np.log(df[col] + 1)
    df = df.drop(columns=[col])

# Rename columns starting with "log_" to remove the prefix
df.columns = [col[4:] if col.startswith('log_') else col for col in df.columns]

# Drop local data that is not well collected
columns_to_drop = ['local_waste_facility', 'local_sports_facility', 'local_leisure_facility',
                   'local_public_art', 'local_swimming_pool', 'local_garden', 'local_social_facility']
df = df.drop(columns=columns_to_drop)

# Drop redundant suburb level data we already collected
df = df.drop(columns=['people', 'houses'])

# Drop families/child_per_household (captured by local community values)
df = df.drop(columns=['families', 'child_per_household'])

# Drop median rent (captured by mortgage repayments)
df = df.drop(columns=['median_weekly_rent'])

# Drop offence data that we have already aggregated (apn/ap)
offense_columns = [col for col in df.columns if col.startswith('offences')]
df = df.drop(columns=offense_columns)

# Drop 'local_community_dwellings'
df = df.drop(columns=['local_community_dwellings'])

# Drop 'median_weekly_household_income'
df = df.drop(columns=['median_weekly_household_income'])

# Drop 'distance_to_perth_airport' (captured by CBD distance)
df = df.drop(columns=['distance_to_perth_airport'])

# Drop text-based fields except 'house_type'
text_fields = ['address', 'image_url', 'details_url', 'suburb']
df = df.drop(columns=text_fields)

# Remove outliers for bedrooms, bathrooms, and parking
df = df[(df['bedrooms'] <= 5) & (df['bathrooms'] <= 4) & (df['parking'] <= 5)]

# Suppress outliers using winsorization
columns_to_winsorize = ['apn_crime_inc', 'ap_crime_inc', 'crime_ratio', 'crime_per_capita', 'affordability_ratio']
for col in columns_to_winsorize:
    lower_percentile = df[col].quantile(0.05)
    upper_percentile = df[col].quantile(0.95)
    df[col] = df[col].clip(lower=lower_percentile, upper=upper_percentile)

# Convert any column with 'house_type' prefix to int
house_type_columns = [col for col in df.columns if col.startswith('house_type')]
df[house_type_columns] = df[house_type_columns].astype(int)

# Log price
df['price'] = np.log1p(df['price'])


In [2]:

# We only want to train on is_sold == True since we are predicting pre-sales
df_sold = df[df['is_sold'] == True]

# Split the data into features (X) and target variable (y), excluding 'identifier'
X = df_sold.drop(['price', 'is_sold', 'identifier'], axis=1)
y = df_sold['price']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.06, random_state=42)

# Approach 1: XGBoost
print("Approach 1: XGBoost")

dtrain = xgb.DMatrix(X_train, label=y_train, nthread=-1)
dtest = xgb.DMatrix(X_test, label=y_test, nthread=-1)

xgb_params = {
    'objective': 'reg:squarederror',
    'tree_method': 'hist',
    'eval_metric': 'rmse',
    'device': 'cuda',
    'learning_rate': 0.1,
    'max_depth': 9,
    'alpha': 0.1,
}

xgb_model = xgb.train(xgb_params, dtrain, num_boost_round=1000)
xgb_test_predictions = xgb_model.predict(dtest)
xgb_mae = mean_absolute_error(y_test, xgb_test_predictions)
xgb_rmse = mean_squared_error(y_test, xgb_test_predictions, squared=False)
print(f"XGBoost - MAE: {xgb_mae:.4f}, RMSE: {xgb_rmse:.4f}")

# Save the XGBoost model in JSON formatwwwww
xgb_model.save_model('xgb_model.json')

Approach 1: XGBoost




XGBoost - MAE: 0.1484, RMSE: 0.1988


In [3]:
# Calculate residuals
residuals = y_test - xgb_test_predictions

# Fit a polynomial regression model to the residuals
degree = 4  # You can adjust the degree for more flexibility
poly_model = make_pipeline(PolynomialFeatures(degree), LinearRegression())
poly_model.fit(y_test.values.reshape(-1, 1), residuals)

# Predicted residuals
predicted_residuals = poly_model.predict(y_test.values.reshape(-1, 1))

# Adjust the initial predictions
adjusted_predictions = xgb_test_predictions + predicted_residuals

# Evaluate the adjusted predictions
adjusted_mae = mean_absolute_error(y_test, adjusted_predictions)
adjusted_rmse = mean_squared_error(y_test, adjusted_predictions, squared=False)
print(f"Adjusted XGBoost - MAE: {adjusted_mae:.4f}, RMSE: {adjusted_rmse:.4f}")


Adjusted XGBoost - MAE: 0.1399, RMSE: 0.1833




In [4]:

# Filter out the unsold data
df_unsold = df[df['is_sold'] == False].copy()  # Create a copy to avoid the warning

# Prepare the features for prediction, excluding unnecessary columns
X_unsold = df_unsold.drop(['price', 'is_sold', 'identifier'], axis=1)

# Load the XGBoost model
xgb_model = xgb.Booster()
xgb_model.load_model('xgb_model.json')
dunsold = xgb.DMatrix(X_unsold)
xgb_predictions = xgb_model.predict(dunsold)

# Adjust the predictions using the previously fitted polynomial regression model
unsold_actual_prices = xgb_predictions.reshape(-1, 1)
unsold_predicted_residuals = poly_model.predict(unsold_actual_prices)
adjusted_xgb_predictions = xgb_predictions + unsold_predicted_residuals

# Add adjusted predictions to the unsold data
df_unsold.loc[:, 'price_prediction'] = np.expm1(adjusted_xgb_predictions)

# Merge the predictions with df_target
df_target_pred = pd.merge(df_target, df_unsold[['identifier', 'price_prediction']], on='identifier', how='left')

# Reorder columns to place price_prediction as the third column
cols = list(df_target_pred.columns)
prediction_index = cols.index('price_prediction')
if prediction_index != 2:
    cols.insert(2, cols.pop(prediction_index))
df_target_pred = df_target_pred[cols]

# Remove rows with no predicted price
rows_before = df_target_pred.shape[0]
df_target_cleaned = df_target_pred.dropna(subset=['price_prediction'])
rows_after = df_target_cleaned.shape[0]

# Print the number of rows removed
rows_removed = rows_before - rows_after
print(f"Number of rows removed: {rows_removed}")

# Convert the cleaned DataFrame to JSON format
df_target_cleaned_json = df_target_cleaned.to_dict(orient='records')

# Save the JSON to a file
with open('property_data_unsold_predictions.json', 'w') as json_file:
    json.dump(df_target_cleaned_json, json_file, indent=4)

# Display a few entries from the JSON file
df_target_cleaned_json[:5]


Number of rows removed: 302


[{'address': '5/1149 Old Coast Road, Dawesville',
  'price': 125000,
  'price_prediction': 113395.37209334796,
  'landsize': 139.0,
  'latitude': -32.6359313,
  'longitude': 115.6397769,
  'bedrooms': 1,
  'bathrooms': 1,
  'parking': 0,
  'house_type': 'Unit',
  'image_url': 'https://imagecdn.reiwa.com.au/listing/60/4802360-01.jpg?maxwidth=800&maxheight=600&quality=80',
  'details_url': '/5-1149-old-coast-road-dawesville-4802360/',
  'is_sold': False,
  'suburb': 'Dawesville',
  'local_community_population': 4428,
  'local_community_dwellings': 2006,
  'distance_to_perth_cbd': 78.92290631231367,
  'distance_to_perth_airport': 83.43510520997272,
  'nearest_fuel_station': 0.9148762234486837,
  'nearest_bus_stop': 0.13614865395729717,
  'nearest_train_station': 15.710154472128306,
  'nearest_police_station': 14.560863214828064,
  'nearest_healthcare_facility': 9.587335742581256,
  'nearest_doctor_office': 0.2033792974260994,
  'nearest_dental_office': 11.185650304550109,
  'nearest_prima