In [1]:
# Toggles
USE_FEATURE_ENGINEERING = True
USE_BOXCOX_TRANSFORMATION = True
USE_SCALER = True
USE_WEIGHTED_RMSE = True
USE_SYNTHETIC_SAMPLING = True
USE_POLYNOMIAL_BIAS_CORRECTION = False



In [3]:
import numpy as np
import pandas as pd
import xgboost as xgb
import json
from tqdm.notebook import tqdm
from scipy.stats import boxcox
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures, LabelEncoder, StandardScaler
from sklearn.pipeline import make_pipeline
from scipy.special import inv_boxcox
# Load the data
with open('property_data.json', 'r') as file:
    data = json.load(file)

# Convert to DataFrame
df = pd.DataFrame(data)

# Remove duplicate addresses
df = df.drop_duplicates(subset='address', keep='first')

# Remove rows where landsize is greater than 10,000
df = df[df['landsize'] <= 10000]

# Create static identifier so we can match up results later
df['identifier'] = range(1, len(df) + 1)

# Create a copy of the unsold houses with identifier saved
df_target = df[df['is_sold'] == False].copy() 

# Convert boolean pets_allowed to integer
df['pets_allowed'] = df['pets_allowed'].astype(int)

# One-hot encode categorical fields
df = pd.get_dummies(df, columns=['agency_no', 'suburb', 'house_type'])

# Remove rows where local_community_population is zero (aka no ABS data)
df = df[df['local_community_population'] != 0]

# Drop text-based fields that are not encoded
text_fields = ['address', 'image_url', 'details_url', 'agency_name']
df = df.drop(columns=text_fields)

#################################################

if USE_FEATURE_ENGINEERING:
    # Create a ratio for crime against people vs. property
    df['crime_ratio'] = np.where((df['apn_crime'] != 0) & (df['ap_crime'] != 0), df['ap_crime'] / df['apn_crime'], 0)

    # Summarize total crime
    df['crime_per_capita'] = (df['ap_crime'] + df['apn_crime']) / df['people']

    # Create affordability ratio for mortgages
    df['affordability_ratio'] = df['median_monthly_mortgage_repayment'] / df['median_weekly_household_income']

    # Convert local_dining and local_shop to per capita values
    df['dining_per_capita'] = df['local_dining'] / df['local_community_population']
    df['shop_per_capita'] = df['local_shop'] / df['local_community_population']

    # Interaction Features
    df['landsize_population_interaction'] = df['landsize'] * df['local_community_population']
    df['rooms_bathrooms_interaction'] = df['bedrooms'] * df['bathrooms']

    # Polynomial Features
    df['landsize_squared'] = df['landsize'] ** 2
    df['distance_to_perth_cbd_squared'] = df['distance_to_perth_cbd'] ** 2

    # Distance Ratios
    df['distance_airport_cbd_ratio'] = df['distance_to_perth_airport'] / (df['distance_to_perth_cbd'] + 1)
    df['distance_fuel_station_ratio'] = df['nearest_fuel_station'] / (df['distance_to_perth_cbd'] + 1)

    # Crime Rates per Population
    df['crime_rate'] = df['ap_crime'] / (df['local_community_population'] + 1)

    # Density Features
    df['population_density'] = df['local_community_population'] / (df['landsize'] + 1)
    df['housing_density'] = df['local_community_dwellings'] / (df['landsize'] + 1)

    # Ratio of median monthly mortgage repayment to median weekly household income
    df['mortgage_income_ratio'] = df['median_monthly_mortgage_repayment'] / (df['median_weekly_household_income'] + 1)

    # Ratio of median weekly rent to median weekly household income
    df['rent_income_ratio'] = df['median_weekly_rent'] / (df['median_weekly_household_income'] + 1)

    # Average number of people per house
    df['people_per_house'] = df['people'] / (df['houses'] + 1)

#################################################

# Function to identify valid Box-Cox transformation candidates
def identify_boxcox_candidates(df):
    candidates = []
    for col in df.columns:
        if col in ['price', 'identifier', 'is_sold']:
            continue  # Exclude target variables
        if df[col].nunique() <= 2:
            continue  # Exclude boolean and one-hot encoded columns
        if (df[col] <= 0).any():
            min_value = df[col].min()
            df[col] += abs(min_value) + 1  # Shift to make all values positive
        candidates.append(col)
    return candidates

# Apply Box-Cox transformation to valid candidates
def apply_boxcox_transformation(df):
    candidates = identify_boxcox_candidates(df)
    boxcox_columns = []
    for col in candidates:
        df[f'boxcox_{col}'], _ = boxcox(df[col])
        df.drop(columns=[col], inplace=True)
        boxcox_columns.append(f'boxcox_{col}')
    return df, boxcox_columns

# Apply the transformation if the toggle is on
if USE_BOXCOX_TRANSFORMATION:
    df, boxcox_columns = apply_boxcox_transformation(df)
else:
    boxcox_columns = []

# Apply scaling if the toggle is on
if USE_SCALER and boxcox_columns:
    # Initialize StandardScaler
    scaler = StandardScaler()
    # Apply StandardScaler to Box-Cox transformed columns
    df[boxcox_columns] = scaler.fit_transform(df[boxcox_columns])

# Boxcox the target separately
df['price'], price_lambda = boxcox(df['price'])


In [4]:
# We only want to train on is_sold == True since we are predicting pre-sales
df_sold = df[df['is_sold'] == True]

# Split the data into features (X) and target variable (y), excluding 'identifier'
X = df_sold.drop(['price', 'is_sold', 'identifier'], axis=1)
y = df_sold['price']

# Define weighted RMSE thresholds for low and high y values
rmse_threshold_low = np.percentile(y, 10)
rmse_threshold_high = np.percentile(y, 90)

print(rmse_threshold_low)
print(rmse_threshold_high)

# Define synthetic data thresholds for low and high y values
synth_threshold_low = np.percentile(y, 20)
synth_threshold_high = np.percentile(y, 70)

if USE_SYNTHETIC_SAMPLING:
    # Separate the low, middle, and high target value samples
    low_indices = y <= synth_threshold_low
    high_indices = y >= synth_threshold_high

    X_low = X[low_indices]
    y_low = y[low_indices]

    X_high = X[high_indices]
    y_high = y[high_indices]

    # Create synthetic samples by adding noise
    def create_synthetic_samples(X, y, n_samples, noise_level=0.01):
        synthetic_X = np.tile(X, (n_samples, 1))
        synthetic_y = np.tile(y, n_samples)
        noise = np.random.normal(0, noise_level, synthetic_X.shape)
        return synthetic_X + noise, synthetic_y

    # Create synthetic samples for low and high values
    n_synthetic_samples = 5
    X_low_synthetic, y_low_synthetic = create_synthetic_samples(X_low.values, y_low.values, n_synthetic_samples)
    X_high_synthetic, y_high_synthetic = create_synthetic_samples(X_high.values, y_high.values, n_synthetic_samples)

    # Combine original and synthetic samples
    X_resampled = np.vstack([X.values, X_low_synthetic, X_high_synthetic])
    y_resampled = np.hstack([y.values, y_low_synthetic, y_high_synthetic])
else:
    X_resampled = X.values
    y_resampled = y.values

# Split the over-sampled data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.06, random_state=42)

# Create DMatrix for XGBoost
dtrain = xgb.DMatrix(X_train, label=y_train, nthread=-1)
dtest = xgb.DMatrix(X_test, label=y_test, nthread=-1)

# Custom callback to update tqdm progress bar
class TqdmCallback(xgb.callback.TrainingCallback):
    def __init__(self, total_rounds, early_stopping_rounds):
        self.pbar = tqdm(total=total_rounds, desc="Training Progress")
        self.early_stopping_rounds = early_stopping_rounds
        self.best_score = float("inf")
        self.best_iteration = 0
        self.stopping_counter = 0

    def after_iteration(self, model, epoch, evals_log):
        train_rmse = evals_log['train']['rmse'][-1]
        eval_rmse = evals_log['eval']['rmse'][-1]
        
        self.pbar.set_postfix({
            'train_rmse': f'{train_rmse:.5f}', 
            'eval_rmse': f'{eval_rmse:.5f}',
            'best_iteration': self.best_iteration
        })
        self.pbar.update(1)

        # Early stopping logic
        if eval_rmse < self.best_score:
            self.best_score = eval_rmse
            self.best_iteration = epoch
            self.stopping_counter = 0
        else:
            self.stopping_counter += 1

        if self.stopping_counter >= self.early_stopping_rounds:
            self.pbar.set_postfix_str(f'Early stopping at iteration {self.best_iteration} with best score {self.best_score}')
            self.pbar.close()
            return True  # Return True to stop training
        return False

# Custom objective function for weighted RMSE
def weighted_rmse(preds, dtrain):
    y_true = dtrain.get_label()
    residuals = preds - y_true
    weights = np.where((y_true <= rmse_threshold_low) | (y_true >= rmse_threshold_high), 2.5, 2.5)
    weighted_residuals = weights * residuals
    grad = 2 * weighted_residuals
    hess = 2 * weights
    return grad, hess

# Train the model with early stopping and progress bar
total_rounds = 10000
early_stopping_rounds = 50
tqdm_callback = TqdmCallback(total_rounds, early_stopping_rounds)

evals = [(dtrain, 'train'), (dtest, 'eval')]

# Define XGBoost parameters with increased regularization and early stopping
xgb_params = {
    'tree_method': 'hist',
    'eval_metric': 'rmse',
    'device': 'cuda',
    'learning_rate': 0.1,
    'max_depth': 9,
    'alpha': 1.0,  # Regularization parameter
    'lambda': 1.0,  # Regularization parameter
    'subsample': 0.8,  # Use subsample to reduce overfitting
    'colsample_bytree': 0.8  # Use colsample_bytree to reduce overfitting
}

if USE_WEIGHTED_RMSE:
    xgb_model = xgb.train(
        params=xgb_params, 
        dtrain=dtrain, 
        num_boost_round=total_rounds, 
        evals=evals, 
        obj=weighted_rmse, 
        callbacks=[tqdm_callback, xgb.callback.EarlyStopping(rounds=early_stopping_rounds)], 
        verbose_eval=0
    )
else:
    xgb_params['objective'] = 'reg:squarederror'
    xgb_model = xgb.train(
        params=xgb_params, 
        dtrain=dtrain, 
        num_boost_round=total_rounds, 
        evals=evals, 
        callbacks=[tqdm_callback, xgb.callback.EarlyStopping(rounds=early_stopping_rounds)], 
        verbose_eval=0
    )

xgb_test_predictions = xgb_model.predict(dtest)
xgb_mae = mean_absolute_error(y_test, xgb_test_predictions)
xgb_rmse = mean_squared_error(y_test, xgb_test_predictions, squared=False)

# Print results
print(f"XGBoost - MAE: {xgb_mae:.4f}, RMSE: {xgb_rmse:.4f}")

# Save the XGBoost model in JSON format
xgb_model.save_model('xgb_model.json')

# PB: 3547/10000 [04:01<06:47, 15.84it/s, Early stopping at iteration 3496 with best score 0.4464623629559897]
# PB:XGBoost - MAE: 0.2695, RMSE: 0.4465
# PB Adjusted XGBoost - MAE: 0.2580, RMSE: 0.3801

#2.5 2.5 eval_rmse=0.47259
#2.0 2.0 eval_rmse=0.47997
#1.5 1.5 eval_rmse=0.47576,
#2.5 1.0 eval_rmse=0.46000,
#2.0 1.0 eval_rmse=0.46830,
#2.5 0.5 eval_rmse=0.45731

# 80 10 - 41579
# 70 10 - 38454,
# 60 10 - 37981
# 70 20 - 37147


Training Progress:   0%|          | 0/10000 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [33]:

# Calculate residuals
residuals = y_test - xgb_test_predictions

# Fit a polynomial regression model to the residuals
degree = 4  # You can adjust the degree for more flexibility
poly_model = make_pipeline(PolynomialFeatures(degree), LinearRegression())
poly_model.fit(y_test.reshape(-1, 1), residuals)

# Predicted residuals
predicted_residuals = poly_model.predict(y_test.reshape(-1, 1))

# Adjust the initial predictions
if USE_POLYNOMIAL_BIAS_CORRECTION:
    adjusted_predictions = xgb_test_predictions + predicted_residuals
else:
    adjusted_predictions = xgb_test_predictions

# Evaluate the adjusted predictions
adjusted_mae = mean_absolute_error(y_test, adjusted_predictions)
adjusted_rmse = mean_squared_error(y_test, adjusted_predictions, squared=False)
print(f"Adjusted XGBoost - MAE: {adjusted_mae:.4f}, RMSE: {adjusted_rmse:.4f}")


Adjusted XGBoost - MAE: 0.2536, RMSE: 0.3517




In [34]:

# Filter out the unsold data
df_unsold = df[df['is_sold'] == False].copy()  # Create a copy to avoid the warning

# Prepare the features for prediction, excluding unnecessary columns
X_unsold = df_unsold.drop(['price', 'is_sold', 'identifier'], axis=1)

# Load the XGBoost model
xgb_model = xgb.Booster()
xgb_model.load_model('xgb_model.json')
dunsold = xgb.DMatrix(X_unsold)
xgb_predictions = xgb_model.predict(dunsold)

# Adjust the predictions using the previously fitted polynomial regression model
unsold_actual_prices = xgb_predictions.reshape(-1, 1)
unsold_predicted_residuals = poly_model.predict(unsold_actual_prices)
adjusted_xgb_predictions = xgb_predictions + unsold_predicted_residuals

# Add adjusted predictions to the unsold data
df_unsold.loc[:, 'price_prediction'] = inv_boxcox(adjusted_xgb_predictions, price_lambda)

# Merge the predictions with df_target
df_target_pred = pd.merge(df_target, df_unsold[['identifier', 'price_prediction']], on='identifier', how='left')

# Reorder columns to place price_prediction as the third column
cols = list(df_target_pred.columns)
prediction_index = cols.index('price_prediction')
if prediction_index != 2:
    cols.insert(2, cols.pop(prediction_index))
df_target_pred = df_target_pred[cols]

# Remove rows with no predicted price
rows_before = df_target_pred.shape[0]
df_target_cleaned = df_target_pred.dropna(subset=['price_prediction'])
rows_after = df_target_cleaned.shape[0]

# Print the number of rows removed
rows_removed = rows_before - rows_after
print(f"Number of rows removed: {rows_removed}")

# Convert the cleaned DataFrame to JSON format
df_target_cleaned_json = df_target_cleaned.to_dict(orient='records')

# Save the JSON to a file
with open('property_data_unsold_predictions.json', 'w') as json_file:
    json.dump(df_target_cleaned_json, json_file, indent=4)

# Display a few entries from the JSON file
df_target_cleaned_json[:5]


Number of rows removed: 19


[{'address': '1/8 Alston Avenue, Como',
  'price': 1545000,
  'price_prediction': 1332048.23246084,
  'landsize': 265.0,
  'latitude': -31.9976225,
  'longitude': 115.8574697,
  'bedrooms': 4,
  'bathrooms': 3,
  'parking': 2,
  'house_type': 'House',
  'image_url': 'https://imagecdn.reiwa.com.au/listing/08/4805608-01.jpg?maxwidth=800&maxheight=600&quality=80',
  'details_url': '/1-8-alston-avenue-como-4805608/',
  'is_sold': False,
  'floor_plan_count': 0,
  'agency_name': 'Yard Property',
  'agency_no': 14610,
  'pets_allowed': False,
  'suburb': 'Como',
  'local_community_population': 12915,
  'local_community_dwellings': 6758,
  'distance_to_perth_cbd': 5.155173325833062,
  'distance_to_perth_airport': 12.262246694720282,
  'nearest_fuel_station': 0.4270645519956997,
  'nearest_bus_stop': 0.23677398167357305,
  'nearest_train_station': 1.3530084190963068,
  'nearest_police_station': 3.0775317002541063,
  'nearest_healthcare_facility': 0.7132712480994013,
  'nearest_doctor_office': 