In [1]:
import pandas as pd
import numpy as np
import os
import datetime as dt
import numpy as np
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import math
import statsmodels.api as sm

# Data load and exploratory analysis

In [10]:
properties_df = pd.read_csv(os.getcwd() + '/DATA/properties.csv')
transactions_df = pd.read_csv(os.getcwd() + '/DATA/transactions.csv')

transactions_df['SALE_DATE'] = pd.to_datetime(transactions_df['SALE_DATE'], dayfirst=True)

df = properties_df.merge(transactions_df, how='left', on='ID')

In [11]:
pd.DataFrame({"values":{col:df[col].unique() for col in df},
              'type':{col:df[col].dtype for col in df},
              'unique values':{col:len(df[col].unique()) for col in df},
              'NA values':{col:str(round(sum(df[col].isna())/len(df),2))+'%' for col in df},
              'Duplicated Values':{col:sum(df[col].duplicated()) for col in df}
             })

Unnamed: 0,values,type,unique values,NA values,Duplicated Values
ID,"[GL-3704-XQ, JE-8988-FK, MH-4739-NW, SF-3867-V...",object,10378,0.0%,703
PROPERTYCATEGORY,"[Unit, House]",object,2,0.0%,11079
STATE,[NSW],object,1,0.0%,11080
POSTCODE,"[2042.0, 2035.0, nan]",float64,3,0.01%,11078
SUBURB,"[B, A]",object,2,0.0%,11079
STREETTYPE,"[St, Rd, Pde, Av, nan, Pl, Cct, Cr, La, Dr, Ct...",object,12,0.0%,11069
ADDRESSLATITUDE,"[-33.90505, -33.89965, -33.94226, -33.9052, -3...",float64,2968,0.02%,8113
ADDRESSLONGITUDE,"[151.17824, 151.17771, 151.26235, 151.18065, 1...",float64,3290,0.02%,7791
AREASIZE,"[4120.0, 42.0, 748.0, 72.0, 1561.0, 114.0, 559...",float64,1146,0.02%,9935
BEDROOMS,"[1.0, nan, 3.0, 2.0, 4.0]",float64,5,0.29%,11076


# Data Cleansing

### Replaceing all NaN values in 'AIRCONDITION', 'BALCONY', 'WARDROBE', 'GARDEN' with False

In [12]:
for item in ['AIRCONDITION', 'BALCONY', 'WARDROBE', 'GARDEN']:
    df[item] = df[item].replace(True, 1)
    df[item] = df[item].replace(False, 0)
    df[item].fillna(0, inplace=True)

### Estimating the number of bedrooms and bathrooms base off the nearest AREASIZE

In [13]:
bedroom_index = df[['AREASIZE', 'BEDROOMS']].dropna().reset_index()
bathroom_index = df[['AREASIZE', 'BATHS']].dropna().reset_index()
def bed_match(row):
    if math.isnan(row['AREASIZE']):
        pass
    elif math.isnan(row['BEDROOMS']):
        value = row['AREASIZE']
        index = abs(bedroom_index['AREASIZE'] - value).idxmin()
        row['BEDROOMS'] = bedroom_index['BEDROOMS'].iloc[index]
    return row

def bath_match(row):
    if math.isnan(row['AREASIZE']):
        pass
    elif math.isnan(row['BATHS']):
        value = row['AREASIZE']
        index = abs(bathroom_index['AREASIZE'] - value).idxmin()
        row['BATHS'] = bathroom_index['BATHS'].iloc[index]
    return row
df = df.apply(bed_match, axis=1)
df = df.apply(bath_match, axis=1)

### Removing na values across 'PROPERTYCATEGORY', 'SUBURB', 'ADDRESSLATITUDE', 'ADDRESSLONGITUDE', 'AREASIZE', 'BEDROOMS', 'BATHS', 'PARKING', 'AIRCONDITION', 'BALCONY', 'WARDROBE', 'GARDEN'

In [14]:
df = df.dropna(subset=['PROPERTYCATEGORY', 'SUBURB',
       'ADDRESSLATITUDE', 'ADDRESSLONGITUDE', 'AREASIZE', 'BEDROOMS', 'BATHS',
       'PARKING', 'AIRCONDITION', 'BALCONY', 'WARDROBE', 'GARDEN'])

### Dropping variables to only include those to be present in the model

In [15]:
df = df[['PROPERTYCATEGORY', 'SUBURB',
       'ADDRESSLATITUDE', 'ADDRESSLONGITUDE', 'AREASIZE', 'BEDROOMS', 'BATHS',
       'PARKING', 'AIRCONDITION', 'BALCONY', 'WARDROBE', 'GARDEN', 'SALE_PRICE']]

### Creating dummy variables for str features 

In [16]:
df = pd.get_dummies(df, columns=['PROPERTYCATEGORY', 'SUBURB'], prefix="dmy", prefix_sep="*")

### seperating df with sales_price data and without it

In [17]:
sale_df = df.loc[~df['SALE_PRICE'].isna()].dropna()
non_sale_df = df.loc[df['SALE_PRICE'].isna()]

# Regression Set Up

### Splitting the data set into features (x) and independent variables (y)

In [18]:
df_x = sale_df.drop('SALE_PRICE', axis=1)
df_y = sale_df[['SALE_PRICE']]

### Descriptive statistics on the features

In [19]:
df_x.describe()

Unnamed: 0,ADDRESSLATITUDE,ADDRESSLONGITUDE,AREASIZE,BEDROOMS,BATHS,PARKING,AIRCONDITION,BALCONY,WARDROBE,GARDEN,dmy*House,dmy*Unit,dmy*A,dmy*B
count,3691.0,3691.0,3691.0,3691.0,3691.0,3691.0,3691.0,3691.0,3691.0,3691.0,3691.0,3691.0,3691.0,3691.0
mean,-33.919273,151.20913,761.902465,2.275806,1.444866,0.519642,0.266594,0.431048,0.573286,0.592251,0.589813,0.410187,0.442698,0.557302
std,0.022343,0.032952,2586.625879,1.002872,0.675055,0.700624,0.442238,0.49529,0.494667,0.491483,0.491934,0.491934,0.496773,0.496773
min,-33.95818,151.1729,10.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,-33.94225,151.179245,149.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,-33.90668,151.18642,345.0,2.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0
75%,-33.89844,151.242235,761.5,3.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
max,-33.89253,151.26475,138138.0,4.0,4.0,7.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


### Train Test Split

In [20]:
x_train, x_test, y_train, y_test = train_test_split(df_x, df_y, test_size=0.33, random_state=4)
x_train.head()

Unnamed: 0,ADDRESSLATITUDE,ADDRESSLONGITUDE,AREASIZE,BEDROOMS,BATHS,PARKING,AIRCONDITION,BALCONY,WARDROBE,GARDEN,dmy*House,dmy*Unit,dmy*A,dmy*B
7069,-33.90201,151.17667,121.0,2.0,1.0,0,0.0,0.0,1.0,1.0,1,0,0,1
4055,-33.89783,151.18168,900.0,1.0,1.0,0,0.0,1.0,1.0,1.0,0,1,0,1
7966,-33.90657,151.17981,134.0,2.0,1.0,0,0.0,0.0,0.0,1.0,1,0,0,1
3573,-33.90333,151.18057,102.0,2.0,1.0,0,0.0,1.0,0.0,1.0,1,0,0,1
2170,-33.90575,151.17753,121.0,2.0,1.0,0,0.0,0.0,0.0,1.0,1,0,0,1


### Model set up and train

In [21]:
model = sm.OLS(y_train, sm.add_constant(x_train)).fit()
y_pred = model.predict(sm.add_constant(x_test))

### Model Summary

In [24]:
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:             SALE_PRICE   R-squared:                       0.302
Model:                            OLS   Adj. R-squared:                  0.298
Method:                 Least Squares   F-statistic:                     88.53
Date:                Mon, 09 May 2022   Prob (F-statistic):          1.06e-181
Time:                        12:02:20   Log-Likelihood:                -36750.
No. Observations:                2472   AIC:                         7.353e+04
Df Residuals:                    2459   BIC:                         7.360e+04
Df Model:                          12                                         
Covariance Type:            nonrobust                                         
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
const             -1.38e+09   1.66e+08  

### Predicting price data for original data set

In [17]:
df['model_predict'] = model.predict(sm.add_constant(df.drop('SALE_PRICE', axis=1)))

# Results

In [30]:
results = df.groupby("dmy*A").agg({'model_predict': ['sum']}).reset_index()

In [31]:
print(f"Suburb B has a total property market value of ${results[('model_predict', 'sum')][0]}")
print(f"Suburb A has a total property market value of ${results[('model_predict', 'sum')][1]}")

Suburb B has a total property market value of $6158235514.062412
Suburb A has a total property market value of $5959234477.402757
