In [None]:
import pandas as pd 
import numpy as np
import os
import csv
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.cluster import KMeans
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.metrics import mean_squared_error, r2_score


In [2]:
os.chdir('C:\\Users\\grace\\UNSW\\COMP4121\\COMP4121_MajorProject\\Data')

In [3]:
#Load datasets
raw_sales_df = pd.read_csv('raw_sales.csv', parse_dates=['datesold'])
ma_lga_df = pd.read_csv('ma_lga_12345.csv', parse_dates=['saledate'])

  ma_lga_df = pd.read_csv('ma_lga_12345.csv', parse_dates=['saledate'])


In [4]:
# Show the first few rows of each dataset to understand their structure
print("Raw Sales Data:")
print(raw_sales_df.head())
print("\nMA LGA Data:")
print(ma_lga_df.head())

Raw Sales Data:
    datesold  postcode   price propertyType  bedrooms
0 2007-02-07      2607  525000        house         4
1 2007-02-27      2906  290000        house         3
2 2007-03-07      2905  328000        house         3
3 2007-03-09      2905  380000        house         4
4 2007-03-21      2906  310000        house         3

MA LGA Data:
    saledate      MA   type  bedrooms
0 2007-09-30  441854  house         2
1 2007-12-31  441854  house         2
2 2008-03-31  441854  house         2
3 2008-06-30  441854  house         2
4 2008-09-30  451583  house         2


In [5]:
#Preprocessing 'raw_sales_df'

# Convert 'datesold' to datetime and sort by this column
raw_sales_df['datesold'] = pd.to_datetime(raw_sales_df['datesold'], errors='coerce')
raw_sales_df = raw_sales_df.sort_values('datesold')

# Handle missing values (if any)
raw_sales_df = raw_sales_df.dropna(subset=['datesold', 'price', 'postcode', 'propertyType', 'bedrooms'])

# Encode categorical variables like 'propertyType' (house/unit)
label_encoder = LabelEncoder()
raw_sales_df['propertyType'] = label_encoder.fit_transform(raw_sales_df['propertyType'])

# One-hot encode postcode as a new feature (optional)
#raw_sales_df = pd.get_dummies(raw_sales_df, columns=['postcode'], drop_first=True)


In [6]:
#Preprocessing 'ma_lga_df' & cleaning 'ma_lga_12345' DataFrame

# Convert 'saledate' to datetime and sort by this column
ma_lga_df['saledate'] = pd.to_datetime(ma_lga_df['saledate'], errors='coerce')
ma_lga_df = ma_lga_df.sort_values('saledate')

# Handle missing values (if any)
ma_lga_df = ma_lga_df.dropna(subset=['saledate', 'MA', 'type', 'bedrooms'])

# Encode categorical variables like 'type' (house/unit)
ma_lga_df['type'] = label_encoder.fit_transform(ma_lga_df['type'])

In [7]:
#Data Exploration

#Print basic info to inspect column types and missing values
print("\nRaw Sales Data Info:")
print(raw_sales_df.info())

print("\nMA LGA Data Info:")
print(ma_lga_df.info())

# Show basic statistics on the numerical columns
print("\nRaw Sales Data Description:")
print(raw_sales_df.describe())

print("\nMA LGA Data Description:")
print(ma_lga_df.describe())


Raw Sales Data Info:
<class 'pandas.core.frame.DataFrame'>
Index: 29580 entries, 0 to 24551
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   datesold      29580 non-null  datetime64[ns]
 1   postcode      29580 non-null  int64         
 2   price         29580 non-null  int64         
 3   propertyType  29580 non-null  int32         
 4   bedrooms      29580 non-null  int64         
dtypes: datetime64[ns](1), int32(1), int64(3)
memory usage: 1.2 MB
None

MA LGA Data Info:
<class 'pandas.core.frame.DataFrame'>
Index: 347 entries, 49 to 346
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   saledate  347 non-null    datetime64[ns]
 1   MA        347 non-null    int64         
 2   type      347 non-null    int32         
 3   bedrooms  347 non-null    int64         
dtypes: datetime64[ns](1), int32(1), int64(2)
memory usage

In [8]:
print(raw_sales_df[['postcode', 'propertyType']].isnull().sum())

postcode        0
propertyType    0
dtype: int64


In [9]:
print(raw_sales_df.head())
print(raw_sales_df.dtypes)


    datesold  postcode   price  propertyType  bedrooms
0 2007-02-07      2607  525000             0         4
1 2007-02-27      2906  290000             0         3
2 2007-03-07      2905  328000             0         3
3 2007-03-09      2905  380000             0         4
4 2007-03-21      2906  310000             0         3
datesold        datetime64[ns]
postcode                 int64
price                    int64
propertyType             int32
bedrooms                 int64
dtype: object


In [10]:
sequence_raw_sales_single_group = raw_sales_df.groupby('postcode').apply(
    lambda group: group[['price', 'bedrooms']].values.tolist()).reset_index(name='sequence')
print(sequence_raw_sales_single_group.head())

  sequence_raw_sales_single_group = raw_sales_df.groupby('postcode').apply(


   postcode                                           sequence
0      2600  [[327000, 1], [790000, 4], [825000, 3], [31500...
1      2601  [[380000, 1], [760000, 3], [595000, 3], [32500...
2      2602  [[900000, 4], [427500, 3], [780000, 3], [53000...
3      2603  [[1780000, 4], [1460000, 5], [760000, 3], [760...
4      2604  [[360000, 2], [479000, 3], [505000, 3], [52250...


In [11]:
label_encoder = LabelEncoder()
ma_lga_df['type'] = label_encoder.fit_transform(ma_lga_df['type'])

In [None]:
scaler = MinMaxScaler()
ma_lga_df['MA_scaled'] = scaler.fit_transform(ma_lga_df[['MA']])

ma_lga_df['MA_lag1'] = ma_lga_df['MA'].shift(1)
ma_lga_df['MA_lag2'] = ma_lga_df['MA'].shift(2)

ma_lga_df = ma_lga_df.dropna()


In [16]:
X = ma_lga_df[['MA_scaled', 'type', 'bedrooms']].values
#X = ma_lga_df[['MA_scaled', 'MA_lag1', 'MA_lag2', 'type', 'bedrooms']].values


In [None]:
#Prepare hidden states

n_states = 3  # Number of states
kmeans = KMeans(n_clusters=n_states, random_state=42)
ma_lga_df['state'] = kmeans.fit_predict(ma_lga_df[['MA_scaled']])
hidden_states = ma_lga_df['state'].values


Final Preprocessed Dataset

After processing, the dataset should have:

Sequential features (X): Prepared for HMMs.
Hidden states (hidden_states): Used for training and validation.

In [18]:
# Save n_states and X
np.savez('hmm_data.npz', n_states=n_states, X=X)

print("Variables saved to hmm_data.npz")

Variables saved to hmm_data.npz
