In [None]:
import numpy as np
import pandas as pd
import os
from hmmlearn.hmm import GaussianHMM
import seaborn as sns
import matplotlib.pyplot as plt
from statsmodels.tsa.arima.model import ARIMA
from sklearn.model_selection import train_test_split

In [3]:
os.chdir('C:\\Users\\grace\\UNSW\\COMP4121\\COMP4121_MajorProject\\Data')

In [4]:
# Load the saved values from the .npz file
data = np.load('hmm_data.npz')

In [5]:
# Retrieve the variables
n_states = int(data['n_states'])
features = data['X']

print(f"Optimal number of hidden states: {n_states}")
print(f"Features shape: {features.shape}")

Optimal number of hidden states: 5
Features shape: (29414, 3)


In [11]:
merged_df = pd.read_csv('merged_df.csv')
print(merged_df.head())

    date_sold   price  bedrooms  year_quarter        MA  normalized_price  \
0  2007-02-07  525000         4             0  548969.0              4078   
1  2007-02-27  290000         3             0  421291.0               449   
2  2007-03-07  328000         3             0  421291.0              1227   
3  2007-03-09  380000         4             0  548969.0               441   
4  2007-03-21  310000         3             0  421291.0               797   

   hidden_state  
0             2  
1             0  
2             0  
3             2  
4             0  


Create Separate Dataframes for each Hidden State

In [13]:
# Create separate dataframes for each regime
hs_1 = merged_df[merged_df['hidden_state'] == 0]
hs_2 = merged_df[merged_df['hidden_state'] == 1]
hs_3 = merged_df[merged_df['hidden_state'] == 2]
hs_4 = merged_df[merged_df['hidden_state'] == 3]
hs_5 = merged_df[merged_df['hidden_state'] == 4]

#Prepare Time Series for ARIMA
hs_1 = hs_1.set_index('date_sold')['price']  # Set date_sold as the index, 'price' as the variable of interest
hs_2 = hs_2.set_index('date_sold')['price']
hs_3 = hs_3.set_index('date_sold')['price']
hs_4 = hs_4.set_index('date_sold')['price']
hs_5 = hs_5.set_index('date_sold')['price']

print(hs_1.head(5))

date_sold
2007-02-27    290000
2007-03-07    328000
2007-03-21    310000
2007-04-24    399000
2007-05-24    359000
Name: price, dtype: int64


ARIMA has three main parameters:

p (the number of lag observations included in the model),
d (the number of times the series is differenced),
q (the size of the moving average window).

Split Data into Train and Test Sets

In [None]:
# Define the features (predictors) and target variable (hidden_state)
features = ['MA', 'bedrooms', 'normalized_price']
target = 'hidden_state'

# Prepare your X (features) and y (target)
X = merged_df[features]
y = merged_df[target]

# Split the data into training and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Preview the result
print(f"Training set size: {X_train.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")


In [7]:
# Use the price column for ARIMA
train, test = train_test_split(merged_df[['date_sold', 'price']], test_size=0.2, shuffle=False)

# Ensure the data is indexed by date for ARIMA
train.set_index('date_sold', inplace=True)
test.set_index('date_sold', inplace=True)

In [None]:
# Sort by 'year_quarter' to calculate price change over time
merged_df = merged_df.sort_values(by=['year_quarter'])

# Calculate price change percentage by quarter
merged_df['price_change'] = round(merged_df.groupby('year_quarter')['price'].pct_change() * 100, 2)  # price_change in %

# Define thresholds for movement classification (increase, stable, decrease)
threshold = 1  # Define a threshold percentage (e.g., 1% increase or decrease)
merged_df['price_movement'] = merged_df['price_change'].apply(
    lambda x: 0 if x < -threshold else (1 if abs(x) <= threshold else 2)
)

# Add a column for year for convenience if needed
merged_df['year'] = merged_df['date_sold'].dt.year

merged_df = merged_df[['MA', 'bedrooms','date_sold', 'price', 'normalized_price', 'property_type', 'year_quarter', 'price_change', 'price_movement']].dropna()

Fit ARIMA Model

Incorporate Hidden States as Features in ARIMA:

Add the hidden states as an exogenous variable to ARIMA. This allows ARIMA to condition forecasts on the regimes identified by the HMM.