In [89]:
import pandas as pd

# Load cleaned dataset
file_path = '../data/processed/cleaned_data.csv'
df = pd.read_csv(file_path)

# Display the first few rows to confirm it's loaded correctly
df.head()


# Create composite key
df['key'] = df['YEAR'].astype(str) + '-Q' + df['QUARTER'].astype(str) + '-' + df['Province']


# Generate a unique key for each row
df['Unique_Key'] = range(1, len(df) + 1)


In [45]:
# Define independent variables
independent_vars = [
    'Quarterly Avg Temp (°C)',
    'Quarterly Sum Precip (mm)',
    'CPI Average',
    'CPI value Cumulative',
    'CPI Compounded',
    'Debt_to_Asset_Ratio',
    'Mortgage_Debt_to_Real_Estate_Ratio',
    'Depression',
    'Anxiety',
    'Therapy',
    'Stress',
    'Unemployment Rate'
]

# Filter only independent variables (drop any other columns)
df_numeric = df[independent_vars]


In [47]:
smoothed_df = df_numeric.rolling(window=4, min_periods=1).mean()


## Skewness Check

In [49]:
from scipy.stats import shapiro

for col in smoothed_df.columns:
    skewness = smoothed_df[col].skew()
    stat, p = shapiro(smoothed_df[col].dropna())

    print(f"Variable: {col}")
    print(f"Skewness: {skewness:.2f}")
    if p > 0.05:
        print(f"✅ Data looks normally distributed after smoothing (p = {p:.3f})\n")
    else:
        print(f"❌ Data NOT normally distributed after smoothing (p = {p:.3f})\n")


Variable: Quarterly Avg Temp (°C)
Skewness: 0.38
❌ Data NOT normally distributed after smoothing (p = 0.031)

Variable: Quarterly Sum Precip (mm)
Skewness: -0.05
❌ Data NOT normally distributed after smoothing (p = 0.004)

Variable: CPI Average
Skewness: 0.19
✅ Data looks normally distributed after smoothing (p = 0.686)

Variable: CPI value Cumulative
Skewness: -0.06
✅ Data looks normally distributed after smoothing (p = 0.234)

Variable: CPI Compounded
Skewness: 0.19
✅ Data looks normally distributed after smoothing (p = 0.652)

Variable: Debt_to_Asset_Ratio
Skewness: 0.59
❌ Data NOT normally distributed after smoothing (p = 0.000)

Variable: Mortgage_Debt_to_Real_Estate_Ratio
Skewness: -0.10
❌ Data NOT normally distributed after smoothing (p = 0.003)

Variable: Depression
Skewness: 1.32
❌ Data NOT normally distributed after smoothing (p = 0.000)

Variable: Anxiety
Skewness: -1.10
❌ Data NOT normally distributed after smoothing (p = 0.000)

Variable: Therapy
Skewness: 0.03
✅ Data look

### Transformation to Handle Skewness

In [51]:
from sklearn.preprocessing import PowerTransformer
from scipy.stats import boxcox, shapiro
import numpy as np

# Initialize Yeo-Johnson transformer
pt = PowerTransformer(method='yeo-johnson')

# Handle each variable with targeted transformations
def handle_skewness_targeted(df):
    # Quarterly Avg Temp (Square Root)
    df['Quarterly Avg Temp (°C)'] = np.sqrt(df['Quarterly Avg Temp (°C)'])

    # Quarterly Sum Precip (Square Root)
    df['Quarterly Sum Precip (mm)'] = np.sqrt(df['Quarterly Sum Precip (mm)'])

    # Debt_to_Asset_Ratio (Log)
    if (df['Debt_to_Asset_Ratio'] > 0).all():
        df['Debt_to_Asset_Ratio'] = np.log(df['Debt_to_Asset_Ratio'])

    # Mortgage_Debt_to_Real_Estate_Ratio (Square Root)
    df['Mortgage_Debt_to_Real_Estate_Ratio'] = np.sqrt(df['Mortgage_Debt_to_Real_Estate_Ratio'])

    # Depression (Yeo-Johnson)
    df['Depression'] = pt.fit_transform(df[['Depression']])

    # Anxiety (Yeo-Johnson)
    df['Anxiety'] = pt.fit_transform(df[['Anxiety']])

    # Stress (Reciprocal)
    df['Stress'] = 1 / df['Stress']

    # Unemployment Rate (Log)
    if (df['Unemployment Rate'] > 0).all():
        df['Unemployment Rate'] = np.log(df['Unemployment Rate'])

# Apply targeted transformations
handle_skewness_targeted(smoothed_df)

# Check skewness and normality again
for col in smoothed_df.columns:
    skewness = smoothed_df[col].skew()
    stat, p = shapiro(smoothed_df[col].dropna())

    print(f"Variable: {col}")
    print(f"Skewness: {skewness:.2f}")
    if p > 0.05:
        print(f"✅ Data looks normally distributed after transformation (p = {p:.3f})\n")
    else:
        print(f"❌ Data NOT normally distributed after transformation (p = {p:.3f})\n")


Variable: Quarterly Avg Temp (°C)
Skewness: -0.12
✅ Data looks normally distributed after transformation (p = 0.260)

Variable: Quarterly Sum Precip (mm)
Skewness: -0.46
❌ Data NOT normally distributed after transformation (p = 0.000)

Variable: CPI Average
Skewness: 0.19
✅ Data looks normally distributed after transformation (p = 0.686)

Variable: CPI value Cumulative
Skewness: -0.06
✅ Data looks normally distributed after transformation (p = 0.234)

Variable: CPI Compounded
Skewness: 0.19
✅ Data looks normally distributed after transformation (p = 0.652)

Variable: Debt_to_Asset_Ratio
Skewness: 0.43
❌ Data NOT normally distributed after transformation (p = 0.001)

Variable: Mortgage_Debt_to_Real_Estate_Ratio
Skewness: -0.20
❌ Data NOT normally distributed after transformation (p = 0.003)

Variable: Depression
Skewness: -0.05
❌ Data NOT normally distributed after transformation (p = 0.010)

Variable: Anxiety
Skewness: -0.03
✅ Data looks normally distributed after transformation (p = 0

In [53]:
from scipy.stats import mstats
from sklearn.preprocessing import RobustScaler

# Final cleanup using Winsorization and Robust Scaling
def final_cleanup(df):
    # Winsorization to clip outliers
    df['Quarterly Sum Precip (mm)'] = mstats.winsorize(df['Quarterly Sum Precip (mm)'], limits=[0.01, 0.01])
    df['Debt_to_Asset_Ratio'] = mstats.winsorize(df['Debt_to_Asset_Ratio'], limits=[0.01, 0.01])
    df['Depression'] = mstats.winsorize(df['Depression'], limits=[0.01, 0.01])
    df['Stress'] = mstats.winsorize(df['Stress'], limits=[0.01, 0.01])

    # Robust scaling for ratios (handles extreme values)
    scaler = RobustScaler()
    df['Mortgage_Debt_to_Real_Estate_Ratio'] = scaler.fit_transform(df[['Mortgage_Debt_to_Real_Estate_Ratio']])

# Apply final cleanup
final_cleanup(smoothed_df)

# Re-check skewness and normality
for col in smoothed_df.columns:
    skewness = smoothed_df[col].skew()
    stat, p = shapiro(smoothed_df[col].dropna())

    print(f"Variable: {col}")
    print(f"Skewness: {skewness:.2f}")
    if p > 0.05:
        print(f"✅ Data looks normally distributed after final cleanup (p = {p:.3f})\n")
    else:
        print(f"❌ Data NOT normally distributed after final cleanup (p = {p:.3f})\n")


Variable: Quarterly Avg Temp (°C)
Skewness: -0.12
✅ Data looks normally distributed after final cleanup (p = 0.260)

Variable: Quarterly Sum Precip (mm)
Skewness: -0.47
❌ Data NOT normally distributed after final cleanup (p = 0.000)

Variable: CPI Average
Skewness: 0.19
✅ Data looks normally distributed after final cleanup (p = 0.686)

Variable: CPI value Cumulative
Skewness: -0.06
✅ Data looks normally distributed after final cleanup (p = 0.234)

Variable: CPI Compounded
Skewness: 0.19
✅ Data looks normally distributed after final cleanup (p = 0.652)

Variable: Debt_to_Asset_Ratio
Skewness: 0.43
❌ Data NOT normally distributed after final cleanup (p = 0.001)

Variable: Mortgage_Debt_to_Real_Estate_Ratio
Skewness: -0.20
❌ Data NOT normally distributed after final cleanup (p = 0.003)

Variable: Depression
Skewness: -0.01
❌ Data NOT normally distributed after final cleanup (p = 0.008)

Variable: Anxiety
Skewness: -0.03
✅ Data looks normally distributed after final cleanup (p = 0.275)

Va

In [55]:
# Most variables now have mild to moderate skewness 

In [57]:
# Save the transformed data back into the original dataframe
df_transformed = smoothed_df.copy()

# Confirm it's saved
df_transformed.head()


Unnamed: 0,Quarterly Avg Temp (°C),Quarterly Sum Precip (mm),CPI Average,CPI value Cumulative,CPI Compounded,Debt_to_Asset_Ratio,Mortgage_Debt_to_Real_Estate_Ratio,Depression,Anxiety,Therapy,Stress,Unemployment Rate
0,3.116087,8.717798,0.451939,0.812458,1.36147,-1.71779,0.957873,-0.927056,-0.076438,0.35,7.692308,2.163323
1,3.614554,10.747093,0.438221,0.572563,1.318747,-1.721978,0.854983,-1.270219,0.052714,0.35,7.692308,2.126246
2,2.865891,9.678154,0.358388,0.425713,1.078104,-1.740733,0.729089,-1.152785,0.009111,0.346667,7.5,2.084982
3,2.342541,8.888194,0.529806,0.924651,1.599613,-1.751138,0.636854,-1.270219,-0.200709,0.3525,7.272727,2.039053
4,2.265502,10.38027,0.675377,1.422174,2.041993,-1.754584,0.559478,-1.270219,-0.378332,0.3575,7.142857,1.937542


In [59]:
# Save the transformed data as a new version
df_transformed.to_csv('../data/processed/cleaned_dataset_v11.csv', index=True)


## Stationarity Check

In [61]:
from statsmodels.tsa.stattools import adfuller
import pandas as pd

# Load the transformed dataset
file_path = '../data/processed/cleaned_dataset_v11.csv'
df_transformed = pd.read_csv(file_path, index_col=0, parse_dates=True)

# Perform the Augmented Dickey-Fuller test for stationarity on each independent variable
adf_results = {}
for var in independent_vars:
    result = adfuller(df_transformed[var].dropna(), autolag='AIC')
    adf_results[var] = {
        'ADF Statistic': result[0],
        'p-value': result[1],
        'Number of Lags Used': result[2],
        'Number of Observations Used': result[3],
        'Critical Values': result[4],
        'Stationary': 'Yes' if result[1] <= 0.05 else 'No'
    }

# Convert results to a DataFrame for better visualization
adf_results_df = pd.DataFrame(adf_results).T

# Display results
print(adf_results_df)


# Generate a unique key for each row
df['Unique_Key'] = range(1, len(df) + 1)


                                   ADF Statistic   p-value  \
Quarterly Avg Temp (°C)                -2.169035  0.217675   
Quarterly Sum Precip (mm)              -2.066253  0.258299   
CPI Average                            -2.345859  0.157603   
CPI value Cumulative                   -2.379161  0.147705   
CPI Compounded                         -2.342833  0.158525   
Debt_to_Asset_Ratio                    -3.570715  0.006346   
Mortgage_Debt_to_Real_Estate_Ratio      -2.03242  0.272541   
Depression                             -2.284079  0.177154   
Anxiety                                -2.702469  0.073612   
Therapy                                -2.120934  0.236181   
Stress                                 -0.701643  0.846324   
Unemployment Rate                      -1.902007  0.331118   

                                   Number of Lags Used  \
Quarterly Avg Temp (°C)                             12   
Quarterly Sum Precip (mm)                            9   
CPI Average        

  df_transformed = pd.read_csv(file_path, index_col=0, parse_dates=True)


In [63]:
# Quarterly Avg Temp is seasonal and the other independent variables are not
# Apply different methods

In [65]:
print(df_transformed.columns)


Index(['Quarterly Avg Temp (°C)', 'Quarterly Sum Precip (mm)', 'CPI Average',
       'CPI value Cumulative', 'CPI Compounded', 'Debt_to_Asset_Ratio',
       'Mortgage_Debt_to_Real_Estate_Ratio', 'Depression', 'Anxiety',
       'Therapy', 'Stress', 'Unemployment Rate'],
      dtype='object')


### Apply First- and Second-Order Differencing

In [67]:
# Load v11
file_path = '../data/processed/cleaned_dataset_v11.csv'
df_transformed = pd.read_csv(file_path)

# First-order differencing for non-seasonal variables
non_seasonal_vars = [
    'Quarterly Sum Precip (mm)',
    'CPI Average',
    'CPI value Cumulative',
    'CPI Compounded',
    'Debt_to_Asset_Ratio',
    'Mortgage_Debt_to_Real_Estate_Ratio',
    'Depression',
    'Anxiety',
    'Therapy',
    'Stress',
    'Unemployment Rate'
]

for var in non_seasonal_vars:
    df_transformed[var + '_diff'] = df_transformed[var] - df_transformed[var].shift(1)

# Seasonal differencing for quarterly data (lag=4 for quarterly seasonality)
df_transformed['Quarterly Avg Temp (°C)_diff'] = df_transformed['Quarterly Avg Temp (°C)'] - df_transformed['Quarterly Avg Temp (°C)'].shift(4)

# Drop original columns (optional) to keep dataset clean
df_transformed = df_transformed.drop(non_seasonal_vars + ['Quarterly Avg Temp (°C)'], axis=1)

# Drop any NA values created by differencing
df_transformed = df_transformed.dropna()

# Save as v22
df_transformed.to_csv('../data/processed/cleaned_dataset_v22.csv', index=True)


In [69]:
# Result after first-order and seasonal differencing

differenced_vars = [
    'Quarterly Sum Precip (mm)_diff',
    'CPI Average_diff',
    'CPI value Cumulative_diff',
    'CPI Compounded_diff',
    'Debt_to_Asset_Ratio_diff',
    'Mortgage_Debt_to_Real_Estate_Ratio_diff',
    'Depression_diff',
    'Anxiety_diff',
    'Therapy_diff',
    'Stress_diff',
    'Unemployment Rate_diff',
    'Quarterly Avg Temp (°C)_diff'
]

from statsmodels.tsa.stattools import adfuller
import pandas as pd

# Run the ADF test on all differenced variables
adf_results = {}
for var in differenced_vars:
    result = adfuller(df_transformed[var].dropna(), autolag='AIC')
    adf_results[var] = {
        'ADF Statistic': result[0],
        'p-value': result[1],
        'Number of Lags Used': result[2],
        'Number of Observations Used': result[3],
        'Critical Values': result[4],
        'Stationary': 'Yes' if result[1] <= 0.05 else 'No'
    }

# Display results in a table format
adf_results_df = pd.DataFrame(adf_results).T
print(adf_results_df)



                                        ADF Statistic   p-value  \
Quarterly Sum Precip (mm)_diff              -3.362085  0.012321   
CPI Average_diff                            -5.750459  0.000001   
CPI value Cumulative_diff                   -3.435376   0.00981   
CPI Compounded_diff                         -5.760674  0.000001   
Debt_to_Asset_Ratio_diff                    -3.188125  0.020682   
Mortgage_Debt_to_Real_Estate_Ratio_diff     -5.446258  0.000003   
Depression_diff                             -4.775999   0.00006   
Anxiety_diff                                -3.895139  0.002071   
Therapy_diff                                -1.179536  0.682336   
Stress_diff                                 -5.267628  0.000006   
Unemployment Rate_diff                      -3.087262   0.02751   
Quarterly Avg Temp (°C)_diff                -3.028351   0.03233   

                                        Number of Lags Used  \
Quarterly Sum Precip (mm)_diff                            8   
CP

In [71]:
file_path = '../data/processed/cleaned_dataset_v22.csv'
df_test = pd.read_csv(file_path)
print(df_test.columns)


Index(['Unnamed: 0.1', 'Unnamed: 0', 'Quarterly Sum Precip (mm)_diff',
       'CPI Average_diff', 'CPI value Cumulative_diff', 'CPI Compounded_diff',
       'Debt_to_Asset_Ratio_diff', 'Mortgage_Debt_to_Real_Estate_Ratio_diff',
       'Depression_diff', 'Anxiety_diff', 'Therapy_diff', 'Stress_diff',
       'Unemployment Rate_diff', 'Quarterly Avg Temp (°C)_diff'],
      dtype='object')


In [73]:
# Apply second-order differencing for 'Therapy_diff'
df_transformed['Therapy_diff2'] = df_transformed['Therapy_diff'] - df_transformed['Therapy_diff'].shift(1)

# Drop NA values caused by differencing
df_transformed = df_transformed.dropna()

# ADF Test on second-order differenced 'Therapy_diff'
result = adfuller(df_transformed['Therapy_diff2'].dropna(), autolag='AIC')
adf_therapy_diff2 = {
    'ADF Statistic': result[0],
    'p-value': result[1],
    'Number of Lags Used': result[2],
    'Number of Observations Used': result[3],
    'Critical Values': result[4],
    'Stationary': 'Yes' if result[1] <= 0.05 else 'No'
}

# Display results
print(pd.Series(adf_therapy_diff2))


ADF Statistic                                                           -4.39367
p-value                                                                 0.000305
Number of Lags Used                                                           11
Number of Observations Used                                                   83
Critical Values                {'1%': -3.5117123057187376, '5%': -2.897047520...
Stationary                                                                   Yes
dtype: object


In [75]:
#The ADF Statistic = -4.39 is much lower than the critical value at 1% (-3.51), and the p-value = 0.000305 is well below 0.05 → Strong evidence that the series is now stationary

In [77]:
# Drop the original 'Therapy_diff' since 'Therapy_diff2' is stationary
df_transformed = df_transformed.drop(columns=['Therapy_diff'])

# Save the updated dataset to a new CSV file
df_transformed.to_csv('../data/processed/cleaned_dataset_v33.csv', index=True)

print("✅ Updated dataset saved as 'cleaned_dataset_v33.csv'")


✅ Updated dataset saved as 'cleaned_dataset_v33.csv'


## Choose the Most Suitable CPI Variable From the Three

In [79]:
# Compute correlation matrix for all independent variables
correlation_matrix = df_transformed.corr()

# Extract correlations of each CPI variable with other independent variables
cpi_corr = correlation_matrix.loc[
    ['CPI Average_diff', 'CPI value Cumulative_diff', 'CPI Compounded_diff']
].drop(['CPI Average_diff', 'CPI value Cumulative_diff', 'CPI Compounded_diff'], axis=1)

print(cpi_corr)

                           Unnamed: 0  Quarterly Sum Precip (mm)_diff  \
CPI Average_diff             0.012613                        0.115665   
CPI value Cumulative_diff    0.011890                        0.105666   
CPI Compounded_diff          0.012504                        0.115651   

                           Debt_to_Asset_Ratio_diff  \
CPI Average_diff                          -0.311005   
CPI value Cumulative_diff                 -0.349208   
CPI Compounded_diff                       -0.312888   

                           Mortgage_Debt_to_Real_Estate_Ratio_diff  \
CPI Average_diff                                         -0.276843   
CPI value Cumulative_diff                                -0.331703   
CPI Compounded_diff                                      -0.278447   

                           Depression_diff  Anxiety_diff  Stress_diff  \
CPI Average_diff                  0.253255      0.195857    -0.087640   
CPI value Cumulative_diff         0.344483      0.145114   

In [81]:
cpi_corr.mean(axis=1).abs()

CPI Average_diff             0.005015
CPI value Cumulative_diff    0.011802
CPI Compounded_diff          0.004922
dtype: float64

## Save the Dataset after All Transformation

In [None]:
# Drop the two least correlated CPI variables
df_transformed = df_transformed.drop(columns=['CPI Average_diff', 'CPI value Cumulative_diff'])

# Save the updated dataset
df_transformed.to_csv('../data/processed/cleaned_dataset_v44.csv', index=True)

print("✅ Updated dataset saved as 'cleaned_dataset_v44.csv'")


In [None]:
print(df_transformed.columns)