In [None]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
import os
import matplotlib.pyplot as plot

from utils.conn_data import save_pickle, load_pickle
from learning.memory import run_memory

In [8]:
data_factors = pd.read_csv('data/inputs/fredmd_factors_raw.csv')
transformation_codes = data_factors.iloc[0]
data_factors = data_factors.drop(0)
transformation_codes = transformation_codes.to_dict()
del transformation_codes['sasdate']

small = 1e-6
for column in data_factors.columns:
    if column in transformation_codes:
        match int(transformation_codes[column]):
            case 1:
                data_factors[column] = data_factors[column]

            case 2: # First difference: x(t)-x(t-1)
                data_factors[column] = data_factors[column].diff()

            case 3: # Second difference: (x(t)-x(t-1))-(x(t-1)-x(t-2))
                data_factors[column] = data_factors[column].diff().diff()

            case 4: # Natural log: ln(x)
                data_factors[column] = data_factors[column].apply(lambda x: np.log(x) if x > small else None)

            case 5: # First difference of natural log: ln(x)-ln(x-1)
                data_factors[column] = data_factors[column].apply(lambda x: np.log(x) if x > small else None)
                data_factors[column] = data_factors[column].diff()

            case 6: # Second difference of natural log: (ln(x)-ln(x-1))-(ln(x-1)-ln(x-2))
                data_factors[column] = data_factors[column].apply(lambda x: np.log(x) if x > small else None)
                data_factors[column] = data_factors[column].diff().diff()

            case 7: # First difference of percent change: (x(t)/x(t-1)-1)-(x(t-1)/x(t-2)-1)
                data_factors[column] = data_factors[column].pct_change()
                data_factors[column] = data_factors[column].diff()

data_factors = data_factors.drop([1, 2]).reset_index(drop=True)

data_factors = data_factors.ffill()
data_factors = data_factors.fillna(0.0)

data_factors['sasdate'] = pd.to_datetime(data_factors['sasdate'], format='%m/%d/%Y')
data_factors = data_factors.rename(columns={'sasdate': 'date'})
data_factors = data_factors.set_index('date')

#data_factors = data_factors.rolling(window=12).mean()
#data_factors = data_factors.dropna()

In [9]:
df_normalized = data_factors

# You must normalize the data before applying the fit method
df_normalized=(df_normalized - df_normalized.mean()) / df_normalized.std()
pca = PCA(n_components=df_normalized.shape[1])
pca.fit(df_normalized)

In [14]:
df_normalized = data_factors

# You must normalize the data before applying the fit method
df_normalized=(df_normalized - df_normalized.mean()) / df_normalized.std()
pca = PCA(n_components=df_normalized.shape[1])
pca.fit(df_normalized)

# Reformat and view results
loadings = pd.DataFrame(pca.components_.T,
columns=['PC%s' % _ for _ in range(len(df_normalized.columns))],
index=df_normalized.columns)
# print(loadings)

DESIRE_EXPLAINED_VARIANCE = 0.95
total_explained_variance = 0.0
for i, x in enumerate(pca.explained_variance_ratio_):
    total_explained_variance += x
    if total_explained_variance >= DESIRE_EXPLAINED_VARIANCE:
        # print(f"Number of components to explain {DESIRE_EXPLAINED_VARIANCE * 100}% variance: {i+1}")
        break
n_components = i+1
# %config InlineBackend.figure_format = 'retina'
# plot.plot([DESIRE_EXPLAINED_VARIANCE] * len(pca.explained_variance_ratio_), 'r--')
# plot.plot(pca.explained_variance_ratio_.cumsum())
# plot.title('PCA Explained Variance')
# plot.ylabel('Explained Variance')
# plot.xlabel('Total # of Components')
# plot.legend(['95% Variance Explained', 'Cumulative Explained Variance'])
# plot.show()

In [16]:
# Use the top n components to transform the data
pca = PCA(n_components=df_normalized.shape[1])
pca.fit(df_normalized)
df_transformed = pd.DataFrame(pca.transform(df_normalized),
columns=['PC%s' % _ for _ in range(df_normalized.shape[1])],
index=df_normalized.index)
df_transformed = df_transformed[['PC%s' % _ for _ in range(n_components)]]

# print(df_transformed)

In [17]:
FIX_START = True
ESTIMATION_WINDOW = 12 * 4
K_OPT_METHOD = "elbow"
CLUSTERING_METHOD = "kmeans"

#data = factors_done
data = df_transformed
#data = data_all

memory, all_centroids, all_probs = run_memory(
    data=data,
    fix_start=FIX_START,
    estimation_window=ESTIMATION_WINDOW,
    k_opt_method=K_OPT_METHOD,
    clustering_method=CLUSTERING_METHOD
)
memory

Building memory using window: 738: 100%|██████████| 739/739 [01:05<00:00, 11.31it/s]


Unnamed: 0_level_0,cluster_step0,cluster_step1,cluster_step2,cluster_step3,cluster_step4,cluster_step5,cluster_step6,cluster_step7,cluster_step8,cluster_step9,...,cluster_step729,cluster_step730,cluster_step731,cluster_step732,cluster_step733,cluster_step734,cluster_step735,cluster_step736,cluster_step737,cluster_step738
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1959-03-01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1959-04-01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1959-05-01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1959-06-01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1959-07-01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-04-01,,,,,,,,,,,...,,,,,,3.0,3.0,3.0,4.0,3.0
2024-05-01,,,,,,,,,,,...,,,,,,,2.0,4.0,2.0,2.0
2024-06-01,,,,,,,,,,,...,,,,,,,,3.0,4.0,4.0
2024-07-01,,,,,,,,,,,...,,,,,,,,,3.0,3.0


In [18]:
results = {
    "memory": memory,
    "estimation_window": ESTIMATION_WINDOW,
    "fix_start": FIX_START,
    "k_opt_method": K_OPT_METHOD,
    "clustering_method": CLUSTERING_METHOD
}

outputs_path = 'data/inputs/memory'

# check if results folder exists
if not os.path.exists(os.path.join(outputs_path, CLUSTERING_METHOD)):
    os.makedirs(os.path.join(outputs_path, CLUSTERING_METHOD))

# save results
save_path = os.path.join(outputs_path,
                         CLUSTERING_METHOD,
                         f"results_manual_3_kmeans_{K_OPT_METHOD}.pkl")
print(save_path)
save_pickle(path=save_path, obj=results)

data/inputs/memory/kmeans/results_manual_3_kmeans_elbow.pkl


In [22]:
loaded_resuts = load_pickle(save_path)

In [23]:
loaded_resuts

{'memory':             cluster_step0  cluster_step1  cluster_step2  cluster_step3  \
 date                                                                     
 1959-03-01            0.0            0.0            0.0            0.0   
 1959-04-01            0.0            0.0            0.0            0.0   
 1959-05-01            0.0            0.0            0.0            0.0   
 1959-06-01            0.0            0.0            0.0            0.0   
 1959-07-01            0.0            0.0            0.0            0.0   
 ...                   ...            ...            ...            ...   
 2024-04-01            NaN            NaN            NaN            NaN   
 2024-05-01            NaN            NaN            NaN            NaN   
 2024-06-01            NaN            NaN            NaN            NaN   
 2024-07-01            NaN            NaN            NaN            NaN   
 2024-08-01            NaN            NaN            NaN            NaN   
 
             

In [21]:
save_pickle(path=save_path, obj=loaded_resuts)