In [1]:
base_dir = './'
fig_dir = './figures/'
model_dir = './results/trained_encoder/'

In [2]:
import os
import sys

sys.path.insert(0, os.getcwd() + '\\src')

In [3]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from tqdm.notebook import trange, tqdm

---

In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [5]:
print('CUDA available :', torch.cuda.is_available())
# device = torch.device('cpu')
device = torch.device('cuda')

CUDA available : True


In [6]:
n_timestamps = 5
lag = 1

data_dir = f'./data/n_timestamps_{n_timestamps}__lag_{lag}/' + 'data_split.pt'

data_split = torch.load(data_dir)
train_data, train_pos_enc, train_dates = data_split['train'].values()
valid_data, valid_pos_enc, valid_dates = data_split['valid'].values()
test_data, test_pos_enc, test_dates = data_split['test'].values()

# Concat Date Positional Encoding
X_train = torch.cat((train_data, train_pos_enc), dim=-1)
X_valid = torch.cat((valid_data, valid_pos_enc), dim=-1)
X_test = torch.cat((test_data, test_pos_enc), dim=-1)

# Get currency names
currency_names = pd.read_csv(f'./data/n_timestamps_{n_timestamps}__lag_{lag}/cleansed_data.csv').columns.tolist()

del data_split

In [7]:
N = X_train.shape[0] + X_valid.shape[0] + X_test.shape[0]
_, T, D = X_train.shape
print(f'Data shape : (*, {T}, {D})')
print(f'    - Train : {train_dates[0]} ~ {train_dates[-1]} ({X_train.__len__():>4d}, {X_train.__len__()/N*100:.2f}%)')
print(f'    - Valid : {valid_dates[0]} ~ {valid_dates[-1]} ({X_valid.__len__():>4d}, {X_valid.__len__()/N*100:.2f}%)')
print(f'    - Test  : {test_dates[0]} ~ {test_dates[-1]} ({X_test.__len__():>4d}, {X_test.__len__()/N*100:.2f}%)')

Data shape : (*, 5, 36)
    - Train : 2002-01-08 ~ 2015-05-07 (3412, 63.99%)
    - Valid : 2015-05-08 ~ 2018-09-03 ( 853, 16.00%)
    - Test  : 2018-09-04 ~ 2022-10-31 (1067, 20.01%)


- Raw data for the visualization

In [8]:
from preprocess import *

raw_data = load_data('./data/USD2FX_BIS_daily_avg.csv', 
                     start_date = '2002-01-01', 
                     end_date = '2022-10-31'
                    )
raw_data = cleanse_data(raw_data)

---

In [9]:
from encoder import Encoder
from itertools import product

In [10]:
from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score

In [11]:
manifold_names = {'euclidean':'Euclidean', 
                  'p_plane':'Poincare-Halfplane', 
                  'sphere':'Hypersphere'}

- Training

In [12]:
dim_data = (T, D)
layer_configs = {}

candidates = list(product(['sphere', 'p_plane', 'euclidean'], 
                          list(range(16, 37))))

data = torch.cat((X_train, X_valid, X_test)) 

N, _, _ = data.shape

In [13]:
def compute_pairwise_distance(X, encoder):
    dist = torch.zeros((N, N))
    for i in range(N):
        repeated = emb[[i]].repeat(N, 1)
        dist_i = model.manifold.distance(repeated, emb)
        dist[i] = dist_i
    
    return dist

In [14]:
try:
    # Load clustering result
    clst_results = pd.read_csv('./results/clustering/clustering_results_by_model.csv')
except:
    save_dir = './results/clustering/computed_distance_by_model.h5'
    try:
        # Load precomputed distance by model
        dist_by_models = torch.load(save_dir)
    except:
        dist_by_models = {}
        
        # Compute distance
        for target_manifold, dim_embedding in tqdm(candidates):
            model_name = '_'.join([manifold_names[target_manifold], f'dim-{dim_embedding}'])

            best_ckpt = torch.load(model_dir +'best/' + model_name + '_best.pt')

            model = Encoder(dim_data, dim_embedding, target_manifold=target_manifold,
                            layer_configs=layer_configs, device=device)
            model.load_state_dict(best_ckpt['state_dict'])
            model.eval()
            emb = model(data).detach().cpu()

            dist = compute_pairwise_distance(data, model)

            dist_by_models[model_name] = dist.numpy()
        
        # Save the computed distance
        torch.save(dist_by_models, save_dir)
    
    model_names = list(dist_by_models.keys())
    clst_results = pd.DataFrame(np.zeros((dist_by_models.keys().__len__(), 5)), 
                            columns=['Manifold', 'Dimension', 'Opt_EPS', 'n_clusters', 'Sil_Coef']
                           )
    
    for i in trange(dist_by_models.__len__()):
        model_name = model_names[i]
        manifold, dim = model_name.split('_')
        dim = int(dim.split('-')[-1])
        dist = dist_by_models[model_name]
        for eps in np.linspace(.5, 1e-5, 100):
            dbscan = DBSCAN(eps=eps, metric='precomputed', n_jobs=-1)
            clst_labels = dbscan.fit_predict(dist)
            n_labels = np.unique(dbscan.labels_).__len__()
            if n_labels == 5:
                sil_coef = silhouette_score(dist, clst_labels)
                break
        clst_results.iloc[i] = [manifold, dim, eps, n_labels, sil_coef]
    
    clst_results.to_csv('./results/clustering/clustering_results_by_model.csv', index=False)

  0%|          | 0/63 [00:00<?, ?it/s]

In [15]:
clst_results

Unnamed: 0,Manifold,Dimension,Opt_EPS,n_clusters,Sil_Coef
0,Hypersphere,16.0,0.363639,5.0,-0.206433
1,Hypersphere,17.0,0.323236,5.0,-0.225058
2,Hypersphere,18.0,0.277782,5.0,-0.284016
3,Hypersphere,19.0,0.267681,5.0,-0.203608
4,Hypersphere,20.0,0.328286,5.0,-0.173610
...,...,...,...,...,...
58,Euclidean,32.0,0.282833,5.0,-0.190898
59,Euclidean,33.0,0.000010,1.0,-0.190898
60,Euclidean,34.0,0.247480,5.0,0.293817
61,Euclidean,35.0,0.000010,1.0,0.293817


In [21]:
clst_results.groupby(['n_clusters', 'Manifold']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Dimension,Opt_EPS,Sil_Coef
n_clusters,Manifold,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1.0,Euclidean,6,6,6
1.0,Hypersphere,1,1,1
1.0,Poincare-Halfplane,14,14,14
5.0,Euclidean,15,15,15
5.0,Hypersphere,20,20,20
5.0,Poincare-Halfplane,7,7,7


In [24]:
clst_results[clst_results.Sil_Coef > 0].sort_values('Sil_Coef', ascending=False)

Unnamed: 0,Manifold,Dimension,Opt_EPS,n_clusters,Sil_Coef
45,Euclidean,19.0,0.171724,5.0,0.558571
46,Euclidean,20.0,1e-05,1.0,0.558571
53,Euclidean,27.0,0.297984,5.0,0.381625
54,Euclidean,28.0,0.318185,5.0,0.306429
60,Euclidean,34.0,0.24748,5.0,0.293817
61,Euclidean,35.0,1e-05,1.0,0.293817
6,Hypersphere,22.0,0.287883,5.0,0.191469
49,Euclidean,23.0,0.242429,5.0,0.137339


In [None]:
transition_dates = []
transition_idx = []
for i, clst in enumerate(clst_labels):
    if not i:
        prev_clst = clst
        continue
    if clst != prev_clst:
        transition_dates.append(dates[i])
        transition_idx.append(i)
    
    prev_clst = clst

In [None]:
raw_data.columns

In [None]:
recent_idx = raw_data.index.tolist().index('2020-01-02')

fig, ax = plt.subplots(figsize=(20, 10))

selected = raw_data[['KRW', 'JPY', 'CNY', 'HKD', 'EUR', 'GBP', 'CHF', 'RUB']].copy()
selected /= selected.values[recent_idx]
for idx in transition_idx:
    plt.axvline(idx, c='grey', linewidth=.5, linestyle=':')
selected.plot(ax=ax)
plt.xlim([0, selected.shape[0]])
plt.show()