## Imports

In [25]:
import numpy as np
import pandas as pd
from numpy import linalg as LA
import networkx as nx
import community
from sklearn.preprocessing import StandardScaler
import json
from tqdm import tqdm
import matplotlib.pyplot as plt

## Data preparation
### Loading of the data

In [4]:
# Download names to make it more friendly
json_file_path_1 = '../data/all_coins_by_mc_1.json'
json_file_path_2 = '../data/all_coins_by_mc_2.json'

# Open the JSON file and load its contents
with open(json_file_path_1, 'r') as file:
    data = json.load(file)

with open(json_file_path_2, 'r') as file:
    data = data + json.load(file)
# Now, 'data' contains the contents of the JSON file as a Python object (dictionary, list, etc.)
names_list = [stock["name"] for stock in data]

### Load price data and preprocessing

In [5]:
data = pd.read_csv("../data/processed/prices.csv", parse_dates=['date'])
data.rename(columns={data.columns[0]: 'date'}, inplace=True)
data.set_index("date", inplace=True)
data.columns = names_list
data = data.loc['2021-01-01':'2022-12-31'] # Filter
data = data.dropna(axis=1, how='all') # Drop all columns that contains only null value
data = data.dropna(axis=1, thresh=0.95*len(data.index)) # Drop columns that contains more than 25% of null value
data = data.ffill() # ffill null values
data.dropna(inplace=True)

### Compute and normalize Hourly log-return

In [6]:
hourly_returns_pct = data.pct_change()
hourly_returns_pct = hourly_returns_pct.iloc[1:]
hourly_returns_pct.to_csv("../data/processed/hourly_return.csv")
hourly_log_returns = np.log(hourly_returns_pct + 1)

In [7]:
scaler = StandardScaler()
hourly_log_returns = pd.DataFrame(scaler.fit_transform(hourly_log_returns), columns=hourly_log_returns.columns, index=hourly_log_returns.index)
hourly_log_returns.to_csv("../data/processed/normalized_log_ret.csv")

## Network creation methods

### Eigenvvalues clipping

In [8]:
def compute_C_minus_C0(lambdas, v, lambda_plus, removeMarketMode=False):
    N=len(lambdas)
    C_clean=np.zeros((N, N))

    order = np.argsort(lambdas)
    lambdas,v = lambdas[order],v[:,order]

    v_m=np.matrix(v)

    # note that the eivenvalues are sorted
    for i in range(1*removeMarketMode,N):
        if lambdas[i]>lambda_plus:
            C_clean=C_clean+lambdas[i] * np.dot(v_m[:,i],v_m[:,i].T)
    return C_clean

### From C (as in class)

In [9]:
def baseline_network(R, mst=False):   # R is a matrix of return
    N=R.shape[1]
    T=R.shape[0]

    q=N*1./T
    lambda_plus=(1.+np.sqrt(q))**2
    C=R.corr()
    lambdas, v = LA.eigh(C)
    C_s=compute_C_minus_C0(lambdas,v,lambda_plus)
    C_s = np.abs(C_s)

    return nx.from_numpy_array(C_s)


### Minimum Spanning Tree

In [10]:
def mst_network(R):
    N=R.shape[1]
    T=R.shape[0]

    q=N*1./T
    lambda_plus=(1.+np.sqrt(q))**2
    C=R.corr()
    lambdas, v = LA.eigh(C)
    C_s=compute_C_minus_C0(lambdas,v,lambda_plus)
    
    # Compute distance matrix
    D = np.sqrt(2*(1-C_s))
    # Compute MST
    G = nx.from_numpy_array(D)
    return nx.minimum_spanning_tree(G)

### Threshold network
- Compute C and D
- Create G from D
- Remove all edge with weight smaller than *threshold*
- Louvain clustering

In [63]:
def threshold_network(R, threshold=1.41):
    N = R.shape[1]
    T = R.shape[0]
    
    # Compute lambda_plus and correlation matrix
    q = N * 1. / T
    lambda_plus = (1. + np.sqrt(q)) ** 2
    C = R.corr()
    lambdas, v = LA.eigh(C)
    C = compute_C_minus_C0(lambdas, v, lambda_plus)

    # Compute distance matrix
    D = np.sqrt(2*(1-C))    
    
    # Compute graph
    G = nx.from_numpy_array(D)
    edges_to_remove = [edge for edge, attr in G.edges.items() if attr['weight'] >= threshold]
    G.remove_edges_from(edges_to_remove)
    return G

### Planar Maximally Filtered Graph

In [72]:
def sort_graph_edges(G):
    sorted_edges = []
    for source, dest, data in sorted(G.edges(data=True), key=lambda x: x[2]['weight']):
        sorted_edges.append({'source': source,
                             'dest': dest,
                             'weight': data['weight']})
        
    return sorted_edges

def compute_PMFG(sorted_edges, nb_nodes):
    PMFG = nx.Graph()
    for edge in sorted_edges:
        PMFG.add_edge(edge['source'], edge['dest'])
        if not nx.is_planar(PMFG):
            PMFG.remove_edge(edge['source'], edge['dest'])
            
        if len(PMFG.edges()) == 3*(nb_nodes-2):
            print('test')
            break
    
    return PMFG


def pmfg_network(R):
    N = R.shape[1]
    T = R.shape[0]
    
    # Compute lambda_plus and correlation matrix
    q = N * 1. / T
    lambda_plus = (1. + np.sqrt(q)) ** 2
    C = R.corr()
    lambdas, v = LA.eigh(C)
    C = compute_C_minus_C0(lambdas, v, lambda_plus)

    # Compute distance matrix
    D = np.sqrt(2*(1-C))    
    
    # Compute graph
    G = nx.from_numpy_array(D)
    
    sorted_edges = sort_graph_edges(G)
    
    return compute_PMFG(sorted_edges, N)

## Louvain clustering

In [36]:
def louvain_clustering(G):
    return  community.community_louvain.best_partition(G, resolution=1.05, random_state=3)

## Rolling window

In [37]:
df = pd.read_csv("../data/processed/normalized_log_ret.csv")

In [65]:
def rename_asset(R, partition):
    dict_cluster = {}
    all_names = list(R.columns)
    for i, name in enumerate(all_names):
        dict_cluster[name]=partition[i]
        
    return dict_cluster

def clustering(R: pd.DataFrame, period: int=720, interval: int = 24, method:str='baseline') -> dict:
    """Compute the clusters in a rolling window manner for the return matrix R.

    Args:
        R (pd.Dataframe): Normalized log-return matrix
        period (int, optional): time period_. Defaults to 30 days (720 hours).
        interval (int, optional): time interval. Defaults to 1 day (24 hours).
        method (str, optional): Clustering method. Defaults to 'baseline'. Can be 'baseline', 'mst' 'threshold', 'pmf'.
    """
    cluster_dict = {}
    for t0 in tqdm(range(0, len(R.index)-period, interval)):
        R_tmp = R.iloc[t0:t0+period]
        if method == 'baseline':
            G = baseline_network(R_tmp)
        elif method == 'mst':
            G = mst_network(R_tmp)
        elif method == 'threshold':
            G = threshold_network(R_tmp)
        elif method == 'pmf':
            G = pmfg_network(R_tmp)
        else:
            raise ValueError("Method not recognized")
            
        
        cluster_dict[(t0, t0+period)] = rename_asset(R_tmp, louvain_clustering(G))
            
    return cluster_dict