In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler

import sys 
sys.path.append("/scratch/izar/kapps/DEX-Cyclic-Arbitrage/")
from config.get import cfg

# import standard packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

In [None]:
X_train = np.load(cfg["files"]["encoded_train_features"])
X_test = np.load(cfg["files"]["encoded_test_features"])
train_ids = np.load(cfg['files']['train_ids']).astype(int)

In [None]:
scaler = StandardScaler()
scaler.fit(X_train)
tX_train = scaler.transform(X_train)
tX_test  = scaler.transform(X_test)

In [None]:
k_max = 12

silhouettes = []
sse = []
for k in range(2, k_max): # Try multiple k
    print(k,end="\r")
    # Cluster the data and assign the labels
    kmeans =  KMeans(n_clusters=k, random_state=42)
    labels =  kmeans.fit_predict(X_train)
    # Get the Silhouette score
    score = silhouette_score(X_train, labels)
    silhouettes.append({"k": k, "score": score})
    
    sse.append({"k": k, "sse": kmeans.inertia_})
    
# Convert to dataframes
silhouettes = pd.DataFrame(silhouettes)
sse = pd.DataFrame(sse)

In [None]:
fig = make_subplots(rows=1, cols=2, subplot_titles=("Silhouette method", "SSE method"))

fig.add_trace(
    go.Scatter(x=silhouettes.k, y=silhouettes.score, ),
    row=1, col=1
)

fig.add_trace(
    go.Scatter(x=sse.k, y=sse.sse, ),
    row=1, col=2
)
fig.update_xaxes(title_text="k", range=(2, k_max), row=1, col=1)
fig.update_xaxes(title_text="k", range=(2, k_max), row=1, col=2)

fig.update_layout(height=500, width=1000, title_text="K-means: evaluation metrics for different k", showlegend=False)
fig.write_html(f"{cfg['fig_dir']['clustering']}kmeans_k_metrics.html",full_html=False, include_plotlyjs=False)
fig.show()

In [None]:
# appears to be the best k value
k = 4

kmeans =  KMeans(n_clusters=k, random_state=42)
train_labels = kmeans.fit_predict(tX_train)
test_labels  = kmeans.predict(tX_test)

In [None]:
fX_train = pd.read_csv(cfg["files"]["features_train"]).drop(columns=['Unnamed: 0'])
fX_test  = pd.read_csv(cfg["files"]["features_test"]).drop(columns=['Unnamed: 0'])
fX_train

In [None]:
fX_train['cluster'] = train_labels
fX_train.cluster = fX_train.cluster.apply(str) # make plot look nicer
fX_test['cluster'] = test_labels
fX_test.cluster = fX_test.cluster.apply(str) # make plot look nicer

## Clustering validation


In [None]:
def bar_char_cluster(data, col, title, callback = lambda fig: 0, train=True):
    set_name = ('train' if train else 'test')
    fig = px.bar(
        data.reset_index(), 
        x='cluster', y=col, color='cluster',
        title=f"{title} ({set_name} set)")
    callback(fig)
    fig.write_html(f"{cfg['fig_dir']['clustering']}{title.replace(' ', '_')}_{set_name}_small.html", full_html=False, include_plotlyjs=False)
    fig.show()

In [None]:

def nb_cycles_clusters(grouper, **kwargs):
    bar_char_cluster(grouper.count(),
        'cycle_id',"Number of cycles per cluster",**kwargs)
    
def profits_clusters(grouper, **kwargs):
    bar_char_cluster(grouper.median(),
        'profits',"Profit per cluster", **kwargs)
    
def profitability_clusters(grouper, **kwargs):
    bar_char_cluster(grouper.mean(),
        'profitability',"Profitability of each cluster",
         lambda fig: fig.update_yaxes(range=(0.9, 1)), **kwargs)

def median_token_clusters(grouper, **kwargs):
    def weighted_avg(g):
        return pd.concat([g.token1, g.token2, g.token3]).value_counts().median()
    
    bar_char_cluster(grouper.apply(weighted_avg),
        0,"Median of token distribution within each cluster", **kwargs)
                     
from scipy.stats import entropy

def entropy_clusters(grouper, **kwargs):
    def weighted_avg(g):
        return entropy(pd.concat([g.token1, g.token2, g.token3]).value_counts())
        
    bar_char_cluster(grouper.apply(weighted_avg),
        0,"Entropy of token distribution within each cluster", **kwargs)

In [None]:
pipeline_metrics= [
    nb_cycles_clusters,
    profits_clusters,
    profitability_clusters,
    median_token_clusters,
    entropy_clusters
]


train_grouper = cluster_profits = fX_train.groupby('cluster')

[agg(train_grouper) for agg in pipeline_metrics]

## Validation on test set

In [None]:
test_grouper = fX_test.groupby('cluster')
[agg(test_grouper, train=False) for agg in pipeline_metrics]

In [None]:
a=2