In [1]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler

import sys 
sys.path.append("/scratch/izar/kapps/DEX-Cyclic-Arbitrage/")
from config.get import cfg

# import standard packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

In [2]:
X_train = np.load(cfg["files"]["encoded_train_features"])
X_test = np.load(cfg["files"]["encoded_test_features"])
train_ids = np.load(cfg['files']['train_ids']).astype(int)

In [3]:
scaler = StandardScaler()
scaler.fit(X_train)
tX_train = scaler.transform(X_train)
tX_test  = scaler.transform(X_test)

In [None]:
k_max = 12

silhouettes = []
sse = []
for k in range(2, k_max): # Try multiple k
    print(k,end="\r")
    # Cluster the data and assign the labels
    kmeans =  KMeans(n_clusters=k, random_state=42)
    labels =  kmeans.fit_predict(X_train)
    # Get the Silhouette score
    score = silhouette_score(X_train, labels)
    silhouettes.append({"k": k, "score": score})
    
    sse.append({"k": k, "sse": kmeans.inertia_})
    
# Convert to dataframes
silhouettes = pd.DataFrame(silhouettes)
sse = pd.DataFrame(sse)

In [None]:
fig = make_subplots(rows=1, cols=2, subplot_titles=("Silhouette method", "SSE method"))

fig.add_trace(
    go.Scatter(x=silhouettes.k, y=silhouettes.score, ),
    row=1, col=1
)

fig.add_trace(
    go.Scatter(x=sse.k, y=sse.sse, ),
    row=1, col=2
)
fig.update_xaxes(title_text="k", range=(2, k_max), row=1, col=1)
fig.update_xaxes(title_text="k", range=(2, k_max), row=1, col=2)

fig.update_layout(height=500, width=1000, title_text="K-means: evaluation metrics for different k", showlegend=False)
fig.write_html(f"{cfg['fig_dir']['clustering']}kmeans_k_metrics.html",full_html=False, include_plotlyjs=False)
fig.show()

In [32]:
# appears to be the best k value
k = 4

kmeans =  KMeans(n_clusters=k, random_state=42)
train_labels = kmeans.fit_predict(tX_train)
test_labels  = kmeans.predict(tX_test)

In [33]:
fX_train = pd.read_csv(cfg["files"]["features_train"]).drop(columns=['Unnamed: 0'])
fX_test  = pd.read_csv(cfg["files"]["features_test"]).drop(columns=['Unnamed: 0'])
fX_train

Unnamed: 0,cycle_id,revenues,costs,token1,token2,token3,profits,profitability
0,51068,1.488251e+17,4.598268e+16,0xC02aaA39b223FE8D0A0e5C4F27eAD9083C756Cc2,0xbf2179859fc6D5BEE9Bf9158632Dc51678a4100e,0xC28E27870558cF22ADD83540d2126da2e4b464c2,1.028424e+17,True
1,46217,5.474499e+16,2.786625e+16,0xC02aaA39b223FE8D0A0e5C4F27eAD9083C756Cc2,0x9676EE2eDCc830baf858dad8E56ae1d251783Eb3,0xdAC17F958D2ee523a2206206994597C13D831ec7,2.687874e+16,True
2,44104,8.136130e+16,5.425344e+16,0xC02aaA39b223FE8D0A0e5C4F27eAD9083C756Cc2,0xf29e46887FFAE92f1ff87DfE39713875Da541373,0x9fBFed658919A896B5Dc7b00456Ce22D780f9B65,2.710785e+16,True
3,50240,3.060971e+17,1.233920e+17,0xC02aaA39b223FE8D0A0e5C4F27eAD9083C756Cc2,0xC28E27870558cF22ADD83540d2126da2e4b464c2,0xbf2179859fc6D5BEE9Bf9158632Dc51678a4100e,1.827051e+17,True
4,44997,1.484209e+17,8.918950e+16,0xC02aaA39b223FE8D0A0e5C4F27eAD9083C756Cc2,0xa93D5Cfaa41193b13321c035b4bDD2B534172762,0x6B3595068778DD592e39A122f4f5a5cF09C90fE2,5.923137e+16,True
...,...,...,...,...,...,...,...,...
9328,43218,1.555328e+16,2.399796e+16,0xC02aaA39b223FE8D0A0e5C4F27eAD9083C756Cc2,0xdAC17F958D2ee523a2206206994597C13D831ec7,0xa93D5Cfaa41193b13321c035b4bDD2B534172762,-8.444685e+15,False
9329,50252,9.261110e+17,7.046000e+16,0xC02aaA39b223FE8D0A0e5C4F27eAD9083C756Cc2,0xbf2179859fc6D5BEE9Bf9158632Dc51678a4100e,0xC28E27870558cF22ADD83540d2126da2e4b464c2,8.556510e+17,True
9330,2346,2.322861e+16,1.083954e+16,0xC02aaA39b223FE8D0A0e5C4F27eAD9083C756Cc2,0xF80D589b3Dbe130c270a69F1a69D050f268786Df,0x469eDA64aEd3A3Ad6f868c44564291aA415cB1d9,1.238907e+16,True
9331,49646,1.518578e+17,5.859790e+16,0xC02aaA39b223FE8D0A0e5C4F27eAD9083C756Cc2,0xf3A2ace8e48751c965eA0A1D064303AcA53842b9,0x2b591e99afE9f32eAA6214f7B7629768c40Eeb39,9.325991e+16,True


In [34]:
fX_train['cluster'] = train_labels
fX_train.cluster = fX_train.cluster.apply(str) # make plot look nicer
fX_test['cluster'] = test_labels
fX_test.cluster = fX_test.cluster.apply(str) # make plot look nicer

## Clustering validation


In [35]:
def bar_char_cluster(data, col, title, callback = lambda fig: 0, train=True):
    set_name = ('train' if train else 'test')
    fig = px.bar(
        data.reset_index(), 
        x='cluster', y=col, color='cluster',
        title=f"{title} ({set_name} set)")
    callback(fig)
    fig.write_html(f"{cfg['fig_dir']['clustering']}{title.replace(' ', '_')}_{set_name}_small.html", full_html=False, include_plotlyjs=False)
    fig.show()

In [36]:

def nb_cycles_clusters(grouper, **kwargs):
    bar_char_cluster(grouper.count(),
        'cycle_id',"Number of cycles per cluster",**kwargs)
    
def profits_clusters(grouper, **kwargs):
    bar_char_cluster(grouper.median(),
        'profits',"Profit per cluster", **kwargs)
    
def profitability_clusters(grouper, **kwargs):
    bar_char_cluster(grouper.mean(),
        'profitability',"Profitability of each cluster",
         lambda fig: fig.update_yaxes(range=(0.9, 1)), **kwargs)

def median_token_clusters(grouper, **kwargs):
    def weighted_avg(g):
        return pd.concat([g.token1, g.token2, g.token3]).value_counts().median()
    
    bar_char_cluster(grouper.apply(weighted_avg),
        0,"Median of token distribution within each cluster", **kwargs)
                     
from scipy.stats import entropy

def entropy_clusters(grouper, **kwargs):
    def weighted_avg(g):
        return entropy(pd.concat([g.token1, g.token2, g.token3]).value_counts())
        
    bar_char_cluster(grouper.apply(weighted_avg),
        0,"Entropy of token distribution within each cluster", **kwargs)



In [37]:
pipeline_metrics= [
    nb_cycles_clusters,
    profits_clusters,
    profitability_clusters,
    median_token_clusters,
    entropy_clusters
]

train_grouper = cluster_profits = fX_train.groupby('cluster')

[agg(train_grouper) for agg in pipeline_metrics]

[None, None, None, None, None]

In [38]:
def token_df(X):
    def select(i):
        df = X[[f"token{i}",'cluster']]
        df = df.rename(columns = {f"token{i}":'token'})
        df.token = df.token.str[:7]
        return df
    return pd.concat([select(1), select(2), select(3)],axis=0)

def token_distribution(df):
    fig = px.histogram(df, x="token", facet_row="cluster",  facet_col_wrap=2, log_y=True, category_orders={"cluster": ["0","1","2","3"]})
    fig.show()
    
tdf = token_df(fX_train) 
token_distribution(tdf)

## Validation on test set

In [39]:
test_grouper = fX_test.groupby('cluster')
[agg(test_grouper, train=False) for agg in pipeline_metrics]

[None, None, None, None, None]

In [40]:
    
test_tdf = token_df(fX_test) 
token_distribution(test_tdf)