# Clustering of NFT collections
This notebook aims to provide an understanding of NFT collections grouping based on APR measures (mean, median, std) and visualize the resulting clusters.


The method `run_manual_clustering` runs a simple binary search (not clustering as in a machine learning method) to assign an NFT collection to a cluster. 

This assignment method is made based on a grouping key, for instance whether we cluster based on the `mean` APR, the `median` APR, or `std` APR

In [1]:
from typing import Optional
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import numpy as np
import copy

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.float_format', '{:,.2f}'.format)
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)


def usd_to_str(usd_value, round_value: Optional[int] = None):
    usd_str = f"$ {(round(usd_value, round_value) if not np.isnan(usd_value) else usd_value):,}".replace(',', "'")
    return usd_str


def run_manual_clustering(grouping_key='mean', bucket_apr_ranges=[0, 10, 20, 30, 40, 60], include_unassigned_collections_to_last_bucket=True, autosize=False, save_table_as_csv=True):
    if include_unassigned_collections_to_last_bucket:
        local_bucket_apr_ranges = copy.deepcopy(bucket_apr_ranges)  # to avoid writing to bucket_apr_ranges references (global variables)
        local_bucket_apr_ranges.append(np.inf)
    
    df = pd.read_csv('raw_data.csv')
    display(f"Number of collections in raw data: [{df.shape[0]}]")

    # Define clusters based on APR ranges
    labels = range(1, len(local_bucket_apr_ranges))
    df['cluster'] = pd.cut(df[grouping_key], bins=local_bucket_apr_ranges, labels=labels)

    # Create table with name, address, loan count, dollar volume, mean APR, median APR, std APR, and cluster number
    table = df[['collection_name', 'address', 'count', 'borrow_volume', 'mean', 'median', 'std', 'cluster']].sort_values(by=grouping_key)

    sum_per_cluster_df = table.groupby('cluster').agg('sum')
    sum_per_cluster_df = sum_per_cluster_df.rename(columns={'borrow_volume': 'borrow_volume_per_cluster', 'count': 'loan_count_per_cluster'})[['borrow_volume_per_cluster', 'loan_count_per_cluster']]
    sum_per_cluster_df['cluster'] = sum_per_cluster_df.index
    sum_per_cluster_df = sum_per_cluster_df.reset_index(drop=True)
    table = pd.merge(left=table, right=sum_per_cluster_df, on='cluster')
    nft_count_per_cluster_df = table.groupby('cluster').agg('count')
    nft_count_per_cluster_df = nft_count_per_cluster_df.rename(columns={'collection_name': 'nb_collection_per_cluster'})['nb_collection_per_cluster']
    table = pd.merge(left=table, right=nft_count_per_cluster_df, on='cluster')
    if save_table_as_csv:
        table.to_csv('nft_collection_apr_clusters.csv')

    display(table.head())
    display("Count of NFT collections per cluster:", pd.DataFrame(nft_count_per_cluster_df))

    # Create box plot to visualize clusters
    fig = px.box(df, x='cluster', y=grouping_key, color='cluster', labels={'cluster': 'Cluster Number', grouping_key: 'Mean APR'},
                 category_orders={'cluster': labels}, hover_data=['collection_name', 'address', 'count', 'borrow_volume', grouping_key, 'median', 'std'])
    fig.update_layout(title='Clustering of Collections based on Mean APR')
    if not autosize:
        fig.update_layout(
            width=1300,  # Set the width
            height=850,  # Set the height
        )
    fig.update_layout(paper_bgcolor='white')  # Set paper background color
    

    # Create scatter plot layer
    scatter = go.Scatter(x=df['cluster'], y=df[grouping_key], mode='markers', text=df['collection_name'],
                         marker=dict(color=df[grouping_key], colorscale='Viridis', opacity=0.7))

    # Add scatter plot layer to box plot figure
    fig.add_trace(scatter)
    # Update x-axis label
    fig.update_xaxes(title_text='Cluster Number')

    # Update y-axis label
    fig.update_yaxes(title_text='Mean APR')

    fig.show()

    scatter_fig = px.scatter(table, x=grouping_key, y='collection_name', color='cluster', hover_data=['address', 'collection_name'], size='borrow_volume')
    scatter_fig.update_layout(title='Clustering of Collections based on Mean APR')
    if not autosize:
        scatter_fig.update_layout(
            width=1200,  # Set the width
            height=600,  # Set the height
        )
    scatter_fig.update_layout(paper_bgcolor='white')  # Set paper background color
    scatter_fig.show()
    return table


In [2]:
grouping_key='mean'
"""
grouping_key:
Determine the feature used to group collections based on APR. Valid values are: "mean"; "median"; "std" (standard deviation)
"""

bucket_apr_ranges=[10, 20, 33, 40, 60]
"""
bucket_apr_ranges:
Set the number of desired buckets by setting the expected value of the grouping_key, 
e.g. bucket with value 10 aims for grouping_key = 'mean' would group collections with mean APR equal to 10.
"""

include_unassigned_collections_to_last_bucket=True  
"""
include_unassigned_collections_to_last_bucket:
If set to True, all collections whose APR (pending on grouping_key) is beyond the last bucket, get assigned to the last bucket
For instance, if True, collections in the 80% mean APR range would still get assigned to the 60% bucket if that was the last one. 
If False, collections way above 60% would be left unassigned.
"""

autosize=False
"""
autosize:
If set to True, sets matches the default plot size to the user screen. Else keeps consistent dimension. 
Defaults to False for best viewing of boxplots.
"""

save_table_as_csv=True
"""
save_table_as_csv:
If set to True, saves resulting table as .csv file locally.
"""


table = run_manual_clustering(grouping_key=grouping_key,
                              bucket_apr_ranges=bucket_apr_ranges,
                              autosize=autosize,
                              save_table_as_csv=save_table_as_csv
)

'Number of collections in raw data: [70]'

Unnamed: 0,collection_name,address,count,borrow_volume,mean,median,std,cluster,borrow_volume_per_cluster,loan_count_per_cluster,nb_collection_per_cluster
0,Art Blocks,0x059edd72cd353df5106d2b9cc5ab83a52287ac3a,147,1899193.43,11.74,9.0,7.15,1,8488097.52,292,3
1,Autoglyphs,0xd4e4078ca3495de5b1d4db434bebc5a986197782,20,4243883.08,13.33,9.0,9.69,1,8488097.52,292,3
2,Azuki,0xed5af388653567af2f388e6224dc7c4b3241c544,125,2345021.0,17.93,17.0,10.61,1,8488097.52,292,3
3,Terraforms,0x4e1f41613c9084fdb9e34e11fae9412427480e56,59,108922.53,26.42,24.0,10.68,2,7446065.28,579,9
4,mfer,0x79fcdef22feed20eddacbb2587640e45491b757f,36,44932.24,27.85,26.0,16.74,2,7446065.28,579,9


'Count of NFT collections per cluster:'

Unnamed: 0_level_0,nb_collection_per_cluster
cluster,Unnamed: 1_level_1
1,3
2,9
3,8
4,36
5,12


In [3]:
# to view the table ordered by cluster, set the table index to the cluster:
table.set_index('cluster')

Unnamed: 0_level_0,collection_name,address,count,borrow_volume,mean,median,std,borrow_volume_per_cluster,loan_count_per_cluster,nb_collection_per_cluster
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,Art Blocks,0x059edd72cd353df5106d2b9cc5ab83a52287ac3a,147,1899193.43,11.74,9.0,7.15,8488097.52,292,3
1,Autoglyphs,0xd4e4078ca3495de5b1d4db434bebc5a986197782,20,4243883.08,13.33,9.0,9.69,8488097.52,292,3
1,Azuki,0xed5af388653567af2f388e6224dc7c4b3241c544,125,2345021.0,17.93,17.0,10.61,8488097.52,292,3
2,Terraforms,0x4e1f41613c9084fdb9e34e11fae9412427480e56,59,108922.53,26.42,24.0,10.68,7446065.28,579,9
2,mfer,0x79fcdef22feed20eddacbb2587640e45491b757f,36,44932.24,27.85,26.0,16.74,7446065.28,579,9
2,Bored Ape Yacht Club,0xbc4ca0eda7647a8ab7c2061c2e118a18a936f13d,80,5385201.64,30.33,15.0,49.39,7446065.28,579,9
2,The Captainz,0x769272677fab02575e84945f03eca517acc544cc,142,1084539.07,32.12,25.0,31.0,7446065.28,579,9
2,Otherside Koda,0xe012baf811cf9c05c408e879c399960d1f305903,34,356943.02,32.29,30.0,16.05,7446065.28,579,9
2,VeeFriends,0xa3aee8bce55beea1951ef834b99f3ac60d1abeeb,40,155660.17,32.78,30.0,16.49,7446065.28,579,9
2,Ethereum Name Service (ENS),0x57f1887a8bf19b14fc0df6fd9b2acc9af147ea85,57,36764.69,32.84,25.0,29.27,7446065.28,579,9


## Testing the above parameters with grouping_key='median':

In [4]:
grouping_key='median'
"""
grouping_key:
Determine the feature used to group collections based on APR. Valid values are: "mean"; "median"; "std" (standard deviation)
"""

table = run_manual_clustering(grouping_key=grouping_key,
                              bucket_apr_ranges=bucket_apr_ranges,
                              autosize=autosize,
                              save_table_as_csv=save_table_as_csv
)

'Number of collections in raw data: [70]'

Unnamed: 0,collection_name,address,count,borrow_volume,mean,median,std,cluster,borrow_volume_per_cluster,loan_count_per_cluster,nb_collection_per_cluster
0,Bored Ape Yacht Club,0xbc4ca0eda7647a8ab7c2061c2e118a18a936f13d,80,5385201.64,30.33,15.0,49.39,1,8043334.93,983,3
1,Azuki,0xed5af388653567af2f388e6224dc7c4b3241c544,125,2345021.0,17.93,17.0,10.61,1,8043334.93,983,3
2,Friendship Bracelets by Alexis André,0x942bc2d3e7a589fe5bd4a5c6ef9727dfd82f5c8a,778,313112.29,38.49,19.0,27.18,1,8043334.93,983,3
3,MutantApeYachtClub,0x60e4d786628fea6478f785a6d7e704777c86a7c6,186,3017176.47,33.94,23.0,38.0,2,5607474.11,820,12
4,Terraforms,0x4e1f41613c9084fdb9e34e11fae9412427480e56,59,108922.53,26.42,24.0,10.68,2,5607474.11,820,12


'Count of NFT collections per cluster:'

Unnamed: 0_level_0,nb_collection_per_cluster
cluster,Unnamed: 1_level_1
1,3
2,12
3,15
4,24
5,12


In [5]:
# to view the table ordered by cluster, set the table index to the cluster:
table.set_index('cluster')

Unnamed: 0_level_0,collection_name,address,count,borrow_volume,mean,median,std,borrow_volume_per_cluster,loan_count_per_cluster,nb_collection_per_cluster
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,Bored Ape Yacht Club,0xbc4ca0eda7647a8ab7c2061c2e118a18a936f13d,80,5385201.64,30.33,15.0,49.39,8043334.93,983,3
1,Azuki,0xed5af388653567af2f388e6224dc7c4b3241c544,125,2345021.0,17.93,17.0,10.61,8043334.93,983,3
1,Friendship Bracelets by Alexis André,0x942bc2d3e7a589fe5bd4a5c6ef9727dfd82f5c8a,778,313112.29,38.49,19.0,27.18,8043334.93,983,3
2,MutantApeYachtClub,0x60e4d786628fea6478f785a6d7e704777c86a7c6,186,3017176.47,33.94,23.0,38.0,5607474.11,820,12
2,Terraforms,0x4e1f41613c9084fdb9e34e11fae9412427480e56,59,108922.53,26.42,24.0,10.68,5607474.11,820,12
2,Ethereum Name Service (ENS),0x57f1887a8bf19b14fc0df6fd9b2acc9af147ea85,57,36764.69,32.84,25.0,29.27,5607474.11,820,12
2,The Captainz,0x769272677fab02575e84945f03eca517acc544cc,142,1084539.07,32.12,25.0,31.0,5607474.11,820,12
2,mfer,0x79fcdef22feed20eddacbb2587640e45491b757f,36,44932.24,27.85,26.0,16.74,5607474.11,820,12
2,Milady,0x5af0d9827e0c53e4799bb226655a1de152a425a5,29,78295.1,42.71,29.0,24.05,5607474.11,820,12
2,Cool Cats,0x1a92f7381b9f03921564a437210bb9396471050c,24,33709.18,32.97,29.0,24.15,5607474.11,820,12
