# Clustering Crypto

In [2]:
# Initial imports
!pip install -U altair

In [3]:
import pandas as pd
from pathlib import Path
#import plotly.express as px
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

### Data Preprocessing

In [116]:
# Loade the cryptocurrencies data
file_path = Path("Resources/crypto_data.csv")
crypto_df = pd.read_csv(file_path, index_col=0)
crypto_df.head(10)

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
365,365Coin,X11,True,PoW/PoS,,2300000000
404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
611,SixEleven,SHA-256,True,PoW,,611000
808,808,SHA-256,True,PoW/PoS,0.0,0
1337,EliteCoin,X13,True,PoW/PoS,29279420000.0,314159265359
2015,2015 coin,X11,True,PoW/PoS,,0
BTC,Bitcoin,SHA-256,True,PoW,17927180.0,21000000
ETH,Ethereum,Ethash,True,PoW,107684200.0,0
LTC,Litecoin,Scrypt,True,PoW,63039240.0,84000000


In [117]:
# Keep only cryptocurrencies that are on trading
crypto_df = crypto_df.loc[crypto_df['IsTrading'] == True]
crypto_df.head(10)

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
365,365Coin,X11,True,PoW/PoS,,2300000000
404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
611,SixEleven,SHA-256,True,PoW,,611000
808,808,SHA-256,True,PoW/PoS,0.0,0
1337,EliteCoin,X13,True,PoW/PoS,29279420000.0,314159265359
2015,2015 coin,X11,True,PoW/PoS,,0
BTC,Bitcoin,SHA-256,True,PoW,17927180.0,21000000
ETH,Ethereum,Ethash,True,PoW,107684200.0,0
LTC,Litecoin,Scrypt,True,PoW,63039240.0,84000000


In [118]:
# Keep only cryptocurrencies with a working algorithm
crypto_df = crypto_df.loc[crypto_df['Algorithm'] != '']
crypto_df.head(10)

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
365,365Coin,X11,True,PoW/PoS,,2300000000
404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
611,SixEleven,SHA-256,True,PoW,,611000
808,808,SHA-256,True,PoW/PoS,0.0,0
1337,EliteCoin,X13,True,PoW/PoS,29279420000.0,314159265359
2015,2015 coin,X11,True,PoW/PoS,,0
BTC,Bitcoin,SHA-256,True,PoW,17927180.0,21000000
ETH,Ethereum,Ethash,True,PoW,107684200.0,0
LTC,Litecoin,Scrypt,True,PoW,63039240.0,84000000


In [119]:
# Remove the "IsTrading" column
crypto_df.drop(columns='IsTrading', inplace=True)
crypto_df.head(10)

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,PoW/PoS,41.99995,42
365,365Coin,X11,PoW/PoS,,2300000000
404,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000
611,SixEleven,SHA-256,PoW,,611000
808,808,SHA-256,PoW/PoS,0.0,0
1337,EliteCoin,X13,PoW/PoS,29279420000.0,314159265359
2015,2015 coin,X11,PoW/PoS,,0
BTC,Bitcoin,SHA-256,PoW,17927180.0,21000000
ETH,Ethereum,Ethash,PoW,107684200.0,0
LTC,Litecoin,Scrypt,PoW,63039240.0,84000000


In [120]:
# Remove rows with at least 1 null value
crypto_df.dropna(inplace=True)
crypto_df.head(10)

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,PoW/PoS,41.99995,42
404,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000
808,808,SHA-256,PoW/PoS,0.0,0
1337,EliteCoin,X13,PoW/PoS,29279420000.0,314159265359
BTC,Bitcoin,SHA-256,PoW,17927180.0,21000000
ETH,Ethereum,Ethash,PoW,107684200.0,0
LTC,Litecoin,Scrypt,PoW,63039240.0,84000000
DASH,Dash,X11,PoW/PoS,9031294.0,22000000
XMR,Monero,CryptoNight-V7,PoW,17201140.0,0
ETC,Ethereum Classic,Ethash,PoW,113359700.0,210000000


In [121]:
# Remove rows with cryptocurrencies without coins mined
crypto_df = crypto_df.loc[crypto_df['TotalCoinsMined'] > 0]
crypto_df.head(10)

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,PoW/PoS,41.99995,42
404,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000
1337,EliteCoin,X13,PoW/PoS,29279420000.0,314159265359
BTC,Bitcoin,SHA-256,PoW,17927180.0,21000000
ETH,Ethereum,Ethash,PoW,107684200.0,0
LTC,Litecoin,Scrypt,PoW,63039240.0,84000000
DASH,Dash,X11,PoW/PoS,9031294.0,22000000
XMR,Monero,CryptoNight-V7,PoW,17201140.0,0
ETC,Ethereum Classic,Ethash,PoW,113359700.0,210000000
ZEC,ZCash,Equihash,PoW,7383056.0,21000000


In [122]:
# Fetch the cryptocurrencies names prior to drop them from crypto_df
coins_name = pd.DataFrame(crypto_df['CoinName'].copy())
coins_name.head(10)

Unnamed: 0,CoinName
42,42 Coin
404,404Coin
1337,EliteCoin
BTC,Bitcoin
ETH,Ethereum
LTC,Litecoin
DASH,Dash
XMR,Monero
ETC,Ethereum Classic
ZEC,ZCash


In [123]:
# Remove the cryptocurrency name since it's not going to be used on the clustering algorithm
crypto_df.drop(columns='CoinName', inplace=True)
crypto_df.head(10)

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,Scrypt,PoW/PoS,41.99995,42
404,Scrypt,PoW/PoS,1055185000.0,532000000
1337,X13,PoW/PoS,29279420000.0,314159265359
BTC,SHA-256,PoW,17927180.0,21000000
ETH,Ethash,PoW,107684200.0,0
LTC,Scrypt,PoW,63039240.0,84000000
DASH,X11,PoW/PoS,9031294.0,22000000
XMR,CryptoNight-V7,PoW,17201140.0,0
ETC,Ethash,PoW,113359700.0,210000000
ZEC,Equihash,PoW,7383056.0,21000000


In [12]:
# Create dummies variables for text features
X = pd.get_dummies(crypto_df[['Algorithm', 'ProofType']])

In [13]:
X.head()

Unnamed: 0,Algorithm_1GB AES Pattern Search,Algorithm_536,Algorithm_Argon2d,Algorithm_BLAKE256,Algorithm_Blake,Algorithm_Blake2S,Algorithm_Blake2b,Algorithm_C11,Algorithm_Cloverhash,Algorithm_Counterparty,...,ProofType_PoW/PoS,ProofType_PoW/PoS.1,ProofType_PoW/PoW,ProofType_PoW/nPoS,ProofType_Pos,ProofType_Proof of Authority,ProofType_Proof of Trust,ProofType_TPoS,ProofType_Zero-Knowledge Proof,ProofType_dPoW/PoW
42,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
404,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1337,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
BTC,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ETH,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
X['TotalCoinsMined'] = crypto_df['TotalCoinsMined'].copy()

In [15]:
X['TotalCoinSupply'] = crypto_df['TotalCoinSupply'].copy()

In [16]:
X.head()

Unnamed: 0,Algorithm_1GB AES Pattern Search,Algorithm_536,Algorithm_Argon2d,Algorithm_BLAKE256,Algorithm_Blake,Algorithm_Blake2S,Algorithm_Blake2b,Algorithm_C11,Algorithm_Cloverhash,Algorithm_Counterparty,...,ProofType_PoW/PoW,ProofType_PoW/nPoS,ProofType_Pos,ProofType_Proof of Authority,ProofType_Proof of Trust,ProofType_TPoS,ProofType_Zero-Knowledge Proof,ProofType_dPoW/PoW,TotalCoinsMined,TotalCoinSupply
42,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,41.99995,42
404,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1055185000.0,532000000
1337,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,29279420000.0,314159265359
BTC,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,17927180.0,21000000
ETH,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,107684200.0,0


In [17]:
# Standardize data
X_scaled = StandardScaler().fit_transform(X)


Data with input dtype uint8, float64, object were all converted to float64 by StandardScaler.


Data with input dtype uint8, float64, object were all converted to float64 by StandardScaler.



### Reducing Dimensions Using PCA

In [18]:
# Use PCA to reduce dimension to 3 principal components
pca = PCA(n_components=3)
X_pca = pca.fit_transform(X_scaled)

In [19]:
# Create a DataFrame with the principal components data
pcs_df = pd.DataFrame(
    data=X_pca, columns=["PC 1", "PC 2", "PC 3"]
).set_index(crypto_df.index)
pcs_df.head(10)

Unnamed: 0,PC 1,PC 2,PC 3
42,-0.335384,1.044678,-0.469765
404,-0.318722,1.044872,-0.470068
1337,2.290495,1.699492,-0.56507
BTC,-0.15112,-1.276584,0.156573
ETH,-0.153772,-2.014962,0.328401
LTC,-0.160739,-1.123454,-0.002347
DASH,-0.399598,1.186173,-0.500135
XMR,-0.14523,-2.251925,0.318105
ETC,-0.152214,-2.015056,0.32839
ZEC,-0.162758,-2.0621,0.442027


### Clustering Crytocurrencies Using K-Means

#### Find the Best Value for `k` Using the Elbow Curve

In [21]:
inertia = []
k = list(range(1, 11))

# Calculate the inertia for the range ok k values
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(pcs_df)
    inertia.append(km.inertia_)

# Create the Elbow Curve using hvPlot
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)

In [26]:
import altair as alt

In [137]:
alt.Chart(df_elbow).mark_line().encode(
                x='k', 
                y='inertia',
                tooltip=['k', 'inertia']).properties(
                    title='Elbow Curve',
                    width=600
)

Running K-Means with `k=4`

In [138]:
# Initialize the K-Means model
model = KMeans(n_clusters=4, random_state=0)

# Fit the model
model.fit(pcs_df)

# Predict clusters
predictions = model.predict(pcs_df)

# Create a new DataFrame including predicted clusters and cryptocurrencies features
clustered_df = pd.concat([coins_name, crypto_df, pcs_df], axis=1)
clustered_df['Class'] = model.labels_
clustered_df.head(10)

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,PC 1,PC 2,PC 3,Class
42,42 Coin,Scrypt,PoW/PoS,41.99995,42,-0.335384,1.044678,-0.469765,0
404,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000,-0.318722,1.044872,-0.470068,0
1337,EliteCoin,X13,PoW/PoS,29279420000.0,314159265359,2.290495,1.699492,-0.56507,0
BTC,Bitcoin,SHA-256,PoW,17927180.0,21000000,-0.15112,-1.276584,0.156573,3
ETH,Ethereum,Ethash,PoW,107684200.0,0,-0.153772,-2.014962,0.328401,3
LTC,Litecoin,Scrypt,PoW,63039240.0,84000000,-0.160739,-1.123454,-0.002347,3
DASH,Dash,X11,PoW/PoS,9031294.0,22000000,-0.399598,1.186173,-0.500135,0
XMR,Monero,CryptoNight-V7,PoW,17201140.0,0,-0.14523,-2.251925,0.318105,3
ETC,Ethereum Classic,Ethash,PoW,113359700.0,210000000,-0.152214,-2.015056,0.32839,3
ZEC,ZCash,Equihash,PoW,7383056.0,21000000,-0.162758,-2.0621,0.442027,3


### Visualizing Results

#### 3D-Scatter with Clusters

In [157]:
# Create a 3D-Scatter with the PCA data and the clusters
alt.Chart(clustered_df).mark_point().encode(
                x='PC 1', 
                y='PC 2', 
                color='Class:N',
                shape='Class:N',
                tooltip=["CoinName", "Algorithm", 
                    "TotalCoinsMined", "TotalCoinSupply"]).properties(
                        title='PCA Data'
)

#### Table of Tradable Cryptocurrencies

In [140]:
# Print the total number of tradable cryptocurrencies
print(len(clustered_df))

532


#### Scatter Plot with Tradable Cryptocurrencies

In [141]:
# Scale data to create the scatter plot
clustered_scaled = MinMaxScaler().fit_transform(clustered_df[['TotalCoinSupply', 
                                                                'TotalCoinsMined']])



Data with input dtype float64, object were all converted to float64 by MinMaxScaler.



In [142]:
clustered_scaled = pd.DataFrame(clustered_scaled)
clustered_scaled.set_index(clustered_df.index, inplace=True)
clustered_scaled.head(10)

Unnamed: 0,0,1
42,4.2e-11,0.0
404,0.000532,0.001066
1337,0.3141593,0.029576
BTC,2.1e-05,1.8e-05
ETH,0.0,0.000109
LTC,8.4e-05,6.4e-05
DASH,2.2e-05,9e-06
XMR,0.0,1.7e-05
ETC,0.00021,0.000115
ZEC,2.1e-05,7e-06


In [143]:
clustered_scaled_df = clustered_df.drop(columns=['TotalCoinsMined', 'TotalCoinSupply'])

In [144]:
clustered_scaled_df['TotalCoinsMined'] = clustered_scaled[1]
clustered_scaled_df['TotalCoinSupply'] = clustered_scaled[0]

In [145]:
clustered_scaled_df.head(10)

Unnamed: 0,CoinName,Algorithm,ProofType,PC 1,PC 2,PC 3,Class,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,PoW/PoS,-0.335384,1.044678,-0.469765,0,0.0,4.2e-11
404,404Coin,Scrypt,PoW/PoS,-0.318722,1.044872,-0.470068,0,0.001066,0.000532
1337,EliteCoin,X13,PoW/PoS,2.290495,1.699492,-0.56507,0,0.029576,0.3141593
BTC,Bitcoin,SHA-256,PoW,-0.15112,-1.276584,0.156573,3,1.8e-05,2.1e-05
ETH,Ethereum,Ethash,PoW,-0.153772,-2.014962,0.328401,3,0.000109,0.0
LTC,Litecoin,Scrypt,PoW,-0.160739,-1.123454,-0.002347,3,6.4e-05,8.4e-05
DASH,Dash,X11,PoW/PoS,-0.399598,1.186173,-0.500135,0,9e-06,2.2e-05
XMR,Monero,CryptoNight-V7,PoW,-0.14523,-2.251925,0.318105,3,1.7e-05,0.0
ETC,Ethereum Classic,Ethash,PoW,-0.152214,-2.015056,0.32839,3,0.000115,0.00021
ZEC,ZCash,Equihash,PoW,-0.162758,-2.0621,0.442027,3,7e-06,2.1e-05


In [146]:
# Plot the scatter with x="TotalCoinsMined" and y="TotalCoinSupply"
alt.Chart(clustered_scaled_df).mark_point().encode(
        x='TotalCoinsMined',
        y='TotalCoinSupply',
        color='Class:N',
        shape='Class:N',
        tooltip=["CoinName", "Algorithm"]).properties(
            title='Scaled Data'
)

In [147]:
#Create two dataframes in order to separate the outliers
def remove_outliers(df, column_1, column_2):
    df_1 = df.loc[df[column_1] < df[column_1].std()]
    df_1 = df_1.loc[df_1[column_2] < df_1[column_2].std()]
    
    outlier_df_1 = df.loc[df[column_1] >= df[column_1].std()]
    outlier_df_2 = df.loc[df[column_2] >= df[column_2].std()]
    df_2 = pd.concat([outlier_df_1, outlier_df_2]).drop_duplicates()

    return df_1, df_2

In [148]:
no_outliers_df, outliers_df = remove_outliers(clustered_scaled_df, 
                                        'TotalCoinsMined', 'TotalCoinSupply')

In [149]:
no_outliers_plot = alt.Chart(no_outliers_df).mark_point().encode(
        x='TotalCoinsMined',
        y='TotalCoinSupply',
        color='Class:N',
        shape='Class:N',
        tooltip=["CoinName", "Algorithm"]).properties(
            title='Scaled Data without Outliers'
)

no_outliers_plot

In [152]:
outliers_plot = alt.Chart(outliers_df).mark_point().encode(
        x='TotalCoinsMined',
        y='TotalCoinSupply',
        color='Class:N',
        shape='Class:N',
        tooltip=["CoinName", "Algorithm"]).properties(
            title='Scaled Outlier Data'
)

outliers_plot