In [69]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import AgglomerativeClustering
import hvplot.pandas
from sklearn.cluster import KMeans
import plotly.express as px
import hvplot.pandas

## Data Preprocessing

In [70]:
#Load the data 
file_path = "Resources/crypto_data.csv"
crypto_df = pd.read_csv(file_path)
crypto_df.head()

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
1,365,365Coin,X11,True,PoW/PoS,,2300000000
2,404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
3,611,SixEleven,SHA-256,True,PoW,,611000
4,808,808,SHA-256,True,PoW/PoS,0.0,0


In [71]:
crypto_df.dtypes

Unnamed: 0          object
CoinName            object
Algorithm           object
IsTrading             bool
ProofType           object
TotalCoinsMined    float64
TotalCoinSupply     object
dtype: object

In [72]:
#Find the number of rows in the DataFrame
crypto_df.shape

(1252, 7)

In [73]:
#filter the DataFrame for only trading cryptos
crypto_df = crypto_df[crypto_df["IsTrading"]==1]

In [74]:
#find the new number of rows in the filtered DataFrame
crypto_df.shape

(1144, 7)

In [75]:
#Remove all cryptos that don't have a defined algorithm
crypto_df = crypto_df[crypto_df["Algorithm"]!="Multiple"]

In [76]:
#Find the new number of rows in the filtered DataFrame
crypto_df.shape

(1126, 7)

In [77]:
#Drop null values
crypto_df = crypto_df.dropna()

In [78]:
#Find the new number of rows in the filtered DataFrame
crypto_df.shape

(674, 7)

In [79]:
#Remove cryptos that have not mined any coins
crypto_df = crypto_df[crypto_df["TotalCoinsMined"]!=0]

In [80]:
#Find the new number of rows in the filtered DataFrame
crypto_df.shape

(524, 7)

In [81]:
#Print the name of columns in the DataFrame
crypto_df.columns

Index(['Unnamed: 0', 'CoinName', 'Algorithm', 'IsTrading', 'ProofType',
       'TotalCoinsMined', 'TotalCoinSupply'],
      dtype='object')

In [82]:
#Put the names of the coins into a new DataFrame
coins_name = crypto_df["CoinName"]

In [83]:
#Examine the new coins name DataFrame
coins_name.head()

0      42 Coin
2      404Coin
5    EliteCoin
7      Bitcoin
8     Ethereum
Name: CoinName, dtype: object

In [84]:
#create a copy dataframe for potential later use
crypto_df_copy = crypto_df.copy()

In [85]:
#Drop the CoinName column
crypto_df.drop(columns="CoinName",inplace=True)

In [86]:
#Drop the IsTrading column
crypto_df.drop(columns="IsTrading",inplace=True)

In [87]:
crypto_df.head()

Unnamed: 0.1,Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,Scrypt,PoW/PoS,41.99995,42
2,404,Scrypt,PoW/PoS,1055185000.0,532000000
5,1337,X13,PoW/PoS,29279420000.0,314159265359
7,BTC,SHA-256,PoW,17927180.0,21000000
8,ETH,Ethash,PoW,107684200.0,0


In [88]:
crypto_df.shape

(524, 5)

In [89]:
X_test = pd.get_dummies(crypto_df["Algorithm"])

In [90]:
X_test.head()

Unnamed: 0,1GB AES Pattern Search,536,Argon2d,BLAKE256,Blake,Blake2S,Blake2b,C11,Cloverhash,Counterparty,...,Tribus,VBFT,VeChainThor Authority,X11,X11GOST,X13,X14,X15,X16R,XEVAN
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [91]:
X_test_df=pd.concat([crypto_df,X_test],axis=1)

In [92]:
X_test_df.head()

Unnamed: 0.1,Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,1GB AES Pattern Search,536,Argon2d,BLAKE256,Blake,...,Tribus,VBFT,VeChainThor Authority,X11,X11GOST,X13,X14,X15,X16R,XEVAN
0,42,Scrypt,PoW/PoS,41.99995,42,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,404,Scrypt,PoW/PoS,1055185000.0,532000000,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,1337,X13,PoW/PoS,29279420000.0,314159265359,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
7,BTC,SHA-256,PoW,17927180.0,21000000,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,ETH,Ethash,PoW,107684200.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [93]:
X_II  = pd.get_dummies(crypto_df["ProofType"]) 

In [94]:
X_II.head()

Unnamed: 0,DPOS,DPoC,DPoS,HPoW,LPoS,POBh,PoA,PoC,PoS,PoS/LPoS,...,PoW/PoS,PoW/PoS.1,PoW/PoW,PoW/nPoS,Pos,Proof of Authority,Proof of Trust,TPoS,Zero-Knowledge Proof,dPoW/PoW
0,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [95]:
X=pd.concat([X_test_df,X_II],axis=1)

In [96]:
#Remove redundant columns from X
X.drop(columns=["Algorithm","ProofType"],inplace=True)

In [97]:
X.head()

Unnamed: 0.1,Unnamed: 0,TotalCoinsMined,TotalCoinSupply,1GB AES Pattern Search,536,Argon2d,BLAKE256,Blake,Blake2S,Blake2b,...,PoW/PoS,PoW/PoS.1,PoW/PoW,PoW/nPoS,Pos,Proof of Authority,Proof of Trust,TPoS,Zero-Knowledge Proof,dPoW/PoW
0,42,41.99995,42,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,404,1055185000.0,532000000,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
5,1337,29279420000.0,314159265359,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
7,BTC,17927180.0,21000000,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,ETH,107684200.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [98]:
X.drop(columns=["Unnamed: 0"],inplace=True)

In [99]:
scaler = StandardScaler()

In [100]:
X = scaler.fit_transform(X)

## PCA

In [101]:
# Initialize PCA model
pcs = PCA(n_components=3)

In [102]:
X_pcs = pcs.fit_transform(X)

In [103]:
pcs_df_pre_merge = pd.DataFrame(
    data = X_pcs, columns=["PC 1","PC 2","PC 3"])

In [104]:
pcs_df_pre_merge.head()

Unnamed: 0,PC 1,PC 2,PC 3
0,-0.335966,1.031222,-0.540236
1,-0.31944,1.031218,-0.540735
2,2.288745,1.605273,-0.679396
3,-0.137501,-1.325989,0.223103
4,-0.144662,-2.08035,0.385651


In [105]:
crypto_df.shape

(524, 5)

In [106]:
crypto_df.head()

Unnamed: 0.1,Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,Scrypt,PoW/PoS,41.99995,42
2,404,Scrypt,PoW/PoS,1055185000.0,532000000
5,1337,X13,PoW/PoS,29279420000.0,314159265359
7,BTC,SHA-256,PoW,17927180.0,21000000
8,ETH,Ethash,PoW,107684200.0,0


In [107]:
crypto_df_merge = crypto_df.reset_index()

In [108]:
crypto_df_merge

Unnamed: 0.1,index,Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,0,42,Scrypt,PoW/PoS,4.199995e+01,42
1,2,404,Scrypt,PoW/PoS,1.055185e+09,532000000
2,5,1337,X13,PoW/PoS,2.927942e+10,314159265359
3,7,BTC,SHA-256,PoW,1.792718e+07,21000000
4,8,ETH,Ethash,PoW,1.076842e+08,0
...,...,...,...,...,...,...
519,1238,ZEPH,SHA-256,DPoS,2.000000e+09,2000000000
520,1242,GAP,Scrypt,PoW/PoS,1.493105e+07,250000000
521,1245,BDX,CryptoNight,PoW,9.802226e+08,1400222610
522,1246,ZEN,Equihash,PoW,7.296538e+06,21000000


In [109]:
pcs_df_pre_merge

Unnamed: 0,PC 1,PC 2,PC 3
0,-0.335966,1.031222,-0.540236
1,-0.319440,1.031218,-0.540735
2,2.288745,1.605273,-0.679396
3,-0.137501,-1.325989,0.223103
4,-0.144662,-2.080350,0.385651
...,...,...,...
519,2.462909,0.902454,-0.042397
520,-0.334028,1.031082,-0.540271
521,0.328778,-2.369170,0.398621
522,-0.147790,-2.025569,0.458245


In [110]:
pcs_new_df = pd.concat([pcs_df_pre_merge,crypto_df_merge],axis=1)

In [111]:
pcs_new_df

Unnamed: 0.1,PC 1,PC 2,PC 3,index,Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,-0.335966,1.031222,-0.540236,0,42,Scrypt,PoW/PoS,4.199995e+01,42
1,-0.319440,1.031218,-0.540735,2,404,Scrypt,PoW/PoS,1.055185e+09,532000000
2,2.288745,1.605273,-0.679396,5,1337,X13,PoW/PoS,2.927942e+10,314159265359
3,-0.137501,-1.325989,0.223103,7,BTC,SHA-256,PoW,1.792718e+07,21000000
4,-0.144662,-2.080350,0.385651,8,ETH,Ethash,PoW,1.076842e+08,0
...,...,...,...,...,...,...,...,...,...
519,2.462909,0.902454,-0.042397,1238,ZEPH,SHA-256,DPoS,2.000000e+09,2000000000
520,-0.334028,1.031082,-0.540271,1242,GAP,Scrypt,PoW/PoS,1.493105e+07,250000000
521,0.328778,-2.369170,0.398621,1245,BDX,CryptoNight,PoW,9.802226e+08,1400222610
522,-0.147790,-2.025569,0.458245,1246,ZEN,Equihash,PoW,7.296538e+06,21000000


In [112]:
pcs_df = pcs_new_df.set_index(["Unnamed: 0"])

In [113]:
pcs_copy_df = pcs_df

In [114]:
pcs_df.head()

Unnamed: 0_level_0,PC 1,PC 2,PC 3,index,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
42,-0.335966,1.031222,-0.540236,0,Scrypt,PoW/PoS,41.99995,42
404,-0.31944,1.031218,-0.540735,2,Scrypt,PoW/PoS,1055185000.0,532000000
1337,2.288745,1.605273,-0.679396,5,X13,PoW/PoS,29279420000.0,314159265359
BTC,-0.137501,-1.325989,0.223103,7,SHA-256,PoW,17927180.0,21000000
ETH,-0.144662,-2.08035,0.385651,8,Ethash,PoW,107684200.0,0


In [115]:
#Drop columns
pcs_df.drop(columns=["index","Algorithm","ProofType","TotalCoinsMined","TotalCoinSupply"],inplace=True)

In [116]:
pcs_df.head()

Unnamed: 0_level_0,PC 1,PC 2,PC 3
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
42,-0.335966,1.031222,-0.540236
404,-0.31944,1.031218,-0.540735
1337,2.288745,1.605273,-0.679396
BTC,-0.137501,-1.325989,0.223103
ETH,-0.144662,-2.08035,0.385651


## K-means

In [117]:
inertia = []
k = list(range(1, 11))

In [118]:
inertia = []
k = list(range(1, 11))
# Calculate the inertia for the range of K values
for i in k:
   km = KMeans(n_clusters=i, random_state=0)
   km.fit(pcs_df)
   inertia.append(km.inertia_)

In [119]:
# Define a DataFrame to plot the Elbow Curve using hvPlot
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="k", y="inertia", title="Elbow Curve", xticks=k)

In [120]:
# Initializing model with K = 4 
model = KMeans(n_clusters=4, random_state=0)
model

KMeans(n_clusters=4, random_state=0)

In [121]:
# Fitting model
model.fit(pcs_df)

KMeans(n_clusters=4, random_state=0)

In [122]:
# Get the predictions
predictions = model.predict(pcs_df)

In [123]:
pcs_df["class"]=model.labels_

In [124]:
pcs_df.head()

Unnamed: 0_level_0,PC 1,PC 2,PC 3,class
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
42,-0.335966,1.031222,-0.540236,3
404,-0.31944,1.031218,-0.540735,3
1337,2.288745,1.605273,-0.679396,3
BTC,-0.137501,-1.325989,0.223103,0
ETH,-0.144662,-2.08035,0.385651,0


In [125]:
crypto_df_copy

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,True,PoW/PoS,4.199995e+01,42
2,404,404Coin,Scrypt,True,PoW/PoS,1.055185e+09,532000000
5,1337,EliteCoin,X13,True,PoW/PoS,2.927942e+10,314159265359
7,BTC,Bitcoin,SHA-256,True,PoW,1.792718e+07,21000000
8,ETH,Ethereum,Ethash,True,PoW,1.076842e+08,0
...,...,...,...,...,...,...,...
1238,ZEPH,ZEPHYR,SHA-256,True,DPoS,2.000000e+09,2000000000
1242,GAP,Gapcoin,Scrypt,True,PoW/PoS,1.493105e+07,250000000
1245,BDX,Beldex,CryptoNight,True,PoW,9.802226e+08,1400222610
1246,ZEN,Horizen,Equihash,True,PoW,7.296538e+06,21000000


In [126]:
crypto_copy_new = crypto_df_copy.set_index("Unnamed: 0")

In [127]:
crypto_copy_new.head()

Unnamed: 0_level_0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
1337,EliteCoin,X13,True,PoW/PoS,29279420000.0,314159265359
BTC,Bitcoin,SHA-256,True,PoW,17927180.0,21000000
ETH,Ethereum,Ethash,True,PoW,107684200.0,0


In [128]:
clustered_df = pd.concat([pcs_df,crypto_copy_new],axis=1)

In [129]:
clustered_df.head()

Unnamed: 0_level_0,PC 1,PC 2,PC 3,class,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
42,-0.335966,1.031222,-0.540236,3,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
404,-0.31944,1.031218,-0.540735,3,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
1337,2.288745,1.605273,-0.679396,3,EliteCoin,X13,True,PoW/PoS,29279420000.0,314159265359
BTC,-0.137501,-1.325989,0.223103,0,Bitcoin,SHA-256,True,PoW,17927180.0,21000000
ETH,-0.144662,-2.08035,0.385651,0,Ethereum,Ethash,True,PoW,107684200.0,0


In [130]:
clustered_df.drop(columns="IsTrading",inplace=True)

In [131]:
cols = ["Algorithm","ProofType","TotalCoinsMined","TotalCoinSupply","PC 1","PC 2","PC 3","CoinName","class"]

In [132]:
clustered_df = clustered_df[cols]

In [133]:
clustered_df.head()

Unnamed: 0_level_0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,PC 1,PC 2,PC 3,CoinName,class
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
42,Scrypt,PoW/PoS,41.99995,42,-0.335966,1.031222,-0.540236,42 Coin,3
404,Scrypt,PoW/PoS,1055185000.0,532000000,-0.31944,1.031218,-0.540735,404Coin,3
1337,X13,PoW/PoS,29279420000.0,314159265359,2.288745,1.605273,-0.679396,EliteCoin,3
BTC,SHA-256,PoW,17927180.0,21000000,-0.137501,-1.325989,0.223103,Bitcoin,0
ETH,Ethash,PoW,107684200.0,0,-0.144662,-2.08035,0.385651,Ethereum,0


## Visualizing Results

In [134]:
# Plotting the clusters with three features
fig = px.scatter_3d(clustered_df, x="PC 1", y="PC 2", z="PC 3", color="class",hover_name="CoinName",hover_data=["Algorithm"],width=800)
fig.update_layout(legend=dict(x=0,y=1))
fig.show()

In [135]:
clustered_df.hvplot.table(columns=['CoinName', 'Algorithm', 'ProofType','TotalCoinSupply','TotalCoinsMined','class'], sortable=True, selectable=True)

In [136]:
clustered_df.hvplot.scatter(
	x="TotalCoinsMined",
	y="TotalCoinSupply",
	hover_cols=["CoinName"],
	by="class",
)