# Clustering Crypto

In [32]:
# Initial imports
import requests
import pandas as pd
import matplotlib.pyplot as plt
import hvplot.pandas
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

In [33]:
from pathlib import Path

### Fetching Cryptocurrency Data

In [34]:
# Use the following endpoint to fetch json data
#url = "https://min-api.cryptocompare.com/data/all/coinlist"

In [35]:
# Create a DataFrame 
# HINT: You will need to use the 'Data' key from the json response, then transpose the DataFrame.

In [36]:
# Alternatively, use the provided csv file:
file_path = Path("Resources/crypto_data.csv")

# Create a DataFrame
cryptoDf = pd.read_csv(file_path)

In [37]:
cryptoDf

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,True,PoW/PoS,4.199995e+01,42
1,365,365Coin,X11,True,PoW/PoS,,2300000000
2,404,404Coin,Scrypt,True,PoW/PoS,1.055185e+09,532000000
3,611,SixEleven,SHA-256,True,PoW,,611000
4,808,808,SHA-256,True,PoW/PoS,0.000000e+00,0
...,...,...,...,...,...,...,...
1247,XBC,BitcoinPlus,Scrypt,True,PoS,1.283270e+05,1000000
1248,DVTC,DivotyCoin,Scrypt,False,PoW/PoS,2.149121e+07,100000000
1249,GIOT,Giotto Coin,Scrypt,False,PoW/PoS,,233100000
1250,OPSC,OpenSourceCoin,SHA-256,False,PoW/PoS,,21000000


### Data Preprocessing

In [38]:
# Keep only necessary columns:
# 'CoinName','Algorithm','IsTrading','ProofType','TotalCoinsMined','TotalCoinSupply'
cryptoDf = cryptoDf.drop(["Unnamed: 0"], axis = 1)

In [39]:
# Keep only cryptocurrencies that are trading
cryptoDf = cryptoDf[cryptoDf["IsTrading"]==True]

In [40]:
# Keep only cryptocurrencies with a working algorithm
cryptoDf.dropna(inplace=True)

In [41]:
# Remove the "IsTrading" column
cryptoDf.drop(columns=['IsTrading'], inplace=True)

In [42]:
# Remove rows with at least 1 null value
cryptoDf.dropna(axis=0, inplace=True)

In [43]:
# Remove rows with cryptocurrencies having no coins mined
cryptoDfCleaned = cryptoDf[cryptoDf['TotalCoinsMined'] != 0]

In [44]:
# Drop rows where there are 'N/A' text values
for i in cryptoDfCleaned.columns:
    cryptoDfCleaned = cryptoDfCleaned[cryptoDfCleaned[i] != "N/A"]

cryptoDfCleaned

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42 Coin,Scrypt,PoW/PoS,4.199995e+01,42
2,404Coin,Scrypt,PoW/PoS,1.055185e+09,532000000
5,EliteCoin,X13,PoW/PoS,2.927942e+10,314159265359
7,Bitcoin,SHA-256,PoW,1.792718e+07,21000000
8,Ethereum,Ethash,PoW,1.076842e+08,0
...,...,...,...,...,...
1238,ZEPHYR,SHA-256,DPoS,2.000000e+09,2000000000
1242,Gapcoin,Scrypt,PoW/PoS,1.493105e+07,250000000
1245,Beldex,CryptoNight,PoW,9.802226e+08,1400222610
1246,Horizen,Equihash,PoW,7.296538e+06,21000000


In [45]:
# Store the 'CoinName'column in its own DataFrame prior to dropping it from crypto_df
coinNameDf = cryptoDf["CoinName"]

In [46]:
# Drop the 'CoinName' column since it's not going to be used on the clustering algorithm
cryptoDfCleaned.drop(columns=["CoinName"], inplace=True)

In [47]:
cryptoDfCleaned = cryptoDfCleaned.dropna()
cryptoDfCleaned

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,Scrypt,PoW/PoS,4.199995e+01,42
2,Scrypt,PoW/PoS,1.055185e+09,532000000
5,X13,PoW/PoS,2.927942e+10,314159265359
7,SHA-256,PoW,1.792718e+07,21000000
8,Ethash,PoW,1.076842e+08,0
...,...,...,...,...
1238,SHA-256,DPoS,2.000000e+09,2000000000
1242,Scrypt,PoW/PoS,1.493105e+07,250000000
1245,CryptoNight,PoW,9.802226e+08,1400222610
1246,Equihash,PoW,7.296538e+06,21000000


In [48]:
# Create dummy variables for text features
#cryptoDummy = pd.get_dummies(cryptoDf['Algorithm'], cryptodf['ProofType'])
cryptoDummy = pd.get_dummies(cryptoDfCleaned, columns=["Algorithm", "ProofType"])
cryptoDummy

Unnamed: 0,TotalCoinsMined,TotalCoinSupply,Algorithm_1GB AES Pattern Search,Algorithm_536,Algorithm_Argon2d,Algorithm_BLAKE256,Algorithm_Blake,Algorithm_Blake2S,Algorithm_Blake2b,Algorithm_C11,...,ProofType_PoW/PoS,ProofType_PoW/PoS.1,ProofType_PoW/PoW,ProofType_PoW/nPoS,ProofType_Pos,ProofType_Proof of Authority,ProofType_Proof of Trust,ProofType_TPoS,ProofType_Zero-Knowledge Proof,ProofType_dPoW/PoW
0,4.199995e+01,42,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,1.055185e+09,532000000,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
5,2.927942e+10,314159265359,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
7,1.792718e+07,21000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,1.076842e+08,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1238,2.000000e+09,2000000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1242,1.493105e+07,250000000,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1245,9.802226e+08,1400222610,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1246,7.296538e+06,21000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [49]:
# Standardize data
standardizedData = pd.DataFrame()
# define standard scaler
scaler = StandardScaler()
  
# transform data
standardizedData = scaler.fit_transform(cryptoDummy)

In [50]:
standardizedData

array([[-0.11674788, -0.15286468, -0.0433555 , ..., -0.0433555 ,
        -0.0433555 , -0.0433555 ],
       [-0.09358885, -0.14499604, -0.0433555 , ..., -0.0433555 ,
        -0.0433555 , -0.0433555 ],
       [ 0.52587231,  4.4937636 , -0.0433555 , ..., -0.0433555 ,
        -0.0433555 , -0.0433555 ],
       ...,
       [-0.09523411, -0.13215444, -0.0433555 , ..., -0.0433555 ,
        -0.0433555 , -0.0433555 ],
       [-0.11658774, -0.15255408, -0.0433555 , ..., -0.0433555 ,
        -0.0433555 , -0.0433555 ],
       [-0.11674507, -0.15284989, -0.0433555 , ..., -0.0433555 ,
        -0.0433555 , -0.0433555 ]])

### Reducing Dimensions Using PCA

In [51]:
# Use PCA to reduce dimensions to 3 principal components
pca = PCA(n_components=3)

In [52]:
standardizedData = pca.fit_transform(standardizedData)

In [53]:
standardizedData[:5]

array([[-0.33661823,  0.93422627, -0.57239226],
       [-0.31992511,  0.93429835, -0.57273687],
       [ 2.30620278,  1.62485948, -0.63540542],
       [-0.14716113, -1.24158667,  0.13363909],
       [-0.14736387, -1.98572594,  0.41417082]])

In [54]:
#I still want to keep PCA explained variance ratio stored somewhere.
pcaEVarianceRatio = pca.explained_variance_ratio_
pcaEVarianceRatio 

array([0.02737271, 0.02094718, 0.02006935])

In [55]:
# Create a DataFrame with the principal components data
principalCompDf = pd.DataFrame(standardizedData, columns=["PCA1", "PCA2", "PCA3"]).dropna()
principalCompDf

Unnamed: 0,PCA1,PCA2,PCA3
0,-0.336618,0.934226,-0.572392
1,-0.319925,0.934298,-0.572737
2,2.306203,1.624859,-0.635405
3,-0.147161,-1.241587,0.133639
4,-0.147364,-1.985726,0.414171
...,...,...,...
528,2.463036,0.911719,0.030989
529,-0.334661,0.934108,-0.572420
530,0.325273,-2.213841,0.355860
531,-0.148144,-2.150272,0.610978


### Clustering Crytocurrencies Using K-Means

#### Find the Best Value for `k` Using the Elbow Curve

In [56]:
inertia = []
k = list(range(1, 11))

# Calculate the inertia for the range of k values
for i in k:
    kModel = KMeans(n_clusters=i, random_state=0)
    kModel.fit(principalCompDf)
    inertia.append(kModel.inertia_)

elbowData = {"k": k, "inertia": inertia}

elbowDf = pd.DataFrame(elbowData)

elbowDf.hvplot.line(
    x= "k",
    y= "inertia",
    title= "Elbow Curve",
    xticks=k
)


  f"KMeans is known to have a memory leak on Windows "


Running K-Means with `k=<your best value for k here>`

In [57]:
# Initialize the K-Means model
model = KMeans(n_clusters=4, random_state=0)
# Fit the model
model.fit(principalCompDf)
# Predict clusters
clusterPredict = model.predict(principalCompDf)
clusterPredict = pd.DataFrame(clusterPredict)
# Create a new DataFrame including predicted clusters and cryptocurrencies features
cryptoWithPredicted = pd.concat([cryptoDfCleaned, principalCompDf, coinNameDf, clusterPredict], axis=1)

In [58]:
#dropping the n/a and renaming the 0 
cryptoWithPredicted = pd.DataFrame(cryptoWithPredicted.dropna())
cryptoWithPredicted.rename(columns={0:'Predictions'}, inplace=True)

### Visualizing Results

#### Scatter Plot with Tradable Cryptocurrencies

In [59]:
# Scale data to create the scatter plot


In [60]:
# Plot the scatter with x="TotalCoinsMined" and y="TotalCoinSupply"
cryptoWithPredicted.hvplot.scatter(
    x="TotalCoinsMined",
    y="TotalCoinSupply",
    by= "Predictions"
)

#### Table of Tradable Cryptocurrencies

In [61]:
# Table with tradable cryptos
cryptoWithPredicted

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,PCA1,PCA2,PCA3,CoinName,Predictions
0,Scrypt,PoW/PoS,4.199995e+01,42,-0.336618,0.934226,-0.572392,42 Coin,0.0
2,Scrypt,PoW/PoS,1.055185e+09,532000000,2.306203,1.624859,-0.635405,404Coin,0.0
5,X13,PoW/PoS,2.927942e+10,314159265359,-0.164105,-1.187157,0.001882,EliteCoin,1.0
7,SHA-256,PoW,1.792718e+07,21000000,-0.148144,-2.163900,0.440512,Bitcoin,1.0
8,Ethash,PoW,1.076842e+08,0,-0.145804,-1.985827,0.414149,Ethereum,1.0
...,...,...,...,...,...,...,...,...,...
516,X13,PoS,2.500124e+06,2500124,-0.549965,3.677848,15.511138,RoyalCoin,3.0
522,X14,PoW/PoS,1.000000e+08,100000000,3.718555,1.970833,0.181366,GanjaCoin V2,0.0
523,PoS,PoS,1.781868e+07,301000000,-0.248462,0.689989,-0.163967,TeamUP,0.0
525,SHA-256D,PoW/PoS,1.082163e+09,7506000000,-0.330677,1.616972,-0.135912,LanaCoin,0.0


In [62]:
# Print the total number of tradable cryptocurrencies
