# Clustering Crypto

In [28]:
# Initial imports
import pandas as pd, numpy as np
import hvplot.pandas
from path import Path
import plotly.express as px
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering

### Deliverable 1: Preprocessing the Data for PCA

In [29]:
# Load the crypto_data.csv dataset.
# YOUR CODE HERE
file_path = "crypto_data.csv"
crypto_df = pd.read_csv(file_path, index_col = 0)
crypto_df.head()

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
365,365Coin,X11,True,PoW/PoS,,2300000000
404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
611,SixEleven,SHA-256,True,PoW,,611000
808,808,SHA-256,True,PoW/PoS,0.0,0


In [30]:
# Keep all the cryptocurrencies that are being traded.
# YOUR CODE HERE
crypto_df = crypto_df[crypto_df['IsTrading']==True]
crypto_df

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,True,PoW/PoS,4.199995e+01,42
365,365Coin,X11,True,PoW/PoS,,2300000000
404,404Coin,Scrypt,True,PoW/PoS,1.055185e+09,532000000
611,SixEleven,SHA-256,True,PoW,,611000
808,808,SHA-256,True,PoW/PoS,0.000000e+00,0
...,...,...,...,...,...,...
SERO,Super Zero,Ethash,True,PoW,,1000000000
UOS,UOS,SHA-256,True,DPoI,,1000000000
BDX,Beldex,CryptoNight,True,PoW,9.802226e+08,1400222610
ZEN,Horizen,Equihash,True,PoW,7.296538e+06,21000000


In [31]:
# Keep all the cryptocurrencies that have a working algorithm.
# YOUR CODE HERE
pd.isna(crypto_df['Algorithm'])

42      False
365     False
404     False
611     False
808     False
        ...  
SERO    False
UOS     False
BDX     False
ZEN     False
XBC     False
Name: Algorithm, Length: 1144, dtype: bool

In [32]:
# Remove the "IsTrading" column. 
# YOUR CODE HERE
crypto_df.drop(columns=["IsTrading"], inplace=True)
crypto_df.head()

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,PoW/PoS,41.99995,42
365,365Coin,X11,PoW/PoS,,2300000000
404,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000
611,SixEleven,SHA-256,PoW,,611000
808,808,SHA-256,PoW/PoS,0.0,0


In [33]:
# Remove rows that have at least 1 null value.
# YOUR CODE HERE
crypto_df=crypto_df.dropna()
crypto_df.count()

CoinName           685
Algorithm          685
ProofType          685
TotalCoinsMined    685
TotalCoinSupply    685
dtype: int64

In [34]:
# Keep the rows where coins are mined.
# YOUR CODE HERE
crypto_df = crypto_df[crypto_df.TotalCoinsMined > 0]
crypto_df

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,PoW/PoS,4.199995e+01,42
404,404Coin,Scrypt,PoW/PoS,1.055185e+09,532000000
1337,EliteCoin,X13,PoW/PoS,2.927942e+10,314159265359
BTC,Bitcoin,SHA-256,PoW,1.792718e+07,21000000
ETH,Ethereum,Ethash,PoW,1.076842e+08,0
...,...,...,...,...,...
ZEPH,ZEPHYR,SHA-256,DPoS,2.000000e+09,2000000000
GAP,Gapcoin,Scrypt,PoW/PoS,1.493105e+07,250000000
BDX,Beldex,CryptoNight,PoW,9.802226e+08,1400222610
ZEN,Horizen,Equihash,PoW,7.296538e+06,21000000


In [35]:
# Create a new DataFrame that holds only the cryptocurrencies names.
# YOUR CODE HERE
coin_name_df = crypto_df[['CoinName']].set_index([crypto_df.index])
coin_name_df

Unnamed: 0,CoinName
42,42 Coin
404,404Coin
1337,EliteCoin
BTC,Bitcoin
ETH,Ethereum
...,...
ZEPH,ZEPHYR
GAP,Gapcoin
BDX,Beldex
ZEN,Horizen


In [36]:
# Drop the 'CoinName' column since it's not going to be used on the clustering algorithm.
# YOUR CODE HERE
crypto_df.drop(columns=["CoinName"], inplace=True)
crypto_df

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,Scrypt,PoW/PoS,4.199995e+01,42
404,Scrypt,PoW/PoS,1.055185e+09,532000000
1337,X13,PoW/PoS,2.927942e+10,314159265359
BTC,SHA-256,PoW,1.792718e+07,21000000
ETH,Ethash,PoW,1.076842e+08,0
...,...,...,...,...
ZEPH,SHA-256,DPoS,2.000000e+09,2000000000
GAP,Scrypt,PoW/PoS,1.493105e+07,250000000
BDX,CryptoNight,PoW,9.802226e+08,1400222610
ZEN,Equihash,PoW,7.296538e+06,21000000


In [37]:
# Use get_dummies() to create variables for text features.
# YOUR CODE HERE
crypto = pd.get_dummies(crypto_df['Algorithm'])
dummy = pd.get_dummies(crypto_df['ProofType'])
combined = pd.concat([crypto,dummy], axis= 1)
df = crypto_df.merge(combined,left_index = True,right_index = True)
df = df.drop(['Algorithm','ProofType'], axis =1)
df

Unnamed: 0,TotalCoinsMined,TotalCoinSupply,1GB AES Pattern Search,536,Argon2d,BLAKE256,Blake,Blake2S,Blake2b,C11,...,PoW/PoS,PoW/PoS.1,PoW/PoW,PoW/nPoS,Pos,Proof of Authority,Proof of Trust,TPoS,Zero-Knowledge Proof,dPoW/PoW
42,4.199995e+01,42,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
404,1.055185e+09,532000000,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1337,2.927942e+10,314159265359,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
BTC,1.792718e+07,21000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ETH,1.076842e+08,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZEPH,2.000000e+09,2000000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
GAP,1.493105e+07,250000000,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
BDX,9.802226e+08,1400222610,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ZEN,7.296538e+06,21000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [38]:
# Standardize the data with StandardScaler().
# YOUR CODE HERE
df_scaled = StandardScaler().fit_transform(df)
print(df_scaled)

[[-0.11710817 -0.1528703  -0.0433963  ... -0.0433963  -0.0433963
  -0.0433963 ]
 [-0.09396955 -0.145009   -0.0433963  ... -0.0433963  -0.0433963
  -0.0433963 ]
 [ 0.52494561  4.48942416 -0.0433963  ... -0.0433963  -0.0433963
  -0.0433963 ]
 ...
 [-0.09561336 -0.13217937 -0.0433963  ... -0.0433963  -0.0433963
  -0.0433963 ]
 [-0.11694817 -0.15255998 -0.0433963  ... -0.0433963  -0.0433963
  -0.0433963 ]
 [-0.11710536 -0.15285552 -0.0433963  ... -0.0433963  -0.0433963
  -0.0433963 ]]


### Deliverable 2: Reducing Data Dimensions Using PCA

In [39]:
# Using PCA to reduce dimension to three principal components.
# YOUR CODE HERE
pca = PCA(n_components=3)
df_pca = pca.fit_transform(df_scaled)
df_pca

array([[-0.33600876,  1.01755112, -0.55252945],
       [-0.31934401,  1.01785658, -0.55271896],
       [ 2.29327941,  1.7361333 , -0.63687758],
       ...,
       [ 0.32162806, -2.313605  ,  0.36990902],
       [-0.16307962, -2.03311417,  0.3635631 ],
       [-0.28314038,  0.80834823, -0.23408205]])

In [40]:
# Create a DataFrame with the three principal components.
# YOUR CODE HERE
pcs_df = pd.DataFrame(
    data = df_pca, columns = ['PC1', 'PC2', 'PC3'], index = crypto_df.index)

pcs_df

Unnamed: 0,PC1,PC2,PC3
42,-0.336009,1.017551,-0.552529
404,-0.319344,1.017857,-0.552719
1337,2.293279,1.736133,-0.636878
BTC,-0.139994,-1.340185,0.177835
ETH,-0.144831,-2.046274,0.379615
...,...,...,...
ZEPH,2.478218,0.697841,0.016341
GAP,-0.334054,1.017450,-0.552542
BDX,0.321628,-2.313605,0.369909
ZEN,-0.163080,-2.033114,0.363563


### Deliverable 3: Clustering Crytocurrencies Using K-Means

#### Finding the Best Value for `k` Using the Elbow Curve

In [41]:
# Create an elbow curve to find the best value for K.
# YOUR CODE HERE
inertia = []
k = list(range(1, 11))
# Calculate the inertia for the range of K values
for i in k:
   km = KMeans(n_clusters=i, random_state=0)
   km.fit(pcs_df)
   inertia.append(km.inertia_)

elbow_data = {"k":k,"inertia":inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="k",y="inertia",xticks=k,title="Elbow Curve")

  "KMeans is known to have a memory leak on Windows "


Running K-Means with `k=4`

In [42]:
# Initialize the K-Means model.
# YOUR CODE HERE
model = KMeans(n_clusters=4, random_state=0)

# Fit the model
# YOUR CODE HERE
model.fit(pcs_df)

# Predict clusters
# YOUR CODE HERE
predictions = model.fit_predict(pcs_df)

In [55]:
# Create a new DataFrame including predicted clusters and cryptocurrencies features.
# Concatentate the crypto_df and pcs_df DataFrames on the same columns.
#  Add a new column, "CoinName" to the clustered_df DataFrame that holds the names of the cryptocurrencies.
# YOUR CODE HERE

clustered_df = pd.concat([crypto_df,pcs_df,coin_name_df],axis = 1)

#  Add a new column, "Class" to the clustered_df DataFrame that holds the predictions.
# YOUR CODE HERE

clustered_df["Class"]= model.labels_

# Print the shape of the clustered_df
print(clustered_df.shape)
clustered_df.head(10)

(532, 9)


Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,PC1,PC2,PC3,CoinName,Class
42,Scrypt,PoW/PoS,41.99995,42,-0.336009,1.017551,-0.552529,42 Coin,0
404,Scrypt,PoW/PoS,1055185000.0,532000000,-0.319344,1.017857,-0.552719,404Coin,0
1337,X13,PoW/PoS,29279420000.0,314159265359,2.293279,1.736133,-0.636878,EliteCoin,0
BTC,SHA-256,PoW,17927180.0,21000000,-0.139994,-1.340185,0.177835,Bitcoin,3
ETH,Ethash,PoW,107684200.0,0,-0.144831,-2.046274,0.379615,Ethereum,3
LTC,Scrypt,PoW,63039240.0,84000000,-0.163655,-1.149095,-0.01306,Litecoin,3
DASH,X11,PoW/PoS,9031294.0,22000000,-0.401747,1.30657,-0.496752,Dash,0
XMR,CryptoNight-V7,PoW,17201140.0,0,-0.139758,-2.254926,0.442767,Monero,3
ETC,Ethash,PoW,113359700.0,210000000,-0.143273,-2.046362,0.379606,Ethereum Classic,3
ZEC,Equihash,PoW,7383056.0,21000000,-0.163079,-2.033114,0.363563,ZCash,3


### Deliverable 4: Visualizing Cryptocurrencies Results

#### 3D-Scatter with Clusters

In [56]:
# Creating a 3D-Scatter with the PCA data and the clusters
# YOUR CODE HERE
fig = px.scatter_3d(
    clustered_df,
    x="PC1",
    y="PC2",
    z="PC3",
    color="Class",
    symbol="Class",
    width=800,
    hover_name="CoinName",
    hover_data=["Algorithm"],
)
fig.update_layout(legend=dict(x=0, y=1))
fig.show()

In [57]:
# Create a table with tradable cryptocurrencies.
# YOUR CODE HERE
clustered_df.hvplot.table(sortable=True)

In [58]:
# Print the total number of tradable cryptocurrencies.
# YOUR CODE HERE
print(clustered_df.count) 

<bound method DataFrame.count of         Algorithm ProofType  TotalCoinsMined TotalCoinSupply       PC1  \
42         Scrypt   PoW/PoS     4.199995e+01              42 -0.336009   
404        Scrypt   PoW/PoS     1.055185e+09       532000000 -0.319344   
1337          X13   PoW/PoS     2.927942e+10    314159265359  2.293279   
BTC       SHA-256       PoW     1.792718e+07        21000000 -0.139994   
ETH        Ethash       PoW     1.076842e+08               0 -0.144831   
...           ...       ...              ...             ...       ...   
ZEPH      SHA-256      DPoS     2.000000e+09      2000000000  2.478218   
GAP        Scrypt   PoW/PoS     1.493105e+07       250000000 -0.334054   
BDX   CryptoNight       PoW     9.802226e+08      1400222610  0.321628   
ZEN      Equihash       PoW     7.296538e+06        21000000 -0.163080   
XBC        Scrypt       PoS     1.283270e+05         1000000 -0.283140   

           PC2       PC3     CoinName  Class  
42    1.017551 -0.552529      4

In [59]:
# Scaling data to create the scatter plot with tradable cryptocurrencies.
# YOUR CODE HERE
scaled = MinMaxScaler().fit_transform(clustered_df[["TotalCoinSupply","TotalCoinsMined"]])
print(scaled)

[[4.20000000e-11 0.00000000e+00]
 [5.32000000e-04 1.06585544e-03]
 [3.14159265e-01 2.95755135e-02]
 ...
 [1.40022261e-03 9.90135079e-04]
 [2.10000000e-05 7.37028150e-06]
 [1.00000000e-06 1.29582282e-07]]


In [60]:
# Create a new DataFrame that has the scaled data with the clustered_df DataFrame index.
# YOUR CODE HERE

new_df = pd.DataFrame(    
    data = scaled,columns = ["TotalCoinSuppy","TotalCoinsMined"], index = clustered_df.index
)

# Add the "CoinName" column from the clustered_df DataFrame to the new DataFrame.
# YOUR CODE HERE

new_df = pd.concat([new_df,clustered_df['CoinName']],axis = 1)


# Add the "Class" column from the clustered_df DataFrame to the new DataFrame. 
# YOUR CODE HERE

plot_df = pd.concat([new_df,clustered_df["Class"]],axis = 1)

plot_df.head(10)

Unnamed: 0,TotalCoinSuppy,TotalCoinsMined,CoinName,Class
42,4.2e-11,0.0,42 Coin,0
404,0.000532,0.001066,404Coin,0
1337,0.3141593,0.029576,EliteCoin,0
BTC,2.1e-05,1.8e-05,Bitcoin,3
ETH,0.0,0.000109,Ethereum,3
LTC,8.4e-05,6.4e-05,Litecoin,3
DASH,2.2e-05,9e-06,Dash,0
XMR,0.0,1.7e-05,Monero,3
ETC,0.00021,0.000115,Ethereum Classic,3
ZEC,2.1e-05,7e-06,ZCash,3


In [61]:
# Create a hvplot.scatter plot using x="TotalCoinsMined" and y="TotalCoinSupply".
# YOUR CODE HERE

plot_df.hvplot.scatter(
    x="TotalCoinsMined", 
    y="TotalCoinSuppy",
    hover_cols=['CoinName'],
    by='Class', 
    title= "ScaledCoinsValue")
