# Clustering Crypto

In [140]:
# Initial imports
import requests
import pandas as pd
import matplotlib.pyplot as plt
import hvplot.pandas
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

### Fetching Cryptocurrency Data

In [167]:
# Read in CSV and create a DataFrame
crypto_data = pd.read_csv("C:/Users/erahm/unit13-challenge/ClusteringCrypto/crypto_data.csv")

# Keep only the necessary columns
crypto_data.columns = ["Index", "CoinName", "Algorithm", "IsTrading", "ProofType", "TotalCoinsMined","TotalCoinSupply"]
crypto_data.set_index("Index")

Unnamed: 0_level_0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
42,42 Coin,Scrypt,True,PoW/PoS,4.199995e+01,42
365,365Coin,X11,True,PoW/PoS,,2300000000
404,404Coin,Scrypt,True,PoW/PoS,1.055185e+09,532000000
611,SixEleven,SHA-256,True,PoW,,611000
808,808,SHA-256,True,PoW/PoS,0.000000e+00,0
...,...,...,...,...,...,...
XBC,BitcoinPlus,Scrypt,True,PoS,1.283270e+05,1000000
DVTC,DivotyCoin,Scrypt,False,PoW/PoS,2.149121e+07,100000000
GIOT,Giotto Coin,Scrypt,False,PoW/PoS,,233100000
OPSC,OpenSourceCoin,SHA-256,False,PoW/PoS,,21000000


In [168]:
# Store the 'CoinName'column in its own DataFrame prior to dropping it from crypto_data
coins_name = crypto_data[['CoinName']].copy()
coins_name

Unnamed: 0,CoinName
0,42 Coin
1,365Coin
2,404Coin
3,SixEleven
4,808
...,...
1247,BitcoinPlus
1248,DivotyCoin
1249,Giotto Coin
1250,OpenSourceCoin


### Data Preprocessing

In [172]:
# Identify the cryptos that are not trading
index_names = crypto_data[ (crypto_data['IsTrading'] == 'FALSE')].index
  
# Drop any cryptos that are not trading
crypto_data.drop(index_names, inplace = True)

# Print dataframe 
crypto_data.set_index("Index")

Unnamed: 0_level_0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
42,42 Coin,Scrypt,True,PoW/PoS,4.199995e+01,42
365,365Coin,X11,True,PoW/PoS,,2300000000
404,404Coin,Scrypt,True,PoW/PoS,1.055185e+09,532000000
611,SixEleven,SHA-256,True,PoW,,611000
808,808,SHA-256,True,PoW/PoS,0.000000e+00,0
...,...,...,...,...,...,...
XBC,BitcoinPlus,Scrypt,True,PoS,1.283270e+05,1000000
DVTC,DivotyCoin,Scrypt,False,PoW/PoS,2.149121e+07,100000000
GIOT,Giotto Coin,Scrypt,False,PoW/PoS,,233100000
OPSC,OpenSourceCoin,SHA-256,False,PoW/PoS,,21000000


In [174]:
# Remove the "IsTrading" column
crypto_df = crypto_data.drop('IsTrading', axis=1)
crypto_df.set_index("Index")

Unnamed: 0_level_0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
42,42 Coin,Scrypt,PoW/PoS,4.199995e+01,42
365,365Coin,X11,PoW/PoS,,2300000000
404,404Coin,Scrypt,PoW/PoS,1.055185e+09,532000000
611,SixEleven,SHA-256,PoW,,611000
808,808,SHA-256,PoW/PoS,0.000000e+00,0
...,...,...,...,...,...
XBC,BitcoinPlus,Scrypt,PoS,1.283270e+05,1000000
DVTC,DivotyCoin,Scrypt,PoW/PoS,2.149121e+07,100000000
GIOT,Giotto Coin,Scrypt,PoW/PoS,,233100000
OPSC,OpenSourceCoin,SHA-256,PoW/PoS,,21000000


In [175]:
# Part 1: Keep only the cryptocurrencies with a working algorithm

# Check for any null values in the 'Algorithm' column
crypto_df.isnull().sum()

Index                0
CoinName             0
Algorithm            0
ProofType            0
TotalCoinsMined    508
TotalCoinSupply      0
dtype: int64

In [176]:
# Part 2: Keep only the cryptocurrencies with a working algorithm 

# View all working algorithms -- all cryptos appear to have a working algorithm
crypto_df['Algorithm'].value_counts(normalize=True).to_string()

'Scrypt                    0.338658\nX11                       0.157348\nSHA-256                   0.104633\nX13                       0.045527\nPoS                       0.035144\nCryptoNight               0.031949\nQuark                     0.023962\nEquihash                  0.020767\nNeoScrypt                 0.016773\nNIST5                     0.015974\nDPoS                      0.015176\nEthash                    0.014377\nSHA-256D                  0.014377\nMultiple                  0.014377\nX15                       0.011182\nLyra2REv2                 0.007188\nBlake                     0.006390\nXEVAN                     0.005591\nLyra2Z                    0.005591\nLyra2RE                   0.004792\nSkein                     0.004792\nQuBit                     0.004792\nCryptoNight-V7            0.004792\nPHI1612                   0.004792\nScrypt-n                  0.004792\nSHA-512                   0.003994\nGroestl                   0.003994\nDagger                    0

In [178]:
# Drop all rows where there are 'N/A' text values
crypto_df.dropna().set_index("Index")

Unnamed: 0_level_0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
42,42 Coin,Scrypt,PoW/PoS,4.199995e+01,42
404,404Coin,Scrypt,PoW/PoS,1.055185e+09,532000000
808,808,SHA-256,PoW/PoS,0.000000e+00,0
1337,EliteCoin,X13,PoW/PoS,2.927942e+10,3.14159E+11
BTC,Bitcoin,SHA-256,PoW,1.792718e+07,21000000
...,...,...,...,...,...
GAP,Gapcoin,Scrypt,PoW/PoS,1.493105e+07,250000000
BDX,Beldex,CryptoNight,PoW,9.802226e+08,1400222610
ZEN,Horizen,Equihash,PoW,7.296538e+06,21000000
XBC,BitcoinPlus,Scrypt,PoS,1.283270e+05,1000000


In [179]:
# Drop all null values (we know from above that only the TotalCoinsMined column had null values)
crypto_df.dropna(subset=['TotalCoinsMined'], inplace=True)
crypto_df.isnull().sum()

Index              0
CoinName           0
Algorithm          0
ProofType          0
TotalCoinsMined    0
TotalCoinSupply    0
dtype: int64

In [181]:
# Remove rows with cryptocurrencies having no coins mined
crypto_df.drop(crypto_df[crypto_df['TotalCoinsMined'] == '0'].index, inplace = True)
crypto_df.set_index("Index")

Unnamed: 0_level_0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
42,42 Coin,Scrypt,PoW/PoS,4.199995e+01,42
404,404Coin,Scrypt,PoW/PoS,1.055185e+09,532000000
808,808,SHA-256,PoW/PoS,0.000000e+00,0
1337,EliteCoin,X13,PoW/PoS,2.927942e+10,3.14159E+11
BTC,Bitcoin,SHA-256,PoW,1.792718e+07,21000000
...,...,...,...,...,...
GAP,Gapcoin,Scrypt,PoW/PoS,1.493105e+07,250000000
BDX,Beldex,CryptoNight,PoW,9.802226e+08,1400222610
ZEN,Horizen,Equihash,PoW,7.296538e+06,21000000
XBC,BitcoinPlus,Scrypt,PoS,1.283270e+05,1000000


In [182]:
# Store the 'CoinName'column in its own DataFrame prior to dropping it from crypto_df
coins_name = crypto_df[['CoinName']].copy()
coins_name

Unnamed: 0,CoinName
0,42 Coin
2,404Coin
4,808
5,EliteCoin
7,Bitcoin
...,...
1242,Gapcoin
1245,Beldex
1246,Horizen
1247,BitcoinPlus


In [183]:
# Remove the "CoinName" column
final_crypto_df = crypto_df.drop('CoinName', axis=1)
final_crypto_df.set_index("Index")

Unnamed: 0_level_0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
42,Scrypt,PoW/PoS,4.199995e+01,42
404,Scrypt,PoW/PoS,1.055185e+09,532000000
808,SHA-256,PoW/PoS,0.000000e+00,0
1337,X13,PoW/PoS,2.927942e+10,3.14159E+11
BTC,SHA-256,PoW,1.792718e+07,21000000
...,...,...,...,...
GAP,Scrypt,PoW/PoS,1.493105e+07,250000000
BDX,CryptoNight,PoW,9.802226e+08,1400222610
ZEN,Equihash,PoW,7.296538e+06,21000000
XBC,Scrypt,PoS,1.283270e+05,1000000


In [184]:
# Create new dataframe to create dummy variables for text features
dummy_variables = final_crypto_df[['Algorithm', 'ProofType']].copy()
dummy_variables

Unnamed: 0,Algorithm,ProofType
0,Scrypt,PoW/PoS
2,Scrypt,PoW/PoS
4,SHA-256,PoW/PoS
5,X13,PoW/PoS
7,SHA-256,PoW
...,...,...
1242,Scrypt,PoW/PoS
1245,CryptoNight,PoW
1246,Equihash,PoW
1247,Scrypt,PoS


In [185]:
# Create dummy variables for text features
X = pd.get_dummies(dummy_variables)

# View dummy variables for text features
dummy_variables_csv_ouput = pd.get_dummies(dummy_variables).to_csv(index=False)
dummy_variables_csv_ouput

'Algorithm_1GB AES Pattern Search,Algorithm_536,Algorithm_Argon2,Algorithm_Argon2d,Algorithm_BLAKE256,Algorithm_Blake,Algorithm_Blake2S,Algorithm_Blake2b,Algorithm_C11,Algorithm_Cloverhash,Algorithm_Counterparty,Algorithm_CryptoNight,Algorithm_CryptoNight Heavy,Algorithm_CryptoNight Heavy X,Algorithm_CryptoNight-Lite,Algorithm_CryptoNight-V7,Algorithm_Cryptonight-GPU,Algorithm_DPoS,Algorithm_Dagger,Algorithm_Dagger-Hashimoto,Algorithm_ECC 256K1,Algorithm_Equihash,Algorithm_Equihash+Scrypt,Algorithm_Equihash1927,Algorithm_Ethash,Algorithm_Exosis,Algorithm_Green Protocol,Algorithm_Groestl,Algorithm_HMQ1725,Algorithm_HybridScryptHash256,Algorithm_IMesh,Algorithm_Jump Consistent Hash,Algorithm_Keccak,Algorithm_Leased POS,Algorithm_Lyra2RE,Algorithm_Lyra2REv2,Algorithm_Lyra2Z,Algorithm_M7 POW,Algorithm_Momentum,Algorithm_Multiple,Algorithm_NIST5,Algorithm_NeoScrypt,Algorithm_Ouroboros,Algorithm_PHI1612,Algorithm_PHI2,Algorithm_POS 2.0,Algorithm_POS 3.0,Algorithm_PoS,Algorithm_Proof-of-Autho

In [186]:
# Standardize data with StandardScaler
crypto_scaler = StandardScaler().fit_transform(X)
print(crypto_scaler[0:5])

[[-0.03668644 -0.03668644 -0.03668644 -0.03668644 -0.05191741 -0.09016696
  -0.03668644 -0.05191741 -0.05191741 -0.03668644 -0.03668644 -0.18257419
  -0.05191741 -0.03668644 -0.03668644 -0.08225509 -0.03668644 -0.0974575
  -0.06362848 -0.03668644 -0.03668644 -0.16620562 -0.03668644 -0.03668644
  -0.13848495 -0.03668644 -0.03668644 -0.07352146 -0.05191741 -0.03668644
  -0.03668644 -0.03668644 -0.06362848 -0.03668644 -0.07352146 -0.09016696
  -0.09016696 -0.03668644 -0.03668644 -0.12250233 -0.12803688 -0.15291752
  -0.03668644 -0.08225509 -0.03668644 -0.03668644 -0.06362848 -0.16188544
  -0.03668644 -0.03668644 -0.03668644 -0.07352146 -0.1786061  -0.32732684
  -0.03668644 -0.0974575  -0.08225509 -0.05191741 -0.03668644  1.42714214
  -0.06362848 -0.03668644 -0.03668644 -0.03668644 -0.08225509 -0.06362848
  -0.03668644 -0.03668644 -0.03668644 -0.03668644 -0.05191741 -0.03668644
  -0.03668644 -0.40984739 -0.03668644 -0.18257419 -0.03668644 -0.10425721
  -0.07352146 -0.03668644 -0.0974575  -

### Reducing Dimensions Using PCA

In [187]:
# Use PCA to reduce dimensions to 3 principal components:

# Initialize PCA model
pca = PCA(n_components=3)

# Get 3 principal components for the crypto data
crypto_pca = pca.fit_transform(crypto_scaler)

In [188]:
# Create a DataFrame with the principal components data
pcs_df = pd.DataFrame(data=crypto_pca, columns=["PC 1", "PC 2", "PC 3"])
pcs_df.head()

Unnamed: 0,PC 1,PC 2,PC 3
0,1.2368,-0.508706,0.011694
1,1.2368,-0.508706,0.011694
2,0.786454,-0.295457,-0.010037
3,2.036448,-0.534103,-0.016735
4,-1.50875,0.148868,-0.021265


In [189]:
# Merge Coin Name and PCS dataframes
pcs_final_df = pcs_df.merge(coins_name, left_index=True, right_index=True)
pcs_final_df.set_index('CoinName')

Unnamed: 0_level_0,PC 1,PC 2,PC 3
CoinName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
42 Coin,1.236800,-0.508706,0.011694
404Coin,0.786454,-0.295457,-0.010037
808,-1.508750,0.148868,-0.021265
EliteCoin,-2.063012,0.274247,-0.014731
Bitcoin,1.437534,-0.447962,0.026176
...,...,...,...
Degas Coin,2.370576,16.222562,-8.192286
ZSEcoin,1.262581,-0.078382,-0.023223
HTML5 Coin,1.699798,-0.052474,-0.221435
Ultimate Secure Cash,1.236800,-0.508706,0.011694


### Clustering Crytocurrencies Using K-Means

#### Find the Best Value for `k` Using the Elbow Curve

In [190]:
# Finding the best value for k
inertia = []
k = list(range(1, 11))

# Calculate the inertia for the range of k values
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(pcs_df)
    inertia.append(km.inertia_)

# Creating the Elbow Curve
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="k", y="inertia", xticks=k, title="Elbow Curve")

  f"KMeans is known to have a memory leak on Windows "


Running K-Means with `k=4`

In [191]:
# Predicting clusters with k=4

# Initialize the K-Means model
model = KMeans(n_clusters=4, random_state=0)

# Fit the model
model.fit(pcs_df)

# Predict clusters
predictions = model.predict(pcs_df)

# Add the predicted class columns
pcs_df["class"] = model.labels_
pcs_df

Unnamed: 0,PC 1,PC 2,PC 3,class
0,1.236800,-0.508706,0.011694,3
1,1.236800,-0.508706,0.011694,3
2,0.786454,-0.295457,-0.010037,3
3,2.036448,-0.534103,-0.016735,3
4,-1.508750,0.148868,-0.021265,0
...,...,...,...,...
739,1.236800,-0.508706,0.011694,3
740,-2.280004,0.306556,-0.019165,0
741,-2.081297,0.307006,-0.017022,0
742,0.725953,-0.238083,0.005468,3


In [193]:
# Create a new DataFrame named clustered_df
clustered_df = crypto_df.merge(pcs_df, left_index=True, right_index=True)
clustered_df.set_index("Index")

Unnamed: 0_level_0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,PC 1,PC 2,PC 3,class
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
42,42 Coin,Scrypt,PoW/PoS,4.199995e+01,42,1.236800,-0.508706,0.011694,3
404,404Coin,Scrypt,PoW/PoS,1.055185e+09,532000000,0.786454,-0.295457,-0.010037,3
808,808,SHA-256,PoW/PoS,0.000000e+00,0,-1.508750,0.148868,-0.021265,0
1337,EliteCoin,X13,PoW/PoS,2.927942e+10,3.14159E+11,-2.063012,0.274247,-0.014731,0
BTC,Bitcoin,SHA-256,PoW,1.792718e+07,21000000,1.437534,-0.447962,0.026176,3
...,...,...,...,...,...,...,...,...,...
DEA,Degas Coin,Scrypt,PoW/PoS,2.135876e+07,105000000,2.370576,16.222562,-8.192286,1
ZSE,ZSEcoin,X11,PoW/PoS,0.000000e+00,2093500000,1.262581,-0.078382,-0.023223,3
HTML5,HTML5 Coin,X15,PoW/PoS,4.065902e+10,90000000000,1.699798,-0.052474,-0.221435,3
USC,Ultimate Secure Cash,SHA-256,PoS,1.034311e+07,200084200,1.236800,-0.508706,0.011694,3


### Visualizing Results

#### Scatter Plot with Tradable Cryptocurrencies

In [207]:
# Plot the PC clusters
pcs_df.hvplot.scatter(
    x="PC 1",
    y="PC 2",
    hover_cols=["class"],
    by="class",
)

In [201]:
# Plot the scatter with x="TotalCoinsMined" and y="TotalCoinSupply"
clustered_df.hvplot.scatter(
   x="TotalCoinsMined",
   y="TotalCoinSupply",
   by="class",
   hover_cols=["CoinName"]
)

#### Table of Tradable Cryptocurrencies

In [196]:
# Table with tradable cryptos
clustered_df.hvplot.table()

In [213]:
# Print the total number of tradable cryptocurrencies
# The following code shows that there are 374 unique Coins or tradable cryptocurrencies:
clustered_df['CoinName'].nunique()

374