# Clustering Crypto

In [100]:
# Initial imports
import requests
import pandas as pd
import matplotlib.pyplot as plt
import hvplot.pandas
import plotly.express as px
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

import urllib.request, json 
from pathlib import Path

from returns.result import Result, safe

In [101]:
import sys
if "../lib" not in sys.path:
    print("Adding '../lib' to the path")
    sys.path.append("../lib")
import pandasPalmer as pp
import NLTK.fns as nl
import Classification.fns as cls


### Fetching Cryptocurrency Data

In [102]:
# Use the following endpoint to fetch json data
url_site = "https://min-api.cryptocompare.com/data/all/coinlist"

with urllib.request.urlopen(url_site) as url:
    data = json.loads(url.read().decode())

    if data["Response"]=="Success":
        all_data = data
    else:
        print("NOT ABLE TO READ THE COIN DATA!!!")

In [103]:
# Create a DataFrame 
# HINT: You will need to use the 'Data' key from the json response, then transpose the DataFrame.

df = pd.DataFrame(data=all_data["Data"]).T
df.head(2)

Unnamed: 0,Id,Url,ImageUrl,ContentCreatedOn,Name,Symbol,CoinName,FullName,Description,AssetTokenStatus,...,MaxSupply,MktCapPenalty,IsUsedInDefi,IsUsedInNft,PlatformType,BuiltOn,SmartContractAddress,DecimalPoints,Difficulty,CirculatingSupply
42,4321,/coins/42/overview,/media/35650717/42.jpg,1427211129,42,42,42 Coin,42 Coin (42),Everything about 42 coin is 42 - apart from th...,,...,0,0,0,0,,,,,,
300,749869,/coins/300/overview,/media/27010595/300.png,1517935016,300,300,300 token,300 token (300),300 token is an ERC20 token. This Token was cr...,,...,300,0,0,0,token,ETH,0xaec98a708810414878c3bcdf46aad31ded4a4557,18.0,,


In [104]:
# Alternatively, use the provided csv file:
file_path = Path("./Resources/crypto_data.csv")

# Create a DataFrame
df_csv = pd.read_csv(file_path)
df_csv.head()

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
1,365,365Coin,X11,True,PoW/PoS,,2300000000
2,404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
3,611,SixEleven,SHA-256,True,PoW,,611000
4,808,808,SHA-256,True,PoW/PoS,0.0,0


### Data Preprocessing

In [105]:
# Keep only necessary columns:
# 'CoinName','Algorithm','IsTrading','ProofType','TotalCoinsMined','TotalCoinSupply'
df = df_csv[['CoinName','Algorithm','IsTrading','ProofType','TotalCoinsMined','TotalCoinSupply']]  #,'TotalCoinSupply'
df.head(2)

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42 Coin,Scrypt,True,PoW/PoS,41.999954,42
1,365Coin,X11,True,PoW/PoS,,2300000000


In [106]:
# Keep only cryptocurrencies that are trading
trading_status = df.IsTrading == True
df = df[trading_status]
df.head(2)

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42 Coin,Scrypt,True,PoW/PoS,41.999954,42
1,365Coin,X11,True,PoW/PoS,,2300000000


In [107]:
# Keep only cryptocurrencies with a working algorithm
working_algo = df.Algorithm != "N/A"
df = df[working_algo]
df.head(2)

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42 Coin,Scrypt,True,PoW/PoS,41.999954,42
1,365Coin,X11,True,PoW/PoS,,2300000000


In [108]:
# Remove the 'IsTrading' column
if ("IsTrading" in df.columns):
    df.drop(axis='columns', columns='IsTrading', inplace=True)
else:
    print("The 'IsTrading' Column is already deleted")
df.head(2)

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42 Coin,Scrypt,PoW/PoS,41.999954,42
1,365Coin,X11,PoW/PoS,,2300000000


In [109]:
# Remove rows with at least 1 null value
df.dropna(inplace=True)
df.isnull().sum()
_ = [print(f"Column \t'{col}'   \thas {df[col].isnull().sum()} null values") for col in df.columns]

Column 	'CoinName'   	has 0 null values
Column 	'Algorithm'   	has 0 null values
Column 	'ProofType'   	has 0 null values
Column 	'TotalCoinsMined'   	has 0 null values
Column 	'TotalCoinSupply'   	has 0 null values


In [110]:
# Remove rows with cryptocurrencies having no coins mined
df = df[df.TotalCoinsMined>0]
df.head(2)

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42 Coin,Scrypt,PoW/PoS,41.99995,42
2,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000


In [111]:
# Drop rows where there are 'N/A' text values
Xnavalues = (df.CoinName!="N/A") & (df.Algorithm!="N/A") & (df.ProofType!="N/A")
df = df[Xnavalues]
df.head(2)

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42 Coin,Scrypt,PoW/PoS,41.99995,42
2,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000


In [112]:
# Store the 'CoinName'column in its own DataFrame prior to dropping it from crypto_df 
crypto_df = df.copy(deep=True)
coin_name_df = pd.DataFrame(data=df.CoinName)
coin_name_df.head(2)

Unnamed: 0,CoinName
0,42 Coin
2,404Coin


In [113]:
# Drop the 'CoinName' column since it's not going to be used on the clustering algorithm
if ("CoinName" in crypto_df.columns):
    crypto_df.drop(axis="columns",columns="CoinName",inplace=True)
else:
    print("CoinName already deleted")
crypto_df.head()

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,Scrypt,PoW/PoS,41.99995,42
2,Scrypt,PoW/PoS,1055185000.0,532000000
5,X13,PoW/PoS,29279420000.0,314159265359
7,SHA-256,PoW,17927180.0,21000000
8,Ethash,PoW,107684200.0,0


In [114]:
# Create dummy variables for text features
crypto_df_enc = pd.get_dummies(data=crypto_df, columns=['Algorithm', 'ProofType'])
crypto_df_enc.head()


Unnamed: 0,TotalCoinsMined,TotalCoinSupply,Algorithm_1GB AES Pattern Search,Algorithm_536,Algorithm_Argon2d,Algorithm_BLAKE256,Algorithm_Blake,Algorithm_Blake2S,Algorithm_Blake2b,Algorithm_C11,...,ProofType_PoW/PoS,ProofType_PoW/PoS.1,ProofType_PoW/PoW,ProofType_PoW/nPoS,ProofType_Pos,ProofType_Proof of Authority,ProofType_Proof of Trust,ProofType_TPoS,ProofType_Zero-Knowledge Proof,ProofType_dPoW/PoW
0,41.99995,42,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,1055185000.0,532000000,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
5,29279420000.0,314159265359,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
7,17927180.0,21000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,107684200.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [115]:
# Standardize data
from sklearn.preprocessing import StandardScaler
crypto_scaled = StandardScaler().fit_transform(crypto_df_enc)

print("Scaled", crypto_scaled[0:5])


Scaled [[-0.11710817 -0.1528703  -0.0433963  -0.0433963  -0.0433963  -0.06142951
  -0.07530656 -0.0433963  -0.06142951 -0.06142951 -0.0433963  -0.0433963
  -0.19245009 -0.06142951 -0.09740465 -0.0433963  -0.11547005 -0.07530656
  -0.0433963  -0.0433963  -0.15191091 -0.0433963  -0.13118084 -0.0433963
  -0.0433963  -0.08703883 -0.0433963  -0.0433963  -0.0433963  -0.0433963
  -0.06142951 -0.0433963  -0.08703883 -0.08703883 -0.08703883 -0.0433963
  -0.13118084 -0.13840913 -0.13840913 -0.0433963  -0.06142951 -0.0433963
  -0.07530656 -0.18168574 -0.0433963  -0.0433963  -0.0433963  -0.07530656
  -0.15826614 -0.31491833 -0.0433963  -0.08703883 -0.07530656 -0.06142951
   1.38675049 -0.0433963  -0.0433963  -0.06142951 -0.0433963  -0.0433963
  -0.0433963  -0.0433963  -0.0433963  -0.0433963  -0.0433963  -0.0433963
  -0.39879994 -0.0433963  -0.18168574 -0.0433963  -0.08703883 -0.08703883
  -0.10680283 -0.0433963  -0.13118084 -0.0433963  -0.0433963  -0.0433963
  -0.0433963  -0.07530656 -0.43911856 -

### Reducing Dimensions Using PCA

In [116]:
# Use PCA to reduce dimensions to 3 principal components
pca = PCA(n_components=3)

crypto_pca = pca.fit_transform(crypto_scaled)

In [117]:
# Create a DataFrame with the principal components data
df_crypto_pca = pd.DataFrame(
    data = crypto_pca
    , columns=[f"Principal Component {x}" for x in range(1,4)]
)
df_crypto_pca.head()

Unnamed: 0,Principal Component 1,Principal Component 2,Principal Component 3
0,-0.316153,1.03032,-0.573882
1,-0.299451,1.030535,-0.574254
2,2.305465,1.652208,-0.619858
3,-0.152958,-1.322305,0.207718
4,-0.165774,-2.022585,0.40412


### Clustering Crytocurrencies Using K-Means

#### Finding the Best Value for `k` Using the Elbow Curve

In [118]:
inertia = []
k = list(range(1, 11))

# Calculate the inertia for the range of k values
for i in k:
    x = KMeans(n_clusters=i, random_state=0)
    x.fit(df_crypto_pca)
    inertia.append(x.inertia_)



# Create the Elbow Curve using hvPlot
lbow_data = {"k":k, "inertia":inertia}
df_lbow = pd.DataFrame(lbow_data)
df_lbow.hvplot(
    x="k"
    , y="inertia"
    , xticks=k
    , title="Cryto Currency Elbow Curce"
)


Running K-Means with `k=<your best value for k here>`

In [119]:
# Initialize the K-Means model
mdl = KMeans(n_clusters=4, random_state=0)

# Fit the model
mdl.fit(df_crypto_pca)

# Predict clusters
predict = mdl.predict(df_crypto_pca)

# Create a new DataFrame including predicted clusters and cryptocurrencies features
df_crypto_pca["class"] = mdl.labels_
df_result = coin_name_df.join(crypto_df).join(df_crypto_pca)
df_result
# df_result.to_csv("./CheckTheFile.csv")
# coin_name_df.shape,crypto_df.shape,df_crypto_pca.shape
# df_crypto_pca


Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,Principal Component 1,Principal Component 2,Principal Component 3,class
0,42 Coin,Scrypt,PoW/PoS,4.199995e+01,42,-0.316153,1.030320,-0.573882,0.0
2,404Coin,Scrypt,PoW/PoS,1.055185e+09,532000000,2.305465,1.652208,-0.619858,0.0
5,EliteCoin,X13,PoW/PoS,2.927942e+10,314159265359,-0.148425,-1.120938,-0.029950,2.0
7,Bitcoin,SHA-256,PoW,1.792718e+07,21000000,-0.158365,-2.280507,0.402612,2.0
8,Ethereum,Ethash,PoW,1.076842e+08,0,-0.164213,-2.022680,0.404103,2.0
...,...,...,...,...,...,...,...,...,...
1238,ZEPHYR,SHA-256,DPoS,2.000000e+09,2000000000,,,,
1242,Gapcoin,Scrypt,PoW/PoS,1.493105e+07,250000000,,,,
1245,Beldex,CryptoNight,PoW,9.802226e+08,1400222610,,,,
1246,Horizen,Equihash,PoW,7.296538e+06,21000000,,,,


### Visualizing Results

#### 3D-Scatter with Clusters

In [120]:
# Create a 3D-Scatter with the PCA data and the clusters
fig = px.scatter_3d(
    df_result.dropna()
    , x="Principal Component 1"
    , y="Principal Component 2"
    , z="Principal Component 3"
    , symbol="class", color="class"
)
fig.show()


#### Table of Tradable Cryptocurrencies

In [121]:
# Table with tradable cryptos
#.loc[:,'CoinName':'TotalCoinSupply']
col = [x for x in df_result.columns[0:5]]
tbl = df_result.dropna().hvplot.table(col)
tbl

In [122]:
# Print the total number of tradable cryptocurrencies
print(f"There are {len(coin_name_df)} tradable currencies")

There are 532 tradable currencies


#### Scatter Plot with Tradable Cryptocurrencies

In [124]:
# Scale data to create the scatter plot
df_result.TotalCoinsMined = (df_result.TotalCoinsMined.astype(float)) * 1.0e-8
df_result.TotalCoinSupply = (df_result.TotalCoinSupply.astype(float)) * 1.0e-8


In [127]:
# Plot the scatter with x="TotalCoinsMined" and y="TotalCoinSupply"
df_result.hvplot.scatter(
    x="TotalCoinsMined"
    , y = "TotalCoinSupply"
    , colormap = "Algorithm"
)
