# Clustering Crypto

In [2]:
# Initial imports
import requests
import pandas as pd
import matplotlib.pyplot as plt
import hvplot.pandas
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from bs4 import BeautifulSoup
import scipy
#import xarray
import seaborn as sns
import panel as pn
#pn.extension('plotly')
import numpy as np
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')
import plotly.express as px
from pandas_profiling import ProfileReport
import sweetviz
import dtale
%matplotlib inline
import plotly.express as px
import datetime as dt

### Fetching Cryptocurrency Data

In [3]:
# Use the following endpoint to fetch json data
#url = "https://min-api.cryptocompare.com/data/all/coinlist"
#url_data = requests.get(url)
#url_data.json()['Data']
# Create a DataFrame 
# HINT: You will need to use the 'Data' key from the json response, then transpose the DataFrame.
#crypto_df=pd.DataFrame.from_dict(url_data.json()['Data'], orient='index')
#crypto_df.head()
#crypto_df.columns

#Since the url output does not contain TotalCoinSupply column, I will be reverting to the supplied csv file for further
#analysis

In [24]:
# Alternatively, use the provided csv file:
file_path = Path("./Data/crypto_data.csv")

# Create a DataFrame
crypto_df=pd.read_csv(file_path)
crypto_df.head()

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
1,365,365Coin,X11,True,PoW/PoS,,2300000000
2,404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
3,611,SixEleven,SHA-256,True,PoW,,611000
4,808,808,SHA-256,True,PoW/PoS,0.0,0


### Data Preprocessing

In [25]:
# Keep only necessary columns:
# 'CoinName','Algorithm','IsTrading','ProofType','TotalCoinsMined','TotalCoinSupply'
crypto_df_c=crypto_df[[ 'CoinName','Algorithm','IsTrading','ProofType','TotalCoinsMined','TotalCoinSupply']].copy()
crypto_df_c.head()

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
1,365Coin,X11,True,PoW/PoS,,2300000000
2,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
3,SixEleven,SHA-256,True,PoW,,611000
4,808,SHA-256,True,PoW/PoS,0.0,0


In [53]:
#Data Type Check
crypto_df_c.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1252 entries, 0 to 1251
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   CoinName         1252 non-null   object 
 1   Algorithm        1252 non-null   object 
 2   IsTrading        1252 non-null   bool   
 3   ProofType        1252 non-null   object 
 4   TotalCoinsMined  744 non-null    float64
 5   TotalCoinSupply  1252 non-null   object 
dtypes: bool(1), float64(1), object(4)
memory usage: 50.3+ KB


In [None]:
#Convert non Text/boolean columns to float -
# Convert Data type for TotalCoinSupply  to Float
crypto_df_c['TotalCoinSupply']=crypto_df_c['TotalCoinSupply'].astype(float)

In [30]:
# Keep only cryptocurrencies that are trading
crypto_df_t=crypto_df_c[crypto_df_c['IsTrading']==True]


In [31]:
crypto_df_t['Algorithm'].unique()

array(['Scrypt', 'X11', 'SHA-256', 'X13', 'Ethash', 'CryptoNight-V7',
       'Equihash', 'SHA-512', 'Multiple', 'X15', 'NIST5', 'Quark',
       'Groestl', 'PoS', 'NeoScrypt', 'SHA3', 'HybridScryptHash256',
       'Scrypt-n', 'PHI1612', 'Lyra2REv2', 'CryptoNight', 'Shabal256',
       'Counterparty', 'Blake', 'Momentum', 'Stanford Folding', 'QuBit',
       'XG Hash', 'M7 POW', 'Curve25519', 'Lyra2RE', 'QUAIT', 'vDPOS',
       'Blake2b', 'BLAKE256', '1GB AES Pattern Search', 'Dagger',
       'CryptoNight-Lite', 'X11GOST', 'SHA-256D', 'POS 3.0',
       'Progressive-n', 'DPoS', 'Lyra2Z', 'X14', 'Time Travel', 'Argon2',
       'Keccak', 'Blake2S', 'Dagger-Hashimoto', '536', 'Argon2d',
       'Cloverhash', 'Skein', 'SkunkHash v2 Raptor',
       'VeChainThor Authority', 'Ouroboros', 'POS 2.0', 'SkunkHash',
       'C11', 'Proof-of-BibleHash', 'SHA-256 + Hive',
       'Proof-of-Authority', 'XEVAN', 'VBFT', 'YescryptR16', 'IMesh',
       'Green Protocol', 'Semux BFT consensus', 'X16R', 'Tribus',


In [None]:
# Keep only cryptocurrencies with a working algorithm
crypto_df_t['Algorithm'].unique()
# Each entry seems to have a valid Algorithm

In [32]:
# Remove the "IsTrading" column
crypto_df_t.drop(columns='IsTrading',inplace=True)

In [37]:
# Remove rows with at least 1 null value
display(crypto_df_t.isnull().sum())
crypto_df_n=crypto_df_t.dropna(how='any',axis=0)
display(crypto_df_n.isnull().sum())
display(crypto_df_n.shape)

CoinName             0
Algorithm            0
ProofType            0
TotalCoinsMined    459
TotalCoinSupply      0
dtype: int64

CoinName           0
Algorithm          0
ProofType          0
TotalCoinsMined    0
TotalCoinSupply    0
dtype: int64

In [41]:
# Remove rows with cryptocurrencies having no coins mined
crypto_df_m=crypto_df_n[crypto_df_n['TotalCoinsMined'] > 0]

In [49]:
# Drop rows where there are 'N/A' text values
#crypto_df_m=="N/A"
#np.isin(crypto_df_m, ['N/A']).sum()

crypto_df_na=crypto_df_m[~crypto_df_m.isin(['N/A']).any(axis=1)]



In [50]:
# Store the 'CoinName'column in its own DataFrame prior to dropping it from crypto_df
crypto_df_coinname=crypto_df_na['CoinName'].copy()
crypto_df_coinname.head()

0      42 Coin
2      404Coin
5    EliteCoin
7      Bitcoin
8     Ethereum
Name: CoinName, dtype: object

In [51]:
# Drop the 'CoinName' column since it's not going to be used on the clustering algorithm
crypto_df_na.drop(columns='CoinName',inplace=True)

In [57]:
# Create dummy variables for text features

categorical_columns = crypto_df_na.select_dtypes(["object"]).columns
for column in categorical_columns:
    tempdf = pd.get_dummies(crypto_df_na[column], prefix=column)
    crypto_df_na = pd.merge(
        left=crypto_df_na,
        right=tempdf,
        left_index=True,
        right_index=True,
    )
    crypto_df_na = crypto_df_na.drop(columns=column)

In [58]:
crypto_df_na

Unnamed: 0,TotalCoinsMined,TotalCoinSupply,Algorithm_1GB AES Pattern Search,Algorithm_536,Algorithm_Argon2d,Algorithm_BLAKE256,Algorithm_Blake,Algorithm_Blake2S,Algorithm_Blake2b,Algorithm_C11,...,ProofType_PoW/PoS,ProofType_PoW/PoS.1,ProofType_PoW/PoW,ProofType_PoW/nPoS,ProofType_Pos,ProofType_Proof of Authority,ProofType_Proof of Trust,ProofType_TPoS,ProofType_Zero-Knowledge Proof,ProofType_dPoW/PoW
0,4.199995e+01,4.200000e+01,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,1.055185e+09,5.320000e+08,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
5,2.927942e+10,3.141593e+11,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
7,1.792718e+07,2.100000e+07,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,1.076842e+08,0.000000e+00,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1238,2.000000e+09,2.000000e+09,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1242,1.493105e+07,2.500000e+08,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1245,9.802226e+08,1.400223e+09,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1246,7.296538e+06,2.100000e+07,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
# Standardize data
scale=StandardScaler()


### Reducing Dimensions Using PCA

In [None]:
# Use PCA to reduce dimensions to 3 principal components


In [None]:
# Create a DataFrame with the principal components data


### Clustering Crytocurrencies Using K-Means

#### Find the Best Value for `k` Using the Elbow Curve

In [None]:
inertia = []
k = list(range(1, 11))

# Calculate the inertia for the range of k values


# Create the Elbow Curve using hvPlot


Running K-Means with `k=<your best value for k here>`

In [None]:
# Initialize the K-Means model

# Fit the model

# Predict clusters

# Create a new DataFrame including predicted clusters and cryptocurrencies features


### Visualizing Results

#### Scatter Plot with Tradable Cryptocurrencies

In [None]:
# Scale data to create the scatter plot


In [None]:
# Plot the scatter with x="TotalCoinsMined" and y="TotalCoinSupply"


#### Table of Tradable Cryptocurrencies

In [None]:
# Table with tradable cryptos


In [None]:
# Print the total number of tradable cryptocurrencies
