# Clustering Crypto

In [3]:
# Initial imports
import requests
import pandas as pd
#!pip install matplotlib
import matplotlib.pyplot as plt
#!pip install hvplot
import hvplot.pandas
import plotly.express as px
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
#!pip install -U altair



### Fetching Cryptocurrency Data

In [77]:
# Use the following endpoint to fetch json data
url = "https://min-api.cryptocompare.com/data/all/coinlist"

In [78]:
# Create a DataFrame 
# HINT: You will need to use the 'Data' key from the json response, then transpose the DataFrame.

In [5]:
# Alternatively, use the provided csv file:
# file_path = Path("Resources/crypto_data.csv")
from pathlib import Path
file_path = Path("Data/crypto_data.csv")
# Create a DataFrame

crypto_df=pd.read_csv(file_path)
crypto_df.head(5)


Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
1,365,365Coin,X11,True,PoW/PoS,,2300000000
2,404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
3,611,SixEleven,SHA-256,True,PoW,,611000
4,808,808,SHA-256,True,PoW/PoS,0.0,0


### Data Preprocessing

In [6]:
# Keep only necessary columns:
# 'CoinName','Algorithm','IsTrading','ProofType','TotalCoinsMined','TotalCoinSupply'
crypto_df.drop(['Unnamed: 0'],axis='columns',inplace =True)

In [7]:
# Keep only cryptocurrencies that are trading
t_crypto_df= crypto_df.loc[crypto_df['IsTrading']==True]
t_crypto_df['IsTrading'].value_counts()

True    1144
Name: IsTrading, dtype: int64

In [8]:
# Keep only cryptocurrencies with a working algorithm
at_crypto_df= t_crypto_df.loc[t_crypto_df['Algorithm']!='NaN']

In [9]:
# Remove the "IsTrading" column
at_crypto_df.drop(['IsTrading'],axis='columns',inplace =True)

In [10]:
# Remove rows with at least 1 null value
at_crypto_df.dropna(inplace = True)

In [11]:
# Remove rows with cryptocurrencies having no coins mined
at_crypto_df= at_crypto_df.loc[crypto_df['TotalCoinsMined']>0]

In [12]:
# Drop rows where there are 'N/A' text values
columnname = str
for columnname in at_crypto_df.columns:
    if at_crypto_df[columnname].dtypes !=  "float":
        at_crypto_df=at_crypto_df[at_crypto_df[columnname].str.contains("N/A")==False]

at_crypto_df_copy = at_crypto_df.copy()
at_crypto_df.head()

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42 Coin,Scrypt,PoW/PoS,41.99995,42
2,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000
5,EliteCoin,X13,PoW/PoS,29279420000.0,314159265359
7,Bitcoin,SHA-256,PoW,17927180.0,21000000
8,Ethereum,Ethash,PoW,107684200.0,0


In [13]:
# Store the 'CoinName'column in its own DataFrame prior to dropping it from crypto_df
coinname_df=at_crypto_df['CoinName']


In [14]:
# Drop the 'CoinName' column since it's not going to be used on the clustering algorithm
features=at_crypto_df.drop(columns=['CoinName'], inplace = True,axis =1)


In [15]:
# Create dummy variables for text features
features= pd.get_dummies(at_crypto_df,columns=['Algorithm','ProofType'])

In [16]:
# Standardize data
features['TotalCoinSupply'] =pd.to_numeric(features['TotalCoinSupply'], downcast ='float')
features_scaled = StandardScaler().fit_transform(features)
print(features_scaled[0:2])


[[-0.11710817 -0.1528703  -0.0433963  -0.0433963  -0.0433963  -0.06142951
  -0.07530656 -0.0433963  -0.06142951 -0.06142951 -0.0433963  -0.0433963
  -0.19245009 -0.06142951 -0.09740465 -0.0433963  -0.11547005 -0.07530656
  -0.0433963  -0.0433963  -0.15191091 -0.0433963  -0.13118084 -0.0433963
  -0.0433963  -0.08703883 -0.0433963  -0.0433963  -0.0433963  -0.0433963
  -0.06142951 -0.0433963  -0.08703883 -0.08703883 -0.08703883 -0.0433963
  -0.13118084 -0.13840913 -0.13840913 -0.0433963  -0.06142951 -0.0433963
  -0.07530656 -0.18168574 -0.0433963  -0.0433963  -0.0433963  -0.07530656
  -0.15826614 -0.31491833 -0.0433963  -0.08703883 -0.07530656 -0.06142951
   1.38675049 -0.0433963  -0.0433963  -0.06142951 -0.0433963  -0.0433963
  -0.0433963  -0.0433963  -0.0433963  -0.0433963  -0.0433963  -0.0433963
  -0.39879994 -0.0433963  -0.18168574 -0.0433963  -0.08703883 -0.08703883
  -0.10680283 -0.0433963  -0.13118084 -0.0433963  -0.0433963  -0.0433963
  -0.0433963  -0.07530656 -0.43911856 -0.04339

### Reducing Dimensions Using PCA

In [17]:
# Use PCA to reduce dimensions to 3 principal components
pca = PCA(n_components=3)
features_pca = pca.fit_transform(features_scaled)

In [18]:
# Create a DataFrame with the principal components data
features_pca = pd.DataFrame(
    data=features_pca, columns=["principal component 1", "principal component 2","principal component 3"], 
    index=coinname_df)
features_pca.head()

Unnamed: 0_level_0,principal component 1,principal component 2,principal component 3
CoinName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
42 Coin,-0.333823,1.017566,-0.509029
404Coin,-0.317156,1.017748,-0.509478
EliteCoin,2.294032,1.615351,-0.532582
Bitcoin,-0.147575,-1.312309,0.179119
Ethereum,-0.148726,-1.992069,0.376441


### Clustering Crytocurrencies Using K-Means

#### Find the Best Value for `k` Using the Elbow Curve

In [20]:
import altair as alt
inertia = []
k = list(range(1, 11))


# Calculate the inertia for the range of k values
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(features_pca)
    inertia.append(km.inertia_)

# Create the Elbow Curve using hvPlot
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
alt.Chart(df_elbow).mark_point().encode(
    x="k",
    y="inertia"
)

Running K-Means with `k=<your best value for k here>`

In [21]:
# Initialize the K-Means model
model = KMeans(n_clusters=4, random_state=0,copy_x=True)

# Fit the model
model.fit(features_pca)

# Predict clusters
predictions = model.predict(features_pca)

# Create a new DataFrame including predicted clusters and cryptocurrencies features
features_pca["Class"] = model.labels_
Cols=["PC 1", "PC 2", "PC 3", "Class"]
features_pca.columns= Cols
features_pca

merged_features_pca = pd.merge(left=at_crypto_df_copy, right=features_pca, how='left', 
                               left_on='CoinName', right_on='CoinName')
merged_features_pca.head(10)


Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,PC 1,PC 2,PC 3,Class
0,42 Coin,Scrypt,PoW/PoS,41.99995,42,-0.333823,1.017566,-0.509029,3
1,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000,-0.317156,1.017748,-0.509478,3
2,EliteCoin,X13,PoW/PoS,29279420000.0,314159265359,2.294032,1.615351,-0.532582,3
3,Bitcoin,SHA-256,PoW,17927180.0,21000000,-0.147575,-1.312309,0.179119,0
4,Ethereum,Ethash,PoW,107684200.0,0,-0.148726,-1.992069,0.376441,0
5,Litecoin,Scrypt,PoW,63039240.0,84000000,-0.157876,-1.119721,-0.026493,0
6,Dash,X11,PoW/PoS,9031294.0,22000000,-0.404161,1.245875,-0.379015,3
7,Monero,CryptoNight-V7,PoW,17201140.0,0,-0.150762,-2.162464,0.393696,0
8,Ethereum Classic,Ethash,PoW,113359700.0,210000000,-0.147168,-1.992167,0.376425,0
9,ZCash,Equihash,PoW,7383056.0,21000000,-0.164403,-2.192057,0.375528,0


### Visualizing Results

#### 2D-Altr Scatter with Clusters

In [22]:
# Create a 3D-Scatter with the PCA data and the clusters

alt.Chart(features_pca).mark_point().encode(
    x="PC 1",
    y="PC 2"
)

#### Table of Tradable Cryptocurrencies

In [23]:
# Table with tradable cryptos
at_crypto_df_copy.head()



Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42 Coin,Scrypt,PoW/PoS,41.99995,42
2,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000
5,EliteCoin,X13,PoW/PoS,29279420000.0,314159265359
7,Bitcoin,SHA-256,PoW,17927180.0,21000000
8,Ethereum,Ethash,PoW,107684200.0,0


In [24]:
# Print the total number of tradable cryptocurrencies
t_crypto_df['IsTrading'].value_counts()

True    1144
Name: IsTrading, dtype: int64

#### Scatter Plot with Tradable Cryptocurrencies

In [25]:
# Scale data to create the scatter plot
at_crypto_df_copy.drop(columns=['CoinName','Algorithm','ProofType'], inplace=True,axis=1)

t_crypto_scaled = StandardScaler().fit_transform(at_crypto_df_copy)
print(t_crypto_scaled[0:10])

t_crypto_scaled_df = pd.DataFrame(
    data=t_crypto_scaled, columns=["TotalCoinsMined","TotalCoinSupply"], 
    index=coinname_df)

t_crypto_scaled_df.head()

[[-0.11710817 -0.1528703 ]
 [-0.09396955 -0.145009  ]
 [ 0.52494561  4.48942416]
 [-0.11671506 -0.15255998]
 [-0.11474682 -0.1528703 ]
 [-0.11572582 -0.15162904]
 [-0.11691013 -0.15254521]
 [-0.11673098 -0.1528703 ]
 [-0.11462236 -0.14976715]
 [-0.11694627 -0.15255998]]


Unnamed: 0_level_0,TotalCoinsMined,TotalCoinSupply
CoinName,Unnamed: 1_level_1,Unnamed: 2_level_1
42 Coin,-0.117108,-0.15287
404Coin,-0.09397,-0.145009
EliteCoin,0.524946,4.489424
Bitcoin,-0.116715,-0.15256
Ethereum,-0.114747,-0.15287


In [27]:
# Plot the scatter with x="TotalCoinsMined" and y="TotalCoinSupply"


alt.Chart(t_crypto_scaled_df).mark_point().encode(
    x="TotalCoinsMined",
    y="TotalCoinSupply"
    )
