In [35]:
import pandas as pd 

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

from sklearn.cluster import KMeans

import hvplot.pandas
import plotly.express as px 


 # Data Preprocessing

In [36]:
# data load
crypto_df = pd.read_csv('./Resources/crypto_data.csv')
crypto_df.head()

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
1,365,365Coin,X11,True,PoW/PoS,,2300000000
2,404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
3,611,SixEleven,SHA-256,True,PoW,,611000
4,808,808,SHA-256,True,PoW/PoS,0.0,0


In [37]:
crypto_df.dtypes

Unnamed: 0          object
CoinName            object
Algorithm           object
IsTrading             bool
ProofType           object
TotalCoinsMined    float64
TotalCoinSupply     object
dtype: object

In [38]:
# Remove all cryptocurrencies that aren’t trading
active_crypto_df = crypto_df[crypto_df['IsTrading'] == True]
active_crypto_df.head(3)

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
1,365,365Coin,X11,True,PoW/PoS,,2300000000
2,404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000


In [39]:
# Remove all cryptocurrencies that don’t have an algorithm defined
active_crypto_df['Algorithm'].isnull().sum()


0

In [40]:
# Remove the IsTrading column
active_crypto_df = active_crypto_df.drop(columns = ['IsTrading'])


In [41]:
# Remove all cryptocurrencies with at least one null value

active_crypto_df.isnull().sum()

Unnamed: 0           0
CoinName             0
Algorithm            0
ProofType            0
TotalCoinsMined    459
TotalCoinSupply      0
dtype: int64

In [42]:
notnull_crypto_df = active_crypto_df.dropna()
notnull_crypto_df.head()

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,PoW/PoS,41.99995,42
2,404,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000
4,808,808,SHA-256,PoW/PoS,0.0,0
5,1337,EliteCoin,X13,PoW/PoS,29279420000.0,314159265359
7,BTC,Bitcoin,SHA-256,PoW,17927180.0,21000000


In [43]:
# Remove all cryptocurrencies without coins mined(TotalCoinsMined = 0)
cleaned_crypto_df = notnull_crypto_df[notnull_crypto_df['TotalCoinsMined'] != 0]
cleaned_crypto_df.head()


Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,PoW/PoS,41.99995,42
2,404,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000
5,1337,EliteCoin,X13,PoW/PoS,29279420000.0,314159265359
7,BTC,Bitcoin,SHA-256,PoW,17927180.0,21000000
8,ETH,Ethereum,Ethash,PoW,107684200.0,0


In [44]:
# Store the names of all cryptocurrencies on a DataFrame and use the original df index as the index for it
coins_name = pd.DataFrame(cleaned_crypto_df[['Unnamed: 0','CoinName']])
coins_name.set_index('Unnamed: 0', drop = True, inplace = True)
coins_name.head()

Unnamed: 0_level_0,CoinName
Unnamed: 0,Unnamed: 1_level_1
42,42 Coin
404,404Coin
1337,EliteCoin
BTC,Bitcoin
ETH,Ethereum


In [45]:
# Remove the CoinName column
cleaned_crypto_df = cleaned_crypto_df.drop(columns = ['CoinName'])
cleaned_crypto_df.head()

Unnamed: 0.1,Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,Scrypt,PoW/PoS,41.99995,42
2,404,Scrypt,PoW/PoS,1055185000.0,532000000
5,1337,X13,PoW/PoS,29279420000.0,314159265359
7,BTC,SHA-256,PoW,17927180.0,21000000
8,ETH,Ethash,PoW,107684200.0,0


In [46]:
cleaned_crypto_df.dtypes


Unnamed: 0          object
Algorithm           object
ProofType           object
TotalCoinsMined    float64
TotalCoinSupply     object
dtype: object

In [47]:
cleaned_crypto_df['TotalCoinSupply'] = cleaned_crypto_df['TotalCoinSupply'].astype('float')

In [48]:
# Create dummies variables for all of the text features, and store the resulting data on a DataFrame
X = pd.get_dummies(cleaned_crypto_df[['Algorithm','ProofType']])

# duummies extend to 98 features

In [49]:
# standardize all of the data from the X
scale_model = StandardScaler()
scaled_X = scale_model.fit_transform(X)   #ndarray


 # PCA

In [50]:
# Reducing X DataFrame Dimensions Using PCA to 3 features
pca = PCA(n_components=3, random_state = 1)
X_pca = pca.fit_transform(scaled_X)
print(f'The pca ratio is {pca.explained_variance_ratio_}')


The pca ratio is [0.02125276 0.02051845 0.02044463]


In [51]:
pca.explained_variance_

array([2.08668583, 2.0145875 , 2.00734028])

In [52]:
pcs_df = pd.DataFrame(X_pca, index=cleaned_crypto_df['Unnamed: 0'], columns=['PC 1','PC 2','PC 3'])
pcs_df.head(10)


Unnamed: 0_level_0,PC 1,PC 2,PC 3
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
42,1.025955,-0.747712,0.00667
404,1.025955,-0.747712,0.00667
1337,1.795243,-0.806301,0.010988
BTC,-1.293124,0.249233,0.002442
ETH,-1.931951,0.473894,0.0036
LTC,-1.06256,-0.018322,-0.000646
DASH,1.208534,-0.63388,-0.006957
XMR,-2.117172,0.567854,-0.007968
ETC,-1.931951,0.473894,0.0036
ZEC,-1.91816,0.37875,-0.005581


 # Clustering by KMeans

In [53]:
# Create an elbow curve to find the best value for K, X-axis is K, y-axis is inertia
inertia_list = list()
k_value = list(range(1,11))

for k in k_value:
    k_model = KMeans(n_clusters=k, random_state=1)
    k_model.fit(pcs_df)
    inertia_list.append(k_model.inertia_)
# build a dataframe for plotting
elbow_df = pd.DataFrame({'K': k_value, 'Inertia': inertia_list})


In [61]:
# elbow curve
elbow_df.hvplot.line(x = 'K', y = 'Inertia', xticks = k_value)



 Based on the elbow curve, at the point 4, the line shifts to a strong horizontal line.
 As a result, I chosed K=4 as the best estimate number of cluster in KMeans model.

In [55]:
# run the K-means algorithm to predict the K clusters for the cryptocurrencies’ data
model = KMeans(n_clusters=4, random_state=1)
predictions = model.fit_predict(pcs_df)



In [56]:
# combine all information with predicted cluster into a new DataFrame
clustered_df = cleaned_crypto_df.merge(pcs_df, on = 'Unnamed: 0')
clustered_df = clustered_df.merge(coins_name, on = 'Unnamed: 0')

clustered_df['Class'] = model.labels_

clustered_df.set_index('Unnamed: 0', drop = True, inplace = True)
clustered_df.head(10)

Unnamed: 0_level_0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,PC 1,PC 2,PC 3,CoinName,Class
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
42,Scrypt,PoW/PoS,41.99995,42.0,1.025955,-0.747712,0.00667,42 Coin,1
404,Scrypt,PoW/PoS,1055185000.0,532000000.0,1.025955,-0.747712,0.00667,404Coin,1
1337,X13,PoW/PoS,29279420000.0,314159300000.0,1.795243,-0.806301,0.010988,EliteCoin,1
BTC,SHA-256,PoW,17927180.0,21000000.0,-1.293124,0.249233,0.002442,Bitcoin,0
ETH,Ethash,PoW,107684200.0,0.0,-1.931951,0.473894,0.0036,Ethereum,0
LTC,Scrypt,PoW,63039240.0,84000000.0,-1.06256,-0.018322,-0.000646,Litecoin,0
DASH,X11,PoW/PoS,9031294.0,22000000.0,1.208534,-0.63388,-0.006957,Dash,1
XMR,CryptoNight-V7,PoW,17201140.0,0.0,-2.117172,0.567854,-0.007968,Monero,0
ETC,Ethash,PoW,113359700.0,210000000.0,-1.931951,0.473894,0.0036,Ethereum Classic,0
ZEC,Equihash,PoW,7383056.0,21000000.0,-1.91816,0.37875,-0.005581,ZCash,0


 # Visualizing Results

In [57]:
# 3D scatter plot 


fig = px.scatter_3d(clustered_df, x= 'PC 1', y='PC 2',z='PC 3',
                    color='Class', symbol='Class', hover_name='CoinName',
                    hover_data=['Algorithm'])
fig.update_layout(legend = {'x':0,'y':1})
fig.show()



In [58]:
# create a hvplot table for all the current tradable cryptocurrencies
obj_table = clustered_df.hvplot.table(columns = ['CoinName', 'Algorithm', 
                                    'ProofType', 'TotalCoinSupply', 
                                    'TotalCoinsMined', 'Class'], width =500)

hvplot.show(obj_table)



In [60]:
# create a scatter plot to present the clustered data about cryptocurrencies 
clustered_df.hvplot.scatter(x="TotalCoinsMined", y="TotalCoinSupply",
                                by = 'Class', hover_cols = ['CoinName'])


