In [42]:
# Import dependencies
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import plotly.express as px
import hvplot.pandas
from sklearn.cluster import KMeans

In [6]:
# Import data file
file = "../Resources/crypto_data.csv"
crypto_df = pd.read_csv(file, index_col=0)
crypto_df.head(10)

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
365,365Coin,X11,True,PoW/PoS,,2300000000
404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
611,SixEleven,SHA-256,True,PoW,,611000
808,808,SHA-256,True,PoW/PoS,0.0,0
1337,EliteCoin,X13,True,PoW/PoS,29279420000.0,314159265359
2015,2015 coin,X11,True,PoW/PoS,,0
BTC,Bitcoin,SHA-256,True,PoW,17927180.0,21000000
ETH,Ethereum,Ethash,True,PoW,107684200.0,0
LTC,Litecoin,Scrypt,True,PoW,63039240.0,84000000


### Clean the data

In [7]:
# Remove currencies that aren't trading
crypto_df = crypto_df[crypto_df['IsTrading'] == True]

In [8]:
# See if there are any undefined Algorithms. If so, remove them
crypto_df['Algorithm'].isnull().sum() # All algorithms seem to be defined
crypto_df.groupby('Algorithm').count() 

Unnamed: 0_level_0,CoinName,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1GB AES Pattern Search,1,1,1,1,1
536,2,2,2,1,2
Argon2,2,2,2,1,2
Argon2d,1,1,1,1,1
BLAKE256,2,2,2,2,2
...,...,...,...,...,...
XEVAN,6,6,6,6,6
XG Hash,1,1,1,0,1
YescryptR16,1,1,1,0,1
Zhash,1,1,1,1,1


In [9]:
# Remove the IsTrading column
crypto_df = crypto_df.drop(columns='IsTrading')
crypto_df

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,PoW/PoS,4.199995e+01,42
365,365Coin,X11,PoW/PoS,,2300000000
404,404Coin,Scrypt,PoW/PoS,1.055185e+09,532000000
611,SixEleven,SHA-256,PoW,,611000
808,808,SHA-256,PoW/PoS,0.000000e+00,0
...,...,...,...,...,...
SERO,Super Zero,Ethash,PoW,,1000000000
UOS,UOS,SHA-256,DPoI,,1000000000
BDX,Beldex,CryptoNight,PoW,9.802226e+08,1400222610
ZEN,Horizen,Equihash,PoW,7.296538e+06,21000000


In [10]:
# drop all cryptocurrency with at least one NA
crypto_df = crypto_df.dropna()

In [11]:
# Remove any cryptocurrency with no total coins mined
crypto_df = crypto_df[crypto_df['TotalCoinsMined'] != 0]
crypto_df

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,PoW/PoS,4.199995e+01,42
404,404Coin,Scrypt,PoW/PoS,1.055185e+09,532000000
1337,EliteCoin,X13,PoW/PoS,2.927942e+10,314159265359
BTC,Bitcoin,SHA-256,PoW,1.792718e+07,21000000
ETH,Ethereum,Ethash,PoW,1.076842e+08,0
...,...,...,...,...,...
ZEPH,ZEPHYR,SHA-256,DPoS,2.000000e+09,2000000000
GAP,Gapcoin,Scrypt,PoW/PoS,1.493105e+07,250000000
BDX,Beldex,CryptoNight,PoW,9.802226e+08,1400222610
ZEN,Horizen,Equihash,PoW,7.296538e+06,21000000


In [12]:
# Create a dataframe for the coin names and use the index from crypto_df as the index for this one
coins_name = pd.DataFrame(crypto_df['CoinName'],index = crypto_df.index)

In [13]:
# remove the CoinName column from crypto_df
crypto_df = crypto_df.drop(columns='CoinName')

In [None]:
# Create dummy data for text columns
crypto_df_dummies = pd.get_dummies(crypto_df, columns=['Algorithm', 'ProofType'])
crypto_df

### Scale and Reduce Data

In [16]:
crypto_scaled = StandardScaler().fit_transform(crypto_df_dummies)
crypto_scaled

array([[-0.11674788, -0.15286468, -0.0433555 , ..., -0.0433555 ,
        -0.0433555 , -0.0433555 ],
       [-0.09358885, -0.14499604, -0.0433555 , ..., -0.0433555 ,
        -0.0433555 , -0.0433555 ],
       [ 0.52587231,  4.4937636 , -0.0433555 , ..., -0.0433555 ,
        -0.0433555 , -0.0433555 ],
       ...,
       [-0.09523411, -0.13215444, -0.0433555 , ..., -0.0433555 ,
        -0.0433555 , -0.0433555 ],
       [-0.11658774, -0.15255408, -0.0433555 , ..., -0.0433555 ,
        -0.0433555 , -0.0433555 ],
       [-0.11674507, -0.15284989, -0.0433555 , ..., -0.0433555 ,
        -0.0433555 , -0.0433555 ]])

In [17]:
# Use PCA 
pca = PCA(n_components=3)
crypto_pca = pca.fit_transform(crypto_scaled)

In [18]:
# Create dataframe with the 3 primary components and use crypto_df index as index
pcs_df = pd.DataFrame(crypto_pca, columns=['PC1', 'PC2', 'PC3'], index=crypto_df.index)

### Cluster Model

In [19]:
# Set up inertia list and k-values list
inertia = []
k = list(range(1, 11))

In [20]:
# Find the best k value
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(pcs_df)
    inertia.append(km.inertia_)

In [21]:
# Define a DataFrame to plot the Elbow Curve using hvPlot
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="k", y="inertia", title="Elbow Curve", xticks=k)

In [22]:
# Run data through K Means method using k=4
km = KMeans(n_clusters=4, random_state=0)
km.fit(pcs_df)
prediciton = km.predict(pcs_df)

In [23]:
# make a DataFrame from the other dataframes
clustered = {'Algorithm': crypto_df['Algorithm'], 
             'ProofType': crypto_df['ProofType'], 
             'TotalCoinsMined': crypto_df['TotalCoinsMined'], 
             'TotalCoinSupply': crypto_df['TotalCoinSupply'],
             'PC 1': pcs_df['PC1'],
             'PC 2': pcs_df['PC2'],
             'PC 3': pcs_df['PC3'],
             'CoinName': coins_name['CoinName'],
             'Class': km.labels_
            }

In [24]:
# Create new dataframe with required dataS
clustered_df = pd.DataFrame(clustered)

In [25]:
clustered_df

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,PC 1,PC 2,PC 3,CoinName,Class
42,Scrypt,PoW/PoS,4.199995e+01,42,-0.326327,1.007793,-0.582900,42 Coin,0
404,Scrypt,PoW/PoS,1.055185e+09,532000000,-0.309615,1.007967,-0.583273,404Coin,0
1337,X13,PoW/PoS,2.927942e+10,314159265359,2.318858,1.603048,-0.711036,EliteCoin,0
BTC,SHA-256,PoW,1.792718e+07,21000000,-0.147135,-1.341043,0.199692,Bitcoin,1
ETH,Ethash,PoW,1.076842e+08,0,-0.157457,-2.020302,0.415423,Ethereum,1
...,...,...,...,...,...,...,...,...,...
ZEPH,SHA-256,DPoS,2.000000e+09,2000000000,2.452395,0.764394,-0.017884,ZEPHYR,0
GAP,Scrypt,PoW/PoS,1.493105e+07,250000000,-0.324367,1.007685,-0.582918,Gapcoin,0
BDX,CryptoNight,PoW,9.802226e+08,1400222610,0.319782,-2.305237,0.477720,Beldex,1
ZEN,Equihash,PoW,7.296538e+06,21000000,-0.143217,-2.004632,0.422631,Horizen,1


### Visualization

In [28]:
# Plot the 3D-scatter with x="Annual Income", y="Spending Score (1-100)" and z="Age"
fig = px.scatter_3d(
    clustered_df,
    x="PC 1",
    y="PC 2",
    z="PC 3",
    color="Class",
    symbol="Class",
    hover_name = "CoinName",
    hover_data = ['Algorithm']
)
fig.update_layout(legend=dict(x=0, y=1))
fig.show()

In [34]:
# Create a table using hvplot
table = clustered_df.hvplot.table(columns=["CoinName", "Algorithm", "ProofType", "TotalCoinsMined", "TotalCoinSupply", "Class"])

In [35]:
table

In [40]:
# Create a scatter plot showing comparison of coins mined vs coin supply
clustered_df.hvplot.scatter(x='TotalCoinsMined', y='TotalCoinSupply', hover_cols=['CoinName'])