In [70]:
# Import libraries/dependencies
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import hvplot.pandas
import plotly.express as px

In [28]:
# Load the data
file_path = "Resources/crypto_data.csv"
crypto_df = pd.read_csv(file_path, index_col= 0)
crypto_df

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,True,PoW/PoS,4.199995e+01,42
365,365Coin,X11,True,PoW/PoS,,2300000000
404,404Coin,Scrypt,True,PoW/PoS,1.055185e+09,532000000
611,SixEleven,SHA-256,True,PoW,,611000
808,808,SHA-256,True,PoW/PoS,0.000000e+00,0
...,...,...,...,...,...,...
XBC,BitcoinPlus,Scrypt,True,PoS,1.283270e+05,1000000
DVTC,DivotyCoin,Scrypt,False,PoW/PoS,2.149121e+07,100000000
GIOT,Giotto Coin,Scrypt,False,PoW/PoS,,233100000
OPSC,OpenSourceCoin,SHA-256,False,PoW/PoS,,21000000


In [29]:
# Drop all cryptocurrencies that aren't trading
crypto_df.drop(crypto_df[crypto_df["IsTrading"] == False].index, inplace=True)

In [30]:
# Drop all cryptocurrencies that don't have an algorithm defined
crypto_df["Algorithm"].dropna()

42           Scrypt
365             X11
404          Scrypt
611         SHA-256
808         SHA-256
           ...     
SERO         Ethash
UOS         SHA-256
BDX     CryptoNight
ZEN        Equihash
XBC          Scrypt
Name: Algorithm, Length: 1144, dtype: object

In [31]:
# Drop the IsTrading column
crypto_df = crypto_df.drop(["IsTrading"], axis = 1)

In [32]:
# Drop all cryptocurrencies with at least one null value
crypto_df = crypto_df.dropna()
crypto_df

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,PoW/PoS,4.199995e+01,42
404,404Coin,Scrypt,PoW/PoS,1.055185e+09,532000000
808,808,SHA-256,PoW/PoS,0.000000e+00,0
1337,EliteCoin,X13,PoW/PoS,2.927942e+10,314159265359
BTC,Bitcoin,SHA-256,PoW,1.792718e+07,21000000
...,...,...,...,...,...
ZEPH,ZEPHYR,SHA-256,DPoS,2.000000e+09,2000000000
GAP,Gapcoin,Scrypt,PoW/PoS,1.493105e+07,250000000
BDX,Beldex,CryptoNight,PoW,9.802226e+08,1400222610
ZEN,Horizen,Equihash,PoW,7.296538e+06,21000000


In [33]:
# Drop all cryptocurrencies without coins mined
crypto_df.drop(crypto_df[crypto_df["TotalCoinsMined"] <= 0].index, inplace=True)
crypto_df

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,PoW/PoS,4.199995e+01,42
404,404Coin,Scrypt,PoW/PoS,1.055185e+09,532000000
1337,EliteCoin,X13,PoW/PoS,2.927942e+10,314159265359
BTC,Bitcoin,SHA-256,PoW,1.792718e+07,21000000
ETH,Ethereum,Ethash,PoW,1.076842e+08,0
...,...,...,...,...,...
ZEPH,ZEPHYR,SHA-256,DPoS,2.000000e+09,2000000000
GAP,Gapcoin,Scrypt,PoW/PoS,1.493105e+07,250000000
BDX,Beldex,CryptoNight,PoW,9.802226e+08,1400222610
ZEN,Horizen,Equihash,PoW,7.296538e+06,21000000


In [34]:
# Create a new DataFrame to store CoinName column and use the crypto_df index
coins_name = pd.DataFrame(crypto_df["CoinName"], index = crypto_df.index)
coins_name

Unnamed: 0,CoinName
42,42 Coin
404,404Coin
1337,EliteCoin
BTC,Bitcoin
ETH,Ethereum
...,...
ZEPH,ZEPHYR
GAP,Gapcoin
BDX,Beldex
ZEN,Horizen


In [35]:
# Drop the CoinName column from crypto_df
crypto_df = crypto_df.drop(["CoinName"], axis = 1)
crypto_df

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,Scrypt,PoW/PoS,4.199995e+01,42
404,Scrypt,PoW/PoS,1.055185e+09,532000000
1337,X13,PoW/PoS,2.927942e+10,314159265359
BTC,SHA-256,PoW,1.792718e+07,21000000
ETH,Ethash,PoW,1.076842e+08,0
...,...,...,...,...
ZEPH,SHA-256,DPoS,2.000000e+09,2000000000
GAP,Scrypt,PoW/PoS,1.493105e+07,250000000
BDX,CryptoNight,PoW,9.802226e+08,1400222610
ZEN,Equihash,PoW,7.296538e+06,21000000


In [36]:
# Use get_dummies and store data in new DataFrame
X = pd.get_dummies(crypto_df)
X

Unnamed: 0,TotalCoinsMined,Algorithm_1GB AES Pattern Search,Algorithm_536,Algorithm_Argon2d,Algorithm_BLAKE256,Algorithm_Blake,Algorithm_Blake2S,Algorithm_Blake2b,Algorithm_C11,Algorithm_Cloverhash,...,TotalCoinSupply_91388946,TotalCoinSupply_92000000000,TotalCoinSupply_9354000,TotalCoinSupply_9507271,TotalCoinSupply_9736000,TotalCoinSupply_98000000,TotalCoinSupply_98100000000,TotalCoinSupply_990000000000,TotalCoinSupply_999481516,TotalCoinSupply_9999999
42,4.199995e+01,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
404,1.055185e+09,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1337,2.927942e+10,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
BTC,1.792718e+07,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ETH,1.076842e+08,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZEPH,2.000000e+09,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
GAP,1.493105e+07,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
BDX,9.802226e+08,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ZEN,7.296538e+06,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [37]:
# Standardize the data in the X DataFrame
X_scaled = StandardScaler().fit_transform(X)
X_scaled

array([[-0.11710817, -0.0433963 , -0.0433963 , ..., -0.0433963 ,
        -0.0433963 , -0.0433963 ],
       [-0.09396955, -0.0433963 , -0.0433963 , ..., -0.0433963 ,
        -0.0433963 , -0.0433963 ],
       [ 0.52494561, -0.0433963 , -0.0433963 , ..., -0.0433963 ,
        -0.0433963 , -0.0433963 ],
       ...,
       [-0.09561336, -0.0433963 , -0.0433963 , ..., -0.0433963 ,
        -0.0433963 , -0.0433963 ],
       [-0.11694817, -0.0433963 , -0.0433963 , ..., -0.0433963 ,
        -0.0433963 , -0.0433963 ],
       [-0.11710536, -0.0433963 , -0.0433963 , ..., -0.0433963 ,
        -0.0433963 , -0.0433963 ]])

In [38]:
### PCA ###

In [39]:
# Initialize the PCA model
pca = PCA(n_components = 3)

In [40]:
# Get three principal components for the X data
crypto_pca = pca.fit_transform(X_scaled)

In [41]:
# Transform PCA data into a DataFrame
pcs_df = pd.DataFrame(
    data = crypto_pca,
    columns = ["PC 1", "PC 2", "PC 3"],
    index = crypto_df.index)

pcs_df.head(10)

Unnamed: 0,PC 1,PC 2,PC 3
42,-0.245445,-0.095772,0.01052
404,-0.228353,-0.097799,0.001741
1337,0.422547,0.038275,0.029192
BTC,-0.234857,-0.240023,0.001087
ETH,-0.141039,-0.133835,-0.020612
LTC,-0.302098,-0.250859,0.056961
DASH,-0.290635,-0.017263,-0.048379
XMR,-0.107278,-0.147401,0.079124
ETC,-0.305107,-0.098075,-0.025603
ZEC,-0.260662,-0.223748,0.067942


In [42]:
### K-MEANS ###

In [43]:
# Loop through 10 values for K and determine the inertia
inertia = []
k = list(range(1, 11))
for i in k:
    km = KMeans(n_clusters=i, random_state=7)
    km.fit(pcs_df)
    inertia.append(km.inertia_)

In [44]:
# Create the Elbow Curve using hvPlot
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="k", y="inertia", xticks=k, title="Elbow Curve")

In [48]:
# Run the K-Means algorithm to predict K clusters
model = KMeans(n_clusters=4, random_state=7)
model.fit(pcs_df)
predictions = model.predict(pcs_df)

In [49]:
# Add a new class column to the DataFrame
pcs_df["Class"] = model.labels_
pcs_df

Unnamed: 0,PC 1,PC 2,PC 3,Class
42,-0.245445,-0.095772,0.010520,0
404,-0.228353,-0.097799,0.001741,0
1337,0.422547,0.038275,0.029192,0
BTC,-0.234857,-0.240023,0.001087,0
ETH,-0.141039,-0.133835,-0.020612,0
...,...,...,...,...
ZEPH,2.634613,-0.331053,0.014610,0
GAP,-0.247596,-0.146104,0.020315,0
BDX,-0.013737,-0.332753,-0.039563,0
ZEN,-0.260663,-0.223748,0.067942,0


In [68]:
### Create a new Dataframe callled "clustered_df"

# Merge crypto_df with pcs_df
crypto_pcs_df = pd.merge(crypto_df, pcs_df, right_index=True, left_index=True)

# Merge crypto_pcs_df with coins_name
clustered_df = pd.merge(crypto_pcs_df, coins_name, right_index=True, left_index=True)

# Reorder the columns
clustered_df = clustered_df[[
    "Algorithm",
    "ProofType",
    "TotalCoinsMined",
    "TotalCoinSupply",
    "PC 1",
    "PC 2",
    "PC 3",
    "CoinName",
    "Class"
]]

clustered_df.head(10)

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,PC 1,PC 2,PC 3,CoinName,Class
42,Scrypt,PoW/PoS,41.99995,42,-0.245445,-0.095772,0.01052,42 Coin,0
404,Scrypt,PoW/PoS,1055185000.0,532000000,-0.228353,-0.097799,0.001741,404Coin,0
1337,X13,PoW/PoS,29279420000.0,314159265359,0.422547,0.038275,0.029192,EliteCoin,0
BTC,SHA-256,PoW,17927180.0,21000000,-0.234857,-0.240023,0.001087,Bitcoin,0
ETH,Ethash,PoW,107684200.0,0,-0.141039,-0.133835,-0.020612,Ethereum,0
LTC,Scrypt,PoW,63039240.0,84000000,-0.302098,-0.250859,0.056961,Litecoin,0
DASH,X11,PoW/PoS,9031294.0,22000000,-0.290635,-0.017263,-0.048379,Dash,0
XMR,CryptoNight-V7,PoW,17201140.0,0,-0.107278,-0.147401,0.079124,Monero,0
ETC,Ethash,PoW,113359700.0,210000000,-0.305107,-0.098075,-0.025603,Ethereum Classic,0
ZEC,Equihash,PoW,7383056.0,21000000,-0.260662,-0.223748,0.067942,ZCash,0


In [71]:
### Visualizations ###

In [79]:
# Plot a 3D scatter plot using clustered_df
fig = px.scatter_3d(
    clustered_df,
    x="PC 1",
    y="PC 2",
    z="PC 3",
    color="Class",
    symbol="Class",
    hover_name="CoinName",
    hover_data=["Algorithm"],
    width=800
)

fig.update_layout(legend=dict(x=0, y=1))
fig.show()

In [83]:
# Create an hvplot table with all the current tradable cryptocurrencies
crypto_table = clustered_df[["CoinName", "Algorithm", "ProofType", "TotalCoinSupply", "TotalCoinsMined", "Class"]]
crypto_table.hvplot.table()

In [87]:
# Check the data types of clustered_df
clustered_df.dtypes

Algorithm           object
ProofType           object
TotalCoinsMined    float64
TotalCoinSupply     object
PC 1               float64
PC 2               float64
PC 3               float64
CoinName            object
Class                int32
dtype: object

In [90]:
# Convert TotalCoinSupply column to numerical
clustered_df["TotalCoinSupply"].astype(float)

42      4.200000e+01
404     5.320000e+08
1337    3.141593e+11
BTC     2.100000e+07
ETH     0.000000e+00
            ...     
ZEPH    2.000000e+09
GAP     2.500000e+08
BDX     1.400223e+09
ZEN     2.100000e+07
XBC     1.000000e+06
Name: TotalCoinSupply, Length: 532, dtype: float64

In [None]:
>> df['purchase'].astype(str).astype(int)

In [85]:
# Check the data types of clustered_df

# Create a scatter plot using hvplot and the clustered data
clustered_df.hvplot.scatter(
    x="TotalCoinsMined",
    y="TotalCoinSupply",
    hover_cols=["CoinName"],
    by="Class")

In [86]:
clustered_df.dtypes

Algorithm           object
ProofType           object
TotalCoinsMined    float64
TotalCoinSupply     object
PC 1               float64
PC 2               float64
PC 3               float64
CoinName            object
Class                int32
dtype: object