In [35]:
import pandas as pd 
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import hvplot.pandas
import plotly.express as px 


####################### Data Preprocessing ######################

In [36]:
# Load data
crypto_df = pd.read_csv("Resources/crypto_data.csv")
crypto_df.head(10)

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
1,365,365Coin,X11,True,PoW/PoS,,2300000000
2,404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
3,611,SixEleven,SHA-256,True,PoW,,611000
4,808,808,SHA-256,True,PoW/PoS,0.0,0
5,1337,EliteCoin,X13,True,PoW/PoS,29279420000.0,314159265359
6,2015,2015 coin,X11,True,PoW/PoS,,0
7,BTC,Bitcoin,SHA-256,True,PoW,17927180.0,21000000
8,ETH,Ethereum,Ethash,True,PoW,107684200.0,0
9,LTC,Litecoin,Scrypt,True,PoW,63039240.0,84000000


In [37]:
# Number of data
len(crypto_df)


1252

In [38]:
# Column type
crypto_df.dtypes


Unnamed: 0          object
CoinName            object
Algorithm           object
IsTrading             bool
ProofType           object
TotalCoinsMined    float64
TotalCoinSupply     object
dtype: object

In [39]:
# Removing crypto that are not trading
trading_crypto_df = crypto_df[crypto_df['IsTrading'] == True]
trading_crypto_df.head(10)


Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
1,365,365Coin,X11,True,PoW/PoS,,2300000000
2,404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
3,611,SixEleven,SHA-256,True,PoW,,611000
4,808,808,SHA-256,True,PoW/PoS,0.0,0
5,1337,EliteCoin,X13,True,PoW/PoS,29279420000.0,314159265359
6,2015,2015 coin,X11,True,PoW/PoS,,0
7,BTC,Bitcoin,SHA-256,True,PoW,17927180.0,21000000
8,ETH,Ethereum,Ethash,True,PoW,107684200.0,0
9,LTC,Litecoin,Scrypt,True,PoW,63039240.0,84000000


In [40]:
# Number of currency that are trading
len(trading_crypto_df)


1144

In [41]:
# Counting crypto that don’t have algorithm defined
trading_crypto_df['Algorithm'].isnull().sum()


0

In [42]:
# No crypto has a null value
trading_wAlgorithm_df = trading_crypto_df
trading_wAlgorithm_df.head(5)


Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
1,365,365Coin,X11,True,PoW/PoS,,2300000000
2,404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
3,611,SixEleven,SHA-256,True,PoW,,611000
4,808,808,SHA-256,True,PoW/PoS,0.0,0


In [43]:
# Remove the IsTrading column
trading_wAlgorithm_df = trading_wAlgorithm_df.drop(columns = ['IsTrading'])
trading_wAlgorithm_df.head(5)


Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,PoW/PoS,41.99995,42
1,365,365Coin,X11,PoW/PoS,,2300000000
2,404,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000
3,611,SixEleven,SHA-256,PoW,,611000
4,808,808,SHA-256,PoW/PoS,0.0,0


In [44]:
# Find null values
for column in trading_wAlgorithm_df:
    print(f"Column {column} has {trading_wAlgorithm_df[column].isnull().sum()} null values")


Column Unnamed: 0 has 0 null values
Column CoinName has 0 null values
Column Algorithm has 0 null values
Column ProofType has 0 null values
Column TotalCoinsMined has 459 null values
Column TotalCoinSupply has 0 null values


In [45]:
# Removing cryptoc with at least one null value
notnull_crypto_df = trading_wAlgorithm_df.dropna()
print(f"{len(notnull_crypto_df)} cryptocurrency with no null values")


685 cryptocurrency with no null values


In [46]:
# Crypto with no coines mined
notnull_crypto_df[notnull_crypto_df['TotalCoinsMined']== 0]


Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
4,808,808,SHA-256,PoW/PoS,0.0,0
18,XBS,Bitstake,X11,PoW/PoS,0.0,1300000
29,ACOIN,ACoin,SHA-256,PoW,0.0,1600000
30,AERO,Aero Coin,X13,PoS,0.0,7000000
35,APEX,ApexCoin,X13,PoW/PoS,0.0,6000000
...,...,...,...,...,...,...
1180,QCN,Quazar Coin,CryptoNight,PoW,0.0,18446744
1183,PKB,ParkByte,SHA-256,PoW/PoS,0.0,25000000
1197,DOT,Dotcoin,Scrypt,PoW,0.0,890000000
1199,THC,The Hempcoin,Scrypt,PoW/PoS,0.0,300000000


In [47]:
# Removing crypto with no coins mined
ableToMine_crypto_df = notnull_crypto_df[notnull_crypto_df['TotalCoinsMined'] != 0]
len(ableToMine_crypto_df)
print(f"There are {len(ableToMine_crypto_df)} instances of crypto mining")



There are 533 instances of crypto mining


In [48]:
# Store names of crypto on a DataFrame, using "Unnamed: 0" as index
coins_name = pd.DataFrame(ableToMine_crypto_df[['Unnamed: 0','CoinName']])
coins_name.set_index('Unnamed: 0', drop = True, inplace = True)
# coins_name.index.names = ['']
coins_name.head()


Unnamed: 0_level_0,CoinName
Unnamed: 0,Unnamed: 1_level_1
42,42 Coin
404,404Coin
1337,EliteCoin
BTC,Bitcoin
ETH,Ethereum


In [49]:
# Making sure we didnt lose any data...
print(f"We have {len(coins_name)} number of data")


We have 533 number of data


In [50]:
# Removing CoinName column
clean_crypto_df = ableToMine_crypto_df.drop(columns = ['CoinName'])
clean_crypto_df.head()


Unnamed: 0.1,Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,Scrypt,PoW/PoS,41.99995,42
2,404,Scrypt,PoW/PoS,1055185000.0,532000000
5,1337,X13,PoW/PoS,29279420000.0,314159265359
7,BTC,SHA-256,PoW,17927180.0,21000000
8,ETH,Ethash,PoW,107684200.0,0


In [51]:
# Inspecting data types
clean_crypto_df.dtypes


Unnamed: 0          object
Algorithm           object
ProofType           object
TotalCoinsMined    float64
TotalCoinSupply     object
dtype: object

In [52]:
# Changing data type for TotalCoinSupply
clean_crypto_df['TotalCoinSupply'] = clean_crypto_df['TotalCoinSupply'].astype('float')


In [53]:
# Double check if changes applied
# clean_crypto_df.dtypes


In [54]:
# Create dummies variables for text features, and store results to DataFrame
X = pd.get_dummies(clean_crypto_df[['Algorithm','ProofType']])
X.head()


Unnamed: 0,Algorithm_1GB AES Pattern Search,Algorithm_536,Algorithm_Argon2d,Algorithm_BLAKE256,Algorithm_Blake,Algorithm_Blake2S,Algorithm_Blake2b,Algorithm_C11,Algorithm_Cloverhash,Algorithm_Counterparty,...,ProofType_PoW/PoS,ProofType_PoW/PoS.1,ProofType_PoW/PoW,ProofType_PoW/nPoS,ProofType_Pos,ProofType_Proof of Authority,ProofType_Proof of Trust,ProofType_TPoS,ProofType_Zero-Knowledge Proof,ProofType_dPoW/PoW
0,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [55]:
# Standardize the data from X
scale_model = StandardScaler()
scaled_X = scale_model.fit_transform(X)
scaled_X


array([[-0.0433555, -0.0433555, -0.0433555, ..., -0.0433555, -0.0433555,
        -0.0433555],
       [-0.0433555, -0.0433555, -0.0433555, ..., -0.0433555, -0.0433555,
        -0.0433555],
       [-0.0433555, -0.0433555, -0.0433555, ..., -0.0433555, -0.0433555,
        -0.0433555],
       ...,
       [-0.0433555, -0.0433555, -0.0433555, ..., -0.0433555, -0.0433555,
        -0.0433555],
       [-0.0433555, -0.0433555, -0.0433555, ..., -0.0433555, -0.0433555,
        -0.0433555],
       [-0.0433555, -0.0433555, -0.0433555, ..., -0.0433555, -0.0433555,
        -0.0433555]])

####################   PCA   #######################


In [56]:
# Reducing X DataFrame Dimensions Using PCA to 3 features
pca = PCA(n_components=3)
X_pca = pca.fit_transform(scaled_X)
print(f'pca ratio - {pca.explained_variance_ratio_}')


pca ratio - [0.02132079 0.02051782 0.0204434 ]


In [57]:
# Explained variance
pca.explained_variance_


array([2.09336495, 2.01452583, 2.00721862])

In [58]:
pcs_df = pd.DataFrame(X_pca, index=clean_crypto_df["Unnamed: 0"], columns=['PC 1','PC 2','PC 3'])
# pcs_df.index.names = ['']
pcs_df.head(10)


Unnamed: 0_level_0,PC 1,PC 2,PC 3
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
42,1.093781,-0.650718,-0.006617
404,1.093781,-0.650718,-0.006617
1337,1.701221,-0.754176,-0.011909
BTC,-1.366572,0.213556,0.019995
ETH,-2.053793,0.441257,0.010919
LTC,-1.035281,-0.012345,0.005709
DASH,1.114848,-0.656943,-0.037417
XMR,-2.262082,0.447518,-0.016661
ETC,-2.053793,0.441257,0.010919
ZEC,-2.058618,0.411544,0.026244


################### Clustering Using K-means  ###############


In [59]:
# Graph elbow curve to find best value for K,
#   X-axis is K, y-axis is inertia
inertia_list = list()
k_value = list(range(1,11))

for k in k_value:
    k_model = KMeans(n_clusters=k, random_state=1)
    k_model.fit(pcs_df)
    inertia_list.append(k_model.inertia_)

# DataFrame for plotting
elbow_df = pd.DataFrame({'K': k_value, 'Inertia': inertia_list})


In [60]:
# Graph the Elbow curve
elbow_curve = elbow_df.hvplot.line(x = 'K', y = 'Inertia', xticks = k_value, title='Elbow Curve')
# hvplot.show(obj)
elbow_curve

In [61]:
# From our graph, the elbow is more prominent at K=4. We will set our cluster=4 for our KMeans


In [62]:
# KMeans algorithm
model = KMeans(n_clusters=4, random_state=1)
predictions = model.fit_predict(pcs_df)



In [63]:
# Combining clean_crypto_df, pcs_df, and coins_name
clustered_df = clean_crypto_df.merge(pcs_df, on = 'Unnamed: 0')
clustered_df = clustered_df.merge(coins_name, on = 'Unnamed: 0')

# Integrate algorithm in the DataFrame
clustered_df['Class'] = model.labels_

clustered_df.set_index('Unnamed: 0', drop = True, inplace = True)
clustered_df.index.names = ['']
clustered_df.head(10)



Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,PC 1,PC 2,PC 3,CoinName,Class
,,,,,,,,,
42,Scrypt,PoW/PoS,41.99995,42.0,1.093781,-0.650718,-0.006617,42 Coin,1.0
404,Scrypt,PoW/PoS,1055185000.0,532000000.0,1.093781,-0.650718,-0.006617,404Coin,1.0
1337,X13,PoW/PoS,29279420000.0,314159300000.0,1.701221,-0.754176,-0.011909,EliteCoin,1.0
BTC,SHA-256,PoW,17927180.0,21000000.0,-1.366572,0.213556,0.019995,Bitcoin,0.0
ETH,Ethash,PoW,107684200.0,0.0,-2.053793,0.441257,0.010919,Ethereum,0.0
LTC,Scrypt,PoW,63039240.0,84000000.0,-1.035281,-0.012345,0.005709,Litecoin,0.0
DASH,X11,PoW/PoS,9031294.0,22000000.0,1.114848,-0.656943,-0.037417,Dash,1.0
XMR,CryptoNight-V7,PoW,17201140.0,0.0,-2.262082,0.447518,-0.016661,Monero,0.0
ETC,Ethash,PoW,113359700.0,210000000.0,-2.053793,0.441257,0.010919,Ethereum Classic,0.0


#################### Visualizing Results #####################


In [64]:
# Scatter plot 3D
fig_c4_3d = px.scatter_3d(
    clustered_df,
    x= 'PC 1',
    y='PC 2',
    z='PC 3',
    color='Class',
    symbol='Class',
    hover_name='CoinName',
    hover_data=['Algorithm'])

# fig_c4_3d.update_layout(legend = {'x':0,'y':1})
fig_c4_3d.update_layout(legend=dict(x=0, y=1))
fig_c4_3d



In [65]:
# hvplot Table
crypto_table = clustered_df.hvplot.table(columns = ['CoinName', 'Algorithm', 
                                    'ProofType', 'TotalCoinSupply', 
                                    'TotalCoinsMined', 'Class'], width =500)

crypto_table



In [66]:
# Scatter plot 2D
fig_c4_scatter = clustered_df.hvplot.scatter(x="TotalCoinsMined", y="TotalCoinSupply",
                                by = 'Class', hover_cols = ['CoinName'])

fig_c4_scatter
