Cryptocurrency

In [16]:
import pandas as pd
from pathlib import Path
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

In [17]:
# Loading the preprocessed iris dataset
file_path = Path("Resources/crypto_data.csv")
import os
print (os.getcwd())

def Fix_TotalCoinSupply(value):
    try:
        return float(value.replace(" ", ""))
    except: # of we get am exception
        return None
crypto_df = pd.read_csv(file_path,index_col=0,converters = {'TotalCoinSupply':Fix_TotalCoinSupply})

crypto_df.head()

C:\GitHub2\Unsupervised-Machine-Learning-Cryptocurrency-Clusters


Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42.0
365,365Coin,X11,True,PoW/PoS,,2300000000.0
404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000.0
611,SixEleven,SHA-256,True,PoW,,611000.0
808,808,SHA-256,True,PoW/PoS,0.0,0.0


In [18]:
crypto_df.shape

(1252, 6)

In [19]:
#Discard all cryptocurrencies that are not being traded. 
#Filter for currencies that are currently being traded
crypto_df = crypto_df[crypto_df.IsTrading == True] 
crypto_df.head()

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42.0
365,365Coin,X11,True,PoW/PoS,,2300000000.0
404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000.0
611,SixEleven,SHA-256,True,PoW,,611000.0
808,808,SHA-256,True,PoW/PoS,0.0,0.0


In [20]:
#drop the IsTrading column from the dataframe.
cleaned_df = crypto_df.drop(columns="IsTrading")
cleaned_df.head()

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,PoW/PoS,41.99995,42.0
365,365Coin,X11,PoW/PoS,,2300000000.0
404,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000.0
611,SixEleven,SHA-256,PoW,,611000.0
808,808,SHA-256,PoW/PoS,0.0,0.0


In [21]:
# check null values
cleaned_df.isnull().sum()

CoinName             0
Algorithm            0
ProofType            0
TotalCoinsMined    459
TotalCoinSupply      2
dtype: int64

In [22]:
#Remove all rows that have at least one null value
#click between parenthesis, then shift/tab.
cleaned_df = cleaned_df.dropna()
cleaned_df.head()

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,PoW/PoS,41.99995,42.0
404,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000.0
808,808,SHA-256,PoW/PoS,0.0,0.0
1337,EliteCoin,X13,PoW/PoS,29279420000.0,314159300000.0
BTC,Bitcoin,SHA-256,PoW,17927180.0,21000000.0


In [23]:
#data frame to hold coinname
#unname
coinname_df = pd.DataFrame(cleaned_df["CoinName"],index=cleaned_df.index) 
coinname_df

Unnamed: 0,CoinName
42,42 Coin
404,404Coin
808,808
1337,EliteCoin
BTC,Bitcoin
...,...
ZEPH,ZEPHYR
GAP,Gapcoin
BDX,Beldex
ZEN,Horizen


In [24]:
#Filter for cryptocurrencies that have been mined. 
#That is, the total coins mined should be greater than zero.
cleaned_df = cleaned_df[cleaned_df.TotalCoinsMined > 0] 
cleaned_df.head()

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,PoW/PoS,41.99995,42.0
404,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000.0
1337,EliteCoin,X13,PoW/PoS,29279420000.0,314159300000.0
BTC,Bitcoin,SHA-256,PoW,17927180.0,21000000.0
ETH,Ethereum,Ethash,PoW,107684200.0,0.0


In [25]:
#drop the CoinName column from the dataframe.
cleaned_df = crypto_df.drop(columns="CoinName",axis=1)
cleaned_df.head()

Unnamed: 0,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
42,Scrypt,True,PoW/PoS,41.99995,42.0
365,X11,True,PoW/PoS,,2300000000.0
404,Scrypt,True,PoW/PoS,1055185000.0,532000000.0
611,SHA-256,True,PoW,,611000.0
808,SHA-256,True,PoW/PoS,0.0,0.0


In [26]:
cleaned_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1144 entries, 42 to XBC
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Algorithm        1144 non-null   object 
 1   IsTrading        1144 non-null   bool   
 2   ProofType        1144 non-null   object 
 3   TotalCoinsMined  685 non-null    float64
 4   TotalCoinSupply  1142 non-null   float64
dtypes: bool(1), float64(2), object(2)
memory usage: 45.8+ KB


In [27]:
#In order for your dataset to be comprehensible to a machine learning algorithm, 
#its data should be numeric
#get_dummies to convert categorical to numeric

# TotalcoinSupply has one non-numeric value cause data type to a string
# not a number

cleaned_df = pd.get_dummies(cleaned_df)
cleaned_df.head()

Unnamed: 0,IsTrading,TotalCoinsMined,TotalCoinSupply,Algorithm_1GB AES Pattern Search,Algorithm_536,Algorithm_Argon2,Algorithm_Argon2d,Algorithm_BLAKE256,Algorithm_Blake,Algorithm_Blake2S,...,ProofType_PoW/PoS/PoC,ProofType_PoW/PoW,ProofType_PoW/nPoS,ProofType_Pos,ProofType_Proof of Authority,ProofType_Proof of Stake,ProofType_Proof of Trust,ProofType_TPoS,ProofType_Zero-Knowledge Proof,ProofType_dPoW/PoW
42,True,41.99995,42.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
365,True,,2300000000.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
404,True,1055185000.0,532000000.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
611,True,,611000.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
808,True,0.0,0.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [28]:
# Standarize data with StandarScaler
iris_scaled = StandardScaler().fit_transform(df_iris)
print(iris_scaled[0:5])

NameError: name 'df_iris' is not defined

In [None]:
# Applying PCA to reduce dimensions from 4 to 2

# Initialize PCA model
pca = PCA(n_components=.8)

# Get two principal components for the iris data.
iris_pca = pca.fit_transform(iris_scaled)

In [None]:
iris_pca

In [None]:
# Transform PCA data to a DataFrame
df_iris_pca = pd.DataFrame(
    data=iris_pca, columns=["principal component 1", "principal component 2"]
)
df_iris_pca.head()

In [None]:
# Fetch the explained variance
pca.explained_variance_ratio_

## Running KMeans with PCA Data

In [None]:
# Finding the best value for k
inertia = []
k = list(range(1, 11))

# Calculate the inertia for the range of k values
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(df_iris_pca)
    
    inertia.append(km.inertia_)

# Creating the Elbow Curve
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)

plt.plot(df_elbow['k'], df_elbow['inertia'])
plt.xticks(list(range(11)))
plt.title('Elbow Curve')
plt.xlabel('Number of clusters')
plt.ylabel('Inertia')
plt.show()

In [None]:
# Predicting clusters with k=3

# Initialize the k-means model
model = KMeans(n_clusters=3, random_state=0)

# Fit the model
model.fit(df_iris_pca)

# Predict clusters
predictions = model.predict(df_iris_pca)

# Add the predicted class column to the dataframe
df_iris_pca["class"] = model.labels_
df_iris_pca.head()

In [None]:
# Visualize the clusters
plt.scatter(x=df_iris_pca['principal component 1'], y=df_iris_pca['principal component 2'], c=df_iris_pca['class'])
plt.xlabel('Principal component 1')
plt.ylabel('Principal component 2')
plt.title('Iris clusters')
plt.show()