## Data Preparation

In [1]:
import numpy as np
import pandas as pd
from pathlib import Path

In [2]:
# Read CSV files, remove the unnamed column that is only another representation of CoinName
train_df = pd.read_csv(Path('Resources/crypto_data.csv'), index_col = 0)

In [3]:
train_df.shape

(1252, 6)

In [4]:
# Keep only currently trading cryptocurrencies / drop IsTrading column
only_trading = (train_df['IsTrading'] == True)
train_df = train_df.loc[only_trading].drop(columns=['IsTrading'])
train_df.head()

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,PoW/PoS,41.99995,42
365,365Coin,X11,PoW/PoS,,2300000000
404,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000
611,SixEleven,SHA-256,PoW,,611000
808,808,SHA-256,PoW/PoS,0.0,0


In [5]:
# Drop rows that have any null value
train_df = train_df.dropna(axis=0, how='any')
train_df.head()

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,PoW/PoS,41.99995,42
404,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000
808,808,SHA-256,PoW/PoS,0.0,0
1337,EliteCoin,X13,PoW/PoS,29279420000.0,314159265359
BTC,Bitcoin,SHA-256,PoW,17927180.0,21000000


In [6]:
# Keep only cryptocurrencies that have been mined
mined_crypto = (train_df['TotalCoinsMined'] > 0)
train_df = train_df.loc[mined_crypto]
train_df.head()

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,PoW/PoS,41.99995,42
404,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000
1337,EliteCoin,X13,PoW/PoS,29279420000.0,314159265359
BTC,Bitcoin,SHA-256,PoW,17927180.0,21000000
ETH,Ethereum,Ethash,PoW,107684200.0,0


In [7]:
# Drop the 'CoinName' column, it does not contribute to analysis
train_df = train_df.drop(columns=['CoinName'])
train_df.head()

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,Scrypt,PoW/PoS,41.99995,42
404,Scrypt,PoW/PoS,1055185000.0,532000000
1337,X13,PoW/PoS,29279420000.0,314159265359
BTC,SHA-256,PoW,17927180.0,21000000
ETH,Ethash,PoW,107684200.0,0


In [8]:
# Identify the data types of remaining columns
train_df.dtypes

Algorithm           object
ProofType           object
TotalCoinsMined    float64
TotalCoinSupply     object
dtype: object

In [10]:
# Convert TotalCoinSupply to type float for the Machine Learning algorithms inputs
train_df['TotalCoinSupply'] = train_df['TotalCoinSupply'].astype(float)

In [11]:
# Check updated data types
train_df.dtypes

Algorithm           object
ProofType           object
TotalCoinsMined    float64
TotalCoinSupply    float64
dtype: object

In [16]:
# Identify the number of unique values in feature 'Algorithm' and 'ProofType'
print(train_df['Algorithm'].nunique(), train_df['ProofType'].nunique())

71 25


In [17]:
# Update features 'Algorithm' and 'ProofType' to numeric formats for the fitting.
# Expect (71 + 25 + (4 - 2)) = 98 features
X_train_df = pd.get_dummies(train_df)
print("X_train_df shape: ", X_train_df.shape)
X_train_df.columns

X_train_df shape:  (532, 98)


Index(['TotalCoinsMined', 'TotalCoinSupply',
       'Algorithm_1GB AES Pattern Search', 'Algorithm_536',
       'Algorithm_Argon2d', 'Algorithm_BLAKE256', 'Algorithm_Blake',
       'Algorithm_Blake2S', 'Algorithm_Blake2b', 'Algorithm_C11',
       'Algorithm_Cloverhash', 'Algorithm_Counterparty',
       'Algorithm_CryptoNight', 'Algorithm_CryptoNight Heavy',
       'Algorithm_CryptoNight-V7', 'Algorithm_Cryptonight-GPU',
       'Algorithm_DPoS', 'Algorithm_Dagger', 'Algorithm_Dagger-Hashimoto',
       'Algorithm_ECC 256K1', 'Algorithm_Equihash',
       'Algorithm_Equihash+Scrypt', 'Algorithm_Ethash', 'Algorithm_Exosis',
       'Algorithm_Green Protocol', 'Algorithm_Groestl', 'Algorithm_HMQ1725',
       'Algorithm_HybridScryptHash256', 'Algorithm_IMesh',
       'Algorithm_Jump Consistent Hash', 'Algorithm_Keccak',
       'Algorithm_Leased POS', 'Algorithm_Lyra2RE', 'Algorithm_Lyra2REv2',
       'Algorithm_Lyra2Z', 'Algorithm_M7 POW', 'Algorithm_Multiple',
       'Algorithm_NIST5', 'Algor

In [18]:
# Standardize the dataset with StandardScaler() to reduce impact of features with large numbers
# Scaling the X data by using StandardScaler()
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(X_train_df)
X_train_scaled = scaler.transform(X_train_df)
X_train_scaled

array([[-0.11710817, -0.1528703 , -0.0433963 , ..., -0.0433963 ,
        -0.0433963 , -0.0433963 ],
       [-0.09396955, -0.145009  , -0.0433963 , ..., -0.0433963 ,
        -0.0433963 , -0.0433963 ],
       [ 0.52494561,  4.48942416, -0.0433963 , ..., -0.0433963 ,
        -0.0433963 , -0.0433963 ],
       ...,
       [-0.09561336, -0.13217937, -0.0433963 , ..., -0.0433963 ,
        -0.0433963 , -0.0433963 ],
       [-0.11694817, -0.15255998, -0.0433963 , ..., -0.0433963 ,
        -0.0433963 , -0.0433963 ],
       [-0.11710536, -0.15285552, -0.0433963 , ..., -0.0433963 ,
        -0.0433963 , -0.0433963 ]])

# Dimensionality Reduction with PCA and t-SNE

In [None]:
# Use PCA to do initial dimensionality reduction
