In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.preprocessing import StandardScaler

In [2]:
# Read crypto_data.csv into Pandas
crypto_df = pd.read_csv(Path('data/crypto_data.csv'))
crypto_df.head()

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
1,365,365Coin,X11,True,PoW/PoS,,2300000000
2,404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
3,611,SixEleven,SHA-256,True,PoW,,611000
4,808,808,SHA-256,True,PoW/PoS,0.0,0


## Data Preparation

In [3]:
# Gather information about dataframe for pre-processing
print('Value Counts of Column IsTrading:')
print(pd.Series(crypto_df['IsTrading']).value_counts())
print('----------')
print('Count of Nulls:')
print(crypto_df.isna().sum())
print('----------')
print('Shape of Data Frame:')
print(crypto_df.shape)

Value Counts of Column IsTrading:
True     1144
False     108
Name: IsTrading, dtype: int64
----------
Count of Nulls:
Unnamed: 0           0
CoinName             0
Algorithm            0
IsTrading            0
ProofType            0
TotalCoinsMined    508
TotalCoinSupply      0
dtype: int64
----------
Shape of Data Frame:
(1252, 7)


In [4]:
# Discard all cryptocurrencies that are not being traded and have not been mined
crypto_active = crypto_df[(crypto_df['IsTrading'] == True) & (crypto_df['TotalCoinsMined'] > 0)]

In [5]:
# Drop the IsTrading column from the dataframe
crypto_active = crypto_active.drop('IsTrading', axis = 1)

# Remove all rows that have at least one null value
crypto_active = crypto_active.dropna()

In [6]:
# Confirm that 'IsTrading' has been dropped and that null values have been removed
print(crypto_active.columns)
print('----------')
print(crypto_active.isna().sum())

Index(['Unnamed: 0', 'CoinName', 'Algorithm', 'ProofType', 'TotalCoinsMined',
       'TotalCoinSupply'],
      dtype='object')
----------
Unnamed: 0         0
CoinName           0
Algorithm          0
ProofType          0
TotalCoinsMined    0
TotalCoinSupply    0
dtype: int64


In [7]:
# Data should be numeric for the machine learning algorithm
# Coin names do not contribute to the analysis of the data so the CoinName column will be dropped, as well as 'Unnamed: 0' as that appears to be the shortened form of the CoinName
crypto_active = crypto_active.drop(['Unnamed: 0', 'CoinName'], axis = 1)

In [8]:
# Analyze column contents of 'Algorithm' and 'ProofType' to determine best way to convert the columns to numerical values
print(pd.Series(crypto_active['Algorithm']).value_counts())
print('----------')
print(pd.Series(crypto_active['ProofType']).value_counts())
print('----------')
print('Shape of Data Frame:')
print(crypto_active.shape)

Scrypt                 182
X11                     73
SHA-256                 48
CryptoNight             19
PoS                     17
                      ... 
Leased POS               1
QUAIT                    1
Time Travel              1
SkunkHash v2 Raptor      1
M7 POW                   1
Name: Algorithm, Length: 71, dtype: int64
----------
PoW                     237
PoW/PoS                 176
PoS                      86
DPoS                      9
PoC                       3
PoS/PoW                   2
HPoW                      1
PoW/PoS                   1
LPoS                      1
POBh                      1
PoW and PoS               1
dPoW/PoW                  1
Proof of Trust            1
Proof of Authority        1
PoS/PoW/PoT               1
PoS/LPoS                  1
DPOS                      1
PoW + Hive                1
TPoS                      1
PoA                       1
PoST                      1
PoW/PoW                   1
Zero-Knowledge Proof      1
Pos   

In [9]:
# Turn Algorithm and ProofType into numerical columns.
# pd.get_dummies will result in a large number of columns, so a dictionary of values is going to be used to replace the strings
# The number of columns should remain the same

algorithm_list = []

for algorithm in crypto_active['Algorithm']:
    if algorithm not in algorithm_list:
        algorithm_list.append(algorithm)
        
algorithm_dict = {}
for index, algorithm in enumerate(algorithm_list):
    algorithm_dict.update({algorithm: index})
    
    
prooftype_list = []

for prooftype in crypto_active['ProofType']:
    if prooftype not in prooftype_list:
        prooftype_list.append(prooftype)
        
prooftype_dict = {}
for index, prooftype in enumerate(prooftype_list):
    prooftype_dict.update({prooftype: index})
    

In [10]:
crypto_active['Algorithm'] = crypto_active['Algorithm'].map(algorithm_dict)
crypto_active['ProofType'] = crypto_active['ProofType'].map(prooftype_dict)

In [11]:
print(pd.Series(crypto_active['Algorithm']).value_counts())
print('----------')
print(pd.Series(crypto_active['ProofType']).value_counts())
print('----------')
print('Shape of Data Frame:')
print(crypto_active.shape)

0     182
4      73
2      48
18     19
1      17
     ... 
43      1
45      1
46      1
47      1
70      1
Name: Algorithm, Length: 71, dtype: int64
----------
1     237
0     176
2      86
10      9
3       3
7       2
15      1
23      1
22      1
21      1
20      1
19      1
18      1
17      1
16      1
12      1
14      1
13      1
11      1
9       1
8       1
6       1
5       1
4       1
24      1
Name: ProofType, dtype: int64
----------
Shape of Data Frame:
(532, 4)


In [12]:
crypto_active.dtypes
# print(pd.Series(crypto_active['TotalCoinSupply']).value_counts())

Algorithm            int64
ProofType            int64
TotalCoinsMined    float64
TotalCoinSupply     object
dtype: object

In [14]:
# Scale the dataset
scaler = StandardScaler().fit(crypto_active)
crypto_scaled = scaler.transform(crypto_active)

In [15]:
crypto_scaled

array([[-0.6516109 , -0.48303965, -0.11710817, -0.1528703 ],
       [-0.6516109 , -0.48303965, -0.09396955, -0.145009  ],
       [-0.59091103, -0.48303965,  0.52494561,  4.48942416],
       ...,
       [ 0.44098689, -0.16141625, -0.09561336, -0.13217937],
       [-0.28741164, -0.16141625, -0.11694817, -0.15255998],
       [-0.6516109 ,  0.16020714, -0.11710536, -0.15285552]])