In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans

Data Preparation

In [12]:
# read in crypocurrency csv dataset
file = os.path.join("crypto_data.csv")

df = pd.read_csv(file)

df

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,True,PoW/PoS,4.199995e+01,42
1,365,365Coin,X11,True,PoW/PoS,,2300000000
2,404,404Coin,Scrypt,True,PoW/PoS,1.055185e+09,532000000
3,611,SixEleven,SHA-256,True,PoW,,611000
4,808,808,SHA-256,True,PoW/PoS,0.000000e+00,0
...,...,...,...,...,...,...,...
1247,XBC,BitcoinPlus,Scrypt,True,PoS,1.283270e+05,1000000
1248,DVTC,DivotyCoin,Scrypt,False,PoW/PoS,2.149121e+07,100000000
1249,GIOT,Giotto Coin,Scrypt,False,PoW/PoS,,233100000
1250,OPSC,OpenSourceCoin,SHA-256,False,PoW/PoS,,21000000


In [13]:
df.columns

Index(['Unnamed: 0', 'CoinName', 'Algorithm', 'IsTrading', 'ProofType',
       'TotalCoinsMined', 'TotalCoinSupply'],
      dtype='object')

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1252 entries, 0 to 1251
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Unnamed: 0       1252 non-null   object 
 1   CoinName         1252 non-null   object 
 2   Algorithm        1252 non-null   object 
 3   IsTrading        1252 non-null   bool   
 4   ProofType        1252 non-null   object 
 5   TotalCoinsMined  744 non-null    float64
 6   TotalCoinSupply  1252 non-null   object 
dtypes: bool(1), float64(1), object(5)
memory usage: 60.0+ KB


In [15]:
#filter for currencies that are currently being traded
df = df[df.IsTrading == True]

In [16]:
#drop the IsTrading column from the dataframe
df.drop('IsTrading', axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [17]:
df

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,PoW/PoS,4.199995e+01,42
1,365,365Coin,X11,PoW/PoS,,2300000000
2,404,404Coin,Scrypt,PoW/PoS,1.055185e+09,532000000
3,611,SixEleven,SHA-256,PoW,,611000
4,808,808,SHA-256,PoW/PoS,0.000000e+00,0
...,...,...,...,...,...,...
1243,SERO,Super Zero,Ethash,PoW,,1000000000
1244,UOS,UOS,SHA-256,DPoI,,1000000000
1245,BDX,Beldex,CryptoNight,PoW,9.802226e+08,1400222610
1246,ZEN,Horizen,Equihash,PoW,7.296538e+06,21000000


In [18]:
#Remove all rows that have at least one null value
df.dropna(inplace=True)
df.isnull().sum().sum()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


0

In [9]:
#Filter for cryptocurrencies that have been mined. That is, the total coins mined should be greater than zero.
df = df[df.TotalCoinsMined > 0]



In [19]:
#delete the CoinName from the original dataframe
df_dropped_names = df.drop(['Unnamed: 0', 'CoinName'], axis = 1)

In [20]:
df_dropped_names['TotalCoinSupply'] = df_dropped_names['TotalCoinSupply'].apply(lambda x: float(x))
df_dropped_names.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 685 entries, 0 to 1247
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Algorithm        685 non-null    object 
 1   ProofType        685 non-null    object 
 2   TotalCoinsMined  685 non-null    float64
 3   TotalCoinSupply  685 non-null    float64
dtypes: float64(2), object(2)
memory usage: 26.8+ KB


In [21]:
#convert the remaining features with text values, Algorithm and ProofType, into numerical data
#use Pandas to create dummy variables
df_dummies = pd.get_dummies(df_dropped_names)
df_dummies.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 685 entries, 0 to 1247
Columns: 109 entries, TotalCoinsMined to ProofType_dPoW/PoW
dtypes: float64(2), uint8(107)
memory usage: 87.6 KB


In [23]:
#Standardize your dataset so that columns that contain larger values do not unduly influence the outcome

scaler = StandardScaler()
scaled_data = scaler.fit_transform(df_dummies)

In [29]:
scaled_data[:1]

array([[-0.10282804, -0.03823841, -0.03823596, -0.03823596, -0.03823596,
        -0.03823596, -0.05411338, -0.07664017, -0.03823596, -0.05411338,
        -0.05411338, -0.03823596, -0.03823596, -0.18216065, -0.05411338,
        -0.03823596, -0.03823596, -0.08574929, -0.03823596, -0.10160947,
        -0.06632365, -0.03823596, -0.03823596, -0.1642757 , -0.03823596,
        -0.03823596, -0.13908716, -0.03823596, -0.03823596, -0.07664017,
        -0.03823596, -0.03823596, -0.03823596, -0.03823596, -0.06632365,
        -0.03823596, -0.07664017, -0.08574929, -0.07664017, -0.03823596,
        -0.03823596, -0.12775161, -0.1335313 , -0.13908716, -0.03823596,
        -0.05411338, -0.03823596, -0.06632365, -0.1689039 , -0.03823596,
        -0.03823596, -0.03823596, -0.07664017, -0.17342199, -0.33468341,
        -0.03823596, -0.08574929, -0.06632365, -0.05411338, -0.03823596,
         1.42042992, -0.06632365, -0.03823596, -0.03823596, -0.06632365,
        -0.06632365, -0.03823596, -0.03823596, -0.0

Dimensionality Reduction

In [24]:
#Perform dimensionality reduction with PCA
#preserve 90% of the explained variance in dimensionality reduction PCA(n_components=0.90)

pca = PCA(n_components=0.90)
pca_transformed_data = pca.fit_transform(scaled_data)

In [25]:
#further reduce the dataset dimensions with t-SNE and visually inspect the results
#run t-SNE on the principal components: the output of the PCA transformation

tsne = TSNE(learning_rate=100)
transformed_features = tsne.fit_transform(pca_transformed_data)


In [27]:
#create a scatter plot of the t-SNE output

df['x'] = transformed_features[:,0]
df['y'] = transformed_features[:,1]
df['class'] = df['IsTrading']

plt.scatter(df['x'], df2['y'])
plt.show()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


KeyError: 'IsTrading'

In [28]:
plt.scatter(df['x'], df['y'], c=df['class'])
plt.show()

KeyError: 'class'

Cluster Analysis with k-Means

In [None]:
#Create an elbow plot to identify the best number of clusters.
#Use a for-loop to determine the inertia for each k between 1 through 10.
#Determine, if possible, where the elbow of the plot is, and at which value of k it appears.

Recommendation

Based on your findings, make a brief (1-2 sentences) recommendation to your clients. Can the cryptocurrencies be clustered together? If so, into how many clusters?