# Clustering Crypto

In [None]:
import os
import io
import json
import numpy as np
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler,LabelEncoder
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans as KM
!pip install -U altair
import altair as alt


# Amazon SageMaker and related imports
import sagemaker
from sagemaker import get_execution_role
import sagemaker.amazon.common as smac
from sagemaker.predictor import csv_serializer, json_deserializer
from sagemaker import get_execution_role
from sagemaker.amazon.amazon_estimator import get_image_uri
import boto3  # AWS Python sdk

%matplotlib inline

### Data Preprocessing

In [None]:
# Loade the cryptocurrencies data
file_path = Path("data/crypto_data.csv")
crypto_df = pd.read_csv(file_path, index_col=0)
crypto_df.head(10)

In [None]:
crypto_df['IsTrading'].value_counts()

In [None]:
# Keep only cryptocurrencies that are on trading
crypto_df = crypto_df[crypto_df['IsTrading'] == True]
crypto_df.head()

In [None]:
crypto_df['IsTrading'].value_counts()

In [None]:
# Keep only cryptocurrencies with a working algorithm
crypto_df['ProofType'].value_counts()

In [None]:
crypto_df.drop(crypto_df.loc[
    (crypto_df['ProofType']=='Zero-Knowledge Proof')|(crypto_df['ProofType']=='Limited Confidence Proof-of-Activity ')]
               .index, inplace=True)


In [None]:
crypto_df['ProofType'].value_counts()

In [None]:
# Remove the "IsTrading" column
crypto_df.drop(columns=['IsTrading'], inplace=True)
crypto_df.head()

In [None]:
# Remove rows with at least 1 null value
crypto_df.dropna(inplace=True)
crypto_df.head()

In [None]:
# Remove rows with cryptocurrencies without coins mined
crypto_df=crypto_df.loc[crypto_df['TotalCoinsMined']>0]
crypto_df.head()

In [None]:
# Fetch the cryptocurrencies names prior to drop them from crypto_df
coin_name=crypto_df['CoinName'].copy()

In [None]:
# Remove the cryptocurrency name since it's not going to be used on the clustering algorithm
crypto_df.drop(columns=['CoinName'], inplace=True)

In [None]:
# Create dummies variables for text features
cdf=crypto_df.copy()
label_encoder = LabelEncoder()
label_encoder.fit(cdf["ProofType"])
cdf["proof_type_le"] = label_encoder.transform(cdf["ProofType"])
label_encoder.fit(cdf["Algorithm"])
cdf["Algo_le"] = label_encoder.transform(cdf["Algorithm"])
cdf.drop(columns=['Algorithm', 'ProofType'], inplace=True)
cdf.head()

In [None]:
# Standardize data
data_scaler = StandardScaler()
cdf[['TotalCoinsMined', 'TotalCoinSupply']] = data_scaler.fit_transform(cdf[['TotalCoinsMined', 'TotalCoinSupply']])
cdf.head()

### Reducing Dimensions Using PCA

In [None]:
# Use PCA to reduce dimension to 3 principal components
crypto_scaled = StandardScaler().fit_transform(cdf)
pca = PCA(n_components=3)
crypto_pca = pca.fit_transform(crypto_scaled)

In [None]:
# Create a DataFrame with the principal components data
df_crypto_pca = pd.DataFrame(
    data=crypto_pca, columns=["pc1", "pc2", 'pc3']
)
df_crypto_pca.set_index(crypto_df.index, inplace=True)
df_crypto_pca.head()

In [None]:
train_data=df_crypto_pca.values.astype('float32')
train_data

In [None]:
inertia = []
k = list(range(1, 11))


for i in k:
    km = KM(n_clusters=i, random_state=0)
    km.fit(train_data)
    inertia.append(km.inertia_)


elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
alt.Chart(df_elbow).mark_line().encode(
    x='k',
    y='inertia'
)


### sagemaker stuff

In [None]:
bucket = "aws-hw-202006-01"

prefix = "cypto_mining"

role = get_execution_role()

In [None]:
buf = io.BytesIO()
data = df_crypto_pca.to_numpy()
smac.write_numpy_to_dense_tensor(buf, train_data)
buf.seek(0)

# Upload encoded training data to Amazon S3
key = "kmeans"
boto3.resource("s3").Bucket(bucket).Object(
    os.path.join(prefix, "ktrain", key)
).upload_fileobj(buf)
s3_train_data = "s3://{}/{}/train/{}".format(bucket, prefix, key)
print("Training data uploaded to: {}".format(s3_train_data))




In [None]:
print(s3_train_data)

### Clustering Crytocurrencies Using K-Means

#### Find the Best Value for `k` Using the Elbow Curve

In [None]:
from sagemaker import KMeans
sess = sagemaker.Session()

num_clusters = 4

kmeans =  KMeans(role=role,
                train_instance_count=1,
                init_method='random',
                train_instance_type='ml.c4.xlarge',
                output_path="s3://{}/{}/output".format(bucket, prefix),              
                k=num_clusters,
                data_location=s3_train_data)

kmeans.fit(kmeans.record_set(train_data))

kmeans_predictor = kmeans.deploy(initial_instance_count=1, 
                                 instance_type='ml.t2.medium')
result=kmeans_predictor.predict(train_data)
print('done')

Running K-Means with `k=4`

In [None]:
results=pd.concat([crypto_df, df_crypto_pca, coin_name], axis=1, join='inner')
cluster_labels = [r.label['closest_cluster'].float32_tensor.values[0] for r in result]
results["class"] = cluster_labels
results.head()

### Visualizing Results

#### 3D-Scatter with Clusters

In [None]:
# Create a 3D-Scatter with the PCA data and the cluster
alt.Chart(results).mark_circle(size=60).encode(
    x='pc1',
    y='pc2',
    color='class',
    tooltip=['Algorithm','TotalCoinsMined','TotalCoinSupply','CoinName']
).interactive()

#### Table of Tradable Cryptocurrencies

In [None]:
display(results)

#### Scatter Plot with Tradable Cryptocurrencies

In [None]:
# Scale data to create the scatter plot
data_scaler = MinMaxScaler()
results[['TotalCoinsMined', 'TotalCoinSupply']] = data_scaler.fit_transform(results[['TotalCoinsMined', 'TotalCoinSupply']])

In [None]:
# Plot the scatter with x="TotalCoinsMined" and y="TotalCoinSupply"
alt.Chart(results).mark_circle(size=60).encode(
    x='TotalCoinsMined',
    y='TotalCoinSupply',
    color='class',
    tooltip=['Algorithm','ProofType','CoinName']
).interactive()