# Clustering Crypto

In [None]:
# Initial imports
import sys
import os
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import hvplot.pandas
from pathlib import Path
import dataframe_image as dfi
from IPython.display import Image
from matplotlib import pyplot as plt
import pydotplus
import seaborn as sns
import mxnet as mx

%matplotlib inline

# SK Learn
from sklearn import tree
from sklearn.svm import LinearSVC
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.inspection import permutation_importance
from sklearn import preprocessing
from sklearn import utils

# Imbalanced Learn
from imblearn.under_sampling import RandomUnderSampler
from imblearn.under_sampling import ClusterCentroids
from imblearn.metrics import classification_report_imbalanced
from imblearn import over_sampling as os
from imblearn import pipeline as pl
from imblearn.metrics import (geometric_mean_score, make_index_balanced_accuracy)
from imblearn.ensemble import EasyEnsembleClassifier

# AWS Sagemaker 
import sagemaker
import sagemaker.amazon.common as smac
from sagemaker.predictor import csv_serializer, json_deserializer
from sagemaker import get_execution_role
from sagemaker.amazon.amazon_estimator import get_image_uri
from sagemaker import PCA
from sagemaker import KMeans
import boto3  
import io

In [None]:
# Initiate Sagemaker client 
sess = sagemaker.Session()
bucket = sess.default_bucket()

# Retrieve ARN from AWS
role = get_execution_role()
role

## Read in AWS S3 Data

In [None]:
# Initiate AWS client
## S3 policy revision
s3_client = boto3.client("s3")
data_bucket_name = "unit-13-challenge"

# Retrieve ARN / execution role 
role = get_execution_role()
role

In [None]:
# Retrieve list of objects contained within s3 bucket
obj_list = s3_client.list_objects(Bucket=data_bucket_name)
file = ['crypto_data.csv']
for contents in obj_list["Contents"]:
    file.append(contents["Key"])
print(file)

In [None]:
file_data = file[0]

## Data Exploration & Analysis

In [None]:
# Retrieve data from the CSV file in s3 bucket
response = s3_client.get_object(Bucket=data_bucket_name, Key=file_data)
response_body = response["Body"].read()

# Create dataframe
crypto_data = pd.read_csv(io.BytesIO(response_body), header=0, delimiter=",", low_memory=False)
crypto_data.head()

In [None]:
# Keep only necessary columns:
# 'CoinName','Algorithm','IsTrading','ProofType','TotalCoinsMined','TotalCoinSupply'
crypto_data.drop(crypto_data.columns[crypto_data.columns.str.contains('unnamed', case = False)], axis = 1, inplace = True)
crypto_data.head()

In [None]:
# Keep only cryptocurrencies that are trading
crypto_data.drop(crypto_data[crypto_data.IsTrading=='False'].index, inplace = True)
crypto_data.head()

In [None]:
# Keep only cryptocurrencies with a working algorithm
crypto_data = crypto_data.loc[crypto_data['IsTrading'] == True]
crypto_data.head()

In [None]:
# Remove the "IsTrading" column
crypto_data.drop(columns = 'IsTrading', axis = 1, inplace = True)
crypto_data.head()

In [None]:
# Drop null values 
crypto_data.dropna(inplace=True)
crypto_data.head()

In [None]:
# Remove rows with cryptocurrencies having no coins mined
crypt_data = crypto_data.loc[crypto_data['TotalCoinsMined'] != 0]
crypto_data.head()

In [None]:
crypto_data.dropna()
crypto_data.head()

In [None]:
crypto_data.isnull().sum()

In [None]:
crypto_data.index = crypto_data["CoinName"]
crypto_data.head()

In [None]:
crypto_data.drop(crypto_data.columns[crypto_data.columns.str.contains('CoinName', case = False)], axis = 1, inplace = True)
crypto_data.head()

In [None]:
for a in ["TotalCoinsMined", "TotalCoinSupply"]:
    ax = plt.subplots(figsize=(10, 4))
    ax = sns.distplot(crypto_data[a])
    title = "Histogram of " + a
    ax.set_title(title, fontsize=12)
    plt.show()
    
# for a in ["CoinName", "TotalCoinsMined", "TotalCoinSupply"]:

## Feature Engineering

Data Scaling- We need to standardize the scaling of the numerical columns in order to use any distance based analytical methods so that we can compare the relative distances between different feature columns. We can use minmaxscaler to transform the numerical columns so that they also fall between 0 and 1.

In [None]:
# Create dummy variables for text features
df_variables = pd.get_dummies(crypto_data, columns=['Algorithm', 'ProofType'])
df_variables.head()

In [None]:
# Scale data
scaled = StandardScaler().fit_transform(df_variables)
print(scaled[0:1])

In [None]:
# Create dummy variables for text features
df_variables = pd.get_dummies(crypto_data, columns=['Algorithm', 'ProofType'])
df_variables.head()

In [None]:
# Inspect new dataframe with dummy features 
df_variables.describe()

## Data Modelling 
* The AWS Sagemaker algorithm for principal component analysis (PCA) was used to reduce the dimensionality of cryptocurrency data. This method decomposes the data matrix into features that are orthogonal with each other. 
* The resultant orthogonal features are linear combinations of the original feature set. 
* This method involves taking many features and combining similar or redundant features together to form a new, smaller feature set.

In [None]:
num_components = 33

pca_SM = PCA(
    role=role,
    instance_count=1,
    instance_type="ml.c4.xlarge",  
    output_path="s3://" + bucket,
    num_components=num_components,
)

In [None]:
train_data = df_variables.values.astype("float32")
train_data

## Model Attributes for Principal Components Analysis (PCA)
* Model artifacts are stored in AWS S3 after completing training in previous step. 
* The model artifact is stored as an ND array. 
* The model resides in <training_job_name>/output/model.tar.gz file, which is a TAR archive file compressed with GNU zip (gzip) compression.

In [None]:
job_name = pca_SM.latest_training_job.name
model_key = job_name + "/output/model.tar.gz"

boto3.resource("s3").Bucket(bucket).download_file(model_key, "model.tar.gz")
os.system("tar -zxvf model.tar.gz")

# Load ND array via MXNet
pca_model_params = mx.ndarray.load("model_algo-1")

In [None]:
# Examine the makeup of each PCA component based on the weightings of the original features that are included in the component
component_num=3

first_comp = v_5[5-component_num]
comps = pd.DataFrame(list(zip(first_comp, df_variables.columns)), columns=['weights', 'features'])
comps['abs_weights']=comps['weights'].apply(lambda x: np.abs(x))
ax=sns.barplot(data=comps.sort_values('abs_weights', ascending=False).head(10), x="weights", y="features", palette="Blues_d")
ax.set_title("PCA Component Makeup: #" + str(component_num))
plt.show()

## PCA Model Deployment

In [None]:
%%time
pca_predictor = pca_SM.deploy(initial_instance_count=1, instance_type="ml.c4.xlarge") 

In [None]:
PCA_list = ["principal component 1", "principal component 2", "principal component 3"]

In [None]:
# Pass original dataset to model 
result = pca_predictor.predict(train_data) 

crypto_data_transformed = pd.DataFrame()

for a in result:
    b = a.label["projection"].float32_tensor.values
    crypto_data_transformed = crypto_data_transformed.append([list(b)])
    
crypto_data_transformed.index = crypto_data_transformed.index
crypto_data_transformed = crypto_data_transformed.iloc[:, 28:]
crypto_data_transformed.columns = PCA_list

In [None]:
# Inspect dataframe
crypto_data_transformed.head(15)

## KMeans Algorithm (Unsupervised Clustering)

In [None]:
train_data = crypto_data_transformed.values.astype("float32")

In [None]:
# Call and define the hyperparameters of model
## The KMeans algorithm allows the user to specify how many clusters to identify
num_clusters = 3

kmeans = KMeans(
    role=role,
    instance_count=1,
    instance_type="ml.c4.xlarge",
    output_path="s3://" + bucket + "/counties/",
    k=num_clusters,
)
kmeans

* The record_set function in the Amazon SageMaker PCA model converts a numpy array into a record set format that is the required format for the input data to be trained.
* The use of this data type is one of the reasons that allows training of models within Amazon SageMaker to perform quicker than other implementations such as sklearn.

In [None]:
# Train model on training data
kmeans.fit(kmeans.record_set(train_data))

In [None]:
# Deploy model and pass in the original training set
kmeans_predictor = kmeans.deploy(initial_instance_count=1, instance_type="ml.c4.xlarge")

In [None]:
result = kmeans_predictor.predict(train_data)

In [None]:
# View breakdown of cluster counts and the distribution of clusters
cluster_labels = [r.label["closest_cluster"].float32_tensor.values[0] for r in result]

In [None]:
pd.DataFrame(cluster_labels)[0].value_counts()

In [None]:
# Visualize cluster counts
ax = plt.subplots(figsize=(6, 3))
ax = sns.distplot(cluster_labels, kde=False)
title = "Histogram of Cluster Counts"
ax.set_title(title, fontsize=12)
plt.show()

## Model Attributes for KMeans Algorithm

In [None]:
job_name = kmeans.latest_training_job.name
model_key = job_name + "/output/model_kmeans.tar.gz"

boto3.resource("s3").Bucket(bucket).download_file(model_key, "model_kmeans.tar.gz")
os.system("tar -zxvf model_kmeans.tar.gz")

In [None]:
Kmeans_model_params = mx.ndarray.load("model_algo-2")

In [None]:
# Cluster Centroids Locations
crypto_data_transformed = pd.DataFrame(Kmeans_model_params[0].asnumpy())
crypto_data_transformed.columns = crypto_data_transformed.columns

In [None]:
cluster_centroids

In [None]:
# Plot heatmap of cluster centroids
plt.figure(figsize=(16, 6))
ax = sns.heatmap(cluster_centroids.T, cmap="YlGnBu")
ax.set_xlabel("Cluster")
plt.yticks(fontsize=16)
plt.xticks(fontsize=16)
ax.set_title("Attribute Value by Centroid")
plt.show()

In [None]:
crypto_data_transformed["labels"] = list(map(int, cluster_labels))
crypto_data_transformed.head()

In [None]:
cluster = crypto_data_transformed[crypto_data_transformed["TotalCoinsMined"] == 1]
cluster.head(5)

## Delete Endpoint

In [None]:
# PCA Predictor
pca_predictor.delete_endpoint()

In [None]:
# KMeans Predictor
kmeans_predictor.delete_endpoint()