In [1]:
# Import required libraries and dependencies
import pandas as pd
import hvplot.pandas
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [2]:
# Load the data into a Pandas DataFrame # crypto_market_data.csv
df_market_data = pd.read_csv(
    "Resources/crypto_market_data.csv",
    index_col="coin_id")

# Display sample data
df_market_data.head(10)

Unnamed: 0_level_0,price_change_percentage_24h,price_change_percentage_7d,price_change_percentage_14d,price_change_percentage_30d,price_change_percentage_60d,price_change_percentage_200d,price_change_percentage_1y
coin_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
bitcoin,1.08388,7.60278,6.57509,7.67258,-3.25185,83.5184,37.51761
ethereum,0.22392,10.38134,4.80849,0.13169,-12.8889,186.77418,101.96023
tether,-0.21173,0.04935,0.0064,-0.04237,0.28037,-0.00542,0.01954
ripple,-0.37819,-0.60926,2.24984,0.23455,-17.55245,39.53888,-16.60193
bitcoin-cash,2.90585,17.09717,14.75334,15.74903,-13.71793,21.66042,14.49384
binancecoin,2.10423,12.85511,6.80688,0.05865,36.33486,155.61937,69.69195
chainlink,-0.23935,20.69459,9.30098,-11.21747,-43.69522,403.22917,325.13186
cardano,0.00322,13.99302,5.55476,10.10553,-22.84776,264.51418,156.09756
litecoin,-0.06341,6.60221,7.28931,1.21662,-17.2396,27.49919,-12.66408
bitcoin-cash-sv,0.9253,3.29641,-1.86656,2.88926,-24.87434,7.42562,93.73082


In [3]:
df_market_data.shape


(41, 7)

In [4]:
# Generate summary statistics
df_market_data.describe()

Unnamed: 0,price_change_percentage_24h,price_change_percentage_7d,price_change_percentage_14d,price_change_percentage_30d,price_change_percentage_60d,price_change_percentage_200d,price_change_percentage_1y
count,41.0,41.0,41.0,41.0,41.0,41.0,41.0
mean,-0.269686,4.497147,0.185787,1.545693,-0.094119,236.537432,347.667956
std,2.694793,6.375218,8.376939,26.344218,47.365803,435.225304,1247.842884
min,-13.52786,-6.09456,-18.1589,-34.70548,-44.82248,-0.3921,-17.56753
25%,-0.60897,0.04726,-5.02662,-10.43847,-25.90799,21.66042,0.40617
50%,-0.06341,3.29641,0.10974,-0.04237,-7.54455,83.9052,69.69195
75%,0.61209,7.60278,5.51074,4.57813,0.65726,216.17761,168.37251
max,4.84033,20.69459,24.23919,140.7957,223.06437,2227.92782,7852.0897


In [5]:
# Plot your data to see what's in your DataFrame
df_market_data.hvplot.line(
    width=800,
    height=400,
    rot=90
)

---

### Prepare the Data

In [6]:
# Use the `StandardScaler()` module from scikit-learn to normalize the data from the CSV file
scaler = StandardScaler()
scaler.fit(df_market_data)

In [7]:
X_scaled = scaler.transform(df_market_data)
X_scaled

array([[ 5.08529366e-01,  4.93193071e-01,  7.72200433e-01,
         2.35459633e-01, -6.74950963e-02, -3.55953481e-01,
        -2.51636882e-01],
       [ 1.85445894e-01,  9.34445040e-01,  5.58692121e-01,
        -5.43409317e-02, -2.73482725e-01, -1.15759474e-01,
        -1.99352110e-01],
       [ 2.17739616e-02, -7.06336853e-01, -2.16804207e-02,
        -6.10301536e-02,  8.00452481e-03, -5.50246924e-01,
        -2.82060506e-01],
       [-4.07643829e-02, -8.10928066e-01,  2.49457974e-01,
        -5.03879651e-02, -3.73164019e-01, -4.58258816e-01,
        -2.95546142e-01],
       [ 1.19303608e+00,  2.00095907e+00,  1.76061001e+00,
         5.45842065e-01, -2.91202870e-01, -4.99847761e-01,
        -2.70316950e-01],
       [ 8.91870708e-01,  1.32729453e+00,  8.00214184e-01,
        -5.71478992e-02,  7.78653106e-01, -1.88231917e-01,
        -2.25532605e-01],
       [ 1.13972400e-02,  2.57225091e+00,  1.10164693e+00,
        -4.90495415e-01, -9.31954023e-01,  3.87758986e-01,
        -1.8284399

In [8]:
index_list = list(df_market_data.index)
print(index_list)
columns_list = list(df_market_data.columns)
print(columns_list)

['bitcoin', 'ethereum', 'tether', 'ripple', 'bitcoin-cash', 'binancecoin', 'chainlink', 'cardano', 'litecoin', 'bitcoin-cash-sv', 'crypto-com-chain', 'usd-coin', 'eos', 'monero', 'tron', 'tezos', 'okb', 'stellar', 'cosmos', 'cdai', 'neo', 'wrapped-bitcoin', 'leo-token', 'huobi-token', 'nem', 'binance-usd', 'iota', 'vechain', 'zcash', 'theta-token', 'dash', 'ethereum-classic', 'ethlend', 'maker', 'havven', 'omisego', 'celsius-degree-token', 'ontology', 'ftx-token', 'true-usd', 'digibyte']
['price_change_percentage_24h', 'price_change_percentage_7d', 'price_change_percentage_14d', 'price_change_percentage_30d', 'price_change_percentage_60d', 'price_change_percentage_200d', 'price_change_percentage_1y']


In [9]:
# Create a DataFrame with the scaled data
scaled_df = pd.DataFrame(X_scaled, index = index_list, columns = columns_list)

# Copy the crypto names from the original data
# Set the coinid column as index
scaled_df.index.rename('coin_id', inplace=True)

# Display sample data
scaled_df.head(5)

Unnamed: 0_level_0,price_change_percentage_24h,price_change_percentage_7d,price_change_percentage_14d,price_change_percentage_30d,price_change_percentage_60d,price_change_percentage_200d,price_change_percentage_1y
coin_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
bitcoin,0.508529,0.493193,0.7722,0.23546,-0.067495,-0.355953,-0.251637
ethereum,0.185446,0.934445,0.558692,-0.054341,-0.273483,-0.115759,-0.199352
tether,0.021774,-0.706337,-0.02168,-0.06103,0.008005,-0.550247,-0.282061
ripple,-0.040764,-0.810928,0.249458,-0.050388,-0.373164,-0.458259,-0.295546
bitcoin-cash,1.193036,2.000959,1.76061,0.545842,-0.291203,-0.499848,-0.270317


---

### Find the Best Value for k Using the Original Data.

In [10]:
# Create a list with the number of k-values from 1 to 11
k_values = list(range(1,11))

In [11]:
# Create an empty list to store the inertia values
inertia_values = []

# Create a for loop to compute the inertia with each possible value of k
# Inside the loop:
# 1. Create a KMeans model using the loop counter for the n_clusters
# 2. Fit the model to the data using `df_market_data_scaled`
# 3. Append the model.inertia_ to the inertia list

for i in k_values:
    km_model = KMeans(n_clusters = i, random_state=1) 
    km_model.fit(scaled_df)
    calc_inertia = km_model.inertia_
    inertia_values.append(calc_inertia)
    
print(inertia_values)

[287.0, 212.1233420748626, 165.1367523344681, 79.02243535120977, 66.41305075939437, 61.634088110682214, 46.074467441030215, 43.39269411734827, 32.91322265971525, 30.469111293270352]


In [12]:
# Create a dictionary with the data to plot the Elbow curve
elbow_data = {"k": k_values, "inertia": inertia_values}

# Create a DataFrame with the data to plot the Elbow curve
elbow_df = pd.DataFrame(elbow_data)
elbow_df

Unnamed: 0,k,inertia
0,1,287.0
1,2,212.123342
2,3,165.136752
3,4,79.022435
4,5,66.413051
5,6,61.634088
6,7,46.074467
7,8,43.392694
8,9,32.913223
9,10,30.469111


In [13]:
# Plot a line chart with all the inertia values computed with 
# the different values of k to visually identify the optimal value for k.
plot_km = elbow_df.hvplot.line(
    x = "k",
    y = "inertia",
    title = "Elbow Curve Graph - KMeans",
    xticks = k_values
)
plot_km

#### Answer the following question: 

**Question:** What is the best value for `k`?

**Answer:** *According to the elbow curve for the KMeans, the ideal number of clusters is 4. There seems to be diminishing returns with a number of clusters above 4*


---

### Cluster Cryptocurrencies with K-means Using the Original Data

In [14]:
# Initialize the K-Means model using the best value for k
model_k = KMeans(n_clusters = 4, random_state = 1)

In [15]:
# Fit the K-Means model using the scaled data
model_k.fit(scaled_df)

In [16]:
# Predict the clusters to group the cryptocurrencies using the scaled data
predict_k = model_k.predict(scaled_df)

# Print the resulting array of cluster values.
predict_k

array([2, 2, 0, 0, 2, 2, 2, 2, 2, 0, 0, 0, 0, 2, 0, 2, 0, 0, 2, 0, 0, 2,
       0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 3, 2, 0, 0, 1, 0, 0, 0, 0],
      dtype=int32)

In [17]:
# Create a copy of the DataFrame
df_market_data_with_predictions = df_market_data.copy()

In [18]:
# Add a new column to the DataFrame with the predicted clusters
predict_k_df = pd.DataFrame(predict_k, columns=["KMeans_cluster_prediction"]) #df in case I want to save or reference it
df_market_data_with_predictions['KMeans_cluster_prediction'] = predict_k
# Display sample data
df_market_data_with_predictions
# predict_k_df

Unnamed: 0_level_0,price_change_percentage_24h,price_change_percentage_7d,price_change_percentage_14d,price_change_percentage_30d,price_change_percentage_60d,price_change_percentage_200d,price_change_percentage_1y,KMeans_cluster_prediction
coin_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
bitcoin,1.08388,7.60278,6.57509,7.67258,-3.25185,83.5184,37.51761,2
ethereum,0.22392,10.38134,4.80849,0.13169,-12.8889,186.77418,101.96023,2
tether,-0.21173,0.04935,0.0064,-0.04237,0.28037,-0.00542,0.01954,0
ripple,-0.37819,-0.60926,2.24984,0.23455,-17.55245,39.53888,-16.60193,0
bitcoin-cash,2.90585,17.09717,14.75334,15.74903,-13.71793,21.66042,14.49384,2
binancecoin,2.10423,12.85511,6.80688,0.05865,36.33486,155.61937,69.69195,2
chainlink,-0.23935,20.69459,9.30098,-11.21747,-43.69522,403.22917,325.13186,2
cardano,0.00322,13.99302,5.55476,10.10553,-22.84776,264.51418,156.09756,2
litecoin,-0.06341,6.60221,7.28931,1.21662,-17.2396,27.49919,-12.66408,2
bitcoin-cash-sv,0.9253,3.29641,-1.86656,2.88926,-24.87434,7.42562,93.73082,0


In [19]:
# Create a scatter plot using hvPlot by setting 
# `x="price_change_percentage_24h"` and `y="price_change_percentage_7d"`. 
# Color the graph points with the labels found using K-Means and 
# add the crypto name in the `hover_cols` parameter to identify 
# the cryptocurrency represented by each data point.

km_plot = df_market_data_with_predictions.hvplot.scatter(
    x="price_change_percentage_24h",
    y="price_change_percentage_7d",
    title="KMeans Clustered for Price Changes - 24hour & 7day",
    hover_cols=['coin_id'],
    hover_color="magenta",
    by="KMeans_cluster_prediction",
    grid=True
)
km_plot

---

### Optimize Clusters with Principal Component Analysis.

In [20]:
# Create a PCA model instance and set `n_components=3`.
model_pca = PCA(n_components=3)

In [21]:
# Use the PCA model with `fit_transform` to reduce to 
# three principal components.
pca_data = model_pca.fit_transform(scaled_df)
# View the first five rows of the DataFrame. 
pca_data[:5]

array([[-0.60066733,  0.84276006,  0.46159457],
       [-0.45826071,  0.45846566,  0.95287678],
       [-0.43306981, -0.16812638, -0.64175193],
       [-0.47183495, -0.22266008, -0.47905316],
       [-1.15779997,  2.04120919,  1.85971527]])

In [22]:
pca_df = pd.DataFrame(pca_data, columns = ['PC1', 'PC2', 'PC3'], index=scaled_df.index)
pca_df.head(5)

Unnamed: 0_level_0,PC1,PC2,PC3
coin_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bitcoin,-0.600667,0.84276,0.461595
ethereum,-0.458261,0.458466,0.952877
tether,-0.43307,-0.168126,-0.641752
ripple,-0.471835,-0.22266,-0.479053
bitcoin-cash,-1.1578,2.041209,1.859715


In [23]:
# Retrieve the explained variance to determine how much information 
# can be attributed to each principal component.
evr = model_pca.explained_variance_ratio_
#print(evr)
print(f"""Explained Variance Ratio of the PCA
PC1 = {evr[0]}
PC2 = {evr[1]}
PC3 = {evr[2]}

Total varaince exaplained by PCA: {sum(evr)}""")

Explained Variance Ratio of the PCA
PC1 = 0.3719856032745437
PC2 = 0.34700812751851623
PC3 = 0.17603792623792408

Total varaince exaplained by PCA: 0.895031657030984


#### Answer the following question: 

**Question:** What is the total explained variance of the three principal components?

**Answer:** *The variance attributed to each of the axes of the PCA are above, and the combined variance of the data that can be attributed to the PCA done here is 89.5%*

In [24]:
# Create a new DataFrame with the PCA data.
# Creating a DataFrame with the PCA data
# Copy the crypto names from the original data
# Set the coinid column as index
# Display sample data


# ABOVE -
# pca_df = pd.DataFrame(pca_data, columns = ['PCA1', 'PCA2', 'PCA3'], index=scaled_df.index)
pca_df.head(5)

Unnamed: 0_level_0,PC1,PC2,PC3
coin_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bitcoin,-0.600667,0.84276,0.461595
ethereum,-0.458261,0.458466,0.952877
tether,-0.43307,-0.168126,-0.641752
ripple,-0.471835,-0.22266,-0.479053
bitcoin-cash,-1.1578,2.041209,1.859715


---

### Find the Best Value for k Using the PCA Data

In [25]:
# Create a list with the number of k-values from 1 to 11
k_values = list(range(1, 11))

In [26]:
# Create an empty list to store the inertia values
inertia_values_pca = []

# Create a for loop to compute the inertia with each possible value of k
# Inside the loop:
# 1. Create a KMeans model using the loop counter for the n_clusters
# 2. Fit the model to the data using `df_market_data_pca`
# 3. Append the model.inertia_ to the inertia list
for i in k_values:
    km_model_pca = KMeans(n_clusters = i, random_state=1) 
    km_model_pca.fit(pca_df)
    calc_inertia_pca = km_model_pca.inertia_
    inertia_values_pca.append(calc_inertia_pca)
    
print(inertia_values_pca)


[256.8740855678923, 182.3395300777564, 135.44240762454203, 49.665496651797326, 38.67258217591795, 34.50757653758153, 23.728364781274763, 18.574523440183192, 15.572672210433335, 11.407666572096915]


In [27]:
# Create a dictionary with the data to plot the Elbow curve
elbow_data_pca = {"k": k_values, "inertia": inertia_values_pca}

# Create a DataFrame with the data to plot the Elbow curve
elbow_df_pca = pd.DataFrame(elbow_data_pca)
elbow_df_pca


Unnamed: 0,k,inertia
0,1,256.874086
1,2,182.33953
2,3,135.442408
3,4,49.665497
4,5,38.672582
5,6,34.507577
6,7,23.728365
7,8,18.574523
8,9,15.572672
9,10,11.407667


In [28]:
# Plot a line chart with all the inertia values computed with 
# the different values of k to visually identify the optimal value for k.
plot_km_pca = elbow_df_pca.hvplot.line(
    x = "k",
    y = "inertia",
    title = "Elbow Curve Graph - KMeans for PCA",
    xticks = k_values
)
plot_km_pca

In [29]:
plot_km * plot_km_pca

#### Answer the following questions: 

* **Question:** What is the best value for `k` when using the PCA data?

  * **Answer:** *Looking at this elbow graph for the KMeans clusters using the PCA data, it appears that the ideal numbers of clusters for the scaled data would be 4 clusters*


* **Question:** Does it differ from the best k value found using the original data?

  * **Answer:**  *This is the same best-k-value that was found using the original scaled data, before running PCA*

### Cluster Cryptocurrencies with K-means Using the PCA Data

In [30]:
pca_df.head(1)

Unnamed: 0_level_0,PC1,PC2,PC3
coin_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bitcoin,-0.600667,0.84276,0.461595


In [31]:
# Initialize the K-Means model using the best value for k
model_k_pca = KMeans(n_clusters = 4, random_state = 1)

In [32]:
# Fit the K-Means model using the PCA data
model_k_pca.fit(pca_df)

In [33]:
# Predict the clusters to group the cryptocurrencies using the PCA data
predict_k_pca = model_k_pca.predict(pca_df)
# Print the resulting array of cluster values.
predict_k_pca

array([2, 2, 0, 0, 2, 2, 2, 2, 2, 0, 0, 0, 0, 2, 0, 2, 0, 0, 2, 0, 0, 2,
       0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 3, 2, 0, 0, 1, 0, 0, 0, 0],
      dtype=int32)

In [34]:
# Create a copy of the DataFrame with the PCA data
pca_df_with_predictions = pca_df.copy()
predict_k_pca_df = pd.DataFrame(predict_k_pca, columns=["KMeans_cluster_prediction_PCA"]) #df in case I want to save or reference it

# Add a new column to the DataFrame with the predicted clusters
pca_df_with_predictions['KMeans_cluster_prediction_PCA'] = predict_k_pca

# Display sample data
pca_df_with_predictions

Unnamed: 0_level_0,PC1,PC2,PC3,KMeans_cluster_prediction_PCA
coin_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
bitcoin,-0.600667,0.84276,0.461595,2
ethereum,-0.458261,0.458466,0.952877,2
tether,-0.43307,-0.168126,-0.641752,0
ripple,-0.471835,-0.22266,-0.479053,0
bitcoin-cash,-1.1578,2.041209,1.859715,2
binancecoin,-0.516534,1.388377,0.804071,2
chainlink,-0.450711,0.517699,2.846143,2
cardano,-0.3456,0.729439,1.478013,2
litecoin,-0.649468,0.432165,0.600303,2
bitcoin-cash-sv,-0.759014,-0.2012,-0.217653,0


In [35]:
# Create a scatter plot using hvPlot by setting 
# `x="PC1"` and `y="PC2"`. 
# Color the graph points with the labels found using K-Means and 
# add the crypto name in the `hover_cols` parameter to identify 
# the cryptocurrency represented by each data point.


km_pca_plot = pca_df_with_predictions.hvplot.scatter(
    x="PC1",
    y="PC2",
    title="KMeans Clustered for PCA",
    hover_cols=['coin_id'],
    hover_color="magenta",
    by="KMeans_cluster_prediction_PCA",
    grid=True
)
km_pca_plot

### Visualize and Compare the Results

In this section, you will visually analyze the cluster analysis results by contrasting the outcome with and without using the optimization techniques.

In [36]:
# Composite plot to contrast the Elbow curves **From the Requirements**
plot_km + plot_km_pca

In [37]:
# Composite plot to contrast the Elbow curves
plot_km * plot_km_pca

In [38]:
# Composite plot to contrast the clusters **From the Requirements**
km_plot + km_pca_plot

In [39]:
# Composite plot to contrast the clusters
km_plot * km_pca_plot

#### Answer the following question: 

  * **Question:** After visually analyzing the cluster analysis results, what is the impact of using fewer features to cluster the data using K-Means?

  * **Answer:** 
  *It looks like using fewer features to cluster the data had a few effects on this small dataset.*
  
  *First, it seems that the PCA clusters seem to show tighter clustering. It seems like the potential outliers for the PCA graph are further from the rest, but by looking at the scaled data along the x-axis, we can see that: in the standard KMeans clusters, there is a data with x=-13.5 (approximate x-range of 19). In the PCA KMeans plot where the data seems further, it has an x-value=4.8 (approzimate x-range of 9). We can see the same by comparing the ranges for y-values: for the KMeans cluster (y-range = 27) vs PCA KMeans cluster (y-range = 10)*
  
  *Second, even with only three features used to calculate the KMeans (PCA), 90% of the variance was explained with these three clusters. I believe this is by we see the tighter clusters. By ignoring a number of features that have minimal effect on the clustering, we recude noise, and therefore the clusters are tighter. This will also highlight any potential outliers, or datapoints that we would like to specifically study next to gain specific ingsight.*