In [1]:
# Import required libraries and dependencies
import pandas as pd
import hvplot.pandas
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [2]:
#Correctly read in the 'adult_cleaned.csv' file to correctly maintain variable type integrity
adult_cleaned_df = pd.read_csv("Resources/adult_cleaned.csv", dtype={'Income over 50k? 0=no 1=yes':str})
adult_cleaned_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30718 entries, 0 to 30717
Data columns (total 10 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   age                          30718 non-null  int64 
 1   workclass                    30718 non-null  object
 2   education                    30718 non-null  object
 3   marital.status               30718 non-null  object
 4   occupation                   30718 non-null  object
 5   relationship                 30718 non-null  object
 6   race                         30718 non-null  object
 7   sex                          30718 non-null  object
 8   hours.per.week               30718 non-null  int64 
 9   Income over 50k? 0=no 1=yes  30718 non-null  object
dtypes: int64(2), object(8)
memory usage: 2.3+ MB


In [3]:
#display the dataset in its entirety 
adult_cleaned_df

Unnamed: 0,age,workclass,education,marital.status,occupation,relationship,race,sex,hours.per.week,Income over 50k? 0=no 1=yes
0,82,Private,HS-grad,Widowed,Exec-managerial,Not-in-family,White,Female,18,0
1,54,Private,7th-8th,Divorced,Machine-op-inspct,Unmarried,White,Female,40,0
2,41,Private,Some-college,Separated,Prof-specialty,Own-child,White,Female,40,0
3,34,Private,HS-grad,Divorced,Other-service,Unmarried,White,Female,45,0
4,38,Private,10th,Separated,Adm-clerical,Unmarried,White,Male,40,0
...,...,...,...,...,...,...,...,...,...,...
30713,22,Private,Some-college,Never-married,Protective-serv,Not-in-family,White,Male,40,0
30714,27,Private,Assoc-acdm,Married-civ-spouse,Tech-support,Wife,White,Female,38,0
30715,40,Private,HS-grad,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,40,1
30716,58,Private,HS-grad,Widowed,Adm-clerical,Unmarried,White,Female,40,0


#### Base Model with KMeans Clustering

In [4]:
# Separate the features and the target variable
X = adult_cleaned_df.drop(columns=['Income over 50k? 0=no 1=yes'])
y = adult_cleaned_df['Income over 50k? 0=no 1=yes']

# Identify categorical and numerical columns
categorical_columns = X.select_dtypes(include=['object']).columns
numerical_columns = X.select_dtypes(include=['int64', 'float64']).columns

# Step 1: Encode Categorical Data using OneHotEncoder
encoder = OneHotEncoder(sparse_output=False)
encoded_categorical_data = encoder.fit_transform(X[categorical_columns])

# Convert the encoded features to a DataFrame
encoded_categorical_df = pd.DataFrame(encoded_categorical_data, columns=encoder.get_feature_names_out(categorical_columns))

# Combine the encoded categorical data with the numerical data
X_encoded = pd.concat([X[numerical_columns].reset_index(drop=True), encoded_categorical_df.reset_index(drop=True)], axis=1)

# Step 2: Standardize the Data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_encoded)

# Create a list with the number of k-values from 1 to 11
k = list(range(1,11))
# Create an empty list to store the inertia values
inertia = []

# Create a for loop to compute the inertia with each possible value of k
# Inside the loop:
# 1. Create a KMeans model using the loop counter for the n_clusters
# 2. Fit the model to the data using `X_encoded`
# 3. Append the model.inertia_ to the inertia list
for i in k:
    k_model = KMeans(n_clusters=i, random_state=1)
    k_model.fit(X_encoded)
    inertia.append(k_model.inertia_)
                
# Create a dictionary with the data to plot the Elbow curve
elbow_data = {'k': k, 'inertia': inertia}

# Create a DataFrame with the data to plot the Elbow curve
df_elbow = pd.DataFrame(elbow_data)

# Plot a line chart with all the inertia values computed with 
# the different values of k to visually identify the optimal value for k.
df_elbow.hvplot.line(
    x='k',
    y='inertia',
    title = 'Elbow Curve for Market Data Scaled',
    xticks = k
)

#### Answer the following question: 

**Question:** What is the best value for `k`?

**Answer:** In my professional opinion, the best answer for k is 2. Having 2 clusters will help us visualize 
data spread in a manner that can provide meaningful insights.

In [5]:
# Initialize the K-Means model using the best value for k
model = KMeans(n_clusters = 2)
# Fit the K-Means model using the scaled data
model.fit(X_encoded)
# Predict the clusters to group the instances using the scaled data
clusters = model.predict(X_encoded)

# Print the resulting array of cluster values.
print(clusters)

[1 1 1 ... 1 1 0]


In [6]:
# Create a copy of the DataFrame
X_encoded_final = X_encoded.copy()
# Add a new column to the DataFrame with the predicted clusters
X_encoded_final['clusters'] = clusters
# Display sample data
X_encoded_final

Unnamed: 0,age,hours.per.week,workclass_Federal-gov,workclass_Local-gov,workclass_Private,workclass_Self-emp-inc,workclass_Self-emp-not-inc,workclass_State-gov,workclass_Without-pay,education_10th,...,relationship_Unmarried,relationship_Wife,race_Amer-Indian-Eskimo,race_Asian-Pac-Islander,race_Black,race_Other,race_White,sex_Female,sex_Male,clusters
0,82,18,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1
1,54,40,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1
2,41,40,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1
3,34,45,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0
4,38,40,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30713,22,40,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0
30714,27,38,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0
30715,40,40,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1
30716,58,40,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1


In [7]:
# Create a scatter plot using hvPlot by setting 
# `x="age"` and `y="hours.per.week"`. 
X_encoded_final.hvplot.scatter(
    x='age',
    y='hours.per.week',
    by='clusters',
    title = 'Scatter Plot by Cluster - k=2'
)

In [8]:
# Print the distribution of cluster labels
print(X_encoded_final['clusters'].value_counts())

0    16688
1    14030
Name: clusters, dtype: int64


#### PCA Model with KMeans Clustering

In [9]:
# Create a PCA model instance and set `n_components=2`.
pca = PCA(n_components=2)
# Use the PCA model with `fit_transform` to reduce to 
# two principal components.
pca_data = pca.fit_transform(X_encoded_final)
# View the first five rows of the DataFrame. 
pca_data[:5]

array([[ 30.30388355, -38.79430057],
       [ 13.79441621,  -7.25995339],
       [  1.92952917,  -1.93212617],
       [ -2.41716911,   5.49987367],
       [ -0.81043143,  -0.68358904]])

In [10]:
# Retrieve the explained variance to determine how much information 
# can be attributed to each principal component.
pca.explained_variance_ratio_

array([0.56102362, 0.42623542])

#### Answer the following question: 

**Question:** What is the total explained variance of the two principal components?

**Answer:** 98.72%

In [11]:
# Create a new DataFrame with the PCA data.
pca_df = pd.DataFrame(pca_data, columns = ['PCA1', 'PCA2'])

# Copy the 'ID_number' from the original data
pca_df['ID_number'] = adult_cleaned_df.index

# Set the 'ID_number' column as index
pca_df = pca_df.set_index('ID_number')

# Display sample data
pca_df.head()

Unnamed: 0_level_0,PCA1,PCA2
ID_number,Unnamed: 1_level_1,Unnamed: 2_level_1
0,30.303884,-38.794301
1,13.794416,-7.259953
2,1.929529,-1.932126
3,-2.417169,5.499874
4,-0.810431,-0.683589


In [12]:
# Create a list with the number of k-values from 1 to 11
k = list(range(1,11))
# Create an empty list to store the inertia values
inertia = []

# Create a for loop to compute the inertia with each possible value of k
# Inside the loop:
# 1. Create a KMeans model using the loop counter for the n_clusters
# 2. Fit the model to the data using `pca_df`
# 3. Append the model.inertia_ to the inertia list
for i in k:
    k_model = KMeans(n_clusters=i, random_state=1)
    k_model.fit(pca_df)
    inertia.append(k_model.inertia_)

In [13]:
# Create a dictionary with the data to plot the Elbow curve
elbow_data_pca = {'k': k, 'inertia': inertia}

# Create a DataFrame with the data to plot the Elbow curve
df_elbow_pca = pd.DataFrame(elbow_data_pca)

In [14]:
# Plot a line chart with all the inertia values computed with 
# the different values of k to visually identify the optimal value for k.
df_elbow_pca.hvplot.line(
    x='k',
    y='inertia',
    title = 'Elbow Curve for Adult Income Data Scaled Using PCA',
)

#### Answer the following questions: 

* **Question:** What is the best value for `k` when using the PCA data?

  * **Answer:** 2


* **Question:** Does it differ from the best k value found using the original data?

  * **Answer:** No, the original best k-value was also 2.

In [15]:
# Initialize the K-Means model using the best value for k
model = KMeans(n_clusters=2, random_state=1)
# Fit the K-Means model using the PCA data
model.fit(pca_df)
# Predict the clusters to group the cryptocurrencies using the PCA data
k_2_pca = model.predict(pca_df)
# Print the resulting array of cluster values.
print(k_2_pca)

[1 1 1 ... 0 1 0]


In [16]:
# Create a copy of the DataFrame with the PCA data
pca_df_final = pca_df.copy()

# Add a new column to the DataFrame with the predicted clusters
pca_df_final['PCA_clusters'] = k_2_pca

# Display sample data
pca_df_final

Unnamed: 0_level_0,PCA1,PCA2,PCA_clusters
ID_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,30.303884,-38.794301,1
1,13.794416,-7.259953,1
2,1.929529,-1.932126,1
3,-2.417169,5.499874,0
4,-0.810431,-0.683589,0
...,...,...,...
30713,-15.412208,5.878863,0
30714,-11.649971,1.991401,0
30715,1.070289,-1.498931,0
30716,17.436367,-8.901950,1


In [17]:
# Create a scatter plot using hvPlot by setting 
# `x="PC1"` and `y="PC2"`. 
# Color the graph points with the labels found using K-Means and 
# add the crypto name in the `hover_cols` parameter to identify 
# the cryptocurrency represented by each data point.
pca_df_final.hvplot.scatter(
    x='PCA1',
    y='PCA2',
    by='PCA_clusters',
    title = 'Scatter Plot By PCA Cluster - k=2'
)