In [1]:
# Required imports
import pandas as pd
import numpy as np
import hvplot.pandas
from pathlib import Path
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

In [2]:
# Read in the CSV file as a Pandas Dataframe
adult_census_transformed_df = pd.read_csv(
    Path("AdultCensusUpdated.csv")
)

adult_census_transformed_df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income,State
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K,Arkansas
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K,Maryland
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K,Michigan
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K,Idaho
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K,Florida


In [3]:
from sklearn.preprocessing import LabelEncoder

# Identify categorical columns
categorical_columns = adult_census_transformed_df.select_dtypes(include=['object']).columns

# Convert categorical columns to numeric using LabelEncoder
label_encoders = {}
for column in categorical_columns:
    le = LabelEncoder()
    adult_census_transformed_df[column] = le.fit_transform(adult_census_transformed_df[column])
    label_encoders[column] = le  # Save the encoder for potential inverse transformation later

# Display the first few rows of the transformed DataFrame
adult_census_transformed_df.head()


Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income,State
0,90,0,77053,11,9,6,0,1,4,0,0,4356,40,39,0,3
1,82,4,132870,11,9,6,4,1,4,0,0,4356,18,39,0,19
2,66,0,186061,15,10,6,0,4,2,0,0,4356,40,39,0,21
3,54,4,140359,5,4,0,7,4,4,0,0,3900,40,39,0,11
4,41,4,264663,15,10,5,10,3,4,0,0,3900,40,39,0,8


In [4]:
# Replace '?' with NaN for easier handling
adult_census_transformed_df.replace('?', pd.NA, inplace=True)

#Drop rows with missing values
adult_census_transformed_df.dropna(inplace=True)
adult_census_transformed_df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income,State
0,90,0,77053,11,9,6,0,1,4,0,0,4356,40,39,0,3
1,82,4,132870,11,9,6,4,1,4,0,0,4356,18,39,0,19
2,66,0,186061,15,10,6,0,4,2,0,0,4356,40,39,0,21
3,54,4,140359,5,4,0,7,4,4,0,0,3900,40,39,0,11
4,41,4,264663,15,10,5,10,3,4,0,0,3900,40,39,0,8


In [5]:
# Import the PCA module
from sklearn.decomposition import PCA

In [6]:
# Instantiate the PCA instance and declare the number of PCA variables
pca=PCA(n_components=2)

In [7]:
# Fit PCA to the transformed data
adult_census_pca = pca.fit_transform(adult_census_transformed_df)

# Display the first few rows of the transformed data
print(adult_census_pca[:5])

[[-112725.56667675   -1081.59387818]
 [ -56908.56662445   -1083.29650214]
 [  -3717.56658684   -1084.91473114]
 [ -49419.54854666   -1082.73699415]
 [  74884.45140805   -1086.52148188]]


In [8]:
# Calculate and display the explained variance ratio
explained_variance_ratio = pca.explained_variance_ratio_
print("Explained Variance Ratio:", explained_variance_ratio)

# Calculate cumulative explained variance
cumulative_variance = explained_variance_ratio.cumsum()
print("Cumulative Explained Variance:", cumulative_variance)


Explained Variance Ratio: [0.99511361 0.00487184]
Cumulative Explained Variance: [0.99511361 0.99998545]


In [9]:
# Create the PCA DataFrame
adult_census_pca_df = pd.DataFrame(
    adult_census_pca,
    columns=["PCA1", "PCA2"]
)

# Review the PCA DataFrame
adult_census_pca_df.head()

Unnamed: 0,PCA1,PCA2
0,-112725.566677,-1081.593878
1,-56908.566624,-1083.296502
2,-3717.566587,-1084.914731
3,-49419.548547,-1082.736994
4,74884.451408,-1086.521482


In [10]:
# Create a a list to store inertia values and the values of k
inertia = []
k = list(range(1, 11))

In [11]:
# Create a for-loop where each value of k is evaluated using the K-means algorithm
# Fit the model using the service_ratings DataFrame
# Append the value of the computed inertia from the `inertia_` attribute of the KMeans model instance
for i in k:
    k_model = KMeans(n_clusters=i, random_state=0)
    k_model.fit(adult_census_pca_df)
    inertia.append(k_model.inertia_)

In [12]:
# Define a DataFrame to hold the values for k and the corresponding inertia
elbow_data = {"k": k, "inertia": inertia}

# Create the DataFrame from the elbow data
df_elbow = pd.DataFrame(elbow_data)

# Review the DataFrame
df_elbow.head()

Unnamed: 0,k,inertia
0,1,364520300000000.0
1,2,151152300000000.0
2,3,81732830000000.0
3,4,53971270000000.0
4,5,39968110000000.0


In [13]:
# Plot the DataFrame
df_elbow.hvplot.line(
    x="k", 
    y="inertia", 
    title="Elbow Curve", 
    xticks=k
)

In [14]:
# Define the model Kmeans model using the optimal value of k for the number of clusters.
model = KMeans(n_clusters=3, random_state=0)

# Fit the model
model.fit(adult_census_pca_df)

# Make predictions
k_3 = model.predict(adult_census_pca_df)

# Create a copy of the adult_census_df DataFrame
adult_census_pca_prediction_df = adult_census_pca_df.copy()

# Add a class column with the labels
adult_census_pca_prediction_df["adult_census_segments"] = k_3

In [15]:
# Plot the clusters
adult_census_pca_prediction_df.hvplot.scatter(
    x="PCA1",
    y="PCA2",
    by="adult_census_segments"
)

In [16]:
# Define the model Kmeans model using k=3 clusters
model = KMeans(n_clusters=3, random_state=0)

# Fit the model
model.fit(adult_census_transformed_df)

# Make predictions
k_3 = model.predict(adult_census_transformed_df)

# Create a copy of the adult_census_transformed_predictions_df DataFrame
adult_census_transformed_predictions_df = adult_census_transformed_df.copy()

# Add a class column with the labels
adult_census_transformed_predictions_df["adult_census_segments"] = k_3

In [20]:
# Add the PCA components to the original DataFrame
adult_census_transformed_predictions_df['PCA1'] = adult_census_pca_df['PCA1']
adult_census_transformed_predictions_df['PCA2'] = adult_census_pca_df['PCA2']

# Plot the clusters
adult_census_transformed_predictions_df.hvplot.scatter(
    x="PCA1",
    y="PCA2",
    by="adult_census_segments",
)
