In [1]:
from ucimlrepo import fetch_ucirepo 
import pandas as pd

# fetch dataset 
adult = fetch_ucirepo(id=2) 
  
# data (as pandas dataframes) 
X = adult.data.features 
y = adult.data.targets 
  
# metadata 
print(adult.metadata) 
  
# variable information 
print(adult.variables) 

{'uci_id': 2, 'name': 'Adult', 'repository_url': 'https://archive.ics.uci.edu/dataset/2/adult', 'data_url': 'https://archive.ics.uci.edu/static/public/2/data.csv', 'abstract': 'Predict whether annual income of an individual exceeds $50K/yr based on census data. Also known as "Census Income" dataset. ', 'area': 'Social Science', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 48842, 'num_features': 14, 'feature_types': ['Categorical', 'Integer'], 'demographics': ['Age', 'Income', 'Education Level', 'Other', 'Race', 'Sex'], 'target_col': ['income'], 'index_col': None, 'has_missing_values': 'yes', 'missing_values_symbol': 'NaN', 'year_of_dataset_creation': 1996, 'last_updated': 'Tue Sep 24 2024', 'dataset_doi': '10.24432/C5XW20', 'creators': ['Barry Becker', 'Ronny Kohavi'], 'intro_paper': None, 'additional_info': {'summary': "Extraction was done by Barry Becker from the 1994 Census database.  A set of reasonably clean records was extracted using the fol

In [2]:
# Inspect the data
X.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba


In [3]:
# Inspect the data
y.head()

Unnamed: 0,income
0,<=50K
1,<=50K
2,<=50K
3,<=50K
4,<=50K


# Data cleaning

### Handling Missing Values and '?'

In [4]:
import numpy as np
# Replace "?" with NaN in the specified columns using .loc
X.loc[X['workclass'] == "?", 'workclass'] = np.nan
X.loc[X['occupation'] == "?", 'occupation'] = np.nan
X.loc[X['native-country'] == "?", 'native-country'] = np.nan

# Impute missing values with the most frequent value (mode) using .loc[]
X.loc[:, 'workclass'] = X['workclass'].fillna(X['workclass'].mode()[0])
X.loc[:, 'occupation'] = X['occupation'].fillna(X['occupation'].mode()[0])
X.loc[:, 'native-country'] = X['native-country'].fillna(X['native-country'].mode()[0])

In [5]:
# Standardize original categorical columns
X.loc[:, 'native-country'] = X['native-country'].str.lower()

In [6]:
# Check for missing values in the features
X.isnull().sum()

age               0
workclass         0
fnlwgt            0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
dtype: int64

### One-hot Encoding

In [7]:
# Perform one-hot encoding for categorical features
X = pd.get_dummies(X, columns=['workclass', 'education', 'marital-status', 'occupation',
                                          'relationship', 'race', 'sex', 'native-country'], drop_first=True)

### Ensuring Correct Data Types

In [8]:
X.loc[:, 'age'] = X['age'].astype(int)
X.loc[:, 'fnlwgt'] = X['fnlwgt'].astype(int)
X.loc[:, 'capital-gain'] = X['capital-gain'].astype(int)
X.loc[:, 'capital-loss'] = X['capital-loss'].astype(int)
X.loc[:, 'hours-per-week'] = X['hours-per-week'].astype(int)


In [9]:
# Check the data types of the columns
X.dtypes

age                               int64
fnlwgt                            int64
education-num                     int64
capital-gain                      int64
capital-loss                      int64
                                  ...  
native-country_thailand            bool
native-country_trinadad&tobago     bool
native-country_united-states       bool
native-country_vietnam             bool
native-country_yugoslavia          bool
Length: 97, dtype: object

### Feature Engineering

In [10]:
# Create a new feature 'net-capital-gain'
X['net-capital-gain'] = X['capital-gain'] - X['capital-loss']

# Drop the original 'capital-gain' and 'capital-loss' columns if no longer needed
X.drop(['capital-gain', 'capital-loss'], axis=1, inplace=True)

In [11]:
# Drop `education-num` since it seems redundant
X.drop('education-num', axis=1, inplace=True)

In [12]:
X.head(10)

Unnamed: 0,age,fnlwgt,hours-per-week,workclass_Local-gov,workclass_Never-worked,workclass_Private,workclass_Self-emp-inc,workclass_Self-emp-not-inc,workclass_State-gov,workclass_Without-pay,...,native-country_puerto-rico,native-country_scotland,native-country_south,native-country_taiwan,native-country_thailand,native-country_trinadad&tobago,native-country_united-states,native-country_vietnam,native-country_yugoslavia,net-capital-gain
0,39,77516,40,False,False,False,False,False,True,False,...,False,False,False,False,False,False,True,False,False,2174
1,50,83311,13,False,False,False,False,True,False,False,...,False,False,False,False,False,False,True,False,False,0
2,38,215646,40,False,False,True,False,False,False,False,...,False,False,False,False,False,False,True,False,False,0
3,53,234721,40,False,False,True,False,False,False,False,...,False,False,False,False,False,False,True,False,False,0
4,28,338409,40,False,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,0
5,37,284582,40,False,False,True,False,False,False,False,...,False,False,False,False,False,False,True,False,False,0
6,49,160187,16,False,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,0
7,52,209642,45,False,False,False,False,True,False,False,...,False,False,False,False,False,False,True,False,False,0
8,31,45781,50,False,False,True,False,False,False,False,...,False,False,False,False,False,False,True,False,False,14084
9,42,159449,40,False,False,True,False,False,False,False,...,False,False,False,False,False,False,True,False,False,5178


In [13]:
print(X.shape)
print(y.shape)

(48842, 95)
(48842, 1)


In [14]:
#join df

merged_df = X.join(y)
merged_df.head(10)

Unnamed: 0,age,fnlwgt,hours-per-week,workclass_Local-gov,workclass_Never-worked,workclass_Private,workclass_Self-emp-inc,workclass_Self-emp-not-inc,workclass_State-gov,workclass_Without-pay,...,native-country_scotland,native-country_south,native-country_taiwan,native-country_thailand,native-country_trinadad&tobago,native-country_united-states,native-country_vietnam,native-country_yugoslavia,net-capital-gain,income
0,39,77516,40,False,False,False,False,False,True,False,...,False,False,False,False,False,True,False,False,2174,<=50K
1,50,83311,13,False,False,False,False,True,False,False,...,False,False,False,False,False,True,False,False,0,<=50K
2,38,215646,40,False,False,True,False,False,False,False,...,False,False,False,False,False,True,False,False,0,<=50K
3,53,234721,40,False,False,True,False,False,False,False,...,False,False,False,False,False,True,False,False,0,<=50K
4,28,338409,40,False,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,0,<=50K
5,37,284582,40,False,False,True,False,False,False,False,...,False,False,False,False,False,True,False,False,0,<=50K
6,49,160187,16,False,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,0,<=50K
7,52,209642,45,False,False,False,False,True,False,False,...,False,False,False,False,False,True,False,False,0,>50K
8,31,45781,50,False,False,True,False,False,False,False,...,False,False,False,False,False,True,False,False,14084,>50K
9,42,159449,40,False,False,True,False,False,False,False,...,False,False,False,False,False,True,False,False,5178,>50K


In [15]:
merged_df['income'].unique()

array(['<=50K', '>50K', '<=50K.', '>50K.'], dtype=object)

In [16]:
merged_df['income'] = merged_df['income'].astype('string')

In [17]:
# Clean the 'Income' column to remove the period
merged_df['income'] = merged_df['income'].str.replace('.', '', regex=False)

# Display the cleaned DataFrame
print("Cleaned DataFrame:")
print(merged_df)

Cleaned DataFrame:
       age  fnlwgt  hours-per-week  workclass_Local-gov  \
0       39   77516              40                False   
1       50   83311              13                False   
2       38  215646              40                False   
3       53  234721              40                False   
4       28  338409              40                False   
...    ...     ...             ...                  ...   
48837   39  215419              36                False   
48838   64  321403              40                False   
48839   38  374983              50                False   
48840   44   83891              40                False   
48841   35  182148              60                False   

       workclass_Never-worked  workclass_Private  workclass_Self-emp-inc  \
0                       False              False                   False   
1                       False              False                   False   
2                       False               

In [18]:
merged_df['income'].unique()

<StringArray>
['<=50K', '>50K']
Length: 2, dtype: string

In [19]:
# Alternatively, you can use map:
merged_df['income'] = merged_df['income'].map({'>50K': 1, '<=50K': 0})

# Check the DataFrame after conversion
print("DataFrame after conversion:")
merged_df.head(10)

DataFrame after conversion:


Unnamed: 0,age,fnlwgt,hours-per-week,workclass_Local-gov,workclass_Never-worked,workclass_Private,workclass_Self-emp-inc,workclass_Self-emp-not-inc,workclass_State-gov,workclass_Without-pay,...,native-country_scotland,native-country_south,native-country_taiwan,native-country_thailand,native-country_trinadad&tobago,native-country_united-states,native-country_vietnam,native-country_yugoslavia,net-capital-gain,income
0,39,77516,40,False,False,False,False,False,True,False,...,False,False,False,False,False,True,False,False,2174,0
1,50,83311,13,False,False,False,False,True,False,False,...,False,False,False,False,False,True,False,False,0,0
2,38,215646,40,False,False,True,False,False,False,False,...,False,False,False,False,False,True,False,False,0,0
3,53,234721,40,False,False,True,False,False,False,False,...,False,False,False,False,False,True,False,False,0,0
4,28,338409,40,False,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,0,0
5,37,284582,40,False,False,True,False,False,False,False,...,False,False,False,False,False,True,False,False,0,0
6,49,160187,16,False,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,0,0
7,52,209642,45,False,False,False,False,True,False,False,...,False,False,False,False,False,True,False,False,0,1
8,31,45781,50,False,False,True,False,False,False,False,...,False,False,False,False,False,True,False,False,14084,1
9,42,159449,40,False,False,True,False,False,False,False,...,False,False,False,False,False,True,False,False,5178,1


In [20]:
full_encoded_df = merged_df.copy()

In [21]:
full_encoded_df['income'] = full_encoded_df['income'].astype('int')

In [22]:
full_encoded_df.dtypes

age                             int64
fnlwgt                          int64
hours-per-week                  int64
workclass_Local-gov              bool
workclass_Never-worked           bool
                                ...  
native-country_united-states     bool
native-country_vietnam           bool
native-country_yugoslavia        bool
net-capital-gain                int64
income                          int64
Length: 96, dtype: object

In [28]:
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import hvplot.pandas

inertia = []
k = range(1, 11)

In [29]:
# Create a a list to store inertia values and the values of k

for i in k:
    k_model = KMeans(n_clusters=i, random_state=1)
    k_model.fit(full_encoded_df)
    inertia.append(k_model.inertia_)





In [30]:
# Define a DataFrame to hold the values for k and the corresponding inertia
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)

# Review the DataFrame
df_elbow.head()

Unnamed: 0,k,inertia
0,1,547414500000000.0
1,2,226707300000000.0
2,3,122754000000000.0
3,4,81046760000000.0
4,5,58350240000000.0


In [31]:
# Plot the DataFrame
df_elbow.hvplot.line(
    x="k", 
    y="inertia", 
    title="Elbow Curve", 
    xticks=k
)

# K-Means Clustering

In [32]:
# Define the model with 4 clusters
model = KMeans(n_clusters=3, random_state=1)

# Fit the model
model.fit(full_encoded_df)

# Make predictions
k_3 = model.predict(full_encoded_df)

# Create a copy of the DataFrame
adult_predictions_df = full_encoded_df.copy()

# Add a class column with the labels
adult_predictions_df['segments'] = k_3



In [33]:
# Plot the clusters
adult_predictions_df.hvplot.scatter(
    x="hours-per-week",
    y="income",
    by="segments"
)

In [34]:
# Import the PCA module
from sklearn.decomposition import PCA

In [35]:
# Instantiate the PCA instance and declare the number of PCA variables
pca = PCA(n_components=2)

In [36]:
#Fit the PCA model on the transformed credit card DataFrame
adult_pca = pca.fit_transform(full_encoded_df)

# Review the first 5 rows of list data
adult_pca[:5]

array([[-112148.42221285,    1154.8331039 ],
       [-106352.88740158,   -1017.74257785],
       [  25982.10866078,    -985.17084623],
       [  45057.10793334,    -980.47413883],
       [ 148745.10503635,    -954.95817565]])

In [37]:
# Calculate the PCA explained variance ratio
pca.explained_variance_ratio_

array([0.995014  , 0.00498597])

In [38]:
# Create the PCA DataFrame
adult_pca_df = pd.DataFrame(
    adult_pca,
    columns=["PCA1", "PCA2"]
)

# Review the PCA DataFrame
adult_pca_df.head()

Unnamed: 0,PCA1,PCA2
0,-112148.422213,1154.833104
1,-106352.887402,-1017.742578
2,25982.108661,-985.170846
3,45057.107933,-980.474139
4,148745.105036,-954.958176


In [39]:
# Create a a list to store inertia values and the values of k
inertia = []
k = list(range(1, 11))

# Append the value of the computed inertia from the `inertia_` attribute of teh KMeans model instance
for i in k:
    k_model = KMeans(n_clusters=i, random_state=1)
    k_model.fit(adult_pca_df)
    inertia.append(k_model.inertia_)

# Define a DataFrame to hold the values for k and the corresponding inertia
second_elbow_data = {"k": k, "inertia": inertia}
df_second_elbow = pd.DataFrame(second_elbow_data)

# Review the DataFrame
df_second_elbow.head()



Unnamed: 0,k,inertia
0,1,547414500000000.0
1,2,226710900000000.0
2,3,122754100000000.0
3,4,81053550000000.0
4,5,58360570000000.0


In [40]:
# Plot the Elbow Curve
df_second_elbow.hvplot.line(
    x="k", 
    y="inertia", 
    title="Elbow Curve", 
    xticks=k
)

In [41]:
# Define the model with 3 clusters
model = KMeans(n_clusters=3, random_state=0)

# Fit the model
model.fit(adult_pca_df)

# Make predictions
k_3 = model.predict(adult_pca_df)

# Create a copy of the PCA DataFrame
adult_pca_predictions_df = adult_pca_df.copy()

# Add a class column with the labels
adult_pca_predictions_df["census_segments"] = k_3



In [42]:
# Plot the clusters
adult_pca_predictions_df.hvplot.scatter(
    x="PCA1",
    y="PCA2",
    by="census_segments"
)