In [47]:
import pandas as pd
import numpy as np
import hvplot.pandas
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from warnings import simplefilter
simplefilter(action='ignore')

In [4]:
df = pd.read_csv("../Data/AdultCensusUpdated.csv")
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income,State
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K,Arkansas
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K,Maryland
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K,Michigan
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K,Idaho
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K,Florida


In [6]:
# Clean excessive column
df = df.drop(columns =["education"])
# Replace null values
df = df.replace("?", np.nan)
df = df.fillna(0)
df.head()

Unnamed: 0,age,workclass,fnlwgt,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income,State
0,90,0,77053,9,Widowed,0,Not-in-family,White,Female,0,4356,40,United-States,<=50K,Arkansas
1,82,Private,132870,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K,Maryland
2,66,0,186061,10,Widowed,0,Unmarried,Black,Female,0,4356,40,United-States,<=50K,Michigan
3,54,Private,140359,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K,Idaho
4,41,Private,264663,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K,Florida


In [8]:
# Manual binary encoding for specified columns
df["income"] = df["income"].map({"<=50K": 0, ">50K": 1})
df["sex"] = df["sex"].map({"Male": 0, "Female": 1})
df.head()

Unnamed: 0,age,workclass,fnlwgt,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income,State
0,90,0,77053,9,Widowed,0,Not-in-family,White,1,0,4356,40,United-States,0,Arkansas
1,82,Private,132870,9,Widowed,Exec-managerial,Not-in-family,White,1,0,4356,18,United-States,0,Maryland
2,66,0,186061,10,Widowed,0,Unmarried,Black,1,0,4356,40,United-States,0,Michigan
3,54,Private,140359,4,Divorced,Machine-op-inspct,Unmarried,White,1,0,3900,40,United-States,0,Idaho
4,41,Private,264663,10,Separated,Prof-specialty,Own-child,White,1,0,3900,40,United-States,0,Florida


In [10]:
# Create a LabelEncoder instance
label_encoder = LabelEncoder()

# Apply Label Encoding to each non-numeric column
for col in df.select_dtypes(include=["object"]).columns:
    df[col] = label_encoder.fit_transform(df[col].astype(str))
df.head()

Unnamed: 0,age,workclass,fnlwgt,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income,State
0,90,0,77053,9,6,0,1,4,1,0,4356,40,39,0,3
1,82,4,132870,9,6,4,1,4,1,0,4356,18,39,0,19
2,66,0,186061,10,6,0,4,2,1,0,4356,40,39,0,21
3,54,4,140359,4,0,7,4,4,1,0,3900,40,39,0,11
4,41,4,264663,10,5,10,3,4,1,0,3900,40,39,0,8


In [12]:
df.dtypes

age               int64
workclass         int32
fnlwgt            int64
education.num     int64
marital.status    int32
occupation        int32
relationship      int32
race              int32
sex               int64
capital.gain      int64
capital.loss      int64
hours.per.week    int64
native.country    int32
income            int64
State             int32
dtype: object

## PART 2: Creating the Elbow Curve

In [14]:
y = df['income']
X = df.drop(columns='income')

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [37]:
# Create a a list to store inertia values
inertia = []

# Create a a list to store the values of k
k = list(range(1, 11))

# Create a for loop where each value of k is evaluated using the K-means algorithm
# Fit the model using the home_sales_df DataFrame
# Append the value of the computed inertia from the `inertia_` attribute of the K-means model instance
for i in k:
    k_model = KMeans(n_clusters=i, random_state=1)
    k_model.fit(X)
    inertia.append(k_model.inertia_)

# Create a dictionary that holds the list values for k and inertia
elbow_data = {"k": k, "inertia": inertia}

# Create a DataFrame using the elbow_data dictionary
df_elbow = pd.DataFrame(elbow_data)

In [43]:
# Plot the DataFrame
df_elbow.hvplot.line(
    x="k", 
    y="inertia", 
    title="Elbow Curve", 
    xticks=k
)

## Part 3: Instantiate an K Nearest Neighbor Classifier instance

In [21]:
# Instantiate a StandardScaler instance
scaler = StandardScaler()

# Fit the training data to the standard scaler
X_scaler = scaler.fit(X_train)

# Transform the training data using the scaler
X_train_scaled = X_scaler.transform(X_train)

# Transform the testing data using the scaler
X_test_scaled = X_scaler.transform(X_test)

In [25]:
# Instantiate the KNeighborsClassifier model with n_neighbors = 3 
knn = KNeighborsClassifier(n_neighbors=3)

In [27]:
# Train the model using the training data
knn.fit(X_train_scaled, y_train)

In [29]:
# Create predictions using the testing data
y_pred = knn.predict(X_test_scaled)

In [31]:
# Print the classification report comparing the testing data to the model predictions
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.87      0.90      0.88      6149
           1       0.64      0.57      0.61      1992

    accuracy                           0.82      8141
   macro avg       0.76      0.73      0.74      8141
weighted avg       0.81      0.82      0.81      8141



In [52]:
X.columns

Index(['age', 'workclass', 'fnlwgt', 'education.num', 'marital.status',
       'occupation', 'relationship', 'race', 'sex', 'capital.gain',
       'capital.loss', 'hours.per.week', 'native.country', 'State'],
      dtype='object')

## Part 4: Model Optimization

In [54]:
data_scaled = StandardScaler().fit_transform(X[['age', 'workclass', 'fnlwgt', 'education.num', 'marital.status', \
       'occupation', 'relationship', 'race', 'sex', 'capital.gain', \
       'capital.loss', 'hours.per.week', 'native.country', 'State']])


In [58]:
# Create a DataFrame called with the scaled data
# The column names should match those referenced in the StandardScaler step
df_scaled = pd.DataFrame(
    data_scaled,
    columns=['age', 'workclass', 'fnlwgt', 'education.num', 'marital.status', \
       'occupation', 'relationship', 'race', 'sex', 'capital.gain', \
       'capital.loss', 'hours.per.week', 'native.country', 'State']
)
X = df_scaled
# Review the DataFrame
df_scaled.head()

Unnamed: 0,age,workclass,fnlwgt,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,State
0,3.769612,-2.65732,-1.067997,-0.42006,2.24948,-1.554283,-0.277805,0.393668,1.422331,-0.14592,10.593507,-0.035429,0.291569,-1.49304
1,3.183112,0.09005,-0.539169,-0.42006,2.24948,-0.608387,-0.277805,0.393668,1.422331,-0.14592,10.593507,-1.817204,0.291569,-0.383084
2,2.01011,-2.65732,-0.03522,-0.03136,2.24948,-1.554283,1.589322,-1.962621,1.422331,-0.14592,10.593507,-0.035429,0.291569,-0.24434
3,1.130359,0.09005,-0.468215,-2.363558,-1.734058,0.101036,1.589322,0.393668,1.422331,-0.14592,9.461864,-0.035429,0.291569,-0.938062
4,0.177296,0.09005,0.709482,-0.03136,1.585557,0.810458,0.966947,0.393668,1.422331,-0.14592,9.461864,-0.035429,0.291569,-1.146179


In [60]:
# Split into training and testing data with new scaled data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [62]:
# Instantiate a StandardScaler instance
scaler = StandardScaler()

# Fit the training data to the standard scaler
X_scaler = scaler.fit(X_train)

# Transform the training data using the scaler
X_train_scaled = X_scaler.transform(X_train)

# Transform the testing data using the scaler
X_test_scaled = X_scaler.transform(X_test)

In [64]:
# Instantiate the KNeighborsClassifier model with n_neighbors = 3 
knn = KNeighborsClassifier(n_neighbors=3)

In [66]:
# Train the model using the training data
knn.fit(X_train_scaled, y_train)

In [68]:
# Create predictions using the testing data
y_pred = knn.predict(X_test_scaled)

In [70]:
# Print the classification report comparing the testing data to the model predictions
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.87      0.90      0.88      6149
           1       0.64      0.57      0.61      1992

    accuracy                           0.82      8141
   macro avg       0.76      0.73      0.74      8141
weighted avg       0.81      0.82      0.81      8141

