In [1]:
# Import our dependecies
import pandas as pd
import numpy as np
import hvplot.pandas
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from warnings import simplefilter
simplefilter(action='ignore')

In [2]:
# Read in the data
df = pd.read_csv("../Data/AdultCensusUpdated.csv")
census_df = df.copy()
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income,State
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K,Arkansas
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K,Maryland
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K,Michigan
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K,Idaho
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K,Florida


In [3]:
# Clean excessive column
df = df.drop(['fnlwgt','education','capital.gain','capital.loss','native.country'], axis=1)
# Replace null values
df = df.replace("?", np.nan)
df = df.fillna(0)
df.head()

Unnamed: 0,age,workclass,education.num,marital.status,occupation,relationship,race,sex,hours.per.week,income,State
0,90,0,9,Widowed,0,Not-in-family,White,Female,40,<=50K,Arkansas
1,82,Private,9,Widowed,Exec-managerial,Not-in-family,White,Female,18,<=50K,Maryland
2,66,0,10,Widowed,0,Unmarried,Black,Female,40,<=50K,Michigan
3,54,Private,4,Divorced,Machine-op-inspct,Unmarried,White,Female,40,<=50K,Idaho
4,41,Private,10,Separated,Prof-specialty,Own-child,White,Female,40,<=50K,Florida


In [4]:
# Manual binary encoding for specified columns
df["income"] = df["income"].map({"<=50K": 0, ">50K": 1})
df["sex"] = df["sex"].map({"Male": 0, "Female": 1})
df.head()

Unnamed: 0,age,workclass,education.num,marital.status,occupation,relationship,race,sex,hours.per.week,income,State
0,90,0,9,Widowed,0,Not-in-family,White,1,40,0,Arkansas
1,82,Private,9,Widowed,Exec-managerial,Not-in-family,White,1,18,0,Maryland
2,66,0,10,Widowed,0,Unmarried,Black,1,40,0,Michigan
3,54,Private,4,Divorced,Machine-op-inspct,Unmarried,White,1,40,0,Idaho
4,41,Private,10,Separated,Prof-specialty,Own-child,White,1,40,0,Florida


In [5]:
# Create a LabelEncoder instance
label_encoder = LabelEncoder()

# Apply Label Encoding to each non-numeric column
for col in df.select_dtypes(include=["object"]).columns:
    df[col] = label_encoder.fit_transform(df[col].astype(str))
df.head()

Unnamed: 0,age,workclass,education.num,marital.status,occupation,relationship,race,sex,hours.per.week,income,State
0,90,0,9,6,0,1,4,1,40,0,3
1,82,4,9,6,4,1,4,1,18,0,19
2,66,0,10,6,0,4,2,1,40,0,21
3,54,4,4,0,7,4,4,1,40,0,11
4,41,4,10,5,10,3,4,1,40,0,8


In [6]:
# Confirm Data Types
df.dtypes

age               int64
workclass         int32
education.num     int64
marital.status    int32
occupation        int32
relationship      int32
race              int32
sex               int64
hours.per.week    int64
income            int64
State             int32
dtype: object

## PART 2: Creating the Elbow Curve

In [7]:
# Identify target column and separate from main dataframe 
y = df['income']
X = df.drop(columns='income')

In [8]:
# Split data into training and testing groups
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [None]:
# Create a a list to store inertia values
inertia = []

# Create a a list to store the values of k
k = list(range(1, 11))

# Create a for loop where each value of k is evaluated using the K-means algorithm
# Fit the model using the home_sales_df DataFrame
# Append the value of the computed inertia from the `inertia_` attribute of the K-means model instance
for i in k:
    k_model = KMeans(n_clusters=i, random_state=1)
    k_model.fit(X)
    inertia.append(k_model.inertia_)

# Create a dictionary that holds the list values for k and inertia
elbow_data = {"k": k, "inertia": inertia}

# Create a DataFrame using the elbow_data dictionary
df_elbow = pd.DataFrame(elbow_data)

In [None]:
# Plot the DataFrame
df_elbow.hvplot.line(
    x="k", 
    y="inertia", 
    title="Elbow Curve", 
    xticks=k
)

## Part 3: Instantiate an K Nearest Neighbor Classifier instance

In [10]:
# Instantiate a StandardScaler instance
scaler = StandardScaler()

# Fit the training data to the standard scaler
X_scaler = scaler.fit(X_train)

# Transform the training data using the scaler
X_train_scaled = X_scaler.transform(X_train)

# Transform the testing data using the scaler
X_test_scaled = X_scaler.transform(X_test)

In [11]:
# Instantiate the KNeighborsClassifier model with n_neighbors = 3 
knn = KNeighborsClassifier(n_neighbors=3)

In [12]:
# Train the model using the training data
knn.fit(X_train_scaled, y_train)

In [13]:
# Create predictions using the testing data
y_pred = knn.predict(X_test_scaled)

In [14]:
# Print the classification report comparing the testing data to the model predictions
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.86      0.89      0.87      6149
           1       0.62      0.56      0.59      1992

    accuracy                           0.81      8141
   macro avg       0.74      0.72      0.73      8141
weighted avg       0.80      0.81      0.80      8141



In [15]:
# Reference column names
X.columns

Index(['age', 'workclass', 'education.num', 'marital.status', 'occupation',
       'relationship', 'race', 'sex', 'hours.per.week', 'State'],
      dtype='object')

## Part 4: Model Optimization

In [16]:
# Scale the data
data_scaled = StandardScaler().fit_transform(X[['age', 'workclass', 'education.num', 'marital.status', \
       'occupation', 'relationship', 'race', 'sex', \
        'hours.per.week','State']])


In [17]:
# Create a DataFrame called with the scaled data
df_scaled = pd.DataFrame(
    data_scaled,
    columns=['age', 'workclass',  'education.num', 'marital.status', \
       'occupation', 'relationship', 'race', 'sex', 'hours.per.week', 'State']
)
X = df_scaled
# Review the DataFrame
X.head()

Unnamed: 0,age,workclass,education.num,marital.status,occupation,relationship,race,sex,hours.per.week,State
0,3.769612,-2.65732,-0.42006,2.24948,-1.554283,-0.277805,0.393668,1.422331,-0.035429,-1.49304
1,3.183112,0.09005,-0.42006,2.24948,-0.608387,-0.277805,0.393668,1.422331,-1.817204,-0.383084
2,2.01011,-2.65732,-0.03136,2.24948,-1.554283,1.589322,-1.962621,1.422331,-0.035429,-0.24434
3,1.130359,0.09005,-2.363558,-1.734058,0.101036,1.589322,0.393668,1.422331,-0.035429,-0.938062
4,0.177296,0.09005,-0.03136,1.585557,0.810458,0.966947,0.393668,1.422331,-0.035429,-1.146179


In [18]:
# Split into training and testing data with new scaled data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [19]:
# Instantiate a StandardScaler instance
scaler = StandardScaler()

# Fit the training data to the standard scaler
X_scaler = scaler.fit(X_train)

# Transform the training data using the scaler
X_train_scaled = X_scaler.transform(X_train)

# Transform the testing data using the scaler
X_test_scaled = X_scaler.transform(X_test)

In [20]:
# Instantiate the KNeighborsClassifier model with n_neighbors = 3 
knn = KNeighborsClassifier(n_neighbors=3)

In [21]:
# Train the model using the training data
knn.fit(X_train_scaled, y_train)

In [22]:
# Create predictions using the testing data
y_pred = knn.predict(X_test_scaled)

In [23]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, y_pred)

In [24]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, y_pred))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,5459,690
Actual 1,873,1119


Accuracy Score : 0.8080088441223436
Classification Report
              precision    recall  f1-score   support

           0       0.86      0.89      0.87      6149
           1       0.62      0.56      0.59      1992

    accuracy                           0.81      8141
   macro avg       0.74      0.72      0.73      8141
weighted avg       0.80      0.81      0.80      8141



In [25]:
# Print the classification report comparing the testing data to the model predictions
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.86      0.89      0.87      6149
           1       0.62      0.56      0.59      1992

    accuracy                           0.81      8141
   macro avg       0.74      0.72      0.73      8141
weighted avg       0.80      0.81      0.80      8141



### Analyze The Important Features

In [None]:
# Random Forests in sklearn will automatically calculate feature importance
importances = knn.f .feature_importances_
# We can sort the features by their importance
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

### Export Classification Report

In [None]:
import csv

def classificationReportToPanda(report, model):
    file = f'../Data/classification_report_{model}.csv'
    df = pd.DataFrame(report).transpose()
    df['model'] = model
    df['model'] = df['model'].astype('string')
    df['support'] = df['support'].astype('int64')    
    df.to_csv(file, index_label='class', quoting=csv.QUOTE_ALL)
    display(df)

report = classification_report(y_test, y_pred, output_dict=True)

classificationReportToPanda(report, 'KNeighbors')

In [None]:
print(y_pred)

In [None]:
# Add model name to dataframe
model_name = []
for value in y_pred:
    model_name.append("KNeighbors")
    

In [None]:
# Add predictions and model to dataframe
X_test['Results'] = y_pred
X_test['Model'] = model_name

In [None]:
# Verify Results
X_test.head()

In [None]:
X_test.columns

In [None]:
# Drop encoded data
X_test = X_test.drop(columns =['age', 'workclass', 'education.num', 'marital.status', 'occupation',
       'relationship', 'race', 'sex', 'hours.per.week', 'State'])

In [None]:
# Merge with original dataframe
new_df = census_df.join(X_test)

In [None]:
# Drop training data
new_df = new_df.dropna()

In [None]:
# Verify Results
new_df.head()

In [None]:
# Export to CSV
new_df.to_csv("../Data/KNeighbors.csv")