In [1]:
# Import dependecies
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns
import json

In [2]:
# Read CSV as a DF
adult_cleaned_df = pd.read_csv("/Users/jadesmith/Desktop/ptda-dec-2023/Projects/Project_4/Project_4_Income_Prediction_Model/Resources/adult_cleaned.csv", dtype={'Income over 50k? 0=no 1=yes':str})
adult_cleaned_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30718 entries, 0 to 30717
Data columns (total 10 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   age                          30718 non-null  int64 
 1   workclass                    30718 non-null  object
 2   education                    30718 non-null  object
 3   marital.status               30718 non-null  object
 4   occupation                   30718 non-null  object
 5   relationship                 30718 non-null  object
 6   race                         30718 non-null  object
 7   sex                          30718 non-null  object
 8   hours.per.week               30718 non-null  int64 
 9   Income over 50k? 0=no 1=yes  30718 non-null  object
dtypes: int64(2), object(8)
memory usage: 2.3+ MB


In [3]:
# Split into training and testing sets
X = adult_cleaned_df.drop(columns=['Income over 50k? 0=no 1=yes'])
X = pd.get_dummies(X)
y = adult_cleaned_df['Income over 50k? 0=no 1=yes']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
# Standarize with scaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [5]:
# Create and train the Logistic Regression model
model = LogisticRegression(random_state=42)
model.fit(X_train, y_train)

In [6]:
# Make predictions on the test set
y_pred = model.predict(X_test)

In [7]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
cr = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print("Confusion Matrix:")
print(cm)
print("Classification Report:")
print(cr)

Accuracy: 0.8365885416666666
Confusion Matrix:
[[4285  354]
 [ 650  855]]
Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.92      0.90      4639
           1       0.71      0.57      0.63      1505

    accuracy                           0.84      6144
   macro avg       0.79      0.75      0.76      6144
weighted avg       0.83      0.84      0.83      6144



In [8]:
# Reduce dimensions to 2D using PCA for visualization purposes
pca = PCA(n_components=2)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

In [9]:
# Export results to a JSON file
results = {
    "accuracy": accuracy,
    "confusion_matrix": cm.tolist(),
    "classification_report": cr,
    "X_train_pca": X_train_pca.tolist(),
    "X_test_pca": X_test_pca.tolist(),
    "y_train": y_train.tolist(),
    "y_test": y_test.tolist()
}

# Save results to a JSON file
with open('model_results.json', 'w') as f:
    json.dump(results, f)

print("Results exported to 'model_results.json'")

Results exported to 'model_results.json'
