In [30]:
import pandas as pd
import numpy as np
import pickle

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score, f1_score, matthews_corrcoef

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import os

# Create models directory if it doesn't exist
os.makedirs("models", exist_ok=True)


In [31]:
df = pd.read_csv("./data/adult.csv")
df.head()
print("Shape:", df.shape)
print("\nColumns:")
print(df.columns)

Shape: (32561, 15)

Columns:
Index(['age', 'workclass', 'fnlwgt', 'education', 'education.num',
       'marital.status', 'occupation', 'relationship', 'race', 'sex',
       'capital.gain', 'capital.loss', 'hours.per.week', 'native.country',
       'income'],
      dtype='object')


In [32]:
# Remove leading/trailing spaces in strings
df = df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)

# Replace missing values represented by '?'
df.replace("?", np.nan, inplace=True)

# Drop rows with missing values
df.dropna(inplace=True)

print("New Shape after cleaning:", df.shape)

le = LabelEncoder()

for col in df.columns:
    if df[col].dtype == 'object':
        df[col] = le.fit_transform(df[col])

df.head()

X = df.drop("income", axis=1)
y = df["income"]

print("Features shape:", X.shape)


New Shape after cleaning: (30162, 15)
Features shape: (30162, 14)


In [33]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

In [34]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=2000),
    "Decision Tree": DecisionTreeClassifier(),
    "KNN": KNeighborsClassifier(),
    "Naive Bayes": GaussianNB(),
    "Random Forest": RandomForestClassifier(),
    "XGBoost": XGBClassifier(eval_metric='logloss')
}

results = []

for name, model in models.items():
    
    model.fit(X_train, y_train)
    
    preds = model.predict(X_test)
    probs = model.predict_proba(X_test)[:,1]
    
    accuracy = accuracy_score(y_test, preds)
    auc = roc_auc_score(y_test, probs)
    precision = precision_score(y_test, preds)
    recall = recall_score(y_test, preds)
    f1 = f1_score(y_test, preds)
    mcc = matthews_corrcoef(y_test, preds)
    
    results.append([name, accuracy, auc, precision, recall, f1, mcc])
    
    # Save model for Streamlit app
    pickle.dump(model, open(f"models/{name}.pkl","wb"))


print("Training Completed!")

Training Completed!


In [35]:
results_df = pd.DataFrame(results, columns=[
    "Model","Accuracy","AUC","Precision","Recall","F1","MCC"
])

results_df
results_df.to_csv("model_metrics.csv", index=False)
print("Metrics saved!")

Metrics saved!
