In [6]:
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score, f1_score, matthews_corrcoef
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import os

# Load dataset
df = pd.read_csv("https://huggingface.co/datasets/scikit-learn/adult-census-income/resolve/main/adult.csv")

# Encode categorical features
for col in df.select_dtypes(include='object'):
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])

X = df.drop("income", axis=1)
y = df["income"]

scaler = StandardScaler()
X = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(),
    "KNN": KNeighborsClassifier(),
    "Naive Bayes": GaussianNB(),
    "Random Forest": RandomForestClassifier(),
    "XGBoost": XGBClassifier(eval_metric="logloss")
}

results = []

# Create the 'model' directory if it doesn't exist
os.makedirs('model', exist_ok=True)

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:,1]

    metrics = {
        "Model": name,
        "Accuracy": accuracy_score(y_test, y_pred),
        "AUC": roc_auc_score(y_test, y_prob),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1": f1_score(y_test, y_pred),
        "MCC": matthews_corrcoef(y_test, y_pred)
    }

    results.append(metrics)
    joblib.dump(model, f"model/{name.replace(' ', '_')}.pkl")

#pd.DataFrame(results).to_csv("/model_metrics.csv", index=False)
print(pd.DataFrame(results))
# Install Streamlit
!pip install streamlit

import streamlit as st
import pandas as pd
import joblib
from sklearn.metrics import classification_report, confusion_matrix

st.title("ML Assignment 2 – Classification Models")

uploaded_file = st.file_uploader("Upload Test Dataset (CSV)", type=["csv"])

model_name = st.selectbox(
    "Select Model",
    [
        "Logistic Regression",
        "Decision Tree",
        "KNN",
        "Naive Bayes",
        "Random Forest",
        "XGBoost"
    ]
)

if uploaded_file:
    df = pd.read_csv(uploaded_file)

    for col in df.select_dtypes(include='object'):
        df[col] = df[col].astype('category').cat.codes

    X = df.drop("income", axis=1)
    y = df["income"]

    model = joblib.load(f"model/{model_name.replace(' ', '_')}.pkl")

    y_pred = model.predict(X)

    st.subheader("Evaluation Metrics")
    st.text(classification_report(y, y_pred))

    st.subheader("Confusion Matrix")
    st.write(confusion_matrix(y, y_pred))

import streamlit as st
import pandas as pd
import joblib
from sklearn.metrics import classification_report, confusion_matrix

st.title("ML Assignment 2 – Classification Models")

uploaded_file = st.file_uploader("Upload Test Dataset (CSV)", type=["csv"])

model_name = st.selectbox(
    "Select Model",
    [
        "Logistic Regression",
        "Decision Tree",
        "KNN",
        "Naive Bayes",
        "Random Forest",
        "XGBoost"
    ]
)

if uploaded_file:
    df = pd.read_csv(uploaded_file)

    for col in df.select_dtypes(include='object'):
        df[col] = df[col].astype('category').cat.codes

    X = df.drop("income", axis=1)
    y = df["income"]

    model = joblib.load(f"model/{model_name.replace(' ', '_')}.pkl")

    y_pred = model.predict(X)

    st.subheader("Evaluation Metrics")
    st.text(classification_report(y, y_pred))

    st.subheader("Confusion Matrix")
    st.write(confusion_matrix(y, y_pred))

                 Model  Accuracy       AUC  Precision    Recall        F1  \
0  Logistic Regression  0.825580  0.848023   0.705641  0.447625  0.547771   
1        Decision Tree  0.808844  0.740961   0.591940  0.611581  0.601600   
2                  KNN  0.830646  0.849744   0.660029  0.582303  0.618735   
3          Naive Bayes  0.801628  0.851748   0.662252  0.325309  0.436300   
4        Random Forest  0.855059  0.903462   0.728604  0.614834  0.666902   
5              XGBoost  0.870413  0.925298   0.769231  0.644112  0.701133   

        MCC  
0  0.464093  
1  0.476009  
2  0.512085  
3  0.363512  
4  0.578512  
5  0.623207  


2026-02-14 13:45:10.107 
  command:

    streamlit run /home/cloud/anaconda3/lib/python3.12/site-packages/ipykernel_launcher.py [ARGUMENTS]
2026-02-14 13:45:10.126 Session state does not function when running a script without `streamlit run`
