## Load Data

In [1]:
import pandas as pd
import os

DATA_PATH = '../data/processed/'

X_train = pd.read_csv(os.path.join(DATA_PATH, 'X_train.csv'))
X_val = pd.read_csv(os.path.join(DATA_PATH, 'X_val.csv'))
X_test = pd.read_csv(os.path.join(DATA_PATH, 'X_test.csv'))

y_train = pd.read_csv(os.path.join(DATA_PATH, 'y_train.csv')).squeeze()  # if single column
y_val = pd.read_csv(os.path.join(DATA_PATH, 'y_val.csv')).squeeze()
y_test = pd.read_csv(os.path.join(DATA_PATH, 'y_test.csv')).squeeze()


ModuleNotFoundError: No module named 'pandas'

In [None]:
# ======================
# 1. Custom categorical encoding
# ======================

# Department: do NOT drop any category
for X in [X_train, X_val, X_test]:
    dummies = pd.get_dummies(X['Department'], prefix='Department', drop_first=False)
    X.drop('Department', axis=1, inplace=True)
    X[dummies.columns] = dummies

# EducationField: drop ONLY 'Other'
def encode_educationfield(df):
    edu_dummies = pd.get_dummies(df['EducationField'], prefix='EducationField')
    if 'EducationField_Other' in edu_dummies.columns:
        edu_dummies = edu_dummies.drop('EducationField_Other', axis=1)
    df = df.drop('EducationField', axis=1)
    df[edu_dummies.columns] = edu_dummies
    return df

X_train = encode_educationfield(X_train)
X_val = encode_educationfield(X_val)
X_test = encode_educationfield(X_test)

# JobRole: do NOT drop any category
for X in [X_train, X_val, X_test]:
    dummies = pd.get_dummies(X['JobRole'], prefix='JobRole', drop_first=False)
    X.drop('JobRole', axis=1, inplace=True)
    X[dummies.columns] = dummies

# The remaining categorical variables: drop first
drop_first_vars = ['BusinessTravel', 'Gender', 'MaritalStatus', 'OverTime']
for col in drop_first_vars:
    for X in [X_train, X_val, X_test]:
        dummies = pd.get_dummies(X[col], prefix=col, drop_first=True)
        X.drop(col, axis=1, inplace=True)
        X[dummies.columns] = dummies

# Ensure all sets have the same columns (fill missing columns with zero)
all_columns = sorted(set(X_train.columns) | set(X_val.columns) | set(X_test.columns))
for X in [X_train, X_val, X_test]:
    missing_cols = set(all_columns) - set(X.columns)
    for col in missing_cols:
        X[col] = 0
    X = X[all_columns]
    X.reset_index(drop=True, inplace=True)
    
# Update references in case columns were reordered above
X_train = X_train[all_columns].reset_index(drop=True)
X_val = X_val[all_columns].reset_index(drop=True)
X_test = X_test[all_columns].reset_index(drop=True)

# ======================
# 2. Scaling numeric & ordinal features
# ======================

ordinal_cols = [
    'Education', 'EnvironmentSatisfaction', 'JobInvolvement', 'JobLevel',
    'JobSatisfaction', 'PerformanceRating', 'RelationshipSatisfaction', 'WorkLifeBalance'
]
numeric_cols = [
    col for col in X_train.columns
    if (X_train[col].dtype in ['int64', 'float64']) and (col in ordinal_cols or not col.startswith(('Department_', 'EducationField_', 'JobRole_', 'BusinessTravel_', 'Gender_', 'MaritalStatus_', 'OverTime_')))
]
scale_cols = numeric_cols + ordinal_cols

scaler = StandardScaler()
X_train[scale_cols] = scaler.fit_transform(X_train[scale_cols])
X_val[scale_cols] = scaler.transform(X_val[scale_cols])
X_test[scale_cols] = scaler.transform(X_test[scale_cols])

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, classification_report

# Example: Minimal preprocessing/sklearn pipeline
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train, y_train)


In [None]:
from sklearn.metrics import (
    accuracy_score, roc_auc_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
)

y_pred_val = model.predict(X_val)
y_pred_test = model.predict(X_test)
y_val_proba = model.predict_proba(X_val)[:, 1]
y_test_proba = model.predict_proba(X_test)[:, 1]

print("Validation Accuracy:", accuracy_score(y_val, y_pred_val))
print("Test Accuracy:", accuracy_score(y_test, y_pred_test))
print("Validation ROC AUC:", roc_auc_score(y_val, y_val_proba))
print("Test ROC AUC:", roc_auc_score(y_test, y_test_proba))

print("Validation Precision:", precision_score(y_val, y_pred_val))
print("Test Precision:", precision_score(y_test, y_pred_test))
print("Validation Recall:", recall_score(y_val, y_pred_val))
print("Test Recall:", recall_score(y_test, y_pred_test))
print("Validation F1 Score:", f1_score(y_val, y_pred_val))
print("Test F1 Score:", f1_score(y_test, y_pred_test))

print("\nValidation Confusion Matrix:\n", confusion_matrix(y_val, y_pred_val))
print("\nTest Confusion Matrix:\n", confusion_matrix(y_test, y_pred_test))

print("\nClassification report for validation set:\n", classification_report(y_val, y_pred_val))
print("\nClassification report for test set:\n", classification_report(y_test, y_pred_test))
