In [9]:
import pandas as pd
import joblib
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
import lightgbm as lgb
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from imblearn.pipeline import Pipeline as ImbPipeline
import optuna

df_date = pd.read_csv(r"RegisteredCasesattheCommunityMediationCentre.csv")
df_type = pd.read_csv(r"SourceofCasesRegisteredattheCommunityMediationCentre.csv")
df_rs = pd.read_csv(r"RelationshipofPartiesinCasesRegisteredattheCommunityMediationCentre.csv")
df_outcome = pd.read_csv(r"OutcomeofCasesRegisteredattheCommunityMediationCentre.csv")


In [10]:
df = df_date.merge(
    df_type,
    on=['case_number'],
    how="inner"
)
df = df.merge(
    df_rs,
    on=['case_number'],
    how="inner"
)
df = df.merge(
    df_outcome,
    on=['case_number'],
    how="inner"
)
df.head(5)

Unnamed: 0,date_registered,case_number,type_of_intake,type_of_dispute,outcome_of_cases
0,2011-01-03,CAS-02624-B3K1R0,Direct Intake,Friends,Not Mediated
1,2011-01-03,CAS-02766-L1V3G1,External Agency,Neighbour,Not Mediated
2,2011-01-03,CAS-02549-D6L2N3,External Agency,Neighbour,Not Mediated
3,2011-01-03,CAS-02376-R0X8R9,External Agency,Neighbour,Not Mediated
4,2011-01-04,CAS-02790-Y9S0Y4,Direct Intake,Neighbour,Mediation With Settlement


In [11]:
# Preprocessing

data = df 
data['date_registered'] = pd.to_datetime(data['date_registered'])
data['is_weekend'] = data['date_registered'].dt.dayofweek.apply(lambda x: 1 if x >= 5 else 0)

clean_mapping = {
    'Applications through Contact Centre': 'Contact Centre',
    'Court-Ordered (Community Dispute Resolution Tribunal)': 'Court-Ordered',
    "Court-Ordered- Magistrate's Complaint": 'Court-Ordered Magistrate',
    "Court-Ordered- Magistrate's Complaint '": 'Court-Ordered Magistrate',
    'Courts': 'Courts',
    'Direct Correspondence': 'Direct Correspondence',
    'Direct Intake': 'Direct Intake',
    'External Agency': 'External Agency',
    'External Agency Referrals': 'External Agency Referrals',
    'External Agency Referrals  -Others': 'External Agency Referrals - Others',
    'External Agency Referrals - Housing Development Board (HDB)': 'External Agency Referrals - HDB',
    'External Agency Referrals - Member of Parliament': 'External Agency Referrals - MP',
    'External Agency Referrals - Others': 'External Agency Referrals - Others',
    'External Agency Referrals - Singapore Police Force (SPF)': 'External Agency Referrals - SPF',
    'External Agency Referrals - Town Council': 'External Agency Referrals - Town Council',
    'External Agency Referrals -Housing Development Board (HDB)': 'External Agency Referrals - HDB',
    'External Agency Referrals -Member of Parliament': 'External Agency Referrals - MP',
    'External Agency Referrals -Town Council': 'External Agency Referrals - Town Council',
    'Walk Ins': 'Walk-Ins'
}
data['type_of_intake'] = data['type_of_intake'].str.replace("\\", "", regex=False).str.strip().replace(clean_mapping)

cat_cols = ['type_of_intake', 'type_of_dispute']
fitted_encoders = {}

for col in cat_cols:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    fitted_encoders[col] = le 

features = ['is_weekend', 'type_of_intake', 'type_of_dispute']
joblib.dump(fitted_encoders['type_of_intake'], 'label_encoder_intake.pkl')
joblib.dump(fitted_encoders['type_of_dispute'], 'label_encoder_dispute.pkl')
print("Fitted LabelEncoders saved successfully!")

data['proceed_to_mediation'] = data['outcome_of_cases'].apply(lambda x: 0 if x == 'Not Mediated' else 1)
mediation_cases = data[data['proceed_to_mediation'] == 1].copy()
mediation_cases['settled'] = mediation_cases['outcome_of_cases'].apply(
    lambda x: 1 if x == 'Mediation With Settlement' else 0
)

Fitted LabelEncoders saved successfully!


In [12]:
import joblib
import os

try:
    # Load the fitted LabelEncoder objects
    le_intake = joblib.load("label_encoder_intake.pkl")
    le_dispute = joblib.load("label_encoder_dispute.pkl")

    # Access the classes_ attribute to see the mappings and print them
    print("--- Mapping for 'type_of_intake' ---")
    for i, category in enumerate(le_intake.classes_):
        print(f"'{category}' -> {i}")

    print("\n--- Mapping for 'type_of_dispute' ---")
    for i, category in enumerate(le_dispute.classes_):
        print(f"'{category}' -> {i}")

except FileNotFoundError as e:
    print(f"Error: {e}. Please ensure 'label_encoder_intake.pkl' and 'label_encoder_dispute.pkl' exist in the current directory.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")


--- Mapping for 'type_of_intake' ---
'Contact Centre' -> 0
'Court-Ordered' -> 1
'Court-Ordered Magistrate' -> 2
'Courts' -> 3
'Direct Correspondence' -> 4
'Direct Intake' -> 5
'External Agency' -> 6
'External Agency Referrals' -> 7
'External Agency Referrals - HDB' -> 8
'External Agency Referrals - MP' -> 9
'External Agency Referrals - Others' -> 10
'External Agency Referrals - SPF' -> 11
'External Agency Referrals - Town Council' -> 12
'Walk-Ins' -> 13

--- Mapping for 'type_of_dispute' ---
'Colleagues' -> 0
'Commercial Entities' -> 1
'Commercial Entity and An Individual' -> 2
'Family' -> 3
'Friends' -> 4
'Landlord-Tenant' -> 5
'Neighbour' -> 6
'Others' -> 7
'Strangers' -> 8


In [8]:
X = data[features]
y = data['proceed_to_mediation']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Define models
models = {
    'LogisticRegression': LogisticRegression(max_iter=1000, class_weight='balanced'),
    'RandomForest': RandomForestClassifier(n_estimators=200, max_depth=4, class_weight='balanced', random_state=42),
    'XGBoost': XGBClassifier(n_estimators=200, max_depth=4, learning_rate=0.1, eval_metric='logloss', random_state=42),
    'LightGBM': lgb.LGBMClassifier(n_estimators=200, max_depth=4, learning_rate=0.1, class_weight='balanced', random_state=42, verbose=-1, force_row_wise=True),
    # SVM needs scaling and probability=True
    'SVM': Pipeline([
        ('scaler', StandardScaler()),
        ('svc', SVC(kernel='rbf', probability=True, class_weight='balanced', random_state=42))
    ]),
    # MLP needs scaling
    'MLP': Pipeline([
        ('scaler', StandardScaler()),
        ('mlp', MLPClassifier(hidden_layer_sizes=(50,50), max_iter=500, random_state=42))
    ])
}


# Train, predict, evaluate
results = []

for name, model in models.items():
    print(f"\n{name}")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    # Probabilities
    if hasattr(model, "predict_proba"):
        y_prob = model.predict_proba(X_test)[:, 1]
    else:
        y_prob = model.predict_proba(X_test)[:, 1]
    
    # Metrics
    roc_auc = roc_auc_score(y_test, y_prob)
    print("ROC-AUC:", roc_auc)
    
    results.append({
        'model': name,
        'roc_auc': roc_auc
    })


# Summary table
results_df = pd.DataFrame(results)
print("\nModel Comparison")
print(results_df.sort_values(by='roc_auc', ascending=False))



LogisticRegression
ROC-AUC: 0.6711976682564916

RandomForest
ROC-AUC: 0.7393614202437733

XGBoost
ROC-AUC: 0.7461691329338388

LightGBM
ROC-AUC: 0.7477885124943949

SVM


KeyboardInterrupt: 

In [None]:
X = data[features]
y = data['proceed_to_mediation']

print("Class distribution:")
print(y.value_counts())

# Define imbalance strategies
strategies = {
    "none": None,  # default
    "scale_pos_weight": "scale_pos_weight",  # Adding weight to balance imbalance
}

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

results = []

for name, strategy in strategies.items():
    print(f"\nStrategy: {name}")
    
    if strategy is None:
        model = XGBClassifier(eval_metric='logloss', random_state=42)
        roc_auc_scores = cross_val_score(model, X, y, cv=kf, scoring='roc_auc')
        print("ROC-AUC scores for each fold:", roc_auc_scores)
        mean_roc_auc = roc_auc_scores.mean()
    
    else: 
        n_pos = y.sum()
        n_neg = len(y) - n_pos
        pos_weight = n_neg / n_pos
        model = XGBClassifier(scale_pos_weight=pos_weight, eval_metric='logloss', random_state=42)
        roc_auc_scores = cross_val_score(model, X, y, cv=kf, scoring='roc_auc')
        print("ROC-AUC scores for each fold:", roc_auc_scores)
        mean_roc_auc = roc_auc_scores.mean()
    
    print("Mean ROC-AUC:", mean_roc_auc)
    
    results.append({"strategy": name, "roc_auc": mean_roc_auc})

# Compare strategies
results_df = pd.DataFrame(results).sort_values(by="roc_auc", ascending=False)
print("\nStrategy Comparison")
print(results_df)

Class distribution:
proceed_to_mediation
0    4417
1    2776
Name: count, dtype: int64

Strategy: none
ROC-AUC scores for each fold: [0.71490053 0.75483266 0.73425495 0.74647343 0.73138869]
Mean ROC-AUC: 0.7363700533417489

Strategy: scale_pos_weight
ROC-AUC scores for each fold: [0.71501264 0.75453712 0.73381091 0.74502872 0.73096018]
Mean ROC-AUC: 0.7358699131139365

Strategy Comparison
           strategy  roc_auc
0              none  0.73637
1  scale_pos_weight  0.73587


In [None]:
X1 = data[features]
y1 = data['proceed_to_mediation']

X1_train, X1_test, y1_train, y1_test = train_test_split(
    X1, y1, test_size=0.2, random_state=42, stratify=y1
)

# Optuna objective function
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 500),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 5),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 5),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 5),
        'eval_metric': 'logloss'
    }
    model = XGBClassifier(**params)
    
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    auc = cross_val_score(model, X1_train, y1_train, cv=cv, scoring='roc_auc').mean()
    return auc

# Run Optuna study
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=30)  

print("Best trial:")
print(study.best_trial.params)

# Train XGBoost with best params
best_params = study.best_trial.params
model1 = XGBClassifier(**best_params, eval_metric='logloss')
model1.fit(X1_train, y1_train)

# Predictions
y1_pred = model1.predict(X1_test)
y1_prob = model1.predict_proba(X1_test)[:,1]

# Save model
joblib.dump(model1, "xgb_best_proceed_to_mediation.pkl")
print("Model saved to xgb_best_proceed_to_mediation.pkl")

# Evaluation
print("Model 1: Proceed to Mediation")
print(classification_report(y1_test, y1_pred))
print("ROC-AUC Score:", roc_auc_score(y1_test, y1_prob))

[I 2025-09-21 05:19:59,915] A new study created in memory with name: no-name-cc5163b6-9957-4518-9207-3abc1836081c


[I 2025-09-21 05:20:01,691] Trial 0 finished with value: 0.7302916444602742 and parameters: {'n_estimators': 416, 'max_depth': 10, 'learning_rate': 0.22678665838474046, 'subsample': 0.7086713919288441, 'colsample_bytree': 0.7085082097988054, 'gamma': 4.420462995882854, 'reg_alpha': 1.2239464910283075, 'reg_lambda': 2.558475647067113}. Best is trial 0 with value: 0.7302916444602742.
[I 2025-09-21 05:20:04,716] Trial 1 finished with value: 0.7334443339227044 and parameters: {'n_estimators': 358, 'max_depth': 10, 'learning_rate': 0.02264404626476597, 'subsample': 0.7476239499043171, 'colsample_bytree': 0.760972637618154, 'gamma': 1.6555596407377104, 'reg_alpha': 2.5331608681294995, 'reg_lambda': 0.9170154179036927}. Best is trial 1 with value: 0.7334443339227044.
[I 2025-09-21 05:20:06,726] Trial 2 finished with value: 0.7336292558230347 and parameters: {'n_estimators': 495, 'max_depth': 10, 'learning_rate': 0.16190705771511438, 'subsample': 0.7804983848464335, 'colsample_bytree': 0.78420

Best trial:
{'n_estimators': 266, 'max_depth': 8, 'learning_rate': 0.07051473939072679, 'subsample': 0.5002144196617212, 'colsample_bytree': 0.7864623440935495, 'gamma': 0.03602834054381043, 'reg_alpha': 3.574986045903887, 'reg_lambda': 3.1148662391144124}
Model saved to xgb_best_proceed_to_mediation.pkl
Model 1: Proceed to Mediation
              precision    recall  f1-score   support

           0       0.75      0.81      0.78       884
           1       0.65      0.58      0.61       555

    accuracy                           0.72      1439
   macro avg       0.70      0.69      0.70      1439
weighted avg       0.71      0.72      0.71      1439

ROC-AUC Score: 0.751131221719457


# Others

In [None]:
X = mediation_cases[features]
y = mediation_cases['settled']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Define models
models = {
    'LogisticRegression': LogisticRegression(max_iter=1000, class_weight='balanced'),
    'RandomForest': RandomForestClassifier(n_estimators=400, max_depth=5, class_weight='balanced', random_state=42),
    'XGBoost': XGBClassifier(n_estimators=400, max_depth=5, learning_rate=0.1, eval_metric='logloss', random_state=42),
    'LightGBM': lgb.LGBMClassifier(n_estimators=400, max_depth=5, learning_rate=0.1, class_weight='balanced', random_state=42, verbose=-1, force_row_wise=True),
    # SVM needs scaling and probability=True
    'SVM': Pipeline([
        ('scaler', StandardScaler()),
        ('svc', SVC(kernel='rbf', probability=True, random_state=42))
    ]),
    # MLP needs scaling
    'MLP': Pipeline([
        ('scaler', StandardScaler()),
        ('mlp', MLPClassifier(hidden_layer_sizes=(50,50), max_iter=500, random_state=42))
    ])
}

# Train, predict, evaluate
results = []

for name, model in models.items():
    print(f"\n{name}")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    if hasattr(model, "predict_proba"):
        y_prob = model.predict_proba(X_test)[:, 1]
    else:
        y_prob = model.predict_proba(X_test)[:, 1]
    
    # Metrics
    roc_auc = roc_auc_score(y_test, y_prob)
    print("ROC-AUC:", roc_auc)
    
    results.append({
        'model': name,
        'roc_auc': roc_auc
    })

# Summary table
results_df = pd.DataFrame(results)
print("\nModel Comparison")
print(results_df.sort_values(by='roc_auc', ascending=False))



LogisticRegression
ROC-AUC: 0.5712475335102402

RandomForest
ROC-AUC: 0.651961284615908

XGBoost
ROC-AUC: 0.6533986527862828

LightGBM
ROC-AUC: 0.6496563924610465

SVM
ROC-AUC: 0.49517758726270666

MLP
ROC-AUC: 0.644757433489828

=== Model Comparison ===
                model   roc_auc
2             XGBoost  0.653399
1        RandomForest  0.651961
3            LightGBM  0.649656
5                 MLP  0.644757
0  LogisticRegression  0.571248
4                 SVM  0.495178


In [None]:
# Solving class imabalance
print(mediation_cases['settled'].value_counts())
X = mediation_cases[features]
y = mediation_cases['settled']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Define imbalance strategies
strategies = {
    "none": None,
    "scale_pos_weight": "scale_pos_weight",
    "undersample": RandomUnderSampler(sampling_strategy=0.5, random_state=42),
    "oversample": RandomOverSampler(sampling_strategy=0.5, random_state=42)
}


kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
results = []

for name, strategy in strategies.items():
    print(f"\nStrategy: {name}")
    
    if strategy is None:
        model = XGBClassifier(eval_metric='logloss', random_state=42)
        roc_auc = cross_val_score(model, X, y, cv=kf, scoring='roc_auc').mean()
    
    elif strategy == "scale_pos_weight":
        n_pos = y.sum()
        n_neg = len(y) - n_pos
        pos_weight = n_neg / n_pos
        model = XGBClassifier(scale_pos_weight=pos_weight, eval_metric='logloss', random_state=42)
        roc_auc = cross_val_score(model, X, y, cv=kf, scoring='roc_auc').mean()
    
    else:  # undersample or oversample
        model = XGBClassifier(eval_metric='logloss', random_state=42)
        pipeline = ImbPipeline(steps=[('resample', strategy), ('model', model)])
        roc_auc = cross_val_score(pipeline, X, y, cv=kf, scoring='roc_auc').mean()
    
    print("Mean ROC-AUC:", roc_auc)
    
    results.append({"strategy": name, "roc_auc": roc_auc})


# Compare strategies
results_df = pd.DataFrame(results).sort_values(by="roc_auc", ascending=False)
print("\nStrategy Comparison")
print(results_df)

settled
1    2069
0     707
Name: count, dtype: int64

Strategy: none
Mean ROC-AUC: 0.6073036721344015

Strategy: scale_pos_weight
Mean ROC-AUC: 0.6110359965818372

Strategy: undersample
Mean ROC-AUC: 0.6132974175004213

Strategy: oversample
Mean ROC-AUC: 0.6012791874885154

Strategy Comparison
           strategy   roc_auc
2       undersample  0.613297
1  scale_pos_weight  0.611036
0              none  0.607304
3        oversample  0.601279


In [None]:
X = mediation_cases[features]
y = mediation_cases['settled']

X1_train, X1_test, y1_train, y1_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Optuna objective function
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 500),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 5),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 5),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 5),
        'eval_metric': 'logloss'
    }

    undersampler = RandomUnderSampler(sampling_strategy='auto', random_state=42)
    pipeline = ImbPipeline(steps=[('sampler', undersampler), ('classifier', XGBClassifier(**params))])
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    f1_score = cross_val_score(pipeline, X1_train, y1_train, cv=cv, scoring='roc_auc').mean()
    
    return f1_score

# Run Optuna study
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=30)

print("Best trial:")
print(study.best_trial.params)

# Train XGBoost with best params
best_params = study.best_trial.params
final_undersampler = RandomUnderSampler(sampling_strategy='auto', random_state=42)
final_pipeline = ImbPipeline(steps=[('sampler', final_undersampler), ('classifier', XGBClassifier(**best_params, eval_metric='logloss'))])

final_pipeline.fit(X1_train, y1_train)

# Predictions
y1_pred = final_pipeline.predict(X1_test)
y1_prob = final_pipeline.predict_proba(X1_test)[:,1]

# Save model
joblib.dump(final_pipeline, "xgb_best_settled_undersampled.pkl")
print("Model saved to xgb_best_settled_undersampled.pkl")

# Evaluation
print("Model 1: Proceed to Mediation")
print(classification_report(y1_test, y1_pred))
print("ROC-AUC Score:", roc_auc_score(y1_test, y1_prob))

[I 2025-09-21 05:22:56,702] A new study created in memory with name: no-name-0e7b8a58-46b9-4802-aecb-87a2064402ce


[I 2025-09-21 05:22:57,151] Trial 0 finished with value: 0.609263962783734 and parameters: {'n_estimators': 88, 'max_depth': 8, 'learning_rate': 0.024723317405224755, 'subsample': 0.8117895640618558, 'colsample_bytree': 0.6511391142806084, 'gamma': 1.1085191809445054, 'reg_alpha': 3.4658719417066157, 'reg_lambda': 1.0690190863504552}. Best is trial 0 with value: 0.609263962783734.
[I 2025-09-21 05:22:57,917] Trial 1 finished with value: 0.6044969654840521 and parameters: {'n_estimators': 210, 'max_depth': 9, 'learning_rate': 0.033242439349946286, 'subsample': 0.5489571040729241, 'colsample_bytree': 0.7401245203132676, 'gamma': 2.464040230292663, 'reg_alpha': 4.729363268445475, 'reg_lambda': 2.0683563229830746}. Best is trial 0 with value: 0.609263962783734.
[I 2025-09-21 05:22:58,729] Trial 2 finished with value: 0.6133171136005133 and parameters: {'n_estimators': 204, 'max_depth': 9, 'learning_rate': 0.05135956719378261, 'subsample': 0.7540633689620768, 'colsample_bytree': 0.796022198