In [9]:
import pandas as pd
import joblib
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
import lightgbm as lgb
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from imblearn.pipeline import Pipeline as ImbPipeline
import optuna

df_date = pd.read_csv(r"RegisteredCasesattheCommunityMediationCentre.csv")
df_type = pd.read_csv(r"SourceofCasesRegisteredattheCommunityMediationCentre.csv")
df_rs = pd.read_csv(r"RelationshipofPartiesinCasesRegisteredattheCommunityMediationCentre.csv")
df_outcome = pd.read_csv(r"OutcomeofCasesRegisteredattheCommunityMediationCentre.csv")


In [10]:
df = df_date.merge(
    df_type,
    on=['case_number'],
    how="inner"
)
df = df.merge(
    df_rs,
    on=['case_number'],
    how="inner"
)
df = df.merge(
    df_outcome,
    on=['case_number'],
    how="inner"
)
df.head(5)

Unnamed: 0,date_registered,case_number,type_of_intake,type_of_dispute,outcome_of_cases
0,2011-01-03,CAS-02624-B3K1R0,Direct Intake,Friends,Not Mediated
1,2011-01-03,CAS-02766-L1V3G1,External Agency,Neighbour,Not Mediated
2,2011-01-03,CAS-02549-D6L2N3,External Agency,Neighbour,Not Mediated
3,2011-01-03,CAS-02376-R0X8R9,External Agency,Neighbour,Not Mediated
4,2011-01-04,CAS-02790-Y9S0Y4,Direct Intake,Neighbour,Mediation With Settlement


In [11]:
# Preprocessing

data = df 
data['date_registered'] = pd.to_datetime(data['date_registered'])
data['is_weekend'] = data['date_registered'].dt.dayofweek.apply(lambda x: 1 if x >= 5 else 0)

clean_mapping = {
    'Applications through Contact Centre': 'Contact Centre',
    'Court-Ordered (Community Dispute Resolution Tribunal)': 'Court-Ordered',
    "Court-Ordered- Magistrate's Complaint": 'Court-Ordered Magistrate',
    "Court-Ordered- Magistrate's Complaint '": 'Court-Ordered Magistrate',
    'Courts': 'Courts',
    'Direct Correspondence': 'Direct Correspondence',
    'Direct Intake': 'Direct Intake',
    'External Agency': 'External Agency',
    'External Agency Referrals': 'External Agency Referrals',
    'External Agency Referrals  -Others': 'External Agency Referrals - Others',
    'External Agency Referrals - Housing Development Board (HDB)': 'External Agency Referrals - HDB',
    'External Agency Referrals - Member of Parliament': 'External Agency Referrals - MP',
    'External Agency Referrals - Others': 'External Agency Referrals - Others',
    'External Agency Referrals - Singapore Police Force (SPF)': 'External Agency Referrals - SPF',
    'External Agency Referrals - Town Council': 'External Agency Referrals - Town Council',
    'External Agency Referrals -Housing Development Board (HDB)': 'External Agency Referrals - HDB',
    'External Agency Referrals -Member of Parliament': 'External Agency Referrals - MP',
    'External Agency Referrals -Town Council': 'External Agency Referrals - Town Council',
    'Walk Ins': 'Walk-Ins'
}
data['type_of_intake'] = data['type_of_intake'].str.replace("\\", "", regex=False).str.strip().replace(clean_mapping)

cat_cols = ['type_of_intake', 'type_of_dispute']
fitted_encoders = {}

for col in cat_cols:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    fitted_encoders[col] = le 

features = ['is_weekend', 'type_of_intake', 'type_of_dispute']
joblib.dump(fitted_encoders['type_of_intake'], 'label_encoder_intake.pkl')
joblib.dump(fitted_encoders['type_of_dispute'], 'label_encoder_dispute.pkl')
print("Fitted LabelEncoders saved successfully!")

data['proceed_to_mediation'] = data['outcome_of_cases'].apply(lambda x: 0 if x == 'Not Mediated' else 1)
mediation_cases = data[data['proceed_to_mediation'] == 1].copy()
mediation_cases['settled'] = mediation_cases['outcome_of_cases'].apply(
    lambda x: 1 if x == 'Mediation With Settlement' else 0
)

Fitted LabelEncoders saved successfully!


In [12]:
import joblib
import os

try:
    # Load the fitted LabelEncoder objects
    le_intake = joblib.load("label_encoder_intake.pkl")
    le_dispute = joblib.load("label_encoder_dispute.pkl")

    # Access the classes_ attribute to see the mappings and print them
    print("--- Mapping for 'type_of_intake' ---")
    for i, category in enumerate(le_intake.classes_):
        print(f"'{category}' -> {i}")

    print("\n--- Mapping for 'type_of_dispute' ---")
    for i, category in enumerate(le_dispute.classes_):
        print(f"'{category}' -> {i}")

except FileNotFoundError as e:
    print(f"Error: {e}. Please ensure 'label_encoder_intake.pkl' and 'label_encoder_dispute.pkl' exist in the current directory.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")


--- Mapping for 'type_of_intake' ---
'Contact Centre' -> 0
'Court-Ordered' -> 1
'Court-Ordered Magistrate' -> 2
'Courts' -> 3
'Direct Correspondence' -> 4
'Direct Intake' -> 5
'External Agency' -> 6
'External Agency Referrals' -> 7
'External Agency Referrals - HDB' -> 8
'External Agency Referrals - MP' -> 9
'External Agency Referrals - Others' -> 10
'External Agency Referrals - SPF' -> 11
'External Agency Referrals - Town Council' -> 12
'Walk-Ins' -> 13

--- Mapping for 'type_of_dispute' ---
'Colleagues' -> 0
'Commercial Entities' -> 1
'Commercial Entity and An Individual' -> 2
'Family' -> 3
'Friends' -> 4
'Landlord-Tenant' -> 5
'Neighbour' -> 6
'Others' -> 7
'Strangers' -> 8


# Model 1

In [13]:
X = data[features]
y = data['proceed_to_mediation']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Define models
models = {
    'LogisticRegression': LogisticRegression(max_iter=1000, class_weight='balanced'),
    'RandomForest': RandomForestClassifier(n_estimators=200, max_depth=4, class_weight='balanced', random_state=42),
    'XGBoost': XGBClassifier(n_estimators=200, max_depth=4, learning_rate=0.1, eval_metric='logloss', random_state=42),
    'LightGBM': lgb.LGBMClassifier(n_estimators=200, max_depth=4, learning_rate=0.1, class_weight='balanced', random_state=42, verbose=-1, force_row_wise=True),
    # SVM needs scaling and probability=True
    'SVM': Pipeline([
        ('scaler', StandardScaler()),
        ('svc', SVC(kernel='rbf', probability=True, class_weight='balanced', random_state=42))
    ]),
    # MLP needs scaling
    'MLP': Pipeline([
        ('scaler', StandardScaler()),
        ('mlp', MLPClassifier(hidden_layer_sizes=(50,50), max_iter=500, random_state=42))
    ])
}


# Train, predict, evaluate
results = []

for name, model in models.items():
    print(f"\n{name}")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    # Probabilities
    if hasattr(model, "predict_proba"):
        y_prob = model.predict_proba(X_test)[:, 1]
    else:
        y_prob = model.predict_proba(X_test)[:, 1]
    
    # Metrics
    roc_auc = roc_auc_score(y_test, y_prob)
    print("ROC-AUC:", roc_auc)
    
    results.append({
        'model': name,
        'roc_auc': roc_auc
    })


# Summary table
results_df = pd.DataFrame(results)
print("\nModel Comparison")
print(results_df.sort_values(by='roc_auc', ascending=False))



LogisticRegression
ROC-AUC: 0.6712241653418125

RandomForest
ROC-AUC: 0.7393614202437733

XGBoost
ROC-AUC: 0.7461691329338388

LightGBM
ROC-AUC: 0.7482695365048306

SVM
ROC-AUC: 0.7378887937711467

MLP
ROC-AUC: 0.7421344421344421

Model Comparison
                model   roc_auc
3            LightGBM  0.748270
2             XGBoost  0.746169
5                 MLP  0.742134
1        RandomForest  0.739361
4                 SVM  0.737889
0  LogisticRegression  0.671224


In [14]:
X = data[features]
y = data['proceed_to_mediation']

print("Class distribution:")
print(y.value_counts())

# Define imbalance strategies
strategies = {
    "none": None,  # default
    "scale_pos_weight": "scale_pos_weight",  # Adding weight to balance imbalance
}

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

results = []

for name, strategy in strategies.items():
    print(f"\nStrategy: {name}")
    
    if strategy is None:
        model = XGBClassifier(eval_metric='logloss', random_state=42)
        roc_auc_scores = cross_val_score(model, X, y, cv=kf, scoring='roc_auc')
        print("ROC-AUC scores for each fold:", roc_auc_scores)
        mean_roc_auc = roc_auc_scores.mean()
    
    else: 
        n_pos = y.sum()
        n_neg = len(y) - n_pos
        pos_weight = n_neg / n_pos
        model = XGBClassifier(scale_pos_weight=pos_weight, eval_metric='logloss', random_state=42)
        roc_auc_scores = cross_val_score(model, X, y, cv=kf, scoring='roc_auc')
        print("ROC-AUC scores for each fold:", roc_auc_scores)
        mean_roc_auc = roc_auc_scores.mean()
    
    print("Mean ROC-AUC:", mean_roc_auc)
    
    results.append({"strategy": name, "roc_auc": mean_roc_auc})

# Compare strategies
results_df = pd.DataFrame(results).sort_values(by="roc_auc", ascending=False)
print("\nStrategy Comparison")
print(results_df)

Class distribution:
proceed_to_mediation
0    4417
1    2776
Name: count, dtype: int64

Strategy: none
ROC-AUC scores for each fold: [0.71491684 0.75438017 0.73422644 0.74640201 0.73138869]
Mean ROC-AUC: 0.7362628297127627

Strategy: scale_pos_weight
ROC-AUC scores for each fold: [0.71489442 0.75501406 0.73383128 0.74491853 0.73096018]
Mean ROC-AUC: 0.7359236949262945

Strategy Comparison
           strategy   roc_auc
0              none  0.736263
1  scale_pos_weight  0.735924


In [15]:
X1 = data[features]
y1 = data['proceed_to_mediation']

X1_train, X1_test, y1_train, y1_test = train_test_split(
    X1, y1, test_size=0.2, random_state=42, stratify=y1
)

# Optuna objective function
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 500),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 5),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 5),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 5),
        'eval_metric': 'logloss'
    }
    model = XGBClassifier(**params)
    
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    auc = cross_val_score(model, X1_train, y1_train, cv=cv, scoring='roc_auc').mean()
    return auc

# Run Optuna study
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=30)  

print("Best trial:")
print(study.best_trial.params)

# Train XGBoost with best params
best_params = study.best_trial.params
model1 = XGBClassifier(**best_params, eval_metric='logloss')
model1.fit(X1_train, y1_train)

# Predictions
y1_pred = model1.predict(X1_test)
y1_prob = model1.predict_proba(X1_test)[:,1]

# Save model
joblib.dump(model1, "xgb_best_proceed_to_mediation.pkl")
print("Model saved to xgb_best_proceed_to_mediation.pkl")

# Evaluation
print("Model 1: Proceed to Mediation")
print(classification_report(y1_test, y1_pred))
print("ROC-AUC Score:", roc_auc_score(y1_test, y1_prob))

[I 2025-09-21 05:54:13,029] A new study created in memory with name: no-name-27484f23-751e-4eff-96fc-e6b518b1fdb3


[I 2025-09-21 05:54:15,708] Trial 0 finished with value: 0.7294646144081959 and parameters: {'n_estimators': 445, 'max_depth': 10, 'learning_rate': 0.09891525085426658, 'subsample': 0.8190944508970107, 'colsample_bytree': 0.7138656024413217, 'gamma': 3.622970352718611, 'reg_alpha': 4.135602572191729, 'reg_lambda': 4.044916154147586}. Best is trial 0 with value: 0.7294646144081959.
[I 2025-09-21 05:54:19,263] Trial 1 finished with value: 0.7244204975882355 and parameters: {'n_estimators': 253, 'max_depth': 7, 'learning_rate': 0.021247661363377208, 'subsample': 0.7214220855872409, 'colsample_bytree': 0.5764095871281757, 'gamma': 0.3998532888491413, 'reg_alpha': 4.133446587965835, 'reg_lambda': 3.902738860790656}. Best is trial 0 with value: 0.7294646144081959.
[I 2025-09-21 05:54:24,607] Trial 2 finished with value: 0.7332053621546712 and parameters: {'n_estimators': 459, 'max_depth': 3, 'learning_rate': 0.011129737831156573, 'subsample': 0.7706217972218773, 'colsample_bytree': 0.8625625

Best trial:
{'n_estimators': 379, 'max_depth': 8, 'learning_rate': 0.15532074809497792, 'subsample': 0.5053783474006384, 'colsample_bytree': 0.5250440580389345, 'gamma': 1.3322317854765875, 'reg_alpha': 1.1514206778435376, 'reg_lambda': 2.5760223616923694}
Model saved to xgb_best_proceed_to_mediation.pkl
Model 1: Proceed to Mediation
              precision    recall  f1-score   support

           0       0.75      0.81      0.78       884
           1       0.65      0.58      0.61       555

    accuracy                           0.72      1439
   macro avg       0.70      0.69      0.70      1439
weighted avg       0.71      0.72      0.72      1439

ROC-AUC Score: 0.7485416411886999


# Model 2

In [16]:
X = mediation_cases[features]
y = mediation_cases['settled']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Define models
models = {
    'LogisticRegression': LogisticRegression(max_iter=1000, class_weight='balanced'),
    'RandomForest': RandomForestClassifier(n_estimators=400, max_depth=5, class_weight='balanced', random_state=42),
    'XGBoost': XGBClassifier(n_estimators=400, max_depth=5, learning_rate=0.1, eval_metric='logloss', random_state=42),
    'LightGBM': lgb.LGBMClassifier(n_estimators=400, max_depth=5, learning_rate=0.1, class_weight='balanced', random_state=42, verbose=-1, force_row_wise=True),
    # SVM needs scaling and probability=True
    'SVM': Pipeline([
        ('scaler', StandardScaler()),
        ('svc', SVC(kernel='rbf', probability=True, random_state=42))
    ]),
    # MLP needs scaling
    'MLP': Pipeline([
        ('scaler', StandardScaler()),
        ('mlp', MLPClassifier(hidden_layer_sizes=(50,50), max_iter=500, random_state=42))
    ])
}

# Train, predict, evaluate
results = []

for name, model in models.items():
    print(f"\n{name}")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    if hasattr(model, "predict_proba"):
        y_prob = model.predict_proba(X_test)[:, 1]
    else:
        y_prob = model.predict_proba(X_test)[:, 1]
    
    # Metrics
    roc_auc = roc_auc_score(y_test, y_prob)
    print("ROC-AUC:", roc_auc)
    
    results.append({
        'model': name,
        'roc_auc': roc_auc
    })

# Summary table
results_df = pd.DataFrame(results)
print("\nModel Comparison")
print(results_df.sort_values(by='roc_auc', ascending=False))



LogisticRegression
ROC-AUC: 0.5715537184459414

RandomForest
ROC-AUC: 0.651961284615908

XGBoost
ROC-AUC: 0.6536197863509559

LightGBM
ROC-AUC: 0.6496563924610465

SVM
ROC-AUC: 0.4598642580118391

MLP
ROC-AUC: 0.637357964210383

Model Comparison
                model   roc_auc
2             XGBoost  0.653620
1        RandomForest  0.651961
3            LightGBM  0.649656
5                 MLP  0.637358
0  LogisticRegression  0.571554
4                 SVM  0.459864


In [17]:
# Solving class imabalance
print(mediation_cases['settled'].value_counts())
X = mediation_cases[features]
y = mediation_cases['settled']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Define imbalance strategies
strategies = {
    "none": None,
    "scale_pos_weight": "scale_pos_weight",
    "undersample": RandomUnderSampler(sampling_strategy=0.5, random_state=42),
    "oversample": RandomOverSampler(sampling_strategy=0.5, random_state=42)
}


kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
results = []

for name, strategy in strategies.items():
    print(f"\nStrategy: {name}")
    
    if strategy is None:
        model = XGBClassifier(eval_metric='logloss', random_state=42)
        roc_auc = cross_val_score(model, X, y, cv=kf, scoring='roc_auc').mean()
    
    elif strategy == "scale_pos_weight":
        n_pos = y.sum()
        n_neg = len(y) - n_pos
        pos_weight = n_neg / n_pos
        model = XGBClassifier(scale_pos_weight=pos_weight, eval_metric='logloss', random_state=42)
        roc_auc = cross_val_score(model, X, y, cv=kf, scoring='roc_auc').mean()
    
    else:  # undersample or oversample
        model = XGBClassifier(eval_metric='logloss', random_state=42)
        pipeline = ImbPipeline(steps=[('resample', strategy), ('model', model)])
        roc_auc = cross_val_score(pipeline, X, y, cv=kf, scoring='roc_auc').mean()
    
    print("Mean ROC-AUC:", roc_auc)
    
    results.append({"strategy": name, "roc_auc": roc_auc})


# Compare strategies
results_df = pd.DataFrame(results).sort_values(by="roc_auc", ascending=False)
print("\nStrategy Comparison")
print(results_df)

settled
1    2069
0     707
Name: count, dtype: int64

Strategy: none
Mean ROC-AUC: 0.6074113310779415

Strategy: scale_pos_weight
Mean ROC-AUC: 0.6103207847886492

Strategy: undersample
Mean ROC-AUC: 0.612559181181011

Strategy: oversample
Mean ROC-AUC: 0.6010532165714321

Strategy Comparison
           strategy   roc_auc
2       undersample  0.612559
1  scale_pos_weight  0.610321
0              none  0.607411
3        oversample  0.601053


In [18]:
X = mediation_cases[features]
y = mediation_cases['settled']

X1_train, X1_test, y1_train, y1_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Optuna objective function
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 500),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 5),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 5),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 5),
        'eval_metric': 'logloss'
    }

    undersampler = RandomUnderSampler(sampling_strategy='auto', random_state=42)
    pipeline = ImbPipeline(steps=[('sampler', undersampler), ('classifier', XGBClassifier(**params))])
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    f1_score = cross_val_score(pipeline, X1_train, y1_train, cv=cv, scoring='roc_auc').mean()
    
    return f1_score

# Run Optuna study
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=30)

print("Best trial:")
print(study.best_trial.params)

# Train XGBoost with best params
best_params = study.best_trial.params
final_undersampler = RandomUnderSampler(sampling_strategy='auto', random_state=42)
final_pipeline = ImbPipeline(steps=[('sampler', final_undersampler), ('classifier', XGBClassifier(**best_params, eval_metric='logloss'))])

final_pipeline.fit(X1_train, y1_train)

# Predictions
y1_pred = final_pipeline.predict(X1_test)
y1_prob = final_pipeline.predict_proba(X1_test)[:,1]

# Save model
joblib.dump(final_pipeline, "xgb_best_settled_undersampled.pkl")
print("Model saved to xgb_best_settled_undersampled.pkl")

# Evaluation
print("Model 1: Proceed to Mediation")
print(classification_report(y1_test, y1_pred))
print("ROC-AUC Score:", roc_auc_score(y1_test, y1_prob))

[I 2025-09-21 05:55:59,737] A new study created in memory with name: no-name-19aff0fd-4b68-4ff4-924c-4bc9f9139b4e
[I 2025-09-21 05:56:03,659] Trial 0 finished with value: 0.6108440499425181 and parameters: {'n_estimators': 391, 'max_depth': 8, 'learning_rate': 0.014714639931749283, 'subsample': 0.6568526303724113, 'colsample_bytree': 0.5259549842900505, 'gamma': 0.18356793448023712, 'reg_alpha': 3.308992697972048, 'reg_lambda': 3.4457385906334808}. Best is trial 0 with value: 0.6108440499425181.
[I 2025-09-21 05:56:06,922] Trial 1 finished with value: 0.6072962061866696 and parameters: {'n_estimators': 326, 'max_depth': 5, 'learning_rate': 0.10053973968838023, 'subsample': 0.6165801524271033, 'colsample_bytree': 0.9658817728027016, 'gamma': 1.700794433048407, 'reg_alpha': 4.1121101571410765, 'reg_lambda': 4.467798208990697}. Best is trial 0 with value: 0.6108440499425181.
[I 2025-09-21 05:56:09,095] Trial 2 finished with value: 0.6186402160254525 and parameters: {'n_estimators': 129, '

Best trial:
{'n_estimators': 129, 'max_depth': 10, 'learning_rate': 0.15681243253500787, 'subsample': 0.6436439662754474, 'colsample_bytree': 0.9232996275337322, 'gamma': 0.05009685839428957, 'reg_alpha': 1.9795832219259324, 'reg_lambda': 4.085333194196966}
Model saved to xgb_best_settled_undersampled.pkl
Model 1: Proceed to Mediation
              precision    recall  f1-score   support

           0       0.34      0.77      0.47       142
           1       0.86      0.49      0.63       414

    accuracy                           0.56       556
   macro avg       0.60      0.63      0.55       556
weighted avg       0.73      0.56      0.59       556

ROC-AUC Score: 0.6517316459141321
