### Dataset ready

In [None]:
import pandas as pd
import random
import uuid
import math
from faker import Faker

fake = Faker("en_IN")

STATES = [
    "Maharashtra","Delhi","Tamil Nadu","Uttar Pradesh","Gujarat",
    "Karnataka","Rajasthan","West Bengal","Punjab","Haryana",
    "Telangana","Andhra Pradesh","Chhattisgarh","Odisha","Bihar",
    "Jharkhand","Assam","Goa","Manipur","Meghalaya","Mizoram",
    "Nagaland","Sikkim","Tripura"
]

DISPUTE_TYPES = [
    "invoice_non_payment",
    "interest_on_delay",
    "goods_rejection",
    "short_payment",
    "service_non_payment",
    "others"
]

DOCUMENTS = [
    "invoice",
    "purchase_order",
    "delivery_challan",
    "email_correspondence"
]

# Simulate state efficiency factor (legal speed index)
STATE_EFFICIENCY = {state: random.uniform(0.8, 1.2) for state in STATES}

def sigmoid(x):
    return 1 / (1 + math.exp(-x))

def generate_case():

    claim = round(random.uniform(50_000, 5_000_000), 2)
    delay_days = random.randint(30, 800)
    docs = random.sample(DOCUMENTS, k=random.randint(1, 4))
    doc_score = round(len(docs) / 4, 2)
    dispute_type = random.choice(DISPUTE_TYPES)
    jurisdiction = random.choice(STATES)

    # Latent opponent cooperation factor
    cooperation_factor = random.uniform(-1, 1)

    # --- Domain-Driven Scoring ---

    score = 0

    # Claim effect (small claims settle faster)
    if claim < 500_000:
        score += 1.5
    elif claim < 2_000_000:
        score += 0.5
    else:
        score -= 1

    # Delay effect (longer delay increases pressure)
    if delay_days > 365:
        score += 2
    elif delay_days > 180:
        score += 1

    # Documentation strength
    score += doc_score * 2

    # Dispute type effect
    if dispute_type in ["invoice_non_payment", "service_non_payment"]:
        score += 1.5
    elif dispute_type == "goods_rejection":
        score -= 1

    # Jurisdiction efficiency
    score *= STATE_EFFICIENCY[jurisdiction]

    # Add cooperation randomness
    score += cooperation_factor

    # Convert score to probability
    settlement_probability = sigmoid(score - 2)

    is_settlement = 1 if random.random() < settlement_probability else 0

    # Determine final outcome
    if is_settlement:
        outcome = "settlement"
        settle_min = round(random.uniform(0.65, 0.85), 2)
        settle_max = round(random.uniform(settle_min, 0.95), 2)
    else:
        outcome = random.choices(
            ["award_in_favor", "rejected", "pending"],
            weights=[0.4, 0.3, 0.3]
        )[0]
        settle_min = None
        settle_max = None

    return {
        "case_id": f"SYN_{uuid.uuid4().hex[:10]}",
        "dispute_type": dispute_type,
        "claim_amount": claim,
        "delay_days": delay_days,
        "document_count": len(docs),
        "document_completeness_score": doc_score,
        "jurisdiction": jurisdiction,
        "final_outcome": outcome,
        "settlement_min_ratio": settle_min,
        "settlement_max_ratio": settle_max,
        "is_settlement": is_settlement
    }


# Generate dataset
N = 20000
data = [generate_case() for _ in range(N)]

with open("msme_synthetic_cases.json", "w") as f:
    import json
    json.dump(data, f, indent=2)

print("✅ Improved dataset generated:", len(data))


# Traiing cell

In [None]:
import json
import joblib
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score, classification_report
from xgboost import XGBClassifier

# Load JSON
with open("msme_synthetic_cases.json") as f:
    data = json.load(f)

df = pd.DataFrame(data)

# Encode categorical fields
dispute_encoder = LabelEncoder()
state_encoder = LabelEncoder()
df["dispute_type_enc"] = dispute_encoder.fit_transform(df["dispute_type"])
df["jurisdiction_enc"] = state_encoder.fit_transform(df["jurisdiction"])

FEATURES = [
    "claim_amount",
    "delay_days",
    "document_count",
    "document_completeness_score",
    "dispute_type_enc",
    "jurisdiction_enc"
]

X = df[FEATURES]
y = df["is_settlement"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

model = XGBClassifier(
    n_estimators=300,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.9,
    colsample_bytree=0.9,
    eval_metric="logloss",
    random_state=42
)

model.fit(X_train, y_train)

y_prob = model.predict_proba(X_test)[:, 1]
y_pred = (y_prob > 0.3).astype(int)
# cross validation here 
from sklearn.model_selection import cross_val_score

cv_auc = cross_val_score(
    model,
    X,
    y,
    cv=5,
    scoring="roc_auc"
)

print("Cross-Validated AUC Scores:", cv_auc)
print("Mean CV AUC:", round(cv_auc.mean(), 3))
print("Std Dev CV AUC:", round(cv_auc.std(), 3))

joblib.dump(model, "xgb_model.pkl")
joblib.dump(dispute_encoder, "dispute_encoder.pkl")
joblib.dump(state_encoder, "state_encoder.pkl")
print("Model saved as xgb_model.pkl")
print("ROC-AUC:", round(roc_auc_score(y_test, y_prob), 3))
print(classification_report(y_test, y_pred))
from sklearn.metrics import roc_curve
import numpy as np

fpr, tpr, thresholds = roc_curve(y_test, y_prob)

# Youden’s J statistic (tpr - fpr)
optimal_idx = np.argmax(tpr - fpr)
optimal_threshold = thresholds[optimal_idx]

print("Optimal Threshold:", round(optimal_threshold, 3))

# Recalculate predictions
y_pred_opt = (y_prob > optimal_threshold).astype(int)

print(classification_report(y_test, y_pred_opt))



In [33]:
import pandas as pd

importance_df = pd.DataFrame({
    "feature": FEATURES,
    "importance": model.feature_importances_
}).sort_values(by="importance", ascending=False)

print(importance_df)


                       feature  importance
4             dispute_type_enc    0.301686
2               document_count    0.180041
1                   delay_days    0.163183
0                 claim_amount    0.153753
3  document_completeness_score    0.124395
5             jurisdiction_enc    0.076944


In [34]:
import numpy as np

# get contribution values
contrib = model.get_booster().predict(
    xgb.DMatrix(X_test),
    pred_contribs=True
)

# contributions for first case
contrib_df = pd.DataFrame({
    "feature": FEATURES + ["bias"],
    "contribution": contrib[0]
})

print(contrib_df.sort_values(by="contribution", ascending=False))


NameError: name 'xgb' is not defined

In [None]:
!pip install shap