# Reordered / Minimal Notebook: Predicting multiple binary targets
**Purpose:** A compact notebook that keeps only the necessary steps:
- Load data
- Quick EDA (target identification, missingness, class balance)
- Preprocessing
- Train multi-output RandomForest (per-target class balance considered)
- Evaluate using precision/recall/F1 for the 1-class and 0-class
- Predict on unlabeled clients and discuss validation strategy


In [None]:
# Imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold, cross_validate
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, precision_recall_fscore_support
import joblib


In [None]:
# Load dataset (adjust filename if needed)
df = pd.read_csv("Technical_Interview_dataset2.csv")
print('Rows, cols:', df.shape)
df.head()

In [None]:
# Identify target columns (ending with '_tgt') and features
target_cols = [c for c in df.columns if c.endswith('_tgt')]
print('Targets found:', target_cols)
feature_cols = [c for c in df.columns if c not in target_cols + ['client_id']]
print('Number of features:', len(feature_cols))
# Quick check for duplicates
print('Duplicate client_id count:', df['client_id'].duplicated().sum())

In [None]:
# Missingness and class balance per target
missing = df[target_cols].isnull().mean().sort_values(ascending=False)
counts = df[target_cols].apply(lambda s: s.value_counts(dropna=False))
display(missing.to_frame('fraction_null').T)
display(counts)
# Show features missingness briefly
feat_missing = df[feature_cols].isnull().mean().sort_values(ascending=False).head(10)
display(feat_missing.to_frame('fraction_null'))

In [None]:
# For modeling we'll use rows that have at least one non-null target.
# We'll train a multi-output model treating NaN as unlabeled (excluded from loss).
df_model = df.copy()
has_label = df_model[target_cols].notnull().any(axis=1)
df_model = df_model[has_label].reset_index(drop=True)
print('Rows with at least one label:', df_model.shape[0])

# For rows where a particular target is null, we will mask those during evaluation.
X = df_model[feature_cols].copy()
Y = df_model[target_cols].copy()  # keep NaNs
X.shape, Y.shape

In [None]:
# Preprocessing: numeric imputation + scaling for numeric features; one-hot encode categoricals if needed.
# We'll do a simple strategy: numeric columns -> imputer+scaler, object columns -> simple impute with 'missing' then get_dummies.
num_cols = X.select_dtypes(include=['number']).columns.tolist()
cat_cols = [c for c in X.columns if c not in num_cols]

print('Numeric cols:', len(num_cols), 'Categorical cols:', len(cat_cols))

# Impute numeric
num_pipe = Pipeline([
    ('imp', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

X_num = pd.DataFrame(num_pipe.fit_transform(X[num_cols]), columns=num_cols, index=X.index)

# Categorical handling (lightweight)
if cat_cols:
    X_cat = X[cat_cols].fillna('missing').astype(str)
    X_cat = pd.get_dummies(X_cat, drop_first=True)
    X_prepared = pd.concat([X_num, X_cat], axis=1)
else:
    X_prepared = X_num

print('Prepared X shape:', X_prepared.shape)

In [None]:
# Create a simple train/test split. For stratification we'll use the first non-null target as a proxy.
first_label = None
for c in target_cols:
    if df_model[c].notnull().any():
        first_label = c
        break
print('Proxy stratify target:', first_label)

y_strat = df_model[first_label].fillna(0)  # fillna with 0 for stratify proxy
X_train, X_test, y_train_df, y_test_df = train_test_split(
    X_prepared, Y, test_size=0.2, random_state=42, stratify=y_strat
)
print('Train/test shapes:', X_train.shape, X_test.shape)

In [None]:
# Build a MultiOutputClassifier using RandomForest per target.
# We'll set each RandomForest to use class_weight='balanced' to penalize misclassifying minority (1) class.
base_rf = RandomForestClassifier(n_estimators=200, n_jobs=-1, random_state=42, class_weight='balanced')
multi_clf = MultiOutputClassifier(base_rf, n_jobs=-1)
multi_clf.fit(X_train, y_train_df.fillna(0))  # sklearn requires no NaNs at fit; for training we fill NaN with 0 but mask during eval

# Save model
joblib.dump(multi_clf, '/mnt/data/multi_clf_joblib.pkl')
print('Model trained and saved to /mnt/data/multi_clf_joblib.pkl')

In [None]:
# Evaluate per target, masking rows where that target was null in y_test_df
results = {}
for i, tgt in enumerate(target_cols):
    mask = y_test_df[tgt].notnull()
    if mask.sum() == 0:
        print(f'No test labels for {tgt}, skipping')
        continue
    y_true = y_test_df.loc[mask, tgt].astype(int)
    y_pred = multi_clf.predict(X_test.loc[mask])
    # If predict returns array of shape (n_samples, n_targets)
    if y_pred.ndim == 2:
        y_pred_t = y_pred[:, i]
    else:
        y_pred_t = y_pred
    p, r, f, sup = precision_recall_fscore_support(y_true, y_pred_t, labels=[1,0], zero_division=0)
    results[tgt] = {'precision_1': p[0], 'recall_1': r[0], 'f1_1': f[0], 'support': int(sup.sum())}

pd.DataFrame(results).T.sort_values('f1_1', ascending=False)

In [None]:
# Predict on rows where a given target is null (population to score)
df_unlabeled = df[df[target_cols].isnull().any(axis=1)].copy()
print('Unlabeled rows count:', df_unlabeled.shape[0])

if not df_unlabeled.empty:
    X_unl = df_unlabeled[feature_cols]
    # apply same preprocessing
    X_unl_num = pd.DataFrame(num_pipe.transform(X_unl[num_cols]), columns=num_cols, index=X_unl.index)
    if cat_cols:
        X_unl_cat = X_unl[cat_cols].fillna('missing').astype(str)
        X_unl_cat = pd.get_dummies(X_unl_cat, drop_first=True)
        # align columns
        X_unl_cat = X_unl_cat.reindex(columns=X_cat.columns, fill_value=0)
        X_unl_prepared = pd.concat([X_unl_num, X_unl_cat], axis=1)
    else:
        X_unl_prepared = X_unl_num
    preds = multi_clf.predict(X_unl_prepared)
    preds_df = pd.DataFrame(preds, columns=target_cols, index=df_unlabeled.index)
    df_unlabeled[target_cols] = preds_df
    df_unlabeled[['client_id'] + target_cols].head()
    df_unlabeled.to_csv('/mnt/data/unlabeled_with_preds.csv', index=False)
    print('Predictions saved to /mnt/data/unlabeled_with_preds.csv')

## Notes and next steps
- We trained a simple **multi-output RandomForest** with `class_weight='balanced'` to prioritize detection of the **1** class (the 'bad' event).
- **Evaluation:** we masked rows where the true label was missing when computing per-target metrics. Use precision/recall/F1 on the 1-class as primary metrics (precision to avoid false positives, recall to catch bad customers).
- **Combining models:** a single multi-output model is acceptable when features are shared; alternative is one model per target if per-target hyperparameter tuning is needed.
- **Applying to unlabeled:** Save predictions and monitor downstream business signals (e.g., fraction flagged; manual review of a sample; future label arrival) to validate model calibration.
- **Improvements:** cross-validated threshold tuning, calibration (Platt/Isotonic), and using probability thresholds different per-target to balance false positives vs false negatives.
