In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import yaml
import joblib
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, classification_report, confusion_matrix
)
from xgboost import XGBClassifier
import matplotlib.pyplot as plt
import seaborn as sns

with open("../model/params.yaml", "r") as f:
    params = yaml.safe_load(f)

data_dir = Path(params['data_dir'])
model_dir = Path(params['model_dir'])


df_all = pd.read_csv(data_dir / "train.csv")

print(f"total: {len(df_all)}")
print(f"churn: {df_all['churn'].sum()} ({df_all['churn'].mean():.2%})")
print(f"not churn: {(~df_all['churn']).sum()} ({(~df_all['churn']).mean():.2%})")

all_columns = [col for col in df_all.columns if col != 'churn']
numeric_columns = df_all[all_columns].select_dtypes(
    include=['int64', 'float64']
).columns.tolist()
categorical_columns = [col for col in all_columns if col not in numeric_columns]

print(f"all features: {len(all_columns)}")
print(f"num: {len(numeric_columns)}")
print(f"cat: {len(categorical_columns)}")

X = df_all[all_columns].copy()
y = df_all['churn'].values

# Label Encoding for categorical features
label_encoders = {}
for col in categorical_columns:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))
    label_encoders[col] = le

print(f"Label encoding on ({len(categorical_columns)} features)")

# Imputation for missing values
imputer = SimpleImputer(strategy='median')
X_imputed = imputer.fit_transform(X)
X = pd.DataFrame(X_imputed, columns=all_columns)

# Scaling for numeric features
scaler = StandardScaler()
X[numeric_columns] = scaler.fit_transform(X[numeric_columns])

print(f"scaling({len(numeric_columns)}features)")

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y 
)

print(f"train: {len(X_train)}")
print(f"churn: {y_train.sum()} ({y_train.mean():.2%})")
print(f"not: {len(y_train) - y_train.sum()} ({(1-y_train.mean()):.2%})")

print(f"test: {len(X_test)}")
print(f"churn: {y_test.sum()} ({y_test.mean():.2%})")
print(f"not: {len(y_test) - y_test.sum()} ({(1-y_test.mean()):.2%})")

xgb_model = XGBClassifier(
    n_estimators=200,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    min_child_weight=3,
    gamma=0.1,
    random_state=42,
    eval_metric='logloss',
    early_stopping_rounds=20,
    tree_method='hist'
)

xgb_model.fit(
    X_train, y_train,
    eval_set=[(X_train, y_train), (X_test, y_test)],
    verbose=10
)

y_train_pred = xgb_model.predict(X_train)
y_train_proba = xgb_model.predict_proba(X_train)[:, 1]

y_test_pred = xgb_model.predict(X_test)
y_test_proba = xgb_model.predict_proba(X_test)[:, 1]


print(f" Train Accu: {accuracy_score(y_train, y_train_pred):.4f}")
print(f" Prec: {precision_score(y_train, y_train_pred):.4f}")
print(f" Recall: {recall_score(y_train, y_train_pred):.4f}")
print(f" F1: {f1_score(y_train, y_train_pred):.4f}")
print(f" AUC:   {roc_auc_score(y_train, y_train_proba):.4f}")

print(f" Test Accu: {accuracy_score(y_test, y_test_pred):.4f}")
print(f" Prec: {precision_score(y_test, y_test_pred):.4f}")
print(f" Recall: {recall_score(y_test, y_test_pred):.4f}")
print(f" F1: {f1_score(y_test, y_test_pred):.4f}")
print(f" AUC:   {roc_auc_score(y_test, y_test_proba):.4f}")

print(classification_report(y_test, y_test_pred, 
                          target_names=['not churn', 'churn']))

cm = confusion_matrix(y_test, y_test_pred)
print("\n confusion_matrix:")
print(f"                 Pred not churn  Pred churm")
print(f"Real not churn  {cm[0,0]:6d}    {cm[0,1]:6d}")
print(f"Real churn      {cm[1,0]:6d}    {cm[1,1]:6d}")


feature_importance = pd.DataFrame({
    'feature': all_columns,
    'importance': xgb_model.feature_importances_
}).sort_values('importance', ascending=False)

print("\nTop 20 features:")
print(feature_importance.head(20).to_string(index=False))

feature_importance.to_csv(
    model_dir / 'standalone_xgb_feature_importance.csv',
    index=False
)

plt.figure(figsize=(10, 8))
top_features = feature_importance.head(20)
plt.barh(range(len(top_features)), top_features['importance'])
plt.yticks(range(len(top_features)), top_features['feature'])
plt.xlabel('Feature Importance')
plt.title('Top 20 Most Important Features (Standalone XGBoost)')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.savefig(model_dir / 'standalone_xgb_feature_importance.png', dpi=150)

from sklearn.base import clone

xgb_for_cv = clone(xgb_model)
xgb_for_cv.set_params(early_stopping_rounds=None)

cv_scores = cross_val_score(
    xgb_for_cv,
    X_train,
    y_train,
    cv=5,
    scoring='roc_auc',
    n_jobs=-1
)

print(f" cross val AUC: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")
print(f" AUC: {[f'{s:.4f}' for s in cv_scores]}")


joblib.dump(xgb_model, model_dir / 'standalone_xgb.joblib')

joblib.dump(label_encoders, model_dir / 'standalone_label_encoders.joblib')
joblib.dump(scaler, model_dir / 'standalone_scaler.joblib')
joblib.dump(imputer, model_dir / 'standalone_imputer.joblib')
joblib.dump(all_columns, model_dir / 'standalone_columns.joblib')
joblib.dump(numeric_columns, model_dir / 'standalone_numeric_columns.joblib')
joblib.dump(categorical_columns, model_dir / 'standalone_categorical_columns.joblib')

test_data = {
    'X_test': X_test,
    'y_test': y_test,
    'y_pred': y_test_pred,
    'y_proba': y_test_proba
}
joblib.dump(test_data, model_dir / 'standalone_test_results.joblib')
print("results saved for comparison")

print(f"""Saved:
  - standalone_xgb.joblib
  - standalone_*_encoders/scaler/imputer.joblib
  - standalone_xgb_feature_importance.csv
  - standalone_test_results.joblib
""")


total: 3941
churn: 674 (17.10%)
not churn: -4615 (-117.10%)
all features: 10
num: 8
cat: 2
Label encoding on (2 features)
scaling(8features)
train: 3152
churn: 539 (17.10%)
not: 2613 (82.90%)
test: 789
churn: 135 (17.11%)
not: 654 (82.89%)
[0]	validation_0-logloss:0.44709	validation_1-logloss:0.45110
[10]	validation_0-logloss:0.33093	validation_1-logloss:0.34857
[20]	validation_0-logloss:0.27273	validation_1-logloss:0.29741
[30]	validation_0-logloss:0.24035	validation_1-logloss:0.27251
[40]	validation_0-logloss:0.21851	validation_1-logloss:0.25584
[50]	validation_0-logloss:0.20141	validation_1-logloss:0.24623
[60]	validation_0-logloss:0.18994	validation_1-logloss:0.24019
[70]	validation_0-logloss:0.18132	validation_1-logloss:0.23488
[80]	validation_0-logloss:0.17179	validation_1-logloss:0.22986
[90]	validation_0-logloss:0.16489	validation_1-logloss:0.22563
[100]	validation_0-logloss:0.15788	validation_1-logloss:0.22220
[110]	validation_0-logloss:0.15165	validation_1-logloss:0.21793
[12