**1. Importing the dependencies**

In [9]:
import pandas as pd
import numpy as np
import pickle

from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score
from sklearn.calibration import CalibratedClassifierCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import FunctionTransformer


from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline

**2. Data Loading and Understanding**

In [10]:
df = pd.read_csv(r"C:\Users\honor\spicedAcademy\Capstone_Final_Project\Retain_Flow_Automation-\notebook\customer_Info copy.csv")



üëâ Î∞òÏòÅÌïú Î≥ÄÍ≤ΩÏ†ê:

LabelEncoder Ï†úÍ±∞ ‚Üí OneHotEncoder ÏÇ¨Ïö© (Î≤îÏ£ºÌòï ÏïàÏ†ïÏ†Å Ï≤òÎ¶¨, ÏÉàÎ°úÏö¥ Í∞í Îì§Ïñ¥ÏôÄÎèÑ ÏóêÎü¨ Ïïà ÎÇ®).

SMOTE + RandomForestClassifier Î•º imblearn.pipeline.Pipeline ÏïàÏóê ÎÑ£Ïùå ‚Üí Îç∞Ïù¥ÌÑ∞ ÎàÑÏàò Î∞©ÏßÄ.

GridSearchCV + StratifiedKFold Î°ú Recall ÏµúÏ†ÅÌôî.

CalibratedClassifierCV Î°ú ÌôïÎ•† Î≥¥Ï†ï.

Î™®Îç∏ Ï†ÄÏû• Ïãú feature ÏàúÏÑú Î∂àÌïÑÏöî (Pipeline ÏûêÏ≤¥Í∞Ä feature ÏàúÏÑú Ï≤òÎ¶¨).

In [11]:
# ---------------------------
# 1. Îç∞Ïù¥ÌÑ∞ Î∂àÎü¨Ïò§Í∏∞
# ---------------------------
df = pd.read_csv("customer_Info copy.csv")

# Î∂àÌïÑÏöîÌïú ID Ïª¨Îüº Ï†úÍ±∞
df = df.drop(columns=["customerID"])

# TotalCharges ‚Üí Ïà´Ïûê Î≥ÄÌôò
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce").fillna(0)

# ÌÉÄÍ≤ü Ïù∏ÏΩîÎî©
df["Churn"] = df["Churn"].replace({"Yes": 1, "No": 0})

# ---------------------------
# 2. Feature/Target Î∂ÑÎ¶¨
# ---------------------------
X = df.drop(columns=["Churn"])
y = df["Churn"]

# ---------------------------
# 3. Ïª¨Îüº Íµ¨Î∂Ñ
# ---------------------------
numeric_features = ["tenure", "MonthlyCharges", "TotalCharges"]
categorical_features = [col for col in X.columns if col not in numeric_features]

# ---------------------------
# 4. Ï†ÑÏ≤òÎ¶¨ ÌååÏù¥ÌîÑÎùºÏù∏
# ---------------------------
preprocessor = ColumnTransformer(
    transformers=[
        ("num", "passthrough", numeric_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features)
    ]
)



# ---------------------------
# 5. Ï†ÑÏ≤¥ ÌååÏù¥ÌîÑÎùºÏù∏ (Ï†ÑÏ≤òÎ¶¨ + SMOTE + Î™®Îç∏)
# ---------------------------
def clean_numeric(X):
    X = X.copy()
    for col in ["TotalCharges", "MonthlyCharges", "tenure"]:
        X[col] = pd.to_numeric(X[col], errors="coerce").fillna(0)
    return X

numeric_cleaner = FunctionTransformer(clean_numeric)

pipeline = Pipeline(steps=[
    ("cleaner", numeric_cleaner),   # ‚úÖ Ïó¨Í∏∞ Ï∂îÍ∞Ä
    ("preprocessor", preprocessor),
    ("smote", SMOTE(random_state=42)),
    ("clf", RandomForestClassifier(random_state=42, class_weight="balanced"))
])



# ---------------------------
# 6. ÌååÎùºÎØ∏ÌÑ∞ ÌÉêÏÉâ Í≥µÍ∞Ñ
# ---------------------------
param_grid = {
    "clf__n_estimators": [200, 500],
    "clf__max_depth": [10, 20, None],
    "clf__min_samples_split": [2, 5],
    "clf__min_samples_leaf": [1, 2],
    "clf__max_features": ["sqrt", "log2"]
}

# ---------------------------
# 7. Stratified K-Fold + GridSearchCV
# ---------------------------
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    scoring="recall",
    cv=skf,
    n_jobs=-1,
    verbose=2
)

  df["Churn"] = df["Churn"].replace({"Yes": 1, "No": 0})


In [12]:
# ---------------------------
# 8. Train/Test Split ÌõÑ ÌïôÏäµ
# ---------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

grid_search.fit(X_train, y_train)

print("ÏµúÏ†Å ÌååÎùºÎØ∏ÌÑ∞:", grid_search.best_params_)
print("ÏµúÍ≥† Recall (CV ÌèâÍ∑†):", grid_search.best_score_)


Fitting 5 folds for each of 48 candidates, totalling 240 fits
ÏµúÏ†Å ÌååÎùºÎØ∏ÌÑ∞: {'clf__max_depth': 10, 'clf__max_features': 'log2', 'clf__min_samples_leaf': 2, 'clf__min_samples_split': 2, 'clf__n_estimators': 200}
ÏµúÍ≥† Recall (CV ÌèâÍ∑†): 0.6240802675585285


In [13]:
# ---------------------------
# 9. ÌôïÎ•† Î≥¥Ï†ï (CalibratedClassifierCV)
# ---------------------------
best_pipeline = grid_search.best_estimator_

calibrated_rfc = CalibratedClassifierCV(
    estimator=best_pipeline,
    method="sigmoid",   # Îç∞Ïù¥ÌÑ∞ ÌÅ¨Í∏∞Í∞Ä ÌÅ¨Î©¥ isotonicÎèÑ Í∞ÄÎä•
    cv=5
)

calibrated_rfc.fit(X_train, y_train)

0,1,2
,estimator,Pipeline(step...m_state=42))])
,method,'sigmoid'
,cv,5
,n_jobs,
,ensemble,'auto'

0,1,2
,func,<function cle...0015C2EF1DDA0>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,
,kw_args,
,inv_kw_args,

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,sampling_strategy,'auto'
,random_state,42
,k_neighbors,5

0,1,2
,n_estimators,200
,criterion,'gini'
,max_depth,10
,min_samples_split,2
,min_samples_leaf,2
,min_weight_fraction_leaf,0.0
,max_features,'log2'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [14]:
# ---------------------------
# 10. ÌèâÍ∞Ä
# ---------------------------
y_prob = calibrated_rfc.predict_proba(X_test)[:, 1]
y_pred = (y_prob > 0.3).astype(int)   # Threshold Ï°∞Ï†ï Í∞ÄÎä•

print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_prob))


Confusion Matrix:
 [[811 224]
 [104 270]]
Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.78      0.83      1035
           1       0.55      0.72      0.62       374

    accuracy                           0.77      1409
   macro avg       0.72      0.75      0.73      1409
weighted avg       0.80      0.77      0.78      1409

ROC-AUC: 0.8359993799891499


In [15]:
# ---------------------------
# 11. Î™®Îç∏ Ï†ÄÏû•
# ---------------------------
with open("pipeline_customer_churn_model.pkl", "wb") as f:
    pickle.dump(calibrated_rfc, f)

print("‚úÖ Î≥¥Ï†ïÎêú Î™®Îç∏ Ï†ÄÏû• ÏôÑÎ£å")


# üîπ Pipeline Î∞©Ïãù (ÏßÄÍ∏à Ï†úÍ∞Ä ÎìúÎ¶∞ ÏΩîÎìú)
# Pipeline ÏïàÏóê Ï†ÑÏ≤òÎ¶¨(OneHotEncoder) + SMOTE + Î™®Îç∏(RandomForest + Calibration) Ïù¥ Îã§ Îì§Ïñ¥Í∞Ä ÏûàÏùå.
# Í∑∏ÎûòÏÑú Ï†ÑÏ≤òÎ¶¨Î•º Îî∞Î°ú Ìï† ÌïÑÏöîÍ∞Ä ÏóÜÏùå.
# pkl ÌååÏùºÏùÄ 1Í∞ú (customer_churn_model.pkl)Îßå ÏûàÏúºÎ©¥ Îê®.
# ÏòàÏ∏°Ìï† ÎïåÎäî Ïã†Í∑ú Îç∞Ïù¥ÌÑ∞Î•º ÏõêÎ≥∏ Í∑∏ÎåÄÎ°ú DataFrameÏúºÎ°ú ÎÑ£ÏúºÎ©¥ ‚Üí PipelineÏù¥ ÏïåÏïÑÏÑú Ï†ÑÏ≤òÎ¶¨+ÏòàÏ∏°ÏùÑ Ìï¥Ï§çÎãàÎã§.

‚úÖ Î≥¥Ï†ïÎêú Î™®Îç∏ Ï†ÄÏû• ÏôÑÎ£å


In [16]:
# ---------------------------
# 12. Ïã†Í∑ú Í≥†Í∞ù ÏòàÏ∏° ÏòàÏãú
# ---------------------------
input_data = {
    'gender': 'Female',
    'SeniorCitizen': 0,
    'Partner': 'Yes',
    'Dependents': 'No',
    'tenure': 1,
    'PhoneService': 'No',
    'MultipleLines': 'No phone service',
    'InternetService': 'DSL',
    'OnlineSecurity': 'No',
    'OnlineBackup': 'Yes',
    'DeviceProtection': 'No',
    'TechSupport': 'No',
    'StreamingTV': 'No',
    'StreamingMovies': 'No',
    'Contract': 'Month-to-month',
    'PaperlessBilling': 'Yes',
    'PaymentMethod': 'Electronic check',
    'MonthlyCharges': 29.85,
    'TotalCharges': 29.85
}

input_df = pd.DataFrame([input_data])

# Ï†ÄÏû•Îêú Î™®Îç∏ Î∂àÎü¨Ïò§Í∏∞
with open("pipeline_customer_churn_model.pkl", "rb") as f:
    loaded_model = pickle.load(f)

# ÏòàÏ∏°
y_prob = loaded_model.predict_proba(input_df)[:, 1]
y_pred = (y_prob > 0.4).astype(int)

print(f"Prediction: {'Churn' if y_pred[0] == 1 else 'No Churn'}")
print(f"Churn Probability: {y_prob[0]:.4f}")

Prediction: Churn
Churn Probability: 0.4355
