**1. Importing the dependencies**

In [9]:
import pandas as pd
import numpy as np
import pickle

from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score
from sklearn.calibration import CalibratedClassifierCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import FunctionTransformer


from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline

**2. Data Loading and Understanding**

In [10]:
df = pd.read_csv(r"C:\Users\honor\spicedAcademy\Capstone_Final_Project\Retain_Flow_Automation-\notebook\customer_Info copy.csv")



👉 반영한 변경점:

LabelEncoder 제거 → OneHotEncoder 사용 (범주형 안정적 처리, 새로운 값 들어와도 에러 안 남).

SMOTE + RandomForestClassifier 를 imblearn.pipeline.Pipeline 안에 넣음 → 데이터 누수 방지.

GridSearchCV + StratifiedKFold 로 Recall 최적화.

CalibratedClassifierCV 로 확률 보정.

모델 저장 시 feature 순서 불필요 (Pipeline 자체가 feature 순서 처리).

In [11]:
# ---------------------------
# 1. 데이터 불러오기
# ---------------------------
df = pd.read_csv("customer_Info copy.csv")

# 불필요한 ID 컬럼 제거
df = df.drop(columns=["customerID"])

# TotalCharges → 숫자 변환
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce").fillna(0)

# 타겟 인코딩
df["Churn"] = df["Churn"].replace({"Yes": 1, "No": 0})

# ---------------------------
# 2. Feature/Target 분리
# ---------------------------
X = df.drop(columns=["Churn"])
y = df["Churn"]

# ---------------------------
# 3. 컬럼 구분
# ---------------------------
numeric_features = ["tenure", "MonthlyCharges", "TotalCharges"]
categorical_features = [col for col in X.columns if col not in numeric_features]

# ---------------------------
# 4. 전처리 파이프라인
# ---------------------------
preprocessor = ColumnTransformer(
    transformers=[
        ("num", "passthrough", numeric_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features)
    ]
)



# ---------------------------
# 5. 전체 파이프라인 (전처리 + SMOTE + 모델)
# ---------------------------
def clean_numeric(X):
    X = X.copy()
    for col in ["TotalCharges", "MonthlyCharges", "tenure"]:
        X[col] = pd.to_numeric(X[col], errors="coerce").fillna(0)
    return X

numeric_cleaner = FunctionTransformer(clean_numeric)

pipeline = Pipeline(steps=[
    ("cleaner", numeric_cleaner),   # ✅ 여기 추가
    ("preprocessor", preprocessor),
    ("smote", SMOTE(random_state=42)),
    ("clf", RandomForestClassifier(random_state=42, class_weight="balanced"))
])



# ---------------------------
# 6. 파라미터 탐색 공간
# ---------------------------
param_grid = {
    "clf__n_estimators": [200, 500],
    "clf__max_depth": [10, 20, None],
    "clf__min_samples_split": [2, 5],
    "clf__min_samples_leaf": [1, 2],
    "clf__max_features": ["sqrt", "log2"]
}

# ---------------------------
# 7. Stratified K-Fold + GridSearchCV
# ---------------------------
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    scoring="recall",
    cv=skf,
    n_jobs=-1,
    verbose=2
)

  df["Churn"] = df["Churn"].replace({"Yes": 1, "No": 0})


In [12]:
# ---------------------------
# 8. Train/Test Split 후 학습
# ---------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

grid_search.fit(X_train, y_train)

print("최적 파라미터:", grid_search.best_params_)
print("최고 Recall (CV 평균):", grid_search.best_score_)


Fitting 5 folds for each of 48 candidates, totalling 240 fits
최적 파라미터: {'clf__max_depth': 10, 'clf__max_features': 'log2', 'clf__min_samples_leaf': 2, 'clf__min_samples_split': 2, 'clf__n_estimators': 200}
최고 Recall (CV 평균): 0.6240802675585285


In [13]:
# ---------------------------
# 9. 확률 보정 (CalibratedClassifierCV)
# ---------------------------
best_pipeline = grid_search.best_estimator_

calibrated_rfc = CalibratedClassifierCV(
    estimator=best_pipeline,
    method="sigmoid",   # 데이터 크기가 크면 isotonic도 가능
    cv=5
)

calibrated_rfc.fit(X_train, y_train)

0,1,2
,estimator,Pipeline(step...m_state=42))])
,method,'sigmoid'
,cv,5
,n_jobs,
,ensemble,'auto'

0,1,2
,func,<function cle...0015C2EF1DDA0>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,
,kw_args,
,inv_kw_args,

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,sampling_strategy,'auto'
,random_state,42
,k_neighbors,5

0,1,2
,n_estimators,200
,criterion,'gini'
,max_depth,10
,min_samples_split,2
,min_samples_leaf,2
,min_weight_fraction_leaf,0.0
,max_features,'log2'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [14]:
# ---------------------------
# 10. 평가
# ---------------------------
y_prob = calibrated_rfc.predict_proba(X_test)[:, 1]
y_pred = (y_prob > 0.3).astype(int)   # Threshold 조정 가능

print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_prob))


Confusion Matrix:
 [[811 224]
 [104 270]]
Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.78      0.83      1035
           1       0.55      0.72      0.62       374

    accuracy                           0.77      1409
   macro avg       0.72      0.75      0.73      1409
weighted avg       0.80      0.77      0.78      1409

ROC-AUC: 0.8359993799891499


In [15]:
# ---------------------------
# 11. 모델 저장
# ---------------------------
with open("pipeline_customer_churn_model.pkl", "wb") as f:
    pickle.dump(calibrated_rfc, f)

print("✅ 보정된 모델 저장 완료")


# 🔹 Pipeline 방식 (지금 제가 드린 코드)
# Pipeline 안에 전처리(OneHotEncoder) + SMOTE + 모델(RandomForest + Calibration) 이 다 들어가 있음.
# 그래서 전처리를 따로 할 필요가 없음.
# pkl 파일은 1개 (customer_churn_model.pkl)만 있으면 됨.
# 예측할 때는 신규 데이터를 원본 그대로 DataFrame으로 넣으면 → Pipeline이 알아서 전처리+예측을 해줍니다.

✅ 보정된 모델 저장 완료


In [16]:
# ---------------------------
# 12. 신규 고객 예측 예시
# ---------------------------
input_data = {
    'gender': 'Female',
    'SeniorCitizen': 0,
    'Partner': 'Yes',
    'Dependents': 'No',
    'tenure': 1,
    'PhoneService': 'No',
    'MultipleLines': 'No phone service',
    'InternetService': 'DSL',
    'OnlineSecurity': 'No',
    'OnlineBackup': 'Yes',
    'DeviceProtection': 'No',
    'TechSupport': 'No',
    'StreamingTV': 'No',
    'StreamingMovies': 'No',
    'Contract': 'Month-to-month',
    'PaperlessBilling': 'Yes',
    'PaymentMethod': 'Electronic check',
    'MonthlyCharges': 29.85,
    'TotalCharges': 29.85
}

input_df = pd.DataFrame([input_data])

# 저장된 모델 불러오기
with open("pipeline_customer_churn_model.pkl", "rb") as f:
    loaded_model = pickle.load(f)

# 예측
y_prob = loaded_model.predict_proba(input_df)[:, 1]
y_pred = (y_prob > 0.4).astype(int)

print(f"Prediction: {'Churn' if y_pred[0] == 1 else 'No Churn'}")
print(f"Churn Probability: {y_prob[0]:.4f}")

Prediction: Churn
Churn Probability: 0.4355
