In [9]:
import pandas as pd
import numpy as np
import cloudpickle

from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score
from sklearn.calibration import CalibratedClassifierCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline


# ---------------------------
# 1. 데이터 불러오기
# ---------------------------
df = pd.read_csv("customer_Info copy.csv")
df = df.drop(columns=["customerID"])
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce").fillna(0)
df["Churn"] = df["Churn"].replace({"Yes": 1, "No": 0})

X = df.drop(columns=["Churn"])
y = df["Churn"]

numeric_features = ["tenure", "MonthlyCharges", "TotalCharges"]
categorical_features = [col for col in X.columns if col not in numeric_features]


# ---------------------------
# 2. 전처리 + 모델 파이프라인
# ---------------------------
def clean_numeric(X):
    X = X.copy()
    for col in ["TotalCharges", "MonthlyCharges", "tenure"]:
        X[col] = pd.to_numeric(X[col], errors="coerce").fillna(0)
    return X

numeric_cleaner = FunctionTransformer(clean_numeric)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", "passthrough", numeric_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features)
    ]
)

pipeline = Pipeline(steps=[
    ("cleaner", numeric_cleaner),
    ("preprocessor", preprocessor),
    ("smote", SMOTE(random_state=42)),
    ("clf", RandomForestClassifier(random_state=42, class_weight="balanced"))
])


# ---------------------------
# 3. 하이퍼파라미터 튜닝
# ---------------------------
param_grid = {
    "clf__n_estimators": [200, 500],
    "clf__max_depth": [10, 20, None],
    "clf__min_samples_split": [2, 5],
    "clf__min_samples_leaf": [1, 2],
    "clf__max_features": ["sqrt", "log2"]
}

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    scoring="recall",
    cv=skf,
    n_jobs=-1,
    verbose=2
)


# ---------------------------
# 4. 학습 & 보정
# ---------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

grid_search.fit(X_train, y_train)
print("최적 파라미터:", grid_search.best_params_)
print("최고 Recall:", grid_search.best_score_)

best_pipeline = grid_search.best_estimator_

calibrated_rfc = CalibratedClassifierCV(
    estimator=best_pipeline,
    method="sigmoid",
    cv=5
)
calibrated_rfc.fit(X_train, y_train)

# ---------------------------
# 5. 추가 세그먼트용 KMeans
# ---------------------------
y_prob = calibrated_rfc.predict_proba(X)[:, 1]
X_cluster = pd.DataFrame({
    "ChurnProbability": y_prob,
    "MonthlyCharges": X["MonthlyCharges"]
})

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_cluster)

kmeans = KMeans(n_clusters=4, random_state=42, n_init=10)
kmeans.fit(X_scaled)


# ---------------------------
# 6. 저장 (모델 + 스케일러 + 클러스터러)
# ---------------------------
bundle = {
    "model": calibrated_rfc,
    "scaler": scaler,
    "kmeans": kmeans
}

with open("pipeline_customer_churn_model.pkl", "wb") as f:
    cloudpickle.dump(bundle, f)

print("✅ 모델 + KMeans + Scaler 저장 완료")


  df["Churn"] = df["Churn"].replace({"Yes": 1, "No": 0})


Fitting 5 folds for each of 48 candidates, totalling 240 fits
최적 파라미터: {'clf__max_depth': 10, 'clf__max_features': 'log2', 'clf__min_samples_leaf': 2, 'clf__min_samples_split': 2, 'clf__n_estimators': 200}
최고 Recall: 0.6240802675585285
✅ 모델 + KMeans + Scaler 저장 완료
