**1. Importing the dependencies**

In [156]:
# ======================================
# 1. Import Libraries
# ======================================
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score
from sklearn.preprocessing import LabelEncoder

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.preprocessing import StandardScaler


**2. Data Loading and Understanding**

In [157]:
# load teh csv data to a pandas dataframe
df = pd.read_csv("customer_info.csv")

In [158]:
df.shape

(7043, 21)

In [159]:
""" 
고객 특성 → gender, SeniorCitizen, Partner(고객이 배우자(파트너)가 있는지 여부), Dependents(부양가족(아이, 부모 등)이 있는지 여부)
서비스 사용 여부 → PhoneService(고객이 전화 서비스를 이용하는지 여부), MultipleLines, InternetService, OnlineSecurity, DeviceProtection, TechSupport, StreamingTV, StreamingMovies
계약/결제 관련 → Contract(계약 형태), PaperlessBilling(종이 청구서 대신 전자 청구서를 사용하는지 여부), PaymentMethod
금액 관련 → MonthlyCharges, TotalCharges, tenure(고객이 해당 회사와 계약을 유지한 개월 수)
타겟 변수 → Churn : Yes = 서비스를 해지한 고객, No = 유지 중인 고객
"""

' \n고객 특성 → gender, SeniorCitizen, Partner(고객이 배우자(파트너)가 있는지 여부), Dependents(부양가족(아이, 부모 등)이 있는지 여부)\n서비스 사용 여부 → PhoneService(고객이 전화 서비스를 이용하는지 여부), MultipleLines, InternetService, OnlineSecurity, DeviceProtection, TechSupport, StreamingTV, StreamingMovies\n계약/결제 관련 → Contract(계약 형태), PaperlessBilling(종이 청구서 대신 전자 청구서를 사용하는지 여부), PaymentMethod\n금액 관련 → MonthlyCharges, TotalCharges, tenure(고객이 해당 회사와 계약을 유지한 개월 수)\n타겟 변수 → Churn : Yes = 서비스를 해지한 고객, No = 유지 중인 고객\n'

In [160]:
df.columns

Index(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

In [161]:
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [162]:
# churn_rate = df['Churn'].value_counts(normalize=True) * 100
# print(churn_rate)

In [163]:
pd.set_option("display.max_columns", None) 
#Pandas 데이터프레임을 출력할 때 한 화면에 모든 컬럼을 다 보이게 해주는 옵션

In [164]:
df.head(2)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No


In [165]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [166]:
# dropping customerID column as this is not required for modelling
df = df.drop(columns=["customerID"])

In [167]:
df.head(2)

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No


In [168]:
df.columns

Index(['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure',
       'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity',
       'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
       'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod',
       'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

In [169]:
print(df["gender"].unique())
#데이터프레임 df에서 "gender" 컬럼만 선택.해당 컬럼에 들어있는 고유값(unique values) 을 중복 없이 반환

['Female' 'Male']


In [170]:
print(df["SeniorCitizen"].unique())

[0 1]


In [171]:
print(df.head())
print(df.dtypes)

   gender  SeniorCitizen Partner Dependents  tenure PhoneService  \
0  Female              0     Yes         No       1           No   
1    Male              0      No         No      34          Yes   
2    Male              0      No         No       2          Yes   
3    Male              0      No         No      45           No   
4  Female              0      No         No       2          Yes   

      MultipleLines InternetService OnlineSecurity OnlineBackup  \
0  No phone service             DSL             No          Yes   
1                No             DSL            Yes           No   
2                No             DSL            Yes          Yes   
3  No phone service             DSL            Yes           No   
4                No     Fiber optic             No           No   

  DeviceProtection TechSupport StreamingTV StreamingMovies        Contract  \
0               No          No          No              No  Month-to-month   
1              Yes          No  

In [172]:
# printing the unique values in all the columns
# 숫자형 변수 빼고 나머지 컬럼들의 고유한 값들(중복빼고)을 출력


numerical_features_list = ["tenure", "MonthlyCharges", "TotalCharges"]


for col in df.columns:
  if col not in numerical_features_list:
    print(col, df[col].unique())
    print("-"*50)



gender ['Female' 'Male']
--------------------------------------------------
SeniorCitizen [0 1]
--------------------------------------------------
Partner ['Yes' 'No']
--------------------------------------------------
Dependents ['No' 'Yes']
--------------------------------------------------
PhoneService ['No' 'Yes']
--------------------------------------------------
MultipleLines ['No phone service' 'No' 'Yes']
--------------------------------------------------
InternetService ['DSL' 'Fiber optic' 'No']
--------------------------------------------------
OnlineSecurity ['No' 'Yes' 'No internet service']
--------------------------------------------------
OnlineBackup ['Yes' 'No' 'No internet service']
--------------------------------------------------
DeviceProtection ['No' 'Yes' 'No internet service']
--------------------------------------------------
TechSupport ['No' 'Yes' 'No internet service']
--------------------------------------------------
StreamingTV ['No' 'Yes' 'No internet 

In [173]:
df["TotalCharges"].info()

<class 'pandas.core.series.Series'>
RangeIndex: 7043 entries, 0 to 7042
Series name: TotalCharges
Non-Null Count  Dtype 
--------------  ----- 
7043 non-null   object
dtypes: object(1)
memory usage: 55.2+ KB


In [174]:

# Clean TotalCharges (convert to numeric, handle blanks)
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce")
df = df.dropna(subset=["TotalCharges"])

# Target encoding (Yes=1, No=0)
df["Churn"] = df["Churn"].map({"No":0, "Yes":1})

# ======================================
# 3. Encode Categorical Features
# ======================================
object_columns = df.select_dtypes(include=["object"]).columns.tolist()
encoders = {}

for col in object_columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    encoders[col] = le

# Save encoders
with open("encoders.pkl", "wb") as f:
    pickle.dump(encoders, f)

<!-- ```📌 예시 (tenure 기준)
25% = 9개월
→ 고객 중 25%는 가입 9개월 이하에서 이탈하거나 아직 유지 중
50% = 29개월
→ 고객의 절반은 29개월 이하, 절반은 29개월 이상
75% = 55개월
→ 고객의 75%는 55개월 이하, 상위 25% 충성 고객은 55개월 이상 유지 


✅ 정리
읽을 때는 말씀하신 것처럼:
25% 값 = “하위 25% 그룹의 상한선”
50% 값 = “전체를 절반으로 나누는 기준점”
75% 값 = “상위 25% 그룹의 하한선”
👉 그래서 “25% 미만의 사람들은…” / “50%는 절반의 고객들은…” / “75%는 상위 25% 고객은…” 이런 식으로 해석하면 맞습니다 ✔️ -->



In [175]:
#LabelEncoder 는 알파벳(사전) 순으로 고유값을 정렬해서 숫자를 붙입니다. 
# ['No', 'No internet service', 'Yes']
# 0, 1, 2 

encoders

# 여기선 다 label encoding을 했지만, 실제로 3개의 값을 가진게 있을경우엔 hot encoding을 해야함. 하지만, 일단 보고 필요하면 하는걸로!


{'gender': LabelEncoder(),
 'Partner': LabelEncoder(),
 'Dependents': LabelEncoder(),
 'PhoneService': LabelEncoder(),
 'MultipleLines': LabelEncoder(),
 'InternetService': LabelEncoder(),
 'OnlineSecurity': LabelEncoder(),
 'OnlineBackup': LabelEncoder(),
 'DeviceProtection': LabelEncoder(),
 'TechSupport': LabelEncoder(),
 'StreamingTV': LabelEncoder(),
 'StreamingMovies': LabelEncoder(),
 'Contract': LabelEncoder(),
 'PaperlessBilling': LabelEncoder(),
 'PaymentMethod': LabelEncoder()}

In [176]:
encoders["OnlineSecurity"].classes_

array(['No', 'No internet service', 'Yes'], dtype=object)

In [177]:
df.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,0,0,1,0,1,0,1,0,0,2,0,0,0,0,0,1,2,29.85,29.85,0
1,1,0,0,0,34,1,0,0,2,0,2,0,0,0,1,0,3,56.95,1889.5,0
2,1,0,0,0,2,1,0,0,2,2,0,0,0,0,0,1,3,53.85,108.15,1
3,1,0,0,0,45,0,1,0,2,0,2,2,0,0,1,0,0,42.3,1840.75,0
4,0,0,0,0,2,1,0,1,0,0,0,0,0,0,0,1,2,70.7,151.65,1


**Traianing and test data split**

In [178]:
# splitting the features and target
X = df.drop(columns=["Churn"])
y = df["Churn"]

In [179]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)   # train/test split 전에 scaling

In [180]:
# split training and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [181]:
print(y_train.shape)

(5625,)


In [182]:
print(y_train.value_counts())

Churn
0    4130
1    1495
Name: count, dtype: int64


In [183]:
# ======================================
# 4. Train-Test Split
# ======================================
X = df.drop(columns=["Churn"])
y = df["Churn"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)



In [184]:
# ======================================
# 5. Handle Imbalance (SMOTE)
# ======================================
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

print("Before SMOTE:", y_train.value_counts())
print("After SMOTE:", y_train_smote.value_counts())

# ======================================
# 6. Baseline Model Comparison
# ======================================
models = {
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "XGBoost": XGBClassifier(random_state=42, use_label_encoder=False, eval_metric="logloss"),
    "Logistic Regression": LogisticRegression(max_iter=1000, solver="liblinear")  # 여기 수정
}

cv_scores = {}
for name, model in models.items():
    print(f"\nTraining {name}...")
    scores = cross_val_score(model, X_train_smote, y_train_smote, cv=5, scoring="f1")
    cv_scores[name] = np.mean(scores)
    print(f"{name} F1 Score (CV avg): {np.mean(scores):.3f}")

print("\nBaseline comparison:", cv_scores)

# ======================================
# 7. Hyperparameter Tuning (Random Forest)
# ======================================
param_grid = {
    "n_estimators": [100, 200],
    "max_depth": [None, 10, 20],
    "min_samples_split": [2, 5],
    "min_samples_leaf": [1, 2]
}

grid = GridSearchCV(
    RandomForestClassifier(random_state=42, class_weight="balanced"),  # ✅ 여기!
    param_grid, cv=5, scoring="f1", n_jobs=-1
)

grid.fit(X_train_smote, y_train_smote)
print("\nBest RandomForest Params:", grid.best_params_)
best_rfc = grid.best_estimator_

# ======================================
# 8. Evaluation on Test Set
# ======================================
y_pred = best_rfc.predict(X_test)
y_pred_prob = best_rfc.predict_proba(X_test)[:,1]

print("\nAccuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_pred_prob))

Before SMOTE: Churn
0    4130
1    1495
Name: count, dtype: int64
After SMOTE: Churn
0    4130
1    4130
Name: count, dtype: int64

Training Decision Tree...
Decision Tree F1 Score (CV avg): 0.782

Training Random Forest...
Random Forest F1 Score (CV avg): 0.832

Training XGBoost...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGBoost F1 Score (CV avg): 0.827

Training Logistic Regression...
Logistic Regression F1 Score (CV avg): 0.804

Baseline comparison: {'Decision Tree': np.float64(0.7822864159333154), 'Random Forest': np.float64(0.8323511409199031), 'XGBoost': np.float64(0.8270582808694347), 'Logistic Regression': np.float64(0.8041763614471753)}

Best RandomForest Params: {'max_depth': 20, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 100}

Accuracy: 0.7626154939587776
Confusion Matrix:
 [[845 188]
 [146 228]]
Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.82      0.83      1033
           1       0.55      0.61      0.58       374

    accuracy                           0.76      1407
   macro avg       0.70      0.71      0.71      1407
weighted avg       0.77      0.76      0.77      1407

ROC-AUC: 0.8122634867552583


In [185]:
# 1. 저장된 모델 불러오기
with open("customer_churn_model.pkl", "rb") as f:
    model_data = pickle.load(f)

loaded_model = model_data["model"]
feature_names = model_data["features_names"]

# 2. 입력 데이터 준비
input_data_df = pd.DataFrame([input_data])

# 3. 인코딩 적용
with open("encoders.pkl", "rb") as f:
    encoders = pickle.load(f)

for column, encoder in encoders.items():
    input_data_df[column] = encoder.transform(input_data_df[column])

# 4. 예측
prediction = loaded_model.predict(input_data_df)
pred_prob = loaded_model.predict_proba(input_data_df)

print(f"Prediction: {'Churn' if prediction[0] == 1 else 'No Churn'}")
print(f"Prediction Probability: {pred_prob}")


   

Prediction: No Churn
Prediction Probability: [[0.83 0.17]]


In [186]:
# 새로운 고객 데이터를 넣어서 예측해보기

input_data = {
    'gender': 'Female',
    'SeniorCitizen': 0,
    'Partner': 'Yes',
    'Dependents': 'No',
    'tenure': 1,
    'PhoneService': 'No',
    'MultipleLines': 'No phone service',
    'InternetService': 'DSL',
    'OnlineSecurity': 'No',
    'OnlineBackup': 'Yes',
    'DeviceProtection': 'No',
    'TechSupport': 'No',
    'StreamingTV': 'No',
    'StreamingMovies': 'No',
    'Contract': 'Month-to-month',
    'PaperlessBilling': 'Yes',
    'PaymentMethod': 'Electronic check',
    'MonthlyCharges': 29.85,
    'TotalCharges': 29.85
}


input_data_df = pd.DataFrame([input_data])

#저장된 인코더 불러오기
with open("encoders.pkl", "rb") as f:
  encoders = pickle.load(f)


#범주형 변수 인코딩(입력된 문자열 값을 학습할 때와 똑같은 숫자 규칙으로 변환)
for column, encoder in encoders.items():
  input_data_df[column] = encoder.transform(input_data_df[column])

# make a prediction
# predict → 최종 클래스 (0=No Churn, 1=Churn)
# predict_proba → 확률 값 (예: [0.72, 0.28] → No Churn=72%, Churn=28%)
prediction = loaded_model.predict(input_data_df)
pred_prob = loaded_model.predict_proba(input_data_df)

print(prediction)

# results
print(f"Prediction: {'Churn' if prediction[0] == 1 else 'No Churn'}")
print(f"Prediciton Probability: {pred_prob}")

[0]
Prediction: No Churn
Prediciton Probability: [[0.83 0.17]]
