# 1. 패키지 불러오기

In [1]:

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# 2. eda

In [2]:
train_df = pd.read_csv('C:/Users/eunseok/Desktop/vscode/data/customer_churn_train.csv')

In [3]:
train_df.head()


Unnamed: 0,CustomerID,Age,Gender,Tenure,Usage Frequency,Support Calls,Payment Delay,Subscription Type,Contract Length,Total Spend,Last Interaction,Churn
0,2.0,30.0,Female,39.0,14.0,5.0,18.0,Standard,Annual,932.0,17.0,1.0
1,3.0,65.0,Female,49.0,1.0,10.0,8.0,Basic,Monthly,557.0,6.0,1.0
2,4.0,55.0,Female,14.0,4.0,6.0,18.0,Basic,Quarterly,185.0,3.0,1.0
3,5.0,58.0,Male,38.0,21.0,7.0,7.0,Standard,Monthly,396.0,29.0,1.0
4,6.0,23.0,Male,32.0,20.0,5.0,8.0,Basic,Monthly,617.0,20.0,1.0


In [4]:
train_df.head().info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   CustomerID         5 non-null      float64
 1   Age                5 non-null      float64
 2   Gender             5 non-null      object 
 3   Tenure             5 non-null      float64
 4   Usage Frequency    5 non-null      float64
 5   Support Calls      5 non-null      float64
 6   Payment Delay      5 non-null      float64
 7   Subscription Type  5 non-null      object 
 8   Contract Length    5 non-null      object 
 9   Total Spend        5 non-null      float64
 10  Last Interaction   5 non-null      float64
 11  Churn              5 non-null      float64
dtypes: float64(9), object(3)
memory usage: 612.0+ bytes


In [5]:
train_df.describe()

Unnamed: 0,CustomerID,Age,Tenure,Usage Frequency,Support Calls,Payment Delay,Total Spend,Last Interaction,Churn
count,440832.0,440832.0,440832.0,440832.0,440832.0,440832.0,440832.0,440832.0,440832.0
mean,225398.667955,39.373153,31.256336,15.807494,3.604437,12.965722,631.616223,14.480868,0.567107
std,129531.91855,12.442369,17.255727,8.586242,3.070218,8.258063,240.803001,8.596208,0.495477
min,2.0,18.0,1.0,1.0,0.0,0.0,100.0,1.0,0.0
25%,113621.75,29.0,16.0,9.0,1.0,6.0,480.0,7.0,0.0
50%,226125.5,39.0,32.0,16.0,3.0,12.0,661.0,14.0,1.0
75%,337739.25,48.0,46.0,23.0,6.0,19.0,830.0,22.0,1.0
max,449999.0,65.0,60.0,30.0,10.0,30.0,1000.0,30.0,1.0


In [6]:
train_df.isnull().sum()

CustomerID           1
Age                  1
Gender               1
Tenure               1
Usage Frequency      1
Support Calls        1
Payment Delay        1
Subscription Type    1
Contract Length      1
Total Spend          1
Last Interaction     1
Churn                1
dtype: int64

In [7]:
train_df.isna().sum(axis=1).value_counts()

0     440832
12         1
Name: count, dtype: int64

na가 한 열에만 있음

In [8]:
mask_one_na = train_df.isna().sum(axis=1) == 1
rows_with_one_na = train_df[mask_one_na]

In [9]:
rows_with_one_na

Unnamed: 0,CustomerID,Age,Gender,Tenure,Usage Frequency,Support Calls,Payment Delay,Subscription Type,Contract Length,Total Spend,Last Interaction,Churn


In [10]:
mask_one_na = train_df.isna().sum(axis=1) == 12
rows_with_one_na = train_df[mask_one_na]
print("NA가 1개인 행 개수:", len(rows_with_one_na))
print("해당 행 인덱스:", rows_with_one_na.index.tolist())

NA가 1개인 행 개수: 1
해당 행 인덱스: [199295]


In [11]:
train_df.loc[199295]

CustomerID           NaN
Age                  NaN
Gender               NaN
Tenure               NaN
Usage Frequency      NaN
Support Calls        NaN
Payment Delay        NaN
Subscription Type    NaN
Contract Length      NaN
Total Spend          NaN
Last Interaction     NaN
Churn                NaN
Name: 199295, dtype: object

In [12]:
train_df = train_df.drop(index=199295)

In [13]:
train_df.isna().sum(axis=1).value_counts()

0    440832
Name: count, dtype: int64

In [14]:
train_df.isnull().sum()

CustomerID           0
Age                  0
Gender               0
Tenure               0
Usage Frequency      0
Support Calls        0
Payment Delay        0
Subscription Type    0
Contract Length      0
Total Spend          0
Last Interaction     0
Churn                0
dtype: int64

na값이 없어진 것을 확인

# 3. 전처리

In [15]:
numeric_features = ['Age', 'Tenure', 'Usage Frequency', 'Support Calls', 'Payment Delay', 'Total Spend', 'Last Interaction']
categorical_features = ['Gender', 'Subscription Type', 'Contract Length']

수치형이랑 범주형 분리

In [16]:
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)


- 수치형 컬럼을 평균 0, 분산 1의 정규 분포로 스케일링
- 범주형 변수에 대해 원-핫 인코딩
- 마지막에 통합

# 4. 모델링 파이프라인

In [17]:
clf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

param_grid = {
    'classifier__n_estimators': [100, 200],
    'classifier__max_depth': [None, 10, 20]
}

gs = GridSearchCV(clf, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# 5. 학습

In [18]:
X_train = train_df.drop(columns=['CustomerID', 'Churn'])
y_train = train_df['Churn']

gs.fit(X_train, y_train)
print("Best parameters:\n", gs.best_params_)
print("Best CV accuracy:\n", gs.best_score_)

Best parameters:
 {'classifier__max_depth': None, 'classifier__n_estimators': 200}
Best CV accuracy:
 0.9993035899414047


# 6. 평가

In [19]:

y_pred_train = gs.predict(X_train)
print("Train Classification Report:\n", classification_report(y_train, y_pred_train))
print("Train Accuracy:\n", accuracy_score(y_train, y_pred_train))

Train Classification Report:
               precision    recall  f1-score   support

         0.0       1.00      1.00      1.00    190833
         1.0       1.00      1.00      1.00    249999

    accuracy                           1.00    440832
   macro avg       1.00      1.00      1.00    440832
weighted avg       1.00      1.00      1.00    440832

Train Accuracy:
 1.0


# 6. 예측

In [21]:
test_df = pd.read_csv('C:/Users/eunseok/Desktop/vscode/data/customer_churn_test.csv')
X_test = test_df.drop(columns=['CustomerID'])
test_pred = gs.predict(X_test)

In [24]:
output = pd.DataFrame({'CustomerID': test_df['CustomerID'], 'Churn': test_pred})
output.head()

Unnamed: 0,CustomerID,Churn
0,1,1.0
1,2,1.0
2,3,1.0
3,4,1.0
4,5,1.0


test.csv로 저장

In [25]:
output.to_csv('test.csv', index=False)