In [2]:
# -----------------------------
# 문제정의
# -----------------------------
# 고객이 계정을 유지할지, 아니면 해지(이탈)할지를 예측

# -------------
# 컬럼해석
# -------------
# CustomerID: 고객 고유 식별자
# Surname: 고객 성(Last name)
# CreditScore: 신용 점수(숫자)
# Geography: 거주 국가(France/Spain/Germany)
# Gender: 성별(Male/Female)
# Age: 나이
# Tenure: 은행 이용 기간(연수)
# Balance: 계좌 잔액
# NumOfProducts: 이용 중인 은행 상품 수(예: 예금, 카드 등)
# HasCrCard: 신용카드 보유 여부(1=예, 0=아니오)
# IsActiveMember: 활성 회원 여부(1=예, 0=아니오)
# EstimatedSalary: 추정 급여(연봉)
# Exited: 이탈 여부(1=이탈, 0=유지) (대회 타깃 변수)

# 라이브러리 가져오기

In [3]:
# 판다스 라이브러리
import pandas as pd

# 데이터 불러오기
train = pd.read_csv('./train.csv')
test = pd.read_csv('./test.csv')

# 탐색적 데이터 분석(EDA)

In [4]:
train.head()
# y : Exited
# test.head()

Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,0,15674932,Okwudilichukwu,668,France,Male,33.0,3,0.0,2,1.0,0.0,181449.97,0
1,1,15749177,Okwudiliolisa,627,France,Male,33.0,1,0.0,2,1.0,1.0,49503.5,0
2,2,15694510,Hsueh,678,France,Male,40.0,10,0.0,2,1.0,0.0,184866.69,0
3,3,15741417,Kao,581,France,Male,34.0,2,148882.54,1,1.0,1.0,84560.88,0
4,4,15766172,Chiemenam,716,Spain,Male,33.0,5,0.0,2,1.0,1.0,15068.83,0


In [5]:
# 데이터 크기 확인
train.shape, test.shape

((165034, 14), (110023, 13))

In [6]:
# 결측치 확인
train.isnull().sum()
test.isnull().sum()

id                 0
CustomerId         0
Surname            0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
dtype: int64

In [7]:
# 자료형(타입)

In [8]:
test.info()
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 110023 entries, 0 to 110022
Data columns (total 13 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   id               110023 non-null  int64  
 1   CustomerId       110023 non-null  int64  
 2   Surname          110023 non-null  object 
 3   CreditScore      110023 non-null  int64  
 4   Geography        110023 non-null  object 
 5   Gender           110023 non-null  object 
 6   Age              110023 non-null  float64
 7   Tenure           110023 non-null  int64  
 8   Balance          110023 non-null  float64
 9   NumOfProducts    110023 non-null  int64  
 10  HasCrCard        110023 non-null  float64
 11  IsActiveMember   110023 non-null  float64
 12  EstimatedSalary  110023 non-null  float64
dtypes: float64(5), int64(5), object(3)
memory usage: 10.9+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 165034 entries, 0 to 165033
Data columns (total 14 columns):
 #   Colum

In [9]:
# 수치형 컬럼 통계값 확인
test.describe()
train.describe()

# 범주형 컬럼 통계값 확인
test.describe(include='O')
train.describe(include='O')

#label(traget)별 개수 확인
test['IsActiveMember'].value_counts()
train['IsActiveMember'].value_counts()

IsActiveMember
0.0    82885
1.0    82149
Name: count, dtype: int64

# 데이터 전처리

In [10]:
# 이상치 처리
# 문제없음

In [17]:
# 라벨(정답, y값)분리
y_train = train.pop("Exited")
# train의 Exited컬럼이 분리되면서 train의 컬럼 개수는 14->13개로 줄음

In [12]:
# 불균형
print(test.shape, train.shape)

(110023, 13) (165034, 14)


In [13]:
# 라벨 인코딩

from sklearn.preprocessing import LabelEncoder
cols = train.columns[train.dtypes == object] # 범주형 -> 수치형
for col in cols:
    le = LabelEncoder()

    #피팅되는 경우가 안 맞는 경우 각 행마다 이동할 때 train,text행을 합치기
    le.fit(pd.concat([train[col],test[col]] ,axis=0))
    train[col] = le.transform(train[col])
    test[col] = le.transform(test[col])
train
test

Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,165034,15773898,1581,586,0,0,23.0,2,0.00,2,0.0,1.0,160976.75
1,165035,15782418,1935,683,0,0,46.0,2,0.00,1,1.0,0.0,72549.27
2,165036,15807120,1331,656,0,0,34.0,7,0.00,2,1.0,0.0,138882.09
3,165037,15808905,1955,681,0,1,36.0,8,0.00,1,1.0,0.0,113931.57
4,165038,15607314,1153,752,1,1,38.0,10,121263.62,1,1.0,0.0,139431.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...
110018,275052,15662091,2057,570,2,1,29.0,7,116099.82,1,1.0,1.0,148087.62
110019,275053,15774133,620,575,0,0,36.0,4,178032.53,1,1.0,1.0,42181.68
110020,275054,15728456,434,712,0,1,31.0,2,0.00,2,1.0,0.0,16287.38
110021,275055,15687541,2834,709,0,0,32.0,3,0.00,1,1.0,1.0,158816.58


In [15]:
# y인코딩은 굳이 필요없음! 이미 0, 1 상태

In [18]:
#---------------------------------------
# 검증 데이터 분할
#---------------------------------------
from sklearn.model_selection import train_test_split

# 학습 , 검증,  학습   , 검증
X_train, X_val, y_train, y_val = train_test_split(train,
                                                  y_train,
                                                  test_size=0.2,
                                                  random_state=0)
X_train.shape, X_val.shape, y_train.shape, y_val.shape

((132027, 13), (33007, 13), (132027,), (33007,))

In [None]:
#---------------------------------------
# 모델 학습
#---------------------------------------

In [26]:
# 랜덤포레스트
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(random_state=0)
rf.fit(X_train, y_train)
pred1=rf.predict_proba(X_val)
#f1score나 정확도 구할 때 사용
pred1_label=rf.predict(X_val)

# LightGBM
import lightgbm as lgb
lgbmc = lgb.LGBMClassifier(random_state=0, verbose=-1)
lgbmc.fit(X_train, y_train)
pred2 = lgbmc.predict_proba(X_val)
pred2_label=rf.predict(X_val)

In [24]:
print(pred1[:,1])
print()
print(pred2[:,1])

[0.11 0.21 0.75 ... 0.28 0.05 0.36]

[0.06442807 0.14527317 0.68723293 ... 0.23911734 0.02022277 0.30957594]


In [None]:
#---------------------------------------
# 성능 평가
#---------------------------------------

In [27]:
#-----------------
#랜덤포레스트 성능 수치
#-----------------
from sklearn.metrics import roc_auc_score
roc_auc = roc_auc_score(y_val, pred1[:,1])
print('roc_auc:', roc_auc)

from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_val, pred1_label)
print('accuracy_score:', accuracy)

from sklearn.metrics import f1_score
f1 = f1_score(y_val, pred1_label, pos_label=1) # 이탈한 고객 = 1
print('f1_score:', f1)

# 파라미터 값을 적절히 수정하는 튜닝 or 데이터 균형을 맞추기 위한 샘플 삽입을 통해 균형을
# 맞추는 것을 추천!

roc_auc: 0.8801463308572731
accuracy_score: 0.8627260884054897
f1_score: 0.6168935486598461


In [31]:
#-----------------
#LIGHTGBM 성능수치
#-----------------
from sklearn.metrics import roc_auc_score
roc_auc2_lgm = roc_auc_score(y_val, pred2[:,1])
print('roc_auc:', roc_auc2_lgm)

from sklearn.metrics import accuracy_score
accuracy2_lgm = accuracy_score(y_val, pred2_label)
print('accuracy_score:', accuracy2_lgm)

from sklearn.metrics import f1_score
f1_lgm = f1_score(y_val, pred2_label, pos_label=1) # 이탈한 고객 = 1
print('f2_score:', f1_lgm)

roc_auc: 0.8902162094130004
accuracy_score: 0.8627260884054897
f2_score: 0.6168935486598461


In [None]:
#---------------------------------------
# 파일로 저장
#---------------------------------------

In [35]:
pred = lgbmc.predict_proba(test)
# pred #테스트를 넣었을 때 예측값 ,를 기준으로 오른쪽이 이탈확률

# lgbmc.classes_ #라벨에 있는 클래스들

submit = pd.DataFrame({'pred':pred[:,1]}) # 양성인 확률만 뽑아내서
submit.to_csv("result.csv", index=False) #파일로 제출

pd.read_csv("result.csv")

Unnamed: 0,pred
0,0.024092
1,0.814112
2,0.027926
3,0.220899
4,0.341188
...,...
110018,0.036409
110019,0.165881
110020,0.017981
110021,0.171473
