# 5장 회원 탈퇴를 예측하는 테크닉 10

- 탈퇴/지속 회원 -> 의사결정 트리 -> 탈퇴 예측

- use_log_months.csv: 스포츠 센터 이용 이력 데이터(2018.4~2019.3)
- customer_join.csv: 이용 이력을 포함한 고객 데이터

In [1]:
import warnings
warnings.filterwarnings('ignore')

## 41. 데이터를 읽어 들이고 이용 데이터를 수정하자.

In [2]:
import pandas as pd
customer = pd.read_csv('customer_join.csv')
uselog_months = pd.read_csv('use_log_months.csv')

In [3]:
#uselog
year_months = list(uselog_months["연월"].unique())
uselog = pd.DataFrame()

for i in range(1, len(year_months)):
    tmp = uselog_months.loc[uselog_months["연월"]==year_months[i]]
    tmp.rename(columns={"count":"count_0"}, inplace=True)         #count_0: 현재 달 사용횟수

    tmp_before = uselog_months.loc[uselog_months["연월"]==year_months[i-1]]
    del tmp_before["연월"]
    tmp_before.rename(columns={"count":"count_1"}, inplace=True)  #count_1: 이전 달 사용횟수

    tmp = pd.merge(tmp, tmp_before, on="customer_id", how="left")
    uselog = pd.concat([uselog, tmp], ignore_index=True)

uselog.head()

Unnamed: 0,연월,customer_id,count_0,count_1
0,201805,AS002855,5,4.0
1,201805,AS009373,4,3.0
2,201805,AS015233,7,
3,201805,AS015315,3,6.0
4,201805,AS015739,5,7.0


## 42. 탈퇴 전월의 탈퇴 고객 데이터를 작성하자.

In [4]:
#exit_uselog(탈퇴 고객 데이터)
from dateutil.relativedelta import relativedelta

exit_customer = customer.loc[customer["is_deleted"]==1].copy()         #탈퇴 고객 추출
exit_customer["exit_date"] = None                 #"exit_date"열 추가(초기값 = None)
exit_customer["end_date"] = pd.to_datetime(exit_customer["end_date"])

for i in range(len(exit_customer)):
    exit_customer["exit_date"].iloc[i] = exit_customer["end_date"].iloc[i] - relativedelta(months=1)    #"exit_date": 종료날짜 - 1개월

exit_customer["연월"] = pd.to_datetime(exit_customer["exit_date"]).dt.strftime("%Y%m")
uselog["연월"] = uselog["연월"].astype(str)
exit_uselog = pd.merge(uselog, exit_customer, on=["customer_id", "연월"], how="left")

print(len(uselog))
exit_uselog.head()

33851


Unnamed: 0,연월,customer_id,count_0,count_1,name,class,gender,start_date,end_date,campaign_id,...,price,campaign_name,mean,median,max,min,routine_flg,calc_date,membership_period,exit_date
0,201805,AS002855,5,4.0,,,,,NaT,,...,,,,,,,,,,
1,201805,AS009373,4,3.0,,,,,NaT,,...,,,,,,,,,,
2,201805,AS015233,7,,,,,,NaT,,...,,,,,,,,,,
3,201805,AS015315,3,6.0,,,,,NaT,,...,,,,,,,,,,
4,201805,AS015739,5,7.0,,,,,NaT,,...,,,,,,,,,,


In [5]:
# 결측치 제거
exit_uselog = exit_uselog.dropna(subset=["name"])
print(len(exit_uselog))
print(len(exit_uselog["customer_id"].unique()))
exit_uselog.head()

1104
1104


Unnamed: 0,연월,customer_id,count_0,count_1,name,class,gender,start_date,end_date,campaign_id,...,price,campaign_name,mean,median,max,min,routine_flg,calc_date,membership_period,exit_date
19,201805,AS055680,3,3.0,XXXXX,C01,M,2018-03-01,2018-06-30,CA1,...,10500.0,일반,3.0,3.0,3.0,3.0,0.0,2018-06-30,3.0,2018-05-30 00:00:00
57,201805,AS169823,2,3.0,XX,C01,M,2017-11-01,2018-06-30,CA1,...,10500.0,일반,3.0,3.0,4.0,2.0,1.0,2018-06-30,7.0,2018-05-30 00:00:00
110,201805,AS305860,5,3.0,XXXX,C01,M,2017-06-01,2018-06-30,CA1,...,10500.0,일반,3.333333,3.0,5.0,2.0,0.0,2018-06-30,12.0,2018-05-30 00:00:00
128,201805,AS363699,5,3.0,XXXXX,C01,M,2018-02-01,2018-06-30,CA1,...,10500.0,일반,3.333333,3.0,5.0,2.0,0.0,2018-06-30,4.0,2018-05-30 00:00:00
147,201805,AS417696,1,4.0,XX,C03,F,2017-09-01,2018-06-30,CA1,...,6000.0,일반,2.0,1.0,4.0,1.0,0.0,2018-06-30,9.0,2018-05-30 00:00:00


## 43. 지속 회원의 데이터를 작성하자

In [6]:
#conti_uselog(지속 고객 데이터)
conti_customer = customer.loc[customer["is_deleted"]==0]
conti_uselog = pd.merge(uselog, conti_customer, on=["customer_id"], how="left")
print(len(conti_uselog))
conti_uselog = conti_uselog.dropna(subset=["name"])   #"name" 결측 데이터 제거
print(len(conti_uselog))

33851
27422


- 탈퇴 데이터: 1104, 지속 데이터: 27422 -> 데이터 불균형 -> 언더샘플링으로 샘플 수 조정
- 데이터 섞고 중복 제거하기

In [7]:
conti_uselog = conti_uselog.sample(frac=1).reset_index(drop=True)   #frac=1: 전체 데이터 섞기
conti_uselog = conti_uselog.drop_duplicates(subset="customer_id")   #"customer_id"열 중복 제거
print(len(conti_uselog))
conti_uselog.head()

2842


Unnamed: 0,연월,customer_id,count_0,count_1,name,class,gender,start_date,end_date,campaign_id,...,class_name,price,campaign_name,mean,median,max,min,routine_flg,calc_date,membership_period
0,201902,IK967528,2,7.0,XXXX,C03,M,2017-01-01,,CA1,...,야간,6000.0,일반,5.416667,5.5,7.0,2.0,1.0,2019-04-30,27.0
1,201806,HI599892,5,9.0,XXXXX,C03,M,2017-02-01,,CA1,...,야간,6000.0,일반,5.583333,5.0,9.0,3.0,1.0,2019-04-30,26.0
2,201806,PL500772,5,7.0,XXXXX,C02,F,2017-02-01,,CA1,...,주간,7500.0,일반,6.0,6.0,9.0,4.0,1.0,2019-04-30,26.0
3,201902,IK676041,6,2.0,XXXXX,C01,F,2016-08-01,,CA1,...,종일,10500.0,일반,5.25,5.5,7.0,2.0,1.0,2019-04-30,32.0
4,201806,TS124176,10,,XXXXX,C02,F,2018-06-04,,CA2,...,주간,7500.0,입회비반액할인,7.8,8.0,10.0,5.0,1.0,2019-04-30,10.0


In [8]:
#predict_data(탈퇴+지속 고객 데이터)
predict_data = pd.concat([conti_uselog, exit_uselog],ignore_index=True)
print(len(predict_data))
predict_data.head()

3946


Unnamed: 0,연월,customer_id,count_0,count_1,name,class,gender,start_date,end_date,campaign_id,...,price,campaign_name,mean,median,max,min,routine_flg,calc_date,membership_period,exit_date
0,201902,IK967528,2,7.0,XXXX,C03,M,2017-01-01,NaT,CA1,...,6000.0,일반,5.416667,5.5,7.0,2.0,1.0,2019-04-30,27.0,
1,201806,HI599892,5,9.0,XXXXX,C03,M,2017-02-01,NaT,CA1,...,6000.0,일반,5.583333,5.0,9.0,3.0,1.0,2019-04-30,26.0,
2,201806,PL500772,5,7.0,XXXXX,C02,F,2017-02-01,NaT,CA1,...,7500.0,일반,6.0,6.0,9.0,4.0,1.0,2019-04-30,26.0,
3,201902,IK676041,6,2.0,XXXXX,C01,F,2016-08-01,NaT,CA1,...,10500.0,일반,5.25,5.5,7.0,2.0,1.0,2019-04-30,32.0,
4,201806,TS124176,10,,XXXXX,C02,F,2018-06-04,NaT,CA2,...,7500.0,입회비반액할인,7.8,8.0,10.0,5.0,1.0,2019-04-30,10.0,


## 44. 예측할 달의 재적 기간을 작성하자

In [9]:
#predict_data의 "period"열(재적기간): now-start
predict_data["period"] = 0
predict_data["now_date"] = pd.to_datetime(predict_data["연월"], format="%Y%m")
predict_data["start_date"] = pd.to_datetime(predict_data["start_date"])
for i in range(len(predict_data)):
    delta = relativedelta(predict_data["now_date"][i], predict_data["start_date"][i])
    predict_data["period"][i] = int(delta.years*12 + delta.months)
predict_data.head()

Unnamed: 0,연월,customer_id,count_0,count_1,name,class,gender,start_date,end_date,campaign_id,...,mean,median,max,min,routine_flg,calc_date,membership_period,exit_date,period,now_date
0,201902,IK967528,2,7.0,XXXX,C03,M,2017-01-01,NaT,CA1,...,5.416667,5.5,7.0,2.0,1.0,2019-04-30,27.0,,25,2019-02-01
1,201806,HI599892,5,9.0,XXXXX,C03,M,2017-02-01,NaT,CA1,...,5.583333,5.0,9.0,3.0,1.0,2019-04-30,26.0,,16,2018-06-01
2,201806,PL500772,5,7.0,XXXXX,C02,F,2017-02-01,NaT,CA1,...,6.0,6.0,9.0,4.0,1.0,2019-04-30,26.0,,16,2018-06-01
3,201902,IK676041,6,2.0,XXXXX,C01,F,2016-08-01,NaT,CA1,...,5.25,5.5,7.0,2.0,1.0,2019-04-30,32.0,,30,2019-02-01
4,201806,TS124176,10,,XXXXX,C02,F,2018-06-04,NaT,CA2,...,7.8,8.0,10.0,5.0,1.0,2019-04-30,10.0,,0,2018-06-01


## 45. 결측치를 제거하자

In [10]:
# 결측치 수 파악
predict_data.isna().sum()

Unnamed: 0,0
연월,0
customer_id,0
count_0,0
count_1,252
name,0
class,0
gender,0
start_date,0
end_date,2842
campaign_id,0


- count_1, end_date, exit_date에 결측치가 있다.
- end_date, exit_date는 탈퇴 고객만 있으므로 유지회원이 결측치가 된다.

In [11]:
#count_1 결손 데이터 제거
predict_data = predict_data.dropna(subset=["count_1"])
predict_data.isna().sum()

Unnamed: 0,0
연월,0
customer_id,0
count_0,0
count_1,0
name,0
class,0
gender,0
start_date,0
end_date,2642
campaign_id,0


## 46. 문자열 변수를 처리할 수 있게 가공하자

> 문자열 데이터는 어떻게 처리할까?

- 더미 변수를 만들자!
  -> get_dummies

  - 설명변수: count_1(1개월 전의 이용 횟수) & 카테고리 변수 campaign_name, class_name, gender & routine_flg(정기 이용 여부 플래그) & period(재적 기간)
  - 목적변수: is_deleted(탈퇴 플래그)

- 분류는 이산치를 목적 변수(ex. 탈퇴/유지)로 사용한다.



In [12]:
target_col = ["campaign_name", "class_name", "gender", "count_1", "routine_flg", "period", "is_deleted"]
predict_data = predict_data[target_col]
predict_data.head()

Unnamed: 0,campaign_name,class_name,gender,count_1,routine_flg,period,is_deleted
0,일반,야간,M,7.0,1.0,25,0.0
1,일반,야간,M,9.0,1.0,16,0.0
2,일반,주간,F,7.0,1.0,16,0.0
3,일반,종일,F,2.0,1.0,30,0.0
5,일반,야간,M,5.0,1.0,33,0.0


In [13]:
predict_data = pd.get_dummies(predict_data)
predict_data.head()

Unnamed: 0,count_1,routine_flg,period,is_deleted,campaign_name_일반,campaign_name_입회비무료,campaign_name_입회비반액할인,class_name_야간,class_name_종일,class_name_주간,gender_F,gender_M
0,7.0,1.0,25,0.0,True,False,False,True,False,False,False,True
1,9.0,1.0,16,0.0,True,False,False,True,False,False,False,True
2,7.0,1.0,16,0.0,True,False,False,False,False,True,True,False
3,2.0,1.0,30,0.0,True,False,False,False,True,False,True,False
5,5.0,1.0,33,0.0,True,False,False,True,False,False,False,True




> 더미 변수 주의!

한 개의 문자열에서 1/0으로 나타내기 때문에 서로 반대되는 더미 변수 중 하나는 지워도 된다.

- campaign_name_일반 <-> 무료
- class_name_야간 <-> 주간
- gender_M <-> F




In [14]:
del predict_data["campaign_name_일반"]
del predict_data["class_name_야간"]
del predict_data["gender_M"]
predict_data.head()

Unnamed: 0,count_1,routine_flg,period,is_deleted,campaign_name_입회비무료,campaign_name_입회비반액할인,class_name_종일,class_name_주간,gender_F
0,7.0,1.0,25,0.0,False,False,False,False,False
1,9.0,1.0,16,0.0,False,False,False,False,False
2,7.0,1.0,16,0.0,False,False,False,True,True
3,2.0,1.0,30,0.0,False,False,True,False,True
5,5.0,1.0,33,0.0,False,False,False,False,False


## 47. 의사결정 트리를 사용해서 탈퇴 예측 모델을 구축하자

In [15]:
# 의사결정 트리 분류 모델
from sklearn.tree import DecisionTreeClassifier
import sklearn.model_selection

exit = predict_data.loc[predict_data["is_deleted"]==1]
conti = predict_data.loc[predict_data["is_deleted"]==0].sample(len(exit))   #탈퇴 고객과 같은 수만큼 지속 고객 샘플링

X = pd.concat([exit, conti], ignore_index=True)   #X: 탈퇴 + 지속
y = X["is_deleted"]   #목적변수 y에 "is_deleted"
del X["is_deleted"]   #피처 데이터만 남기기
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X,y)    # 훈련/테스트 분리

model = DecisionTreeClassifier(random_state=0)
model.fit(X_train, y_train)   #train data로 모델 훈련
y_test_pred = model.predict(X_test)   #test data로 데이터 예측
print(y_test_pred)

[1. 0. 1. 1. 0. 0. 1. 0. 1. 0. 1. 0. 0. 1. 1. 0. 1. 1. 0. 0. 1. 1. 0. 1.
 1. 1. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0. 1. 0. 1. 1. 0. 1. 1. 0.
 1. 1. 0. 0. 0. 0. 1. 0. 1. 0. 1. 1. 0. 1. 0. 1. 1. 0. 1. 0. 0. 1. 1. 0.
 1. 1. 0. 0. 0. 0. 1. 0. 1. 0. 1. 0. 0. 1. 1. 0. 1. 0. 0. 1. 0. 1. 1. 0.
 0. 1. 1. 0. 1. 1. 0. 0. 0. 1. 1. 0. 1. 1. 1. 0. 0. 1. 1. 0. 0. 0. 1. 0.
 0. 1. 1. 0. 1. 0. 1. 1. 1. 1. 1. 0. 1. 0. 0. 1. 0. 1. 0. 1. 0. 0. 1. 0.
 0. 0. 1. 0. 0. 1. 0. 0. 1. 1. 1. 0. 0. 0. 1. 0. 0. 1. 1. 1. 1. 1. 1. 1.
 0. 1. 0. 1. 1. 0. 0. 1. 1. 1. 0. 0. 1. 0. 1. 1. 1. 0. 1. 1. 1. 0. 1. 1.
 0. 0. 1. 1. 0. 0. 0. 1. 0. 1. 0. 0. 0. 1. 0. 1. 1. 1. 1. 0. 1. 0. 1. 1.
 1. 0. 1. 0. 0. 0. 0. 0. 1. 1. 1. 0. 0. 0. 1. 1. 1. 1. 0. 0. 0. 1. 0. 0.
 1. 1. 1. 1. 1. 0. 1. 1. 0. 0. 0. 0. 0. 1. 1. 0. 0. 1. 0. 1. 1. 0. 1. 1.
 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 1. 1. 1. 0.
 0. 0. 1. 0. 1. 0. 0. 0. 0. 1. 1. 1. 1. 0. 1. 1. 1. 0. 0. 0. 1. 0. 0. 1.
 0. 1. 1. 1. 1. 0. 1. 0. 0. 1. 0. 0. 0. 0. 1. 1. 1.

In [16]:
# 실제값과 예측값 비교
results_test = pd.DataFrame({"y_test":y_test ,"y_pred":y_test_pred })
results_test.head()

Unnamed: 0,y_test,y_pred
255,1.0,1.0
2037,0.0,0.0
1869,0.0,1.0
371,1.0,1.0
1524,0.0,0.0


## 48. 예측 모델을 평가하고 모델을 튜닝해보자

In [17]:
# 정확도(=정답률) score
correct = len(results_test.loc[results_test["y_test"]==results_test["y_pred"]])   #정답수
data_count = len(results_test)    #전체 데이터 개수
score_test = correct / data_count   #정답률 = 정답수/전체 데이터 개수
print(score_test)

0.8650190114068441


In [18]:
print(model.score(X_test, y_test))    #test data로 예측한 정확도
print(model.score(X_train, y_train))    #train data로 예측한 정확도

0.8650190114068441
0.9746514575411914


- 0.98(train) >> 0.87(test): 과적합

> 과적합 해결

- 데이터 늘리기, 변수 재검토, 모델 파라미터 변경
- 의사결정 트리의 경우: 트리의 깊이 얕게 조절(max_depth)

In [19]:
X = pd.concat([exit, conti], ignore_index=True)
y = X["is_deleted"]
del X["is_deleted"]
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X,y)

model = DecisionTreeClassifier(random_state=0, max_depth=5)   #max_depth = 5
model.fit(X_train, y_train)
print(model.score(X_test, y_test))
print(model.score(X_train, y_train))

0.9201520912547528
0.9169835234474017


## 49. 모델에 기여하는 변수를 확인하자

- 기여도 확인: model.feature_importances_

In [20]:
importance = pd.DataFrame({"feature_names":X.columns, "coefficient":model.feature_importances_})
importance

Unnamed: 0,feature_names,coefficient
0,count_1,0.338352
1,routine_flg,0.12744
2,period,0.529224
3,campaign_name_입회비무료,0.0
4,campaign_name_입회비반액할인,0.0
5,class_name_종일,0.002153
6,class_name_주간,0.002463
7,gender_F,0.000368


> 기여도가 높은 변수: 1개월 전의 이용 횟수, 정기 이용 여부, 재적 기간

## 50. 회원 탈퇴를 예측하자

In [21]:
# 예측하고 싶은 회원 데이터
count_1 = 3
routing_flg = 1
period = 10
campaign_name = "입회비무료"
class_name = "종일"
gender = "M"

In [22]:
# campaign_name 이진 인코딩
if campaign_name == "입회비반값할인":
    campaign_name_list = [1, 0]
elif campaign_name == "입회비무료":
    campaign_name_list = [0, 1]
elif campaign_name == "일반":
    campaign_name_list = [0, 0]
# class_name
if class_name == "종일":
    class_name_list = [1, 0]
elif class_name == "주간":
    class_name_list = [0, 1]
elif class_name == "야간":
    class_name_list = [0, 0]
# gender
if gender == "F":
    gender_list = [1]
elif gender == "M":
    gender_list = [0]

# input_data(초기 리스트) 설정 후 리스트 extend
input_data = [count_1, routing_flg, period]
input_data.extend(campaign_name_list)
input_data.extend(class_name_list)
input_data.extend(gender_list)

In [23]:
print(model.predict([input_data]))            #분류 결과: 0(지속)/1(탈퇴)
print(model.predict_proba([input_data]))      #0/1로 예측할 확률

[1.]
[[0.02985075 0.97014925]]


- 탈퇴가 예상된다. 0.97%의 확률로 탈퇴라고 예측한다.