# Create_Model.py 개요
## 1.1사용함수 정리
* pandas : DataFrame 형식 및 연산
* numpy : 기본 연산자 
* xgboost,lightgbm,extraTree,randomForest : stacking시 사용하는 1-level base 모델 및 2-level 모델
* veckstack : stacking 함수
* joblib : 모델 저장
* sklearn : 변수 scaling 
* keras : 생존일 예측시 사용하는 신경망 모델.


In [21]:
%pip install vecstack
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from vecstack import StackingTransformer
import joblib
from sklearn.preprocessing import StandardScaler,MinMaxScaler,RobustScaler
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
import keras.backend as K
from sklearn.pipeline import Pipeline



## 1.2 데이터 로드
* 전처리 단계에서 사용한 데이터 로드
* 데이터 불균형 해소를 위해 undersampling (이탈 vs 생존)

In [0]:
## train_preprocess 데이터 로드
data=  pd.read_csv('preprocess/train_preprocess_1.csv') 


## undersampling
data.loc[data['churn']==1].shape 
a_1 = data.loc[data['churn']==1]
a_0 = data.loc[data['churn']==0].sample(data.loc[data['churn']==1].shape [0])
b = pd.concat([a_1,a_0])

## input / label 분리
y_train = b[['acc_id','churn','churn_week','survival_time']]
x_train = b.iloc[:,6:]

###  범주형 자료 encoding

In [19]:
def oneHotEncode(df,colNames):
    for col in colNames:
        if( df[col].dtype == np.dtype('object')):
            dummies = pd.get_dummies(df[col],prefix=col)
            df = pd.concat([df,dummies],axis=1)

            #drop the encoded column
            df.drop([col],axis = 1 , inplace=True)
    return df

def encoding_cate(x):

    x['pay_yn'] = x['pay_yn'].astype(object)
    x['most_trade_item'] = x['most_trade_item'].astype(object)
    x['most_trade_time'] = x['most_trade_time'].astype(object)

    print('There were {} columns before encoding categorical features'.format(x.shape[1]))
    combined = oneHotEncode(x, ['pay_yn','most_trade_time','most_trade_item'])
    print('There are {} columns after encoding categorical features'.format(combined.shape[1]))
    return combined

## 범주형 함수 encoding 적용
train_combined = encoding_cate(x_train)



There were 129 columns before encoding categorical features
There are 140 columns after encoding categorical features


## 2.1 이탈 유무 binary 분류
### 2.1.1 stacking에 적용할 1-level 함수 사용
* XGBoost, LightGBM, ExtraTree, RandomForest 를 기본함수로 stacking 첫번째 단계에 사용

In [20]:
xgb = XGBClassifier(max_depth=5, n_estimators=100, objective='reg:logistic')
lgbm = LGBMClassifier(max_depth=5,n_estimators=100, objective='binary',num_boost_round=10)
ext = ExtraTreesClassifier(n_estimators=100)
rf =  RandomForestClassifier(max_depth=5, n_estimators=100)


models = [('xgb',xgb),('lgbm',lgbm),('ext',ext),('rf',rf)]
stack= StackingTransformer(estimators = models, regression=False, n_folds=5,stratified=True, needs_proba= True,
       shuffle = True, verbose=2)
stack = stack.fit(train_combined, y_train.churn)
s_train = stack.transform(train_combined)

task:         [classification]
n_classes:    [2]
metric:       [log_loss]
variant:      [A]
n_estimators: [4]

estimator  0: [xgb: XGBClassifier]
    fold  0:  [0.48997722]
    fold  1:  [0.49024277]
    fold  2:  [0.49583155]
    fold  3:  [0.50193583]
    fold  4:  [0.48615832]
    ----
    MEAN:     [0.49282914] + [0.00550034]

estimator  1: [lgbm: LGBMClassifier]




    fold  0:  [0.56074173]




    fold  1:  [0.56050706]




    fold  2:  [0.56293608]




    fold  3:  [0.56615312]




    fold  4:  [0.56003864]
    ----
    MEAN:     [0.56207533] + [0.00226960]

estimator  2: [ext: ExtraTreesClassifier]
    fold  0:  [0.48251265]
    fold  1:  [0.46923722]
    fold  2:  [0.47633424]
    fold  3:  [0.47334179]
    fold  4:  [0.47254335]
    ----
    MEAN:     [0.47479385] + [0.00447218]

estimator  3: [rf: RandomForestClassifier]
    fold  0:  [0.53923080]
    fold  1:  [0.53580899]
    fold  2:  [0.54259671]
    fold  3:  [0.54461350]
    fold  4:  [0.53673624]
    ----
    MEAN:     [0.53979725] + [0.00336541]

Train set was detected.
Transforming...

estimator  0: [xgb: XGBClassifier]
    model from fold  0: done
    model from fold  1: done
    model from fold  2: done
    model from fold  3: done
    model from fold  4: done
    ----
    DONE

estimator  1: [lgbm: LGBMClassifier]
    model from fold  0: done
    model from fold  1: done
    model from fold  2: done
    model from fold  3: done
    model from fold  4: done
    ----
    DONE

estimator  2: [ext: E

### 2.1.2 stacking 2-level 함수 실행.




In [0]:
churn_yn_model = XGBClassifier(max_depth=5, learning_rate=0.08, n_estimators=1000, objective = 'reg:logistic',nthread=4).fit(s_train, y_train.churn)

### 2.1.3stacking 함수 저장

In [0]:
steps = [('stack', stack),
         ('final_estimator', churn_yn_model)]
pipe = Pipeline(steps)
pipe = pipe.set_params(stack__verbose=0)
_ = joblib.dump(pipe, 'model/churn_yn_model.pkl')


## 2.2 생존일자 Multi-class 분류 & 회귀
* 실제 이탈한 사람들만 학습데이터로 사용.

In [9]:
## 실제 이탈한 사람들을 대상으로 input을 구성
y_train = a_1[['acc_id','churn_week','survival_time']]
x_train = a_1.iloc[:,6:]

train_combined = encoding_cate(x_train)

There were 129 columns before encoding categorical features
There are 140 columns after encoding categorical features


### 2.2.1 multi-class 분류
* 이탈 주차(1주~9주)를 분류하는 모델 학습

In [0]:
rg_xgb_model = XGBClassifier(max_depth= 5,nthread=4, learning_rate=0.08, objective= 'multi:softmax',n_estimators=1000,gamma=5).fit(train_combined, y_train.churn_week)

### 2.2.2 신경망 회귀
* 신경망으로 이탈날짜를 회귀모형으로 구성
* 0~1 사이로 MinMax scaling을 input에 적용
* loss함수는 실제 score function에서 사용하는, 날짜 오차에 대한 감소폭을 이용하여 구성

In [13]:
#def custom_loss(y_true,y_pred):
 # d = y_true-y_pred
  #score_max = 30
  #score = 30*K.exp(-(K.square(d)/450))
  #loss = score_max - score
  #return K.sum(loss)
  
scaler =MinMaxScaler()
scaler.fit(train_combined)
nn_x_train = scaler.transform(train_combined)
  
model_st = Sequential()
model_st.add(Dense(90,input_dim = 140 ,activation='relu'))
model_st.add(Dense(90, activation='relu'))
model_st.add(Dense(1))
model_st.compile(optimizer='Adam',
              loss= 'mean_squared_error',
              metrics=['mae'])

nn_model_st = model_st.fit(nn_x_train, y_train.survival_time, epochs=300, batch_size=10)

Epoch 1/300
Epoch 2/300
Epoch 3/300
Epoch 4/300
Epoch 5/300
Epoch 6/300
Epoch 7/300
Epoch 8/300
Epoch 9/300
Epoch 10/300
Epoch 11/300
Epoch 12/300
Epoch 13/300
Epoch 14/300
Epoch 15/300
Epoch 16/300
Epoch 17/300
Epoch 18/300
Epoch 19/300
Epoch 20/300
Epoch 21/300
Epoch 22/300
Epoch 23/300
Epoch 24/300
Epoch 25/300
Epoch 26/300
Epoch 27/300
Epoch 28/300
Epoch 29/300
Epoch 30/300
Epoch 31/300
Epoch 32/300
Epoch 33/300
Epoch 34/300
Epoch 35/300
Epoch 36/300
Epoch 37/300
Epoch 38/300
Epoch 39/300
Epoch 40/300
Epoch 41/300
Epoch 42/300
Epoch 43/300
Epoch 44/300
Epoch 45/300
Epoch 46/300
Epoch 47/300
Epoch 48/300
Epoch 49/300
Epoch 50/300
Epoch 51/300
Epoch 52/300
Epoch 53/300
Epoch 54/300
Epoch 55/300
Epoch 56/300
Epoch 57/300
Epoch 58/300
Epoch 59/300
Epoch 60/300
Epoch 61/300
Epoch 62/300
Epoch 63/300
Epoch 64/300
Epoch 65/300
Epoch 66/300
Epoch 67/300
Epoch 68/300
Epoch 69/300
Epoch 70/300
Epoch 71/300
Epoch 72/300
Epoch 73/300
Epoch 74/300
Epoch 75/300
Epoch 76/300
Epoch 77/300
Epoch 78

### 2.2.3 모델 저장
* 학습된 모델 저장.

In [14]:
import joblib
filename1 = 'model/churn_rg_model_1.sav'
joblib.dump(rg_xgb_model, filename1)

filename2 = 'model/churn_rg_model_2.sav'
joblib.dump(model_st, filename2)

['/content/drive/My Drive/churn_rg_model_2_mse.sav']

# 3. 과금 모형 구성.
## 3.1과금유무 binary 모델 생성
* 데이터의 불균형 해소를 위한 undersampling (과금 vs 비과금)


In [0]:
a_1 = data.loc[data['label_spent']==1].sample(data.loc[data['label_spent']==0].shape[0])
a_0 = data.loc[data['label_spent']==0]
b = pd.concat([a_1,a_0])

#%% label / input 분리
y_train = b[['acc_id','label_spent','amount_spent','survival_time']]
x_train = b.iloc[:,6:]

* 범주형 변수 encoding
* 과금 예측시에는 survival_time을 변수로 사용.

In [16]:
train_combined = encoding_cate(x_train)
train_combined['survival_time'] = y_train['survival_time']

There were 129 columns before encoding categorical features
There are 140 columns after encoding categorical features


### 3.1.1 stacking에 적용할 1-level 함수 사용
* XGBoost, LightGBM, ExtraTree, RandomForest 를 기본함수로 stacking 첫번째 단계에 사용

In [0]:
models = [('xrg' ,XGBClassifier(max_depth=5, n_estimators=100, objective='reg:logistic')),
       ('lgbm',LGBMClassifier(max_depth=5,n_estimators=100, objective='binary',num_boost_round=10)),
       ('ext', ExtraTreesClassifier(n_estimators=100)),
       ('rf', RandomForestClassifier(max_depth=5, n_estimators=100)) ]

stack= StackingTransformer(estimators = models, regression=False, n_folds=5,stratified=True, needs_proba= True,
       shuffle = True, verbose=2)
stack = stack.fit(train_combined, y_train.label_spent)
s_train = stack.transform(train_combined)



task:         [classification]
n_classes:    [2]
metric:       [log_loss]
variant:      [A]
n_estimators: [4]

estimator  0: [xrg: XGBClassifier]
    fold  0:  [0.32471779]
    fold  1:  [0.30348733]
    fold  2:  [0.32839493]
    fold  3:  [0.31110876]
    fold  4:  [0.31700718]
    ----
    MEAN:     [0.31694320] + [0.00901878]

estimator  1: [lgbm: LGBMClassifier]




    fold  0:  [0.42345955]




    fold  1:  [0.41603846]




    fold  2:  [0.43077548]




    fold  3:  [0.42107366]




    fold  4:  [0.42226754]
    ----
    MEAN:     [0.42272294] + [0.00475386]

estimator  2: [ext: ExtraTreesClassifier]
    fold  0:  [0.40037844]
    fold  1:  [0.33023435]
    fold  2:  [0.36050331]
    fold  3:  [0.35145404]
    fold  4:  [0.34701353]
    ----
    MEAN:     [0.35791673] + [0.02339508]

estimator  3: [rf: RandomForestClassifier]
    fold  0:  [0.38289725]
    fold  1:  [0.37066163]
    fold  2:  [0.39117292]
    fold  3:  [0.38396551]
    fold  4:  [0.38629735]
    ----
    MEAN:     [0.38299893] + [0.00679481]

Train set was detected.
Transforming...

estimator  0: [xrg: XGBClassifier]
    model from fold  0: done
    model from fold  1: done
    model from fold  2: done
    model from fold  3: done
    model from fold  4: done
    ----
    DONE

estimator  1: [lgbm: LGBMClassifier]
    model from fold  0: done
    model from fold  1: done
    model from fold  2: done
    model from fold  3: done
    model from fold  4: done
    ----
    DONE

estimator  2: [ext: E

### 3.1.2 stacking 2-level 함수 실행.

In [0]:
spend_yn_model = XGBClassifier(max_depth=5, learning_rate=0.08, n_estimators=1000, objective = 'reg:logistic',nthread=4).fit(s_train, y_train.label_spent)

### 3.1.3 과금 유무 모델 저장

In [0]:
steps = [('stack', stack),
         ('final_estimator', spend_yn_model)]
pipe = Pipeline(steps)
_ = joblib.dump(pipe, 'model/spend_yn_model.pkl')

## 3.2 과금 회귀 모형 구성


In [0]:
y_train = a_1[['acc_id','label_spent','amount_spent','survival_time']]
x_train = a_1.iloc[:,6:]

train_combined = encoding_cate(x_train)
train_combined['survival_time'] = y_train['survival_time']

There were 129 columns before encoding categorical features
There are 140 columns after encoding categorical features


### 3.2.1 신경망 학습
* custom-loss 함수 적용. 과소평가시 loss가 더 큰 비대칭적인 loss함수.

In [0]:
def custom_loss(y_true,y_pred):
    d = y_true-y_pred
    bool_idx_1 = K.greater(d,0)
    score_max = (20*y_true) - 0.3*y_true
    score_under= (((10/9)*(y_pred/y_true)-1/9)*20*y_true) - 0.3*y_pred
    score_over = (20*y_true) - 0.3*y_pred
    loss1 = score_max - score_under
    loss2 = score_max - score_over
    loss = K.switch(bool_idx_1,loss1,loss2)
    return K.sum(loss)

scaler =MinMaxScaler()
scaler.fit(train_combined)
nn_x_train = scaler.transform(train_combined)

model_as = Sequential()
model_as.add(Dense(100, input_dim=141, activation='relu'))
model_as.add(Dense(100, activation='relu'))
model_as.add(Dense(1))
model_as.compile(optimizer='adam',
              loss=custom_loss, 
              metrics=['mae'])
model_as.summary()

nn_model = model_as.fit(nn_x_train, y_train.amount_spent, epochs=150, batch_size=10)


Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_7 (Dense)              (None, 100)               14200     
_________________________________________________________________
dense_8 (Dense)              (None, 100)               10100     
_________________________________________________________________
dense_9 (Dense)              (None, 1)                 101       
Total params: 24,401
Trainable params: 24,401
Non-trainable params: 0
_________________________________________________________________
Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 21/150
Epoch 22/150
Epoch 23/150
Epoch 24/150
Epoch 25/150
Epoch 26/150
Epoch 27/150
Epoch 28/150
Epoch 29/150
Epoch 30/150

### 3.2.2신경망 모델 저장.

In [0]:
filename = 'model/spend_rg_model.sav'
joblib.dump(model_as, filename)

['/content/drive/My Drive/spend_rg_model.sav']