In [222]:
import pandas as pd
import os
import gc
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostRegressor
from sklearn.linear_model import SGDRegressor, LinearRegression, Ridge
from sklearn.preprocessing import MinMaxScaler
from gensim.models import Word2Vec
import math
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, log_loss
import matplotlib.pyplot as plt
import time
from sklearn.model_selection import TimeSeriesSplit
import warnings
warnings.filterwarnings('ignore')

In [223]:
from sklearn.datasets import load_iris
from sklearn.decomposition import PCA
import pylab as pl
from itertools import cycle

# 1.数据预处理

In [224]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
data = pd.concat([train, test], axis=0, ignore_index=True)

In [225]:
train

Unnamed: 0,客户ID,地理区域,是否双频,是否翻新机,当前手机价格,手机网络功能,婚姻状况,家庭成人人数,信息库匹配,预计收入,...,客户生命周期内平均月费用,客户生命周期内的平均每月使用分钟数,客户整个生命周期内的平均每月通话次数,过去三个月的平均每月使用分钟数,过去三个月的平均每月通话次数,过去三个月的平均月费用,过去六个月的平均每月使用分钟数,过去六个月的平均每月通话次数,过去六个月的平均月费用,是否流失
0,0,7,0,-1,181,0,2,0,0,3,...,24,286,91,351,121,23,303,101,25,0
1,1,13,1,0,1399,0,3,0,0,0,...,44,447,190,483,199,40,488,202,44,1
2,2,14,1,0,927,0,2,4,0,6,...,48,183,79,271,95,71,209,77,54,0
3,3,1,0,0,232,0,3,-1,1,-1,...,42,303,166,473,226,72,446,219,65,1
4,4,0,-1,0,699,0,1,2,0,3,...,36,119,24,88,15,35,106,21,37,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
149995,149995,10,1,0,1350,0,3,0,0,0,...,156,474,160,239,80,74,346,122,83,1
149996,149996,6,1,0,542,0,3,-1,1,-1,...,52,968,208,1158,257,58,1307,261,57,0
149997,149997,15,1,0,1300,0,1,2,0,6,...,39,504,205,544,203,45,531,205,47,1
149998,149998,12,1,0,1399,0,4,1,0,-1,...,91,685,249,233,140,94,432,236,97,1


# 2.训练数据/测试数据准备

In [226]:
features = [f for f in data.columns if f not in ['是否流失','客户ID']]
features

['地理区域',
 '是否双频',
 '是否翻新机',
 '当前手机价格',
 '手机网络功能',
 '婚姻状况',
 '家庭成人人数',
 '信息库匹配',
 '预计收入',
 '信用卡指示器',
 '当前设备使用天数',
 '在职总月数',
 '家庭中唯一订阅者的数量',
 '家庭活跃用户数',
 '新手机用户',
 '信用等级代码',
 '平均月费用',
 '每月平均使用分钟数',
 '平均超额使用分钟数',
 '平均超额费用',
 '平均语音费用',
 '数据超载的平均费用',
 '平均漫游呼叫数',
 '当月使用分钟数与前三个月平均值的百分比变化',
 '当月费用与前三个月平均值的百分比变化',
 '平均掉线语音呼叫数',
 '平均丢弃数据呼叫数',
 '平均占线语音呼叫数',
 '平均占线数据调用次数',
 '平均未接语音呼叫数',
 '未应答数据呼叫的平均次数',
 '尝试拨打的平均语音呼叫次数',
 '尝试数据调用的平均数',
 '平均接听语音电话数',
 '平均完成的语音呼叫数',
 '完成数据调用的平均数',
 '平均客户服务电话次数',
 '使用客户服务电话的平均分钟数',
 '一分钟内的平均呼入电话数',
 '平均三通电话数',
 '已完成语音通话的平均使用分钟数',
 '平均呼入和呼出高峰语音呼叫数',
 '平均峰值数据调用次数',
 '使用高峰语音通话的平均不完整分钟数',
 '平均非高峰语音呼叫数',
 '非高峰数据呼叫的平均数量',
 '平均掉线或占线呼叫数',
 '平均尝试调用次数',
 '平均已完成呼叫数',
 '平均呼叫转移呼叫数',
 '平均呼叫等待呼叫数',
 '账户消费限额',
 '客户生命周期内的总通话次数',
 '客户生命周期内的总使用分钟数',
 '客户生命周期内的总费用',
 '计费调整后的总费用',
 '计费调整后的总分钟数',
 '计费调整后的呼叫总数',
 '客户生命周期内平均月费用',
 '客户生命周期内的平均每月使用分钟数',
 '客户整个生命周期内的平均每月通话次数',
 '过去三个月的平均每月使用分钟数',
 '过去三个月的平均每月通话次数',
 '过去三个月的平均月费用',
 '过去六个月的平均每月使用分钟数',
 '过去六个月的平均每月通话次数',
 '过去六个月的平均月费用']

In [227]:
train = data[data['是否流失'].notnull()].reset_index(drop=True)
test = data[data['是否流失'].isnull()].reset_index(drop=True)

In [228]:
train

Unnamed: 0,客户ID,地理区域,是否双频,是否翻新机,当前手机价格,手机网络功能,婚姻状况,家庭成人人数,信息库匹配,预计收入,...,客户生命周期内平均月费用,客户生命周期内的平均每月使用分钟数,客户整个生命周期内的平均每月通话次数,过去三个月的平均每月使用分钟数,过去三个月的平均每月通话次数,过去三个月的平均月费用,过去六个月的平均每月使用分钟数,过去六个月的平均每月通话次数,过去六个月的平均月费用,是否流失
0,0,7,0,-1,181,0,2,0,0,3,...,24,286,91,351,121,23,303,101,25,0.0
1,1,13,1,0,1399,0,3,0,0,0,...,44,447,190,483,199,40,488,202,44,1.0
2,2,14,1,0,927,0,2,4,0,6,...,48,183,79,271,95,71,209,77,54,0.0
3,3,1,0,0,232,0,3,-1,1,-1,...,42,303,166,473,226,72,446,219,65,1.0
4,4,0,-1,0,699,0,1,2,0,3,...,36,119,24,88,15,35,106,21,37,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
149995,149995,10,1,0,1350,0,3,0,0,0,...,156,474,160,239,80,74,346,122,83,1.0
149996,149996,6,1,0,542,0,3,-1,1,-1,...,52,968,208,1158,257,58,1307,261,57,0.0
149997,149997,15,1,0,1300,0,1,2,0,6,...,39,504,205,544,203,45,531,205,47,1.0
149998,149998,12,1,0,1399,0,4,1,0,-1,...,91,685,249,233,140,94,432,236,97,1.0


In [229]:
test

Unnamed: 0,客户ID,地理区域,是否双频,是否翻新机,当前手机价格,手机网络功能,婚姻状况,家庭成人人数,信息库匹配,预计收入,...,客户生命周期内平均月费用,客户生命周期内的平均每月使用分钟数,客户整个生命周期内的平均每月通话次数,过去三个月的平均每月使用分钟数,过去三个月的平均每月通话次数,过去三个月的平均月费用,过去六个月的平均每月使用分钟数,过去六个月的平均每月通话次数,过去六个月的平均月费用,是否流失
0,150000,15,1,0,909,0,3,-1,1,-1,...,57,253,104,68,39,41,141,59,40,
1,150001,8,1,0,1173,0,1,1,0,5,...,94,886,285,1036,305,92,1064,325,94,
2,150002,7,1,0,1049,0,2,0,0,2,...,69,513,195,673,221,78,584,212,79,
3,150003,-1,0,0,209,1,0,1,0,7,...,32,47,33,72,53,33,76,54,33,
4,150004,-1,0,0,209,2,-1,-1,1,-1,...,16,59,37,75,36,19,69,38,17,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29995,179995,4,0,0,173,1,2,2,0,2,...,49,309,95,457,163,97,377,136,73,
29996,179996,9,0,0,150,1,1,3,0,8,...,37,106,66,52,12,28,46,15,28,
29997,179997,6,1,0,1049,0,1,3,0,6,...,37,299,179,243,143,39,300,179,38,
29998,179998,16,0,0,209,1,-1,-1,1,-1,...,22,58,30,12,8,13,33,16,19,


In [230]:
x_train = train[features]
x_test = test[features]

In [231]:
x_train

Unnamed: 0,地理区域,是否双频,是否翻新机,当前手机价格,手机网络功能,婚姻状况,家庭成人人数,信息库匹配,预计收入,信用卡指示器,...,计费调整后的呼叫总数,客户生命周期内平均月费用,客户生命周期内的平均每月使用分钟数,客户整个生命周期内的平均每月通话次数,过去三个月的平均每月使用分钟数,过去三个月的平均每月通话次数,过去三个月的平均月费用,过去六个月的平均每月使用分钟数,过去六个月的平均每月通话次数,过去六个月的平均月费用
0,7,0,-1,181,0,2,0,0,3,1,...,855,24,286,91,351,121,23,303,101,25
1,13,1,0,1399,0,3,0,0,0,1,...,1936,44,447,190,483,199,40,488,202,44
2,14,1,0,927,0,2,4,0,6,0,...,1963,48,183,79,271,95,71,209,77,54
3,1,0,0,232,0,3,-1,1,-1,1,...,3917,42,303,166,473,226,72,446,219,65
4,0,-1,0,699,0,1,2,0,3,0,...,248,36,119,24,88,15,35,106,21,37
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
149995,10,1,0,1350,0,3,0,0,0,1,...,3605,156,474,160,239,80,74,346,122,83
149996,6,1,0,542,0,3,-1,1,-1,1,...,4171,52,968,208,1158,257,58,1307,261,57
149997,15,1,0,1300,0,1,2,0,6,0,...,8513,39,504,205,544,203,45,531,205,47
149998,12,1,0,1399,0,4,1,0,-1,0,...,3495,91,685,249,233,140,94,432,236,97


In [232]:
x_train.isnull().sum()

地理区域               0
是否双频               0
是否翻新机              0
当前手机价格             0
手机网络功能             0
                  ..
过去三个月的平均每月通话次数     0
过去三个月的平均月费用        0
过去六个月的平均每月使用分钟数    0
过去六个月的平均每月通话次数     0
过去六个月的平均月费用        0
Length: 67, dtype: int64

In [233]:
# 两两相乘
# a=pd.DataFrame()
# for i in range(x_train.shape[1]-1):
# 	a=pd.concat([a,pd.DataFrame(np.array(x_train.iloc[:,i])+
# 	np.array(x_train.iloc[:,(i+1):x_train.shape[1]]).T)],axis=0) 

# a=a.T
# a

In [234]:
# pca = PCA(n_components=10, whiten=True).fit(a) # 降维到2D
# pca

In [235]:
# X_pca = pca.transform(a)
# X_pca

In [236]:
# X_pca

In [237]:
# len(X_pca[0])

In [238]:
x_test

Unnamed: 0,地理区域,是否双频,是否翻新机,当前手机价格,手机网络功能,婚姻状况,家庭成人人数,信息库匹配,预计收入,信用卡指示器,...,计费调整后的呼叫总数,客户生命周期内平均月费用,客户生命周期内的平均每月使用分钟数,客户整个生命周期内的平均每月通话次数,过去三个月的平均每月使用分钟数,过去三个月的平均每月通话次数,过去三个月的平均月费用,过去六个月的平均每月使用分钟数,过去六个月的平均每月通话次数,过去六个月的平均月费用
0,15,1,0,909,0,3,-1,1,-1,1,...,3563,57,253,104,68,39,41,141,59,40
1,8,1,0,1173,0,1,1,0,5,0,...,5328,94,886,285,1036,305,92,1064,325,94
2,7,1,0,1049,0,2,0,0,2,0,...,2392,69,513,195,673,221,78,584,212,79
3,-1,0,0,209,1,0,1,0,7,0,...,599,32,47,33,72,53,33,76,54,33
4,-1,0,0,209,2,-1,-1,1,-1,-1,...,967,16,59,37,75,36,19,69,38,17
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29995,4,0,0,173,1,2,2,0,2,0,...,2735,49,309,95,457,163,97,377,136,73
29996,9,0,0,150,1,1,3,0,8,0,...,2712,37,106,66,52,12,28,46,15,28
29997,6,1,0,1049,0,1,3,0,6,0,...,1075,37,299,179,243,143,39,300,179,38
29998,16,0,0,209,1,-1,-1,1,-1,-1,...,634,22,58,30,12,8,13,33,16,19


In [239]:
y_train = train['是否流失']
y_train

0         0.0
1         1.0
2         0.0
3         1.0
4         1.0
         ... 
149995    1.0
149996    0.0
149997    1.0
149998    1.0
149999    0.0
Name: 是否流失, Length: 150000, dtype: float64

# 3.构建模型

In [240]:
def cv_model(clf, train_x, train_y, test_x, clf_name):
    folds = 5
    seed = 2022
    kf = KFold(n_splits=folds, shuffle=True, random_state=seed)

    # tscv = TimeSeriesSplit()
    # tscv = TimeSeriesSplit(n_splits=3, test_size=2, gap=2)

    train = np.zeros(train_x.shape[0])
    test = np.zeros(test_x.shape[0])

    cv_scores = []

    # for train_index, test_index in tscv.split(train_x, train_y):
    #     print("TRAIN:", train_index, "TEST:", test_index)

    for i, (train_index, valid_index) in enumerate(kf.split(train_x, train_y)):
    # for i, (train_index, valid_index) in enumerate(tscv.split(train_x, train_y)):
        print('************************************ {} ************************************'.format(str(i+1)))
        trn_x, trn_y, val_x, val_y = train_x.iloc[train_index], train_y[train_index], train_x.iloc[valid_index], train_y[valid_index]
        # print("trn_x",trn_x)
        # print("trn_y",trn_y)
        # print("val_x",val_x)
        # print("val_y",val_y)
        if clf_name == "lgb":
            train_matrix = clf.Dataset(trn_x, label=trn_y)
            valid_matrix = clf.Dataset(val_x, label=val_y)

            params = {
                'boosting_type': 'gbdt',
                'objective': 'binary',
                'metric': 'auc',
                'min_child_weight': 5,
                'num_leaves': 2 ** 5,
                'lambda_l2': 10,
                'feature_fraction': 0.7,
                'bagging_fraction': 0.7,
                'bagging_freq': 10,
                'learning_rate': 0.2,
                'seed': 2022,
                'n_jobs':-1
            }

            model = clf.train(params, train_matrix, 50000, valid_sets=[train_matrix, valid_matrix], 
                              categorical_feature=[], verbose_eval=3000, early_stopping_rounds=200)
            val_pred = model.predict(val_x, num_iteration=model.best_iteration)
            test_pred = model.predict(test_x, num_iteration=model.best_iteration)
            
            print(list(sorted(zip(features, model.feature_importance("gain")), key=lambda x: x[1], reverse=True))[:20])
                
        if clf_name == "xgb":
            train_matrix = clf.DMatrix(trn_x , label=trn_y)
            valid_matrix = clf.DMatrix(val_x , label=val_y)
            test_matrix = clf.DMatrix(test_x)
            
            params = {'booster': 'gbtree',
                      'objective': 'binary:logistic',
                      'eval_metric': 'auc',
                      'gamma': 1,
                      'min_child_weight': 1.5,
                      'max_depth': 5,
                      'lambda': 10,
                      'subsample': 0.7,
                      'colsample_bytree': 0.7,
                      'colsample_bylevel': 0.7,
                      'eta': 0.2,
                      'tree_method': 'exact',
                      'seed': 2020,
                      'nthread': 36,
                      "silent": True,
                      }
            
            watchlist = [(train_matrix, 'train'),(valid_matrix, 'eval')]
            
            model = clf.train(params, train_matrix, num_boost_round=50000, evals=watchlist, verbose_eval=3000, early_stopping_rounds=200)
            val_pred  = model.predict(valid_matrix, ntree_limit=model.best_ntree_limit)
            test_pred = model.predict(test_matrix , ntree_limit=model.best_ntree_limit)
                 
        if clf_name == "cat":
            params = {'learning_rate': 0.2, 'depth': 5, 'l2_leaf_reg': 10, 'bootstrap_type': 'Bernoulli',
                      'od_type': 'Iter', 'od_wait': 50, 'random_seed': 11, 'allow_writing_files': False}
            
            model = clf(iterations=20000, **params)
            model.fit(trn_x, trn_y, eval_set=(val_x, val_y),
                      cat_features=[], use_best_model=True, verbose=3000)
            feature_importances = model.feature_importances_
            

            val_pred  = model.predict(val_x)
            test_pred = model.predict(test_x)
            
        train[valid_index] = val_pred
        test = test_pred / kf.n_splits

        print("val_y:",val_y)
        print("val_pred:",val_pred)

        cv_scores.append(roc_auc_score(val_y, val_pred))
        
        print(cv_scores)
       
    print("%s_scotrainre_list:" % clf_name, cv_scores)
    print("%s_score_mean:" % clf_name, np.mean(cv_scores))
    print("%s_score_std:" % clf_name, np.std(cv_scores))
    return train, test

In [241]:
def lgb_model(x_train, y_train, x_test):
    lgb_train, lgb_test = cv_model(lgb, x_train, y_train, x_test, "lgb")
    return lgb_train, lgb_test

In [242]:
def xgb_model(x_train, y_train, x_test):
    xgb_train, xgb_test = cv_model(xgb, x_train, y_train, x_test, "xgb")
    return xgb_train, xgb_test

In [243]:
def cat_model(x_train, y_train, x_test):
    cat_train, cat_test = cv_model(CatBoostRegressor, x_train, y_train, x_test, "cat") 
    return cat_train, cat_test

In [244]:
# a=pd.DataFrame(X_pca)
# a

In [245]:
lgb_train, lgb_test = lgb_model(x_train, y_train, x_test)

************************************ 1 ************************************
[LightGBM] [Info] Number of positive: 60072, number of negative: 59928
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10515
[LightGBM] [Info] Number of data points in the train set: 120000, number of used features: 67
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500600 -> initscore=0.002400
[LightGBM] [Info] Start training from score 0.002400
Training until validation scores don't improve for 200 rounds


KeyboardInterrupt: 

In [None]:
test['是否流失'] = lgb_test
test[['客户ID', '是否流失']].to_csv('data/lgb.csv', index=False)