In [None]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn import linear_model

In [None]:
# utils
## chinese decode 
def cn(s):
#     return s.decode('utf8')
    return s
## 字符串类型转换为类别的数字编码
dict_short_url  = {'init':0} #和短地址一样，统一记录使用过的int
dict_short_url_index = {0: 'init'}
def getShortUrl(s):
    # 未记录，则添加记录
    if dict_short_url.has_key(s) == False:
        dict_short_url[s] = max(dict_short_url.values())+1
        dict_short_url_index[dict_short_url[s]] = s
    return dict_short_url[s]

In [None]:
# read data
# trainData means rawdata, X_train means cross_validation data
trainData = pd.merge(pd.read_csv('/data/topic1/ensemble_train_x.csv'), pd.read_csv('/data/topic1/ensemble_train_y.csv'), on='用户标识')
testData = pd.read_csv('/data/topic1/ensemble_test_x.csv')

In [None]:
# 读入外部特征
trainData = pd.merge(trainData, pd.read_csv('/data/topic1/feature_countapppv.csv'), on='用户标识')
trainData = pd.merge(trainData, pd.read_csv('/data/topic1/feature_countapp.csv'), on='用户标识')
trainData = pd.merge(trainData, pd.read_csv('/data/topic1/feature_countoutprovince.csv'), on='用户标识')
trainData = pd.merge(trainData, pd.read_csv('/data/topic1/feature_countwebpv.csv'), on='用户标识')
# trainData = pd.merge(trainData, pd.read_csv('/data/topic1/feature_countweb.csv'), on='用户标识')
testData = pd.merge(testData, pd.read_csv('/data/topic1/feature_countapppv.csv'), on='用户标识')
testData = pd.merge(testData, pd.read_csv('/data/topic1/feature_countapp.csv'), on='用户标识')
testData = pd.merge(testData, pd.read_csv('/data/topic1/feature_countoutprovince.csv'), on='用户标识')
testData = pd.merge(testData, pd.read_csv('/data/topic1/feature_countwebpv.csv'), on='用户标识')
# testData = pd.merge(testData, pd.read_csv('/data/topic1/feature_countweb.csv'), on='用户标识')
print 'march_external_feature'

In [None]:
# 内部特征
trainData['phone_brand'] = trainData[cn('手机品牌')].apply(getShortUrl)
trainData['phone_model_number'] = trainData[cn('手机终端型号')].apply(getShortUrl)
trainData['if_cross_province'] = trainData[cn('是否有跨省行为')].apply(getShortUrl)
trainData['if_cross_conuntry'] = trainData[cn('是否有出境行为')].apply(getShortUrl)
testData['phone_brand'] = testData[cn('手机品牌')].apply(getShortUrl)
testData['phone_model_number'] = testData[cn('手机终端型号')].apply(getShortUrl)
testData['if_cross_province'] = testData[cn('是否有跨省行为')].apply(getShortUrl)
testData['if_cross_conuntry'] = testData[cn('是否有出境行为')].apply(getShortUrl)
trainData.drop([cn('手机品牌'), cn('是否有跨省行为'), cn('是否有出境行为'), cn('漫入省份'), cn('漫出省份'), cn('手机终端型号')], axis=1, inplace=True)
testData.drop([cn('手机品牌'), cn('是否有跨省行为'), cn('是否有出境行为'), cn('漫入省份'), cn('漫出省份'), cn('手机终端型号')], axis=1, inplace=True)
print 'march_inner_feature'

In [None]:
# 缺失值处理
trainData.fillna(0, inplace=True)
testData.fillna(0, inplace=True)

In [None]:
# 类别变量onehot化(未做)
categorical_features = [cn('性别'),cn('大致消费水平'),'phone_brand','if_cross_province', 'phone_model_number', 'if_cross_conuntry']
enc = OneHotEncoder() 
enc.fit_transform(trainData[categorical_features])

## gbdt生成交叉特征

In [None]:
# 不区分gbdt的数据和lgb的数据（会存在过拟合）
grd = GradientBoostingClassifier(n_estimators=100,learning_rate=0.1, max_depth=8,max_features='sqrt',subsample=0.9)
grd.fit(trainData.drop([cn('用户标识'),cn('是否去过迪士尼')],axis=1), trainData[cn('是否去过迪士尼')])
grd_enc = OneHotEncoder() #one-hot编码 
grd_enc.fit(grd.apply(trainData.drop([cn('用户标识'),cn('是否去过迪士尼')],axis=1))[:, :, 0]) #每个数据，在每棵树，叶节点index

## 分类器训练

### lgb做分类器

In [None]:
# 使用训练好的GBDT模型构建特征，然后将特征经过one-hot编码作为新的特征输入到lgb模型训练。
train_data = lgb.Dataset(grd_enc.transform(grd.apply(trainData.drop([cn('用户标识'),cn('是否去过迪士尼')],axis=1))[:, :, 0]), label=trainData[cn('是否去过迪士尼')])
param = {'num_leaves':63, 'num_trees':1000, 'metric':'auc'}
bst = lgb.train(param, train_data, num_boost_round=100)
ypred = bst.predict(grd_enc.transform(grd.apply(testData.drop([cn('用户标识')],axis=1))[:, :, 0]))
# # 使用训练好的GBDT模型构建特征，然后将特征经过one-hot编码作为新的特征输入到lgb模型训练。
# train_data = lgb.Dataset(grd_enc.transform(grd.apply(trainData.drop([cn('用户标识'),cn('是否去过迪士尼')],axis=1))[:, :, 0]), trainData[cn('是否去过迪士尼')])
# param = {'num_leaves':63, 'num_trees':1000, 'objective':'binary', 'metric':'auc'}
# bst = lgb.train(param, train_data, num_boost_round=100)
# ypred = bst.predict(grd_enc.transform(grd.apply(testData.drop([cn('用户标识')],axis=1))[:, :, 0]))

In [None]:
# 输出csv结果
output = pd.DataFrame({'IMEI': testData[cn('用户标识')].values,'SCORE':ypred})
# output.to_csv(cn('E:\\work\\联通+旅游\\data\\1\\result.csv'), index=False, float_format='%.5f', encoding="gb2312")
output.to_csv('/data/topic1/ensemble_result_gbdt_lgb.csv', index=False, float_format='%.5f', encoding="gb2312")

### xgb做分类器

In [None]:
import xgboost as xgb
xlf = xgb.XGBRegressor(max_depth=8, 
                        learning_rate=0.1, 
                        n_estimators=100, 
                        silent=False, 
                        objective='reg:linear', 
                        nthread=-1, 
                        gamma=0,
                        min_child_weight=1, 
                        max_delta_step=0, 
                        subsample=0.9, 
                        colsample_bytree=0.7, 
                        colsample_bylevel=1, 
                        reg_alpha=0, 
                        reg_lambda=1, 
                        objective = 'rank:pairwise',
                        scale_pos_weight=1, 
                        missing=None)
xlf.fit(grd_enc.transform(grd.apply(trainData.drop([cn('用户标识'),cn('是否去过迪士尼')],axis=1))[:, :, 0]), trainData[cn('是否去过迪士尼')], eval_metric='auc', verbose = True)
ypred = xlf.predict(grd_enc.transform(grd.apply(testData.drop([cn('用户标识')],axis=1))[:, :, 0]))

In [None]:
# 输出csv结果
output = pd.DataFrame({'IMEI': testData[cn('用户标识')].values,'SCORE':ypred})
# output.to_csv(cn('E:\\work\\联通+旅游\\data\\1\\result.csv'), index=False, float_format='%.5f', encoding="gb2312")
output.to_csv('/data/topic1/ensemble_result_gbdt_xgb.csv', index=False, float_format='%.5f', encoding="gb2312")

### lr做分类器

In [None]:
grd_lm = linear_model.LogisticRegressionCV(Cs=10, penalty='l2', tol=1e-6, n_jobs=-1, cv=3)
grd_lm.fit(grd_enc.transform(grd.apply(trainData.drop([cn('用户标识'),cn('是否去过迪士尼')],axis=1))[:, :, 0]), trainData[cn('是否去过迪士尼')])
ypred = grd_lm.predict_proba(grd_enc.transform(grd.apply(testData.drop([cn('用户标识')],axis=1))[:, :, 0]))[:, 1]

In [None]:
# 输出csv结果
output = pd.DataFrame({'IMEI': testData[cn('用户标识')].values,'SCORE':ypred})
# output.to_csv(cn('E:\\work\\联通+旅游\\data\\1\\result.csv'), index=False, float_format='%.5f', encoding="gb2312")
output.to_csv('/data/topic1/ensemble_result_gbdt_lr.csv', index=False, float_format='%.5f', encoding="gb2312")