In [None]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import roc_auc_score

In [None]:
# utils
## chinese decode 
def cn(s):
#     return s.decode('utf8')
    return s
## 字符串类型转换为类别的数字编码
dict_short_url  = {'init':0} #和短地址一样，统一记录使用过的int
dict_short_url_index = {0: 'init'}
def getShortUrl(s):
    # 未记录，则添加记录
    if dict_short_url.has_key(s) == False:
        dict_short_url[s] = max(dict_short_url.values())+1
        dict_short_url_index[dict_short_url[s]] = s
    return dict_short_url[s]

In [None]:
# read data
# trainData means rawdata, X_train means cross_validation data
# trainData = pd.merge(pd.read_csv(cn('E:\\work\\联通+旅游\\data\\1\\train_x.csv'),encoding="gb2312"), pd.read_csv(cn('E:\\work\\联通+旅游\\data\\1\\train_y.csv'),encoding="gb2312"), on=cn('用户标识'))
trainData = pd.merge(pd.read_csv('/data/topic1/train_x.csv'), pd.read_csv('/data/topic1/train_y.csv'), on='用户标识')
print 'march_load_data'

In [None]:
# 读入外部特征
## app点击数
trainData = pd.merge(trainData, pd.read_csv('/data/topic1/feature_countapppv.csv'), on='用户标识')
trainData = pd.merge(trainData, pd.read_csv('/data/topic1/feature_countapp.csv'), on='用户标识')
trainData = pd.merge(trainData, pd.read_csv('/data/topic1/feature_countoutprovince.csv'), on='用户标识')
trainData = pd.merge(trainData, pd.read_csv('/data/topic1/feature_countwebpv.csv'), on='用户标识')
trainData = pd.merge(trainData, pd.read_csv('/data/topic1/feature_countweb.csv'), on='用户标识')
trainData = pd.merge(trainData, pd.read_csv('/data/topic1/feature_countgroupticket'), on='用户标识')
trainData = pd.merge(trainData, pd.read_csv('/data/topic1/feature_countgrouptraffic'), on='用户标识')
trainData = pd.merge(trainData, pd.read_csv('/data/topic1/feature_countgroupfff'), on='用户标识')
trainData = pd.merge(trainData, pd.read_csv('/data/topic1/feature_countgrouphotel'), on='用户标识')
trainData = pd.merge(trainData, pd.read_csv('/data/topic1/feature_countgroupweather'), on='用户标识')
trainData = pd.merge(trainData, pd.read_csv('/data/topic1/feature_cluster.csv'), on='用户标识')
trainData = pd.merge(trainData, pd.read_csv('/data/topic1/feature_outprovince.csv'), on='用户标识')
trainData = pd.merge(trainData, pd.read_csv('/data/topic1/feature_topweb.csv'), on='用户标识')
trainData = pd.merge(trainData, pd.read_csv('/data/topic1/feature_webcluster.csv'), on='用户标识')
trainData = pd.merge(trainData, pd.read_csv('/data/topic1/feature_maxgroup.csv'), on='用户标识')
# trainData = pd.merge(trainData, pd.read_csv('/data/topic1/feature_groupticket'), on='用户标识')
# trainData = pd.merge(trainData, pd.read_csv('/data/topic1/feature_grouptraffic'), on='用户标识')
# trainData = pd.merge(trainData, pd.read_csv('/data/topic1/feature_groupfff'), on='用户标识')
# trainData = pd.merge(trainData, pd.read_csv('/data/topic1/feature_grouphotel'), on='用户标识')
# trainData = pd.merge(trainData, pd.read_csv('/data/topic1/feature_groupweather'), on='用户标识')
# trainData = pd.merge(trainData, pd.read_csv('/data/topic1/feature_hashweb.csv'), on='用户标识')
# trainData = pd.merge(trainData, pd.read_csv('/data/topic1/feature_hashapp.csv'), on='用户标识')
print 'march_external_feature'

In [None]:
# 去掉app特征
# trainData.drop(trainData.columns[8:308], axis=1, inplace=True)

In [None]:
# # app二值化
# for i in trainData.columns[8:308]:
#     print i
#     trainData['_' + i] = trainData[i].apply(lambda x: int(x>0))

In [None]:
# # apppv / count_app_pv，得到用户对app的偏好(舍去)
# for i in trainData.columns[8:308]:
#     print i
#     trainData[i] = 1000 * trainData[i] / (trainData['count_app_pv']+1)#避免除以0的情况 #*1000是为了减少决策树分箱颗粒过大的问题

In [None]:
# # tfidf
# ## 获取每个词的idf
# dict_idf = {}
# for i in trainData.columns[8:308]:
#     idf = 1 / float(len(trainData[trainData[i]>0])+1)
#     dict_idf[i] = idf
#     print i + ':' + str(idf)
# for i in trainData.columns[8:308]:
#     print i
#     trainData[i] = 1000 * trainData[i] * dict_idf[i]

In [None]:
trainData['phone_brand'] = trainData[cn('手机品牌')].apply(getShortUrl)
trainData['phone_model_number'] = trainData[cn('手机终端型号')].apply(getShortUrl)
trainData['if_cross_province'] = trainData[cn('是否有跨省行为')].apply(getShortUrl)
trainData['if_cross_conuntry'] = trainData[cn('是否有出境行为')].apply(getShortUrl)
trainData['topweb'] = trainData['topweb'].apply(getShortUrl)
trainData.drop([cn('手机品牌'), cn('是否有跨省行为'), cn('是否有出境行为'), cn('漫入省份'), cn('漫出省份'), cn('手机终端型号')], axis=1, inplace=True)
print 'march_inner_feature'

In [None]:
X_train, X_test, y_train, y_test = train_test_split(trainData.drop([cn('用户标识'),cn('是否去过迪士尼')],axis=1), trainData[cn('是否去过迪士尼')], test_size=0.3)

## 一阶特征，lgb做分类器

In [None]:
# 修改列名， lgb不支持中文列名
dict_new_columns = {}
for i in X_train.columns:
    dict_new_columns[i] = str(getShortUrl(i))
X_train.rename(columns=dict_new_columns, inplace = True)
X_test.rename(columns=dict_new_columns, inplace = True)

### lgb

In [None]:
# 声明类别变量
temp_categorical_feature = [cn('性别'),cn('大致消费水平'),'phone_brand','phone_model_number','if_cross_province','if_cross_conuntry',
                           'hainan','shanghai','neimenggu','xizang','gansu','henan','cluster','webcluster','topweb','maxgroup']
list_categorical_feature = [str(dict_short_url[item]) for item in temp_categorical_feature]

In [None]:
train_data = lgb.Dataset(X_train.values, label=y_train, feature_name=list(X_train.columns), categorical_feature=list_categorical_feature)
test_data = lgb.Dataset(X_test.values, label=y_test, feature_name=list(X_test.columns), categorical_feature=list_categorical_feature)
param = {'application': 'binary', 'boosting': 'dart', 'num_leaves':127, 'num_trees':1000, 'objective':'binary', 'min_child_samples':400, 'max_bin': 250, 'max_depth':-1, 
         'feature_fraction':1, 'metric':'auc','subsample':0.9, 'lambda_l1':1.5, 'lambda_l2':20, 'min_data_in_bin':100,
        'bagging_fraction':1, 'bagging_freq':1, 'learning_rate':0.05}
# param = {'application': 'binary', 'boosting': 'gbdt', 'num_leaves':155, 'num_trees':2000, 'objective':'binary', 'min_child_samples':400, 'max_bin': 250, 'max_depth':-1, 
#          'feature_fraction':1, 'metric':'auc','subsample':0.9, 'lambda_l1':1.5, 'lambda_l2':20, 'min_data_in_bin':100,
#         'bagging_fraction':1, 'bagging_freq':10, 'learning_rate':0.05}
# param = {'application': 'binary', 'boosting': 'gbdt', 'num_leaves':127, 'num_trees':1000, 'objective':'binary', 'min_child_samples':400, 'max_bin': 250, 'max_depth':-1, 
#          'feature_fraction':1, 'metric':'auc','subsample':0.9, 'lambda_l1':1.5, 'lambda_l2':20, 'min_data_in_bin':100,
#         'bagging_fraction':1, 'bagging_freq':1, 'learning_rate':0.025,'is_unbalance':True}
# bst=lgb.cv(param,train_data,num_boost_round=350,nfold=5)
estimators = lgb.train(param,train_data,valid_sets=test_data,num_boost_round=800)
# print len(bst['auc-mean'])
print 'march_training_done'

### xgb

In [None]:
import xgboost as xgb
train_data = xgb.DMatrix(X_train.values, label=y_train)
test_data = xgb.DMatrix(X_test.values, label=y_test)
watch_list = [(test_data, 'eval'), (train_data, 'train')]
param = {'max_depth': 8, 'max_leaf_nodes':127, 'eta': 0.3, 'silent': 1, 'objective': 'rank:pairwise', 'max_leaf_nodes':1, 'subsample':0.9,
        'min_child_weight': 1, 'alpha':0, 'lambda':1, 'gamma':0, 'scale_pos_weight':0.1, 'eval_metric':'auc'}
bst = xgb.train(param, train_data, evals=watch_list, num_boost_round=100, early_stopping_rounds=100)
ypred = bst.predict(test_data)

## 评估AUC

In [None]:
# 评估正确率
from sklearn import metrics
print metrics.classification_report(y_test, ypred>0.5)

In [None]:
# 评估AUC
ypred = estimators.predict(X_test.values)
auc = roc_auc_score(y_test, ypred)
print 'AUC:',auc

In [None]:
# 评估AUC(训练数据)
ypred = estimators.predict(X_train.values)
print roc_auc_score(y_train, ypred)

In [None]:
# importance = pd.DataFrame({'feature':trainData.drop([cn('用户标识'),cn('是否去过迪士尼')],axis=1).columns, 'importance':bst.feature_importance()})
# importance = importance.sort_values(by='importance', ascending=False)