In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb

In [2]:
sms_df = pd.read_csv('sms_df_all.csv')
voice_df = pd.read_csv('voice_df_all.csv')
all_data = pd.merge(sms_df,voice_df,how='outer',on='uid')
label = pd.read_csv('../uid_train.txt',delimiter='\t')
label.columns = ['uid','label']
all_data = pd.merge(all_data,label,on='uid',how='left')
all_data = all_data.sort_values('uid')

In [3]:
train_size = all_data[~all_data.label.isnull()].shape[0]

In [4]:
fea_col = [fea for fea in all_data.columns if fea not in ['uid','label']]

In [5]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(all_data[fea_col].values[:train_size],all_data['label'].values[:train_size],test_size=0.2)

In [6]:
param = {
    'objective':'binary',
    'num_leaves':16,
    'scale_pos_weight':5,
#     'learning_rate':0.08,
    'subsample':0.5,
    'colsample_bytree':0.5
    
}
re_re = []
re = []
from sklearn.model_selection import StratifiedKFold
for tr,va in StratifiedKFold(n_splits=10,random_state=2018).split(X_train,y_train):
    d_train = lgb.Dataset(X_train[tr],y_train[tr])
    d_test = lgb.Dataset(X_train[va],y_train[va])
    model = lgb.train(param,d_train,valid_sets=[d_train,d_test],num_boost_round=300,verbose_eval=150)
    re.append(model.predict(X_test))
    re_re.append(model.predict(all_data[fea_col].values[train_size:]))

[150]	training's binary_logloss: 0.0498705	valid_1's binary_logloss: 0.236459
[300]	training's binary_logloss: 0.0103616	valid_1's binary_logloss: 0.258572
[150]	training's binary_logloss: 0.0515886	valid_1's binary_logloss: 0.212702
[300]	training's binary_logloss: 0.0108445	valid_1's binary_logloss: 0.23535
[150]	training's binary_logloss: 0.0490382	valid_1's binary_logloss: 0.252424
[300]	training's binary_logloss: 0.0101768	valid_1's binary_logloss: 0.282687
[150]	training's binary_logloss: 0.0517521	valid_1's binary_logloss: 0.224594
[300]	training's binary_logloss: 0.0109286	valid_1's binary_logloss: 0.238454
[150]	training's binary_logloss: 0.050594	valid_1's binary_logloss: 0.23328
[300]	training's binary_logloss: 0.0102528	valid_1's binary_logloss: 0.281501
[150]	training's binary_logloss: 0.0496237	valid_1's binary_logloss: 0.240752
[300]	training's binary_logloss: 0.0102392	valid_1's binary_logloss: 0.276749
[150]	training's binary_logloss: 0.0496264	valid_1's binary_logloss

In [7]:
re_test = np.sum(re,axis=0)/10
re_avg = np.sum(re_re,axis=0)/10
from sklearn.metrics import log_loss,f1_score,roc_auc_score

In [8]:
test_bin = np.zeros(len(re_test))
for i,v in enumerate(re_test):
    if v >0.6:
        test_bin[i]=1

In [9]:
print('log_loss',log_loss(y_test,re_test))
print('f1_score',f1_score(y_test,test_bin))
print('auc',roc_auc_score(y_test,re_test))
print('auc_bin',roc_auc_score(y_test,test_bin))

log_loss 0.2543781834786565
f1_score 0.7
auc 0.941205224604812
auc_bin 0.8238744599636261


In [10]:
re_df = pd.DataFrame({'uid':all_data['uid'].values[train_size:],'re':re_avg})
re_df = re_df.sort_values('re',ascending=False)
re_df['label'] = re_df['re'].map(lambda x:1 if x>0.6 else 0)
re_df[['uid','label']].to_csv('re.csv',index=False)