In [None]:
import pickle
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from tqdm import tqdm_notebook
import time
import gc
import numpy as np
import lightgbm as lgb
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs
import torch
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
import torch.nn as nn
from torch.utils.data import Dataset,DataLoader
import torch.nn.functional as F
import sklearn
from sklearn.model_selection import StratifiedKFold,KFold
from sklearn.metrics import roc_curve 
import time
import os
import itertools
import random
import matplotlib.pyplot as plt
from collections import OrderedDict
from scipy.special import erfinv
from collections import OrderedDict
from math import sqrt
import numpy as np
from torch.optim import lr_scheduler
from sklearn.ensemble import GradientBoostingRegressor
import catboost as cbt
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer

In [None]:
from category_encoders.ordinal import OrdinalEncoder
from category_encoders.woe import WOEEncoder
from category_encoders.target_encoder import TargetEncoder as Encoder
from category_encoders.sum_coding import SumEncoder
from category_encoders.m_estimate import MEstimateEncoder
from category_encoders.leave_one_out import LeaveOneOutEncoder
from category_encoders.helmert import HelmertEncoder
from category_encoders.cat_boost import CatBoostEncoder
from category_encoders import CountEncoder
from category_encoders.one_hot import OneHotEncoder

In [None]:
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
seed_everything()

In [None]:
def reduce_mem(df):
    starttime = time.time()
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if pd.isnull(c_min) or pd.isnull(c_max):
                continue
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    print('-- Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction),time spend:{:2.2f} min'.format(end_mem,
                                                                                                           100*(start_mem-end_mem)/start_mem,
                                                                                                           (time.time()-starttime)/60))
    return df

In [None]:

def lower_sample_data_by_sample(df,percent=1,rs=42):
    most_data = df[df['label'] == 0]  # 多数类别的样本
    minority_data = df[df['label'] == 1]  # 少数类别的样本   
    #随机采样most_data中的数据
    lower_data=most_data.sample(n=int(percent*len(minority_data)),replace=False,random_state=rs,axis=0)   
    return (pd.concat([lower_data,minority_data]))

In [None]:
def get_mask_train(df,samp):
  if random.random()<samp:
    return -1
  else :
    return df

In [None]:
#--------------------------------------------------数据预处理--------------------------------------------------#

In [None]:
columns = [ 'uid', 'task_id', 'adv_id', 'creat_type_cd', 'adv_prim_id',
       'dev_id', 'inter_type_cd', 'slot_id', 'spread_app_id', 'tags',
       'app_first_class', 'app_second_class', 'age', 'city', 'city_rank',
       'device_name', 'device_size', 'career', 'gender', 'net_type',
       'residence', 'his_app_size', 'his_on_shelf_time', 'app_score',
       'emui_dev', 'list_time', 'device_price', 'up_life_duration',
       'up_membership_grade', 'membership_life_duration', 'consume_purchase',
       'communication_onlinerate', 'communication_avgonline_30d', 'indu_name',
       'pt_d']

In [None]:
#                读取数据集
train_df = reduce_mem(pd.read_csv('train_data.csv',sep='|'))

test_df = pd.read_csv('test_data_B.csv',sep='|')

In [None]:
def get_tfidf(train,test,key1,key2):
    
    train_tif = pd.DataFrame(train[[key1, key2]].groupby([key1])[key2].apply(list))
    train_tif.reset_index(inplace=True)
    train_key1= train_tif[key1].values
    train_key2    = train_tif[key2].values.tolist()
    train_key2_list = []
    for seq in train_key2:
        sentences = []
        for word in seq:
            sentences.append(str(word))
        train_key2_list.append(' '.join(sentences))
    

    tfidf_vec = TfidfVectorizer() 
    train_tfidf_matrix = tfidf_vec.fit_transform(train_key2_list).toarray()

    test_tif = pd.DataFrame(test[[key1, key2]].groupby([key1])[key2].apply(list))
    test_tif.reset_index(inplace=True)
    test_key1= test_tif[key1].values
    test_key2 = test_tif[key2].values.tolist()
    test_key2_list = []
    for seq in test_key2:
        sentences = []
        for word in seq:
            sentences.append(str(word))
        test_key2_list.append(' '.join(sentences))
    test_tfidf_matrix = tfidf_vec.transform(test_key2_list).toarray()
    assert train_tfidf_matrix.shape[1]==test_tfidf_matrix.shape[1]
    
    train_tfidf_agmax = np.argmax(train_tfidf_matrix,axis=1)
    train_tfidf_max = np.max(train_tfidf_matrix,axis=1)
    train_tfidf_mean = np.mean(train_tfidf_matrix,axis=1)
    train_tfidf_std = np.std(train_tfidf_matrix,axis=1)
    
    test_tfidf_agmax = np.argmax(test_tfidf_matrix,axis=1)
    test_tfidf_max = np.max(test_tfidf_matrix,axis=1)
    test_tfidf_mean = np.mean(test_tfidf_matrix,axis=1)
    test_tfidf_std = np.std(test_tfidf_matrix,axis=1)
    
    print('train_tfidf_agmax.shape:')
    print(train_tfidf_agmax.shape)
    
    print('train_tfidf_mean.shape:')
    print(train_tfidf_mean.shape)
    
    print('test_tfidf_agmax.shape:')
    print(test_tfidf_agmax.shape)
    
    print('test_tfidf_mean.shape:')
    print(test_tfidf_mean.shape)
    return train_tif,test_tif,train_tfidf_agmax,train_tfidf_max,train_tfidf_mean,train_tfidf_std,test_tfidf_agmax,test_tfidf_max,test_tfidf_mean,test_tfidf_std

In [None]:
# 无用列
drop_cols = ['pt_d','label','communication_onlinerate','index','id','K']

# 选择类别特征
cat_cols = [ 'uid', 'task_id', 'adv_id', 'creat_type_cd', 'adv_prim_id',
       'dev_id', 'inter_type_cd', 'slot_id', 'spread_app_id', 'tags',
       'app_first_class', 'app_second_class', 'age', 'city', 'city_rank',
       'device_name', 'device_size', 'career', 'gender', 'net_type',
       'residence', 'his_app_size', 'his_on_shelf_time', 'app_score',
       'emui_dev', 'list_time', 'device_price', 'up_life_duration',
       'up_membership_grade', 'membership_life_duration', 'consume_purchase'
        , 'communication_avgonline_30d', 'indu_name',
      ]
MASK = 'MASK'
miss_col1 = ['task_id', 'adv_id','uid']
miss_col2 = ['adv_prim_id','dev_id' ]#, 'device_size','spread_app_id','indu_name']

In [None]:
for col in tqdm_notebook( miss_col1):
  train_df[col] = train_df[col].apply(lambda x :get_mask_train(x,0.1))
for col in tqdm_notebook(miss_col2):
  train_df[col] = train_df[col].apply(lambda x :get_mask_train(x,0.05))

In [None]:
for col in tqdm_notebook( miss_col1):
  mask_list = list(set(test_df[col].values)-set(train_df[col].values))
  print(len(mask_list)/len(set(test_df[col].values)))
  test_df[col] = test_df[col].replace(mask_list,-1)
for col in tqdm_notebook(miss_col2):
  mask_list = list(set(test_df[col].values)-set(train_df[col].values))
  print(len(mask_list)/len(set(test_df[col].values)))

  test_df[col] = test_df[col].replace(mask_list,-1)

In [None]:
train_df.reset_index(drop=True,inplace=True)

In [None]:
user_col = ['uid','age','city','city_rank','career','gender','residence','communication_avgonline_30d','consume_purchase','membership_life_duration','up_membership_grade','up_life_duration']
ad_col = ['task_id','adv_id','creat_type_cd','adv_prim_id','dev_id','slot_id','spread_app_id','tags','app_first_class','app_second_class','indu_name','inter_type_cd']
phone_col = ['device_name','device_size','net_type','emui_dev','device_price']
app_col = ['his_app_size','his_on_shelf_time','app_score','list_time']

In [None]:

train_tif_uid1,test_tif_uid1,train_tfidf_agmax,train_tfidf_max,train_tfidf_mean,train_tfidf_std,test_tfidf_agmax,test_tfidf_max,test_tfidf_mean,test_tfidf_std = get_tfidf(train_df , test_df, 'uid','task_id')
train_tif_uid1 = train_tif_uid1.drop('task_id',axis=1)
train_tif_uid1['uid'+'task_id'+'tf_argmax'] = train_tfidf_agmax
train_tif_uid1['uid'+'task_id'+'max'] = train_tfidf_max
train_tif_uid1['uid'+'task_id'+'mean'] = train_tfidf_mean
train_tif_uid1['uid'+'task_id'+'std'] = train_tfidf_std

train_tif_uid2,test_tif,train_tfidf_agmax,train_tfidf_max,train_tfidf_mean,train_tfidf_std,test_tfidf_agmax,test_tfidf_max,test_tfidf_mean,test_tfidf_std = get_tfidf(train_df , test_df, 'uid','adv_id')
train_tif_uid2 = train_tif_uid2.drop('adv_id',axis=1)
train_tif_uid2['uid'+'adv_id'+'tf_argmax'] = train_tfidf_agmax
train_tif_uid2['uid'+'adv_id'+'max'] = train_tfidf_max
train_tif_uid2['uid'+'adv_id'+'mean'] = train_tfidf_mean
train_tif_uid2['uid'+'adv_id'+'std'] = train_tfidf_std

train_tif_uid3,test_tif,train_tfidf_agmax,train_tfidf_max,train_tfidf_mean,train_tfidf_std,test_tfidf_agmax,test_tfidf_max,test_tfidf_mean,test_tfidf_std = get_tfidf(train_df , test_df, 'uid','slot_id')
train_tif_uid3 = train_tif_uid3.drop('slot_id',axis=1)
train_tif_uid3['uid'+'slot_id'+'tf_argmax'] = train_tfidf_agmax
train_tif_uid3['uid'+'slot_id'+'max'] = train_tfidf_max
train_tif_uid3['uid'+'slot_id'+'mean'] = train_tfidf_mean
train_tif_uid3['uid'+'slot_id'+'std'] = train_tfidf_std

train_tif_uid4,test_tif,train_tfidf_agmax,train_tfidf_max,train_tfidf_mean,train_tfidf_std,test_tfidf_agmax,test_tfidf_max,test_tfidf_mean,test_tfidf_std = get_tfidf(train_df , test_df, 'uid','adv_prim_id')
train_tif_uid4 = train_tif_uid4.drop('adv_prim_id',axis=1)
train_tif_uid4['uid'+'adv_prim_id'+'tf_argmax'] = train_tfidf_agmax
train_tif_uid4['uid'+'adv_prim_id'+'max'] = train_tfidf_max
train_tif_uid4['uid'+'adv_prim_id'+'mean'] = train_tfidf_mean
train_tif_uid4['uid'+'adv_prim_id'+'std'] = train_tfidf_std



In [None]:
def add_noise(series, noise_level):
    return series * (1 + noise_level * np.random.randn(len(series)))

def target_encode(trn_series=None, 
                  tst_series1=None, 
                  tst_series2 = None,
                  target=None, 
                  min_samples_leaf=1, 
                  smoothing=1,
                  noise_level=0):
    """
    Smoothing is computed like in the following paper by Daniele Micci-Barreca
    https://kaggle2.blob.core.windows.net/forum-message-attachments/225952/7441/high%20cardinality%20categoricals.pdf
    trn_series : training categorical feature as a pd.Series
    tst_series : test categorical feature as a pd.Series
    target : target data as a pd.Series
    min_samples_leaf (int) : minimum samples to take category average into account
    smoothing (int) : smoothing effect to balance categorical average vs prior  
    """ 
    assert len(trn_series) == len(target)
    assert trn_series.name == tst_series1.name
    assert trn_series.name == tst_series2.name
    nui = trn_series.nunique()
    cou = len(trn_series)
    min_samples_leaf = min_samples_leaf*(cou/nui)
    print(min_samples_leaf)
    temp = pd.concat([trn_series, target], axis=1)
    # Compute target mean 
    averages = temp.groupby(by=trn_series.name)[target.name].agg(["mean", "count"])
    # Compute smoothing
    smoothing = 1 / (1 + np.exp(-(averages["count"] - min_samples_leaf) / smoothing))
    # Apply average function to all target data
    prior = target.mean()
    # The bigger the count the less full_avg is taken into account
    averages[target.name] = prior * (1 - smoothing) + averages["mean"] * smoothing
    averages.drop(["mean", "count"], axis=1, inplace=True)
    # Apply averages to trn and tst series
    ft_trn_series = pd.merge(
        trn_series.to_frame(trn_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=trn_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_trn_series.index = trn_series.index 
    ft_tst_series1 = pd.merge(
        tst_series1.to_frame(tst_series1.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=tst_series1.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    
    ft_tst_series2 = pd.merge(
        tst_series2.to_frame(tst_series2.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=tst_series2.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_tst_series1.index = tst_series1.index
    ft_tst_series2.index = tst_series2.index
    return add_noise(ft_trn_series, noise_level).values, add_noise(ft_tst_series1, noise_level).values,add_noise(ft_tst_series2, noise_level).values


In [None]:
floder = StratifiedKFold(n_splits=5,random_state=42,shuffle=True)


In [None]:
test_df2 = test_df.copy()
test_df3 = test_df.copy()
test_df4 = test_df.copy()
test_df5 = test_df.copy()
test_df_list = [test_df , test_df2, test_df3, test_df4, test_df5]

In [None]:
train_df.reset_index(drop=True,inplace=True)

In [None]:
for col in tqdm_notebook(cat_cols):
  i = 1
  train_df[col + 'tar_enco'] = 0
  train_df['K'] = 0
  for k ,(tr_idx, oof_idx) in enumerate(StratifiedKFold(n_splits=5, random_state=2020, shuffle=True).split(train_df, train_df['label'])):
    print('fold{}'.format(i))
    i+=1
    trn_series = train_df.iloc[tr_idx][col]
    tst_series1 = train_df.iloc[oof_idx][col]
    tst_series2 = test_df_list[k][col]
    target = train_df.iloc[tr_idx].label
    train_targetencoding,oof_targetencoding,test_targetencoding =  target_encode(trn_series, 
                  tst_series1, 
                  tst_series2,
                  target, 
                  min_samples_leaf=0.2, 
                  smoothing=1,
                  noise_level=0.0001)
    train_df.loc[oof_idx,col + 'tar_enco'] = oof_targetencoding
    train_df.loc[oof_idx,'K'] = k
    test_df_list[k][col + 'tar_enco'] = test_targetencoding
  train_df = reduce_mem(train_df)
  gc.collect()

In [None]:
train_df = reduce_mem(train_df)

In [None]:
test_df = test_df_list[0]

In [None]:
train_df

In [None]:
train_df = lower_sample_data_by_sample(train_df , 3,303).reset_index(drop=True)

In [None]:
train_df = train_df.merge(train_tif_uid1,on='uid',how='left')
train_df = train_df.merge(train_tif_uid2,on='uid',how='left')
train_df = train_df.merge(train_tif_uid3,on='uid',how='left')
train_df = train_df.merge(train_tif_uid4,on='uid',how='left')
# train_df = train_df.merge(train_tif_taskid1,on='uid',how='left')
# train_df = train_df.merge(train_tif_taskid2,on='uid',how='left')
# train_df = train_df.merge(train_tif_advid1,on='uid',how='left')
# train_df = train_df.merge(train_tif_advid1,on='uid',how='left')

test_df = test_df.merge(train_tif_uid1,on='uid',how='left')
test_df = test_df.merge(train_tif_uid2,on='uid',how='left')
test_df = test_df.merge(train_tif_uid3,on='uid',how='left')
test_df = test_df.merge(train_tif_uid4,on='uid',how='left')
# test_df = test_df.merge(train_tif_taskid1,on='uid',how='left')
# test_df = test_df.merge(train_tif_taskid2,on='uid',how='left')
# test_df = test_df.merge(train_tif_advid1,on='uid',how='left')
# test_df = test_df.merge(train_tif_advid1,on='uid',how='left')
for i in range(5):
    test_df_list[i] = test_df_list[i].merge(train_tif_uid1,on='uid',how='left')
    test_df_list[i] = test_df_list[i].merge(train_tif_uid2,on='uid',how='left')
    test_df_list[i] = test_df_list[i].merge(train_tif_uid3,on='uid',how='left')
    test_df_list[i] = test_df_list[i].merge(train_tif_uid4,on='uid',how='left')
#     test_df_list[i] = test_df_list[i].merge(train_tif_taskid1,on='uid',how='left')
#     test_df_list[i] = test_df_list[i].merge(train_tif_taskid2,on='uid',how='left')
#     test_df_list[i] = test_df_list[i].merge(train_tif_advid1,on='uid',how='left')
#     test_df_list[i] = test_df_list[i].merge(train_tif_advid2,on='uid',how='left')




In [None]:
test_df

In [None]:
cl = CountEncoder()
for col in tqdm_notebook(cat_cols):
      cl = CountEncoder(cols=col)
      cl.fit(train_df[col])
      train_df[col + '_count']  = (cl.transform(train_df[col])).values
      test_df_list[0] = test_df_list[0].join(cl.transform(test_df[col]).add_suffix('_count'))
      test_df_list[1] = test_df_list[1].join(cl.transform(test_df[col]).add_suffix('_count'))
      test_df_list[2] = test_df_list[2].join(cl.transform(test_df[col]).add_suffix('_count'))
      test_df_list[3] = test_df_list[3].join(cl.transform(test_df[col]).add_suffix('_count'))
      test_df_list[4] = test_df_list[4].join(cl.transform(test_df[col]).add_suffix('_count'))
    

In [None]:
cat_cols = cat_cols+['user_kme','ad_kme','uidtask_idtf_argmax']

In [None]:
dense_feature = [col for col in train_df.columns if col not in drop_cols+cat_cols]

In [None]:
train_df = reduce_mem(train_df)

In [None]:
for i in range(5):
    test_df_list[i].drop(['communication_onlinerate'],axis=1,inplace=True)
    test_df_list[i].fillna(0,inplace=True)
    test_df_list[i]['K'] = i

In [None]:
feature = cat_cols+dense_feature

In [None]:
estimator_ad= KMeans(n_clusters=500, random_state=42)
estimator_user= KMeans(n_clusters=500, random_state=42)

user_col = ['age','city','city_rank','career','gender','residence']
ad_col = ['task_id','adv_id','creat_type_cd','adv_prim_id','dev_id','slot_id','spread_app_id','tags','app_first_class','app_second_class','indu_name','inter_type_cd']


In [None]:
#读取Model
with open('estimator.pickle', 'rb') as f:
    estimator_user = pickle.load(f)
    #测试读取后的Model

In [None]:
#读取Model
with open('estimator_ad.pickle', 'rb') as f:
    estimator_ad = pickle.load(f)
    #测试读取后的Model

In [None]:
ad_features = []
for col in train_df.columns:
    for c  in ad_col:
        if c+'tar_enco' in col:
            ad_features.append(col)

In [None]:
user_features = []
for col in train_df.columns:
    for c  in user_col:
        if c+'tar_enco' in col:
            user_features.append(col)

In [None]:
ad_pred =estimator_ad.predict(train_df[ad_features])
train_df['ad_kme'] = ad_pred
for i,t in enumerate(test_df_list):
    test_df_list[i]['ad_kme'] = estimator_ad.predict(t[ad_features])

In [None]:
user_pred =estimator_user.predict(train_df[user_features])
train_df['user_kme'] = user_pred
for i,t in enumerate(test_df_list):
    test_df_list[i]['user_kme'] = estimator_user.predict(t[user_features])

In [None]:
import pickle
with open('estimator_ad.pickle', 'wb') as f:
    pickle.dump(estimator_ad, f)




In [None]:
test_df['user_kme'] = estimator_user.predict(test_df[user_features])
test_df['ad_kme'] = estimator_ad.predict(test_df[ad_features])

In [None]:
test_df

In [None]:
for col in tqdm_notebook(['user_kme','ad_kme','uidtask_idtf_argmax','uidadv_idtf_argmax','uidslot_idtf_argmax','uidadv_prim_idtf_argmax']):
      cl = CountEncoder(cols=col)
      cl.fit(train_df[col])
      train_df[col + '_count']  = (cl.transform(train_df[col])).values
      test_df_list[0] = test_df_list[0].join(cl.transform(test_df[col]).add_suffix('_count'))
      test_df_list[1] = test_df_list[1].join(cl.transform(test_df[col]).add_suffix('_count'))
      test_df_list[2] = test_df_list[2].join(cl.transform(test_df[col]).add_suffix('_count'))
      test_df_list[3] = test_df_list[3].join(cl.transform(test_df[col]).add_suffix('_count'))
      test_df_list[4] = test_df_list[4].join(cl.transform(test_df[col]).add_suffix('_count'))

In [None]:
for col in tqdm_notebook(['user_kme','ad_kme','uidtask_idtf_argmax','uidadv_idtf_argmax','uidslot_idtf_argmax','uidadv_prim_idtf_argmax']):
  i = 1
  train_df[col + 'tar_enco'] = 0
  train_df['K'] = 0
  for k ,(tr_idx, oof_idx) in enumerate(StratifiedKFold(n_splits=5, random_state=2020, shuffle=True).split(train_df, train_df['label'])):
    print('fold{}'.format(i))
    i+=1
    trn_series = train_df.iloc[tr_idx][col]
    tst_series1 = train_df.iloc[oof_idx][col]
    tst_series2 = test_df_list[k][col]
    target = train_df.iloc[tr_idx].label
    train_targetencoding,oof_targetencoding,test_targetencoding =  target_encode(trn_series, 
                  tst_series1, 
                  tst_series2,
                  target, 
                  min_samples_leaf=0.2, 
                  smoothing=1,
                  noise_level=0.0001)
    train_df.loc[oof_idx,col + 'tar_enco'] = oof_targetencoding
    train_df.loc[oof_idx,'K'] = k
    test_df_list[k][col + 'tar_enco'] = test_targetencoding
  train_df = reduce_mem(train_df)
  gc.collect()

In [None]:
seed=1080
is_shuffle=True

In [None]:
user_col = ['uid','age','city','city_rank','career','gender','residence','communication_avgonline_30d','consume_purchase','membership_life_duration','up_membership_grade','up_life_duration']
ad_col = ['task_id','adv_id','creat_type_cd','adv_prim_id','dev_id','slot_id','spread_app_id','tags','app_first_class','app_second_class','indu_name','inter_type_cd']
phone_col = ['device_name','device_size','net_type','emui_dev','device_price']
app_col = ['his_app_size','his_on_shelf_time','app_score','list_time']

In [None]:
train_df

In [None]:
#--------------------------------------------------模型训练----------------------------------------#

In [None]:
from sklearn.ensemble import RandomForestClassifier

for k in tqdm_notebook(range(5)):
    t = train_df[train_df.K!=k].reset_index(drop=True)[user_col]
    t_label = train_df[train_df.K!=k].reset_index(drop=True).label.values
    v = train_df[train_df.K==k].reset_index(drop=True)[user_col]
    v_label = train_df[train_df.K==k].reset_index(drop=True).label.values
    
    RF_user = RandomForestClassifier(n_estimators=10, criterion='gini',n_jobs=-1, random_state=42, verbose=1)
    RF_user.fit(t,t_label)
    train_df.loc[train_df[train_df.K==k].index,'rf_user'] = RF_user.predict_proba(v)[:,1]
    test_df_list[k]['rf_user'] = RF_user.predict_proba(test_df_list[k][user_col])[:,1]

for k in tqdm_notebook(range(5)):
    t = train_df[train_df.K!=k].reset_index(drop=True)[ad_col]
    t_label = train_df[train_df.K!=k].reset_index(drop=True).label.values
    v = train_df[train_df.K==k].reset_index(drop=True)[ad_col]
    v_label = train_df[train_df.K==k].reset_index(drop=True).label.values
    
    RF_ad = RandomForestClassifier(n_estimators=10, criterion='gini',n_jobs=-1, random_state=42, verbose=1)
    RF_ad.fit(t,t_label)
    train_df.loc[train_df[train_df.K==k].index,'rf_ad'] = RF_ad.predict_proba(v)[:,1]
    test_df_list[k]['rf_ad'] = RF_ad.predict_proba(test_df_list[k][ad_col])[:,1]

for k in tqdm_notebook(range(5)):
    t = train_df[train_df.K!=k].reset_index(drop=True)[phone_col]
    t_label = train_df[train_df.K!=k].reset_index(drop=True).label.values
    v = train_df[train_df.K==k].reset_index(drop=True)[phone_col]
    v_label = train_df[train_df.K==k].reset_index(drop=True).label.values
    
    RF_phone = RandomForestClassifier(n_estimators=10, criterion='gini',n_jobs=-1, random_state=42, verbose=1)
    RF_phone.fit(t,t_label)
    train_df.loc[train_df[train_df.K==k].index,'rf_phone'] = RF_phone.predict_proba(v)[:,1]
    test_df_list[k]['rf_phone'] = RF_phone.predict_proba(test_df_list[k][phone_col])[:,1]
    
for k in tqdm_notebook(range(5)):
    t = train_df[train_df.K!=k].reset_index(drop=True)[app_col]
    t_label = train_df[train_df.K!=k].reset_index(drop=True).label.values
    v = train_df[train_df.K==k].reset_index(drop=True)[app_col]
    v_label = train_df[train_df.K==k].reset_index(drop=True).label.values
    
    RF_app = RandomForestClassifier(n_estimators=10, criterion='gini',n_jobs=-1, random_state=42, verbose=1)
    RF_app.fit(t,t_label)
    train_df.loc[train_df[train_df.K==k].index,'rf_app'] = RF_app.predict_proba(v)[:,1]
    test_df_list[k]['rf_app'] = RF_app.predict_proba(test_df_list[k][app_col])[:,1]

In [None]:
print(sklearn.metrics.roc_auc_score(train_df.label,train_df.rf_user))
print(sklearn.metrics.roc_auc_score(train_df.label,train_df.rf_ad))
print(sklearn.metrics.roc_auc_score(train_df.label,train_df.rf_phone))
print(sklearn.metrics.roc_auc_score(train_df.label,train_df.rf_app))

In [None]:
cat_cols

In [None]:
dense_feature = [col for col in train_df.columns if col not in drop_cols+cat_cols]

In [None]:
feature = cat_cols+dense_feature

In [None]:
  feature_importance_df = pd.DataFrame()
  predicts = np.zeros(len(train_df))
  pred = np.zeros(len(test_df_list[0]))
  true = np.zeros(len(train_df))


  begin = 0
  for fold,k in enumerate(range(5)):
    
    #train = train_df[train_df.K!=k].reset_index(drop=True)[feature]
    t_label = train_df[train_df.K!=k].reset_index(drop=True).label
    #valid = train_df[train_df.K==k].reset_index(drop=True)[feature]
    te_label = train_df[train_df.K==k].reset_index(drop=True).label
    

  
    clf = cbt.CatBoostClassifier(iterations = 150, learning_rate = 0.3, depth =7, one_hot_max_size=5,use_best_model =True,
                                 loss_function = 'Logloss', eval_metric= "AUC",logging_level='Verbose',task_type='GPU',
                               cat_features=cat_cols,)#counter_calc_method='Full'，l2_leaf_reg = 10,)
      


    clf.fit(train_df[train_df.K!=k].reset_index(drop=True)[feature],t_label.astype('int32'),
              eval_set=(train_df[train_df.K==k].reset_index(drop=True)[feature], te_label.astype('int32'))
          ,plot=True,verbose=1,cat_features=cat_cols)
#     predicts[begin:over] = clf.predict_proba(train_df[train_df.K==k].reset_index(drop=True)[feature])[:,1]

#     true[begin:over] = te_label.values
    pred += (clf.predict_proba(test_df_list[fold][feature])[:,1])
#     begin+=len(train_df[train_df.K==k].reset_index(drop=True)[feature])
    gc.collect()
  print('--------------------')
  
  #print(sklearn.metrics.roc_auc_score(true,predicts))


In [None]:
feature_importance_df = pd.DataFrame()
feature_importance_df["importance"] = clf.feature_importances_
feature_importance_df["feature"] = feature

In [None]:
feature_importance_df.sort_values('importance',ascending=False)

In [None]:
#------------------------------模型预测----------------------------------------#

In [None]:
pred = pred/5

In [None]:
0.8320243359

In [None]:
(pred>0.5).sum()

In [None]:
pred

In [None]:
((pred-np.min(pred))/(np.max(pred)-np.min(pred)))

In [None]:
res = pd.DataFrame()
res['id'] = test_df_list[0]['id'].astype('int32')
res['probability'] = pred
res.to_csv('5cv_catboost_baseline_target_encoding_.csv',index = False)

In [None]:
res