In [None]:
#-------------------------------调用相关依赖包------------------------------#

In [None]:
!pip install category_encoders
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from tqdm import tqdm_notebook
import time
import gc
import numpy as np
import torch
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
import torch.nn as nn
from torch.utils.data import Dataset,DataLoader
import torch.nn.functional as F
import sklearn
from sklearn.model_selection import StratifiedKFold,KFold
from sklearn.metrics import roc_curve 
import time
import os
import itertools
import random
import matplotlib.pyplot as plt
from collections import OrderedDict
from scipy.special import erfinv
from collections import OrderedDict
from math import sqrt
import numpy as np
import category_encoders as ce
import tqdm


In [None]:
def reduce_mem(df):
    starttime = time.time()
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024 ** 2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if pd.isnull(c_min) or pd.isnull(c_max):
                continue
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024 ** 2
    print('-- Mem. usage decreased to {:5.2f} Mb '
          '({:.1f}% reduction),'
          'time spend:{:2.2f} min'.format(end_mem,
                                          100 * (start_mem - end_mem) / start_mem, (time.time() - starttime) / 60))
    return df


In [None]:
# ------------------------------------------ 数据预处理----------------------------------------------#

In [None]:
def load_data(file_path, chunk=0, sep_tag=','):
    if chunk == 0:
        data = pd.read_csv(file_path, sep=sep_tag)
    else:
        data_iter = pd.read_csv(file_path, sep=sep_tag, chunksize=chunk)
        data = pd.DataFrame()
        for chunk in data_iter:
            chunk = (chunk)
            data = pd.concat([data, chunk])
    return data


In [None]:
train_data = load_data('train_data.csv', sep_tag='|', chunk=2000000)
test_data_A = load_data('test_data_A.csv', sep_tag='|')
test_data_B = load_data('test_data_B.csv', sep_tag='|')
test_data_B['pt_d'] = 8

In [None]:
data_all = pd.concat([train_data, test_data_A, test_data_B]).reset_index(drop=True)

cat_cols = ['task_id', 'adv_id', 'creat_type_cd', 'adv_prim_id', 'dev_id', 'inter_type_cd', 'slot_id', 'spread_app_id',
            'tags',
            'app_first_class', 'app_second_class', 'age', 'city', 'city_rank',
            'device_name', 'device_size', 'career', 'gender', 'net_type',
            'residence', 'his_app_size', 'his_on_shelf_time', 'app_score',
            'emui_dev', 'list_time', 'device_price', 'up_life_duration',
            'up_membership_grade', 'membership_life_duration', 'consume_purchase',
            'communication_avgonline_30d', 'indu_name']


def Label_enco(df, cols):
    for f in tqdm.tqdm(cols):
        map_dict = dict(zip(df[f].unique(), range(df[f].nunique())))
        df[f] = df[f].map(map_dict).fillna(-1).astype(np.int32)
    return df


data_all = Label_enco(data_all, cat_cols)

In [None]:
train_data = data_all.loc[data_all['pt_d'] < 8].reset_index(drop=True)
test_data = data_all.loc[data_all['pt_d'] == 8].reset_index(drop=True)

In [None]:
# ---------------------------------------模型训练----------------------------------------------#

In [None]:
class CTR_dataset(Dataset):
    def __init__(self,df,cate_cols,temp_adv_id,temp_adv_prim_id,weight_T,is_train=False):
        self.weight_T =weight_T
        self.cate_cols = cate_cols
        if is_train:
            self.label = torch.tensor(df.label.values,dtype = torch.float32)
            self.weight = torch.tensor(self.get_weight(self.label) , dtype = torch.float32)
        else:
            self.label = None
        self.col_dict = {}
        for col in cate_cols:
            self.col_dict[col] = torch.tensor(df[col].values, dtype = torch.int64)
        self.col_dict['temp_adv_id'] = torch.tensor(temp_adv_id,dtype=torch.int64)
        self.col_dict['temp_adv_prim_id'] = torch.tensor(temp_adv_prim_id,dtype=torch.int64)
    
    def __len__(self):
        return len(self.col_dict['task_id'])
    
    def __getitem__(self,item):
        output  = {}
        if self.label is not None:
            output= {'label': self.label[item],
            'weight':self.weight[item],
            }
        for col in self.cate_cols:
            output[col] =  self.col_dict[col][item]
    
        output['temp_adv_id'] = self.col_dict['temp_adv_id'][item]
        output['temp_adv_prim_id'] = self.col_dict['temp_adv_prim_id'][item]
    
        return {key: value for key, value in output.items()}

    def get_weight(self,label):
        weight = []
        for i in label:
            if i==1:
                weight.append(self.weight_T)
            else:
                weight.append(1)
        return weight

In [None]:
class Mish(nn.Module):
    def __init__(self):
        super().__init__()
        print("Mish activation loaded...")
    def forward(self,x):
        x = x * (torch.tanh(F.softplus(x)))
        return x

In [None]:
class DNN(nn.Module):


    def __init__(self, inputs_dim, hidden_units, dropout_rate=0, use_bn=False,
                 init_std=0.0001, dice_dim=3, seed=1024):
        super(DNN, self).__init__()
        self.dropout_rate = dropout_rate
        self.dropout = nn.Dropout(dropout_rate)
        self.seed = seed
        self.use_bn = use_bn
        if len(hidden_units) == 0:
            raise ValueError("hidden_units is empty!!")
        hidden_units = [inputs_dim] + list(hidden_units)
        self.Glu = nn.ModuleList([nn.GLU() for i in range(len(hidden_units)-1)])
        self.linears = nn.ModuleList(
            [nn.Linear(hidden_units[i], 2*hidden_units[i + 1]) for i in range(len(hidden_units) - 1)])

        if self.use_bn:
            self.bn = nn.ModuleList(
                [nn.BatchNorm1d(hidden_units[i + 1]) for i in range(len(hidden_units) - 1)])
        self.activation_layers = nn.ModuleList([nn.ReLU() for i in range(len(hidden_units) - 1)])
        #self.activation_layers = nn.ModuleList([Mish() for i in range(len(hidden_units) - 1)])

        # self.activation_layers = nn.ModuleList(
        #     [activation_layer(activation, hidden_units[i + 1], dice_dim) for i in range(len(hidden_units) - 1)])

        for name, tensor in self.linears.named_parameters():
            if 'weight' in name:
                nn.init.normal_(tensor, mean=0, std=init_std)


    def forward(self, inputs):
        deep_input = inputs

        for i in range(len(self.linears)):

            fc = self.Glu[i](self.linears[i](deep_input))

            if self.use_bn:
                fc = self.bn[i](fc)

            fc = self.activation_layers[i](fc)

            fc = self.dropout(fc)
            deep_input = fc
        return deep_input


In [None]:
class temo_model(nn.Module):
    def __init__(self,dict_id_word,cate_cols,emb_dim,inputs_dim,hid_dim, dnn_hidden_units=(512, 128),dnn_dropout=0.3,use_bn=True):
        super(temo_model, self).__init__()
        self.emb_dim = emb_dim
        self.cate_cols = cate_cols
        emb_dict = {}

        for col in cate_cols:
            num_word = dict_id_word[col]['lenth']
            emb_dict[col] = nn.Embedding(num_word,emb_dim)


        self.emb_dict = nn.ModuleDict(emb_dict)

        self.rnn1 = nn.GRU(emb_dim,128,batch_first=True,bidirectional =True)
        self.rnn2 = nn.GRU(emb_dim,128,batch_first=True,bidirectional =True)
        self.dnn = DNN(inputs_dim, dnn_hidden_units, dnn_dropout, use_bn)
        self.linear_out = nn.Linear(dnn_hidden_units[-1],1)
    def forward(self,x):
        ad_col = ['adv_id','task_id','creat_type_cd','adv_prim_id','dev_id',
                                'inter_type_cd','spread_app_id','tags','app_first_class','app_second_class','his_app_size','his_on_shelf_time',
                                                                 'app_score','indu_name']
        user_col = ['age','city','city_rank','device_name','device_size',
                                'career','gender','net_type','residence','emui_dev','list_time','device_price','up_life_duration','up_membership_grade',
                                            'membership_life_duration','consume_purchase','communication_avgonline_30d']
        ad_embs = []
        user_embs = []
        for i ,col in enumerate(ad_col):
            ad_embs.append(self.emb_dict[col](x[col].to(device)).unsqueeze(1))
        for i ,col in enumerate(user_col):
            user_embs.append(self.emb_dict[col](x[col].to(device)).unsqueeze(1))
        ad_embs = torch.cat(ad_embs,dim=1)
        user_embs = torch.cat(user_embs,dim=1)
        ad_embs = torch.sum(ad_embs,dim=1)
        user_embs = torch.sum(user_embs,dim=1)
        fm_out = torch.sum(ad_embs*user_embs,dim=1)
        embeding = [self.emb_dict[col](x[col].to(device)) for col in self.cate_cols ]
        embeding = torch.cat(embeding,dim=-1)
        temp_adv_id =self.emb_dict['adv_id'](x['temp_adv_id'].to(device))
        _,temp_adv_id= self.rnn1(temp_adv_id)
        temp_adv_id = temp_adv_id.permute(1,0,2)
        batch_size = temp_adv_id.size()[0]
        temp_adv_id = temp_adv_id.contiguous().view(batch_size,-1)
        temp_adv_prim_id =self.emb_dict['adv_prim_id'](x['temp_adv_prim_id'].to(device))
        _,temp_adv_prim_id= self.rnn2(temp_adv_prim_id)
        temp_adv_prim_id = temp_adv_prim_id.permute(1,0,2)
        batch_size = temp_adv_prim_id.size()[0]
        temp_adv_prim_id = temp_adv_prim_id.contiguous().view(batch_size,-1)
        dnn_input = torch.cat((embeding,temp_adv_id,temp_adv_prim_id),dim=-1)
        dnn_output  = self.linear_out(self.dnn(dnn_input)).squeeze(1)
        logit = fm_out+dnn_output

        return logit

In [None]:
def train_fn(model,train_loader,train_shape,batch_size,optimizer,is_smooth,eps):
    model.train()
    train_loss = []
    auc = 0
    avg_loss = 0
    static = 0
    pred = torch.zeros((train_shape)).to(device)
    true_label = torch.zeros((train_shape)).to(device)
    criterion = nn.BCEWithLogitsLoss()
    optimizer.zero_grad()
    tk0 = tqdm.tqdm(train_loader, total=len(train_loader))
    for idx, batch in enumerate(tk0):


        label = batch['label'].view(-1).to(device)

        if is_smooth =='double':
            label = (1-eps)*label+(1-label)*eps/1
        elif is_smooth =='one':
            label = label+(1-label)*eps/1
        weight = batch['weight'].view(-1).to(device)
        output_train = model(batch)
        pred[idx*batch_size:(idx+1)*batch_size] = (output_train.view(-1))
        true_label[idx*batch_size:(idx+1)*batch_size] = label.view(-1)
        loss1 = criterion(output_train.view(-1),label)

        loss = loss1

        avg_loss += loss.item()
        train_loss.append(loss.item())
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        tk0.set_postfix(loss=avg_loss/(idx+1))
    auc = sklearn.metrics.roc_auc_score( (true_label).detach().cpu().squeeze().numpy(),torch.sigmoid(pred).detach().cpu().squeeze().numpy())
    plt.plot(train_loss)
    return avg_loss/(idx+1-static),auc

In [None]:
def val_fn(model,valid_loader,val_shape,batch_size):
    model.eval()
    auc = 0
    avg_loss = 0
    static = 0
    sum = 0
    my_loss = nn.BCEWithLogitsLoss()
    pred = np.zeros((val_shape))
    predict = np.zeros((val_shape))
    true_label = np.zeros((val_shape))
    with torch.no_grad():
        tk0 = tqdm.tqdm(valid_loader, total=len(valid_loader))
        for idx, batch in enumerate(tk0):

            label = batch['label'].to(device).view(-1)
            weight = batch['weight'].to(device).view(-1)
            output_train= model(batch)
            pred[idx*batch_size:(idx+1)*batch_size] = torch.sigmoid(output_train.view(-1)).detach().cpu().squeeze().numpy()
            predict[idx*batch_size:(idx+1)*batch_size] = output_train.view(-1).detach().cpu().squeeze().numpy()
            true_label[idx*batch_size:(idx+1)*batch_size] = label.view(-1).detach().cpu().squeeze().numpy()
            loss = my_loss(output_train.view(-1),label)
            avg_loss += loss.item()
            tk0.set_postfix(loss=avg_loss/(idx+1))
   
        auc = sklearn.metrics.roc_auc_score( true_label,pred)
    return avg_loss/(idx+1-static),auc,predict

In [None]:
def test_fn(model,test_loader,test_shape,batch_size):
    model.eval()
    pred = torch.zeros((test_shape))
    with torch.no_grad():
        for idx, batch in tqdm_notebook(enumerate(test_loader),mininterval=2,desc='--testing',leave=False):
            output_train= model(batch)
            pred[idx*batch_size:(idx+1)*batch_size] = output_train.view(-1)
    return pred

In [None]:
def process_data(train_data, test_data, cols):    
    temp_data = train_data[train_data['label']==1].groupby(['uid','pt_d']).agg({cols:list}).reset_index()
    temp_data = temp_data.sort_values(['uid','pt_d'])
    
    uid = temp_data['uid'].tolist()
    pt_d = temp_data['pt_d'].tolist()
    adv_id = temp_data[cols].tolist()
    
    res = []
    for i in tqdm.tqdm(range(0,len(uid))):
        res.append([uid[i],pt_d[i],','.join(list(map(str,adv_id[i])))])
    total_days = 8
    result = [[res[0][0],res[0][1]+1,res[0][2]]]
    for i in tqdm.tqdm(range(1,len(res))):
        if res[i][0] == result[-1][0]:
            while res[i][1]>result[-1][1]:
                result.append([result[-1][0],result[-1][1]+1,result[-1][2]])
            while res[i][1]>=result[-1][1]:
                result.append([result[-1][0], result[-1][1] + 1, result[-1][2]+','+res[i][2]])
        else:
            while total_days>result[-1][1]:
                result.append([result[-1][0],result[-1][1]+1,result[-1][2]])
            result.append([res[i][0], res[i][1]+1, res[i][2]])
            
    result = pd.DataFrame(result)
    result.columns = ['uid','pt_d','his_'+cols]
    result['his_'+cols] = result['his_'+cols].apply(lambda x:list(map(int,x.split(','))))
    result['his_'+cols+'_len'] =result['his_'+cols].apply(lambda x:len(x))
    train_data = train_data.merge(result,on = ['uid','pt_d'],how='left')
    test_data = test_data.merge(result,on = ['uid','pt_d'],how='left')
    train_data = train_data.fillna(0)
    test_data = test_data.fillna(0)
    
    his_adv_id = train_data['his_'+cols].tolist()
    for i in tqdm.tqdm(range(len(his_adv_id))):
        if his_adv_id[i]==0:
            his_adv_id[i] = [0]*20
        else:
            his_adv_id[i] =[0]*(20-len(his_adv_id[i]))+his_adv_id[i][::-1][0:20]
    his_adv_id = np.array(his_adv_id)
    
    test_his_adv_id = test_data['his_'+cols].tolist()
    for i in tqdm.tqdm(range(len(test_his_adv_id))):
        if test_his_adv_id[i]==0:
            test_his_adv_id[i] = [0]*20
        else:
            test_his_adv_id[i] =[0]*(20-len(test_his_adv_id[i]))+test_his_adv_id[i][::-1][0:20]
            
    test_his_adv_id = np.array(test_his_adv_id)
    return train_data, test_data,his_adv_id,test_his_adv_id

In [None]:
train_data,test_data,train_his_advid,test_his_advid = process_data(train_data, test_data, 'adv_id')
train_data,test_data,train_his_adv_prim_id,test_his_adv_prim_id = process_data(train_data, test_data, 'adv_prim_id')

In [None]:
floder = StratifiedKFold(n_splits=5,random_state=42,shuffle=True)
predicts = np.zeros(len(train_data))
Batch_size = 8192*2
device = torch.device('cuda')
NUM_EPOCH = 5
test_predicts = np.zeros(len(test_data))

In [None]:
cat_cols = ['task_id', 'adv_id', 'creat_type_cd', 'adv_prim_id',
       'dev_id', 'inter_type_cd', 'slot_id', 'spread_app_id', 'tags',
       'app_first_class', 'app_second_class', 'age', 'city', 'city_rank',
       'device_name', 'device_size', 'career', 'gender', 'net_type',
       'residence', 'his_app_size', 'his_on_shelf_time', 'app_score',
       'emui_dev', 'list_time', 'device_price', 'up_life_duration',
       'up_membership_grade', 'membership_life_duration', 'consume_purchase',
        'communication_avgonline_30d', 'indu_name']

In [None]:
df = pd.concat([train_data, test_data]).reset_index(drop=True)
dict_id_word = {}
for col in cat_cols:
  par_col={}
  par_col['lenth'] = len(set(df[col].values))
  dict_id_word[col] = par_col

In [None]:

for fold,(train, test) in enumerate(floder.split(train_data,train_data.label)):

  #data_process 

  train_dataset = CTR_dataset(train_data.iloc[train].reset_index(drop=True),cat_cols,train_his_advid[train],train_his_adv_prim_id[train],1,True)
  valid_dataset = CTR_dataset(train_data.iloc[test].reset_index(drop=True),cat_cols,train_his_advid[test],train_his_adv_prim_id[test],1,True)
  train_dataloader = DataLoader(train_dataset , batch_size = Batch_size,num_workers=4, shuffle=True)
  valid_dataloader = DataLoader(valid_dataset , batch_size = Batch_size,num_workers=4, shuffle=False)
 
  model  = temo_model(dict_id_word,cat_cols,16,1024,128, dnn_hidden_units=(1024, 512),dnn_dropout=0.3,use_bn=True)
  model.zero_grad();
  model  = nn.DataParallel(model)
  model.to(device)

    #优化器
  optimizer = torch.optim.AdamW(model.parameters(), lr=0.01,weight_decay=0.1)
  
  
  best_score = -1
  for epoch in range(5):
        torch.cuda.empty_cache()
        start_time = time.time()
        train_loss, train_auc = train_fn(model  ,train_dataloader ,len(train_dataset), Batch_size,optimizer,'no',0.1)
        valid_loss,val_auc,predict, = val_fn(model ,valid_dataloader ,len(valid_dataset), Batch_size)

        elapsed_time = time.time() - start_time
        print('epoch{}/{} , train_loss: {}  ,train_auc: {} \n \n val_loss :{} ,  val_auc:{} ,time: {}\n'.format(epoch+1,NUM_EPOCH,train_loss,train_auc,valid_loss,val_auc,elapsed_time))
        if val_auc>best_score:
          best_score = val_auc
          best_param_score = model.state_dict()
          #predicts[test] = predict
          torch.save(best_param_score,'temp__best_param_score_{}'.format(fold+1))
      
      #test_predict = test_fn(model ,test_dataloader ,len(test_dataset), Batch_size)
  #test_predicts+=test_predict
  del train_dataset
  del valid_dataset
  del train_dataloader
  del valid_dataloader
  gc.collect()



In [None]:
#--------------------------------------------预测代码-----------------------------------------------#

In [None]:
test_dataset = CTR_dataset(test_data,cat_cols,test_his_advid,test_his_adv_prim_id,1,False)

In [None]:
test_dataloader = DataLoader(test_dataset , batch_size = Batch_size,num_workers=4, shuffle=False)


In [None]:


model1  = temo_model(dict_id_word,cat_cols,16,1024,128, dnn_hidden_units=(1024, 512),dnn_dropout=0.3,use_bn=True)
model1.zero_grad();
model1  = nn.DataParallel(model1)
model1.to(device)
model1.load_state_dict(torch.load('temp__best_param_score_{}'.format(1)))

model2  = temo_model(dict_id_word,cat_cols,16,1024,128, dnn_hidden_units=(1024, 512),dnn_dropout=0.3,use_bn=True)
model2.zero_grad();
model2  = nn.DataParallel(model2)
model2.to(device)
model2.load_state_dict(torch.load('temp__best_param_score_{}'.format(2)))

model3  = temo_model(dict_id_word,cat_cols,16,1024,128, dnn_hidden_units=(1024, 512),dnn_dropout=0.3,use_bn=True)
model3.zero_grad();
model3  = nn.DataParallel(model3)
model3.to(device)
model3.load_state_dict(torch.load('temp__best_param_score_{}'.format(3)))

model4  = temo_model(dict_id_word,cat_cols,16,1024,128, dnn_hidden_units=(1024, 512),dnn_dropout=0.3,use_bn=True)
model4.zero_grad();
model4  = nn.DataParallel(model4)
model4.to(device)
model4.load_state_dict(torch.load('temp__best_param_score_{}'.format(4)))

model5  = temo_model(dict_id_word,cat_cols,16,1024,128, dnn_hidden_units=(1024, 512),dnn_dropout=0.3,use_bn=True)
model5.zero_grad();
model5  = nn.DataParallel(model5)
model5.to(device)
model5.load_state_dict(torch.load('temp__best_param_score_{}'.format(5)))

In [None]:
test_predict1 = test_fn(model1 ,test_dataloader ,len(test_dataset), Batch_size)
print((test_predict1>0).sum())
test_predict2 = test_fn(model2 ,test_dataloader ,len(test_dataset), Batch_size)
print((test_predict2>0).sum())

test_predict3 = test_fn(model3 ,test_dataloader ,len(test_dataset), Batch_size)
test_predict4 = test_fn(model4 ,test_dataloader ,len(test_dataset), Batch_size)
test_predict5 = test_fn(model5 ,test_dataloader ,len(test_dataset), Batch_size)

In [None]:
test_predict1 = test_predict1[1000000:]
test_predict2 = test_predict2[1000000:]
test_predict3 = test_predict3[1000000:]
test_predict4 = test_predict4[1000000:]
test_predict5 = test_predict5[1000000:]

In [None]:
test_predict = (test_predict1+test_predict2 +test_predict3 +test_predict4 +test_predict5)/5

In [None]:
test_predict_sig = torch.sigmoid(test_predict)
test_predict_sig = test_predict_sig.detach().cpu().squeeze().numpy()

In [None]:
(test_predict_sig>0.5).sum()

In [None]:
stack_test = pd.DataFrame(test_predict_sig).reset_index()
stack_test.columns=['id','probability']
stack_test['id']+=1
stack_test.to_csv('temp_submission.csv',index=False)