In [None]:
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from tqdm import tqdm_notebook
import time
import gc
import numpy as np
import torch
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
import torch.nn as nn
from torch.utils.data import Dataset,DataLoader
import torch.nn.functional as F
import sklearn
from sklearn.model_selection import StratifiedKFold,KFold
from sklearn.metrics import roc_curve 
import time
import os
import itertools
import random
import matplotlib.pyplot as plt
from collections import OrderedDict
from scipy.special import erfinv
from collections import OrderedDict
from math import sqrt
import numpy as np
import category_encoders as ce

In [None]:
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
seed_everything()

In [None]:
MASK = 'MASK'
miss_col1 = ['task_id', 'adv_id','uid']
miss_col2 = ['adv_prim_id','dev_id' , 'device_size']

In [None]:
def reduce_mem(df):
    starttime = time.time()
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if pd.isnull(c_min) or pd.isnull(c_max):
                continue
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    print('-- Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction),time spend:{:2.2f} min'.format(end_mem,
                                                                                                           100*(start_mem-end_mem)/start_mem,
                                                                                                           (time.time()-starttime)/60))
    return df

In [None]:

def lower_sample_data_by_sample(df,percent=1,rs=42):
    most_data = df[df['label'] == 0]  # 多数类别的样本
    minority_data = df[df['label'] == 1]  # 少数类别的样本   
    #随机采样most_data中的数据
    lower_data=most_data.sample(n=int(percent*len(minority_data)),replace=False,random_state=rs,axis=0)   
    return (pd.concat([lower_data,minority_data]))

In [None]:
#------------------------------数据预处理----------------------------------------#

In [None]:
columns = [ 'uid', 'task_id', 'adv_id', 'creat_type_cd', 'adv_prim_id',
       'dev_id', 'inter_type_cd', 'slot_id', 'spread_app_id', 'tags',
       'app_first_class', 'app_second_class', 'age', 'city', 'city_rank',
       'device_name', 'device_size', 'career', 'gender', 'net_type',
       'residence', 'his_app_size', 'his_on_shelf_time', 'app_score',
       'emui_dev', 'list_time', 'device_price', 'up_life_duration',
       'up_membership_grade', 'membership_life_duration', 'consume_purchase',
       'communication_onlinerate', 'communication_avgonline_30d', 'indu_name',
       'pt_d']

In [None]:
%%time
train_df = reduce_mem(pd.read_csv('train_data.csv',sep='|'))



In [None]:
cate_cols = ['city_rank','creat_type_cd','dev_id','device_size','gender','indu_name','inter_type_cd','residence','slot_id','net_type','task_id','adv_id','adv_prim_id','age','app_first_class','app_second_class','career','city','consume_purchase','uid','tags']
user_feature1 = ['age','city_rank','career','gender','consume_purchase']
adv_feature = ['creat_type_cd','app_second_class','app_first_class','tags']

In [None]:

test_df = pd.read_csv('test_data_B.csv',sep='|')

In [None]:
train_df.reset_index(drop=True,inplace=True)

In [None]:
test_df1 = test_df.copy()
test_df2 = test_df.copy()
test_df3 = test_df.copy()
test_df4 = test_df.copy()
test_df_list = [test_df,test_df1,test_df2,test_df3,test_df4]

In [None]:
for kfold ,(tr_idx, oof_idx) in tqdm_notebook(enumerate(StratifiedKFold(n_splits=5, random_state=2020, shuffle=True).split(train_df, train_df['label']))):
  for k,col in enumerate(tqdm_notebook(cate_cols)):
      target_enc = ce.CatBoostEncoder(cols=col)
      target_enc.fit(train_df.iloc[tr_idx][col], train_df.iloc[tr_idx]['label'])
      train_df.loc[oof_idx,col + '_cb']  = (target_enc.transform(train_df.iloc[oof_idx][col])).values
      train_df.loc[oof_idx,'K'] = kfold
      test_df_list[kfold] = test_df_list[kfold].join(target_enc.transform(test_df[col]).add_suffix('_cb'))
      
      if k%7==0:
        train_df = reduce_mem(train_df)

In [None]:
cb_feature = [col + '_cb' for col in cate_cols]

In [None]:
train_df

In [None]:
train_df = reduce_mem(train_df)

In [None]:
train_df

In [None]:
def cdfinv(y):
    """简化的公式，与原NormalCDFInverse等价，且精度更高

    公式推导参见https://www.cnblogs.com/htj10/p/8621771.html
    """
    return sqrt(2) * erfinv(2 * y - 1)
#求分位数 对于标准正太分布，0.5分位数是0 ， 0，1分位是无穷大
def rankGaussTrafo(dataIn):
    hist = dict()       # hist统计元素的出现频率
    for i in dataIn:
        if i not in hist:
            hist[i] = 1
        else:
            hist[i] += 1

    hist = OrderedDict([t for t in sorted(hist.items(), key=lambda d:d[0])])    # 按照key排序

    trafoMap = dict()
    if len(hist) == 1:      # unary column: trafo all to 0
        trafoMap[list(hist.keys())[0]] = 0.0
    elif len(hist) == 2:    # binary column: trafo to 0 / 1
        trafoMap[list(hist.keys())[0]] = 0.0
        trafoMap[list(hist.keys())[1]] = 1.0
    else:                   # more than 2 unique values
        mean = 0.0
        cnt = 0
        N = len(dataIn)

        for key, value in hist.items():
            rankV = cnt * 1.0 / N       # 累计次数占总次数的比例，取值[0,1]，单调递增（和分布函数F的性质对应）

            rankV = rankV * 0.998 + 1e-3    # 注意到cdfinv(0) = -inf，而cdfinv(1) = inf。这个操作使得rankV限制在[0.001,0.999]，而cdfinv(rankV)限制在[-3.09,3.09]，避免了极端情况的发生

            scale_factor = 0.7      # 使用0.7可以得到原cpp中的结果，对分布有收缩作用。建议使用1.0，可保持std=1.0
            rankV = cdfinv(rankV) * scale_factor     # 将其作为分布函数F的值，逆向求N(0,1)的α分位数

            mean += value * rankV   # value是出现次数，乘以rankV。rankV可以看作是value的权重。注意到hist是按key从小到大排序的，排位越后的数对均值的贡献越大
            trafoMap[key] = rankV   # 记录为trafoMap的值
            cnt += value            # 累计次数
        
        mean /= N

        for key in trafoMap.keys():
            trafoMap[key] -= mean   # 每个rankV减去均值，得到最终trafoMap

    dataOut = dataIn.copy()
    for i in range(len(dataIn)):    # 这里简单地把trafoMap映射到输出
        dataOut[i] = trafoMap[dataIn[i]]
    return dataOut

In [None]:
train_df = lower_sample_data_by_sample(train_df , 3,0).reset_index(drop=True)


In [None]:
test_df_list[0]['K']=0
test_df_list[1]['K']=1
test_df_list[2]['K']=2
test_df_list[3]['K']=3
test_df_list[4]['K']=4


In [None]:
df = pd.concat([train_df,test_df_list[0],test_df_list[1],test_df_list[2],test_df_list[3],test_df_list[4]],axis=0)
test_id = test_df_list[0]['id'].copy().reset_index(drop=True)

In [None]:
del train_df
# del test_df
gc.collect()

In [None]:
%%time
df=df.replace([np.inf, -np.inf],0)
df=df.fillna(0)

In [None]:
df[(df.pt_d==9) & ((df.K)==0)]

In [None]:
user_col = ['uid','age','city','city_rank','career','gender','residence','communication_avgonline_30d','consume_purchase','membership_life_duration','up_membership_grade','up_life_duration']
ad_col = ['task_id','adv_id','creat_type_cd','adv_prim_id','dev_id','slot_id','spread_app_id','tags','app_first_class','app_second_class','indu_name','inter_type_cd']
phone_col = ['device_name','device_size','net_type','emui_dev','device_price']
app_col = ['his_app_size','his_on_shelf_time','app_score','list_time']

In [None]:
#处理类别特征
cate_cols = ['city_rank','creat_type_cd','dev_id','device_size','gender','indu_name','inter_type_cd','residence','slot_id','net_type','task_id','adv_id','adv_prim_id','age','app_first_class','app_second_class','career','city','consume_purchase','uid','tags']
#统计出现了多少次这样的
cate_dict = {}#torch.load('/content/drive/My Drive/dl/华为CTR/model/fibinet/catboost_glo_rank_cate_dict.pkl')#{}
for f in tqdm_notebook(user_col+ad_col+phone_col+app_col):
    map_dict = dict(zip(df[f].unique(), range(df[f].nunique())))
    cate_dict[f] = map_dict
    df[f] = df[f].map(map_dict).fillna(-1).astype('int32')
    df[f + '_count'] = df[f].map(df[(df.pt_d!=9) | ((df.pt_d==9) & (df.K)==0)][f].value_counts())
df = reduce_mem(df)

In [None]:
drop_fea = ['pt_d','label','communication_onlinerate','index','dev_id','id','K']
feature= [x for x in df.columns if x not in drop_fea]
print(len(feature))
print(feature)

In [None]:
sparse_features = cate_cols
dense_features = [x for x in df.columns if x not in drop_fea+cate_cols] #这里的dense_feature可以把树模型的特征加进来
print('sparse_feature: {}'.format(sparse_features))
print('dense_feature: {}'.format(dense_features))

In [None]:
for col in tqdm_notebook(dense_features):
  df[col] = np.array(rankGaussTrafo(df[col].values.tolist()))

In [None]:
dict_id_word = {}
for col in sparse_features:
  par_col={}
  par_col['lenth'] = len(set(df[col].values))
  dict_id_word[col] = par_col

In [None]:
df = reduce_mem(df)

In [None]:
test_df = df[df["pt_d"]==9].copy().reset_index()
train_df = df[df["pt_d"]<8].reset_index()
del df
gc.collect()

In [None]:
train_df = reduce_mem(train_df)

In [None]:
test_df1 = test_df[test_df.K==0].reset_index(drop=True)
test_df2 = test_df[test_df.K==1].reset_index(drop=True)
test_df3 = test_df[test_df.K==2].reset_index(drop=True)
test_df4 = test_df[test_df.K==3].reset_index(drop=True)
test_df5 = test_df[test_df.K==4].reset_index(drop=True)

In [None]:
#------------------------------模型训练----------------------------------------#

In [None]:
class Mish(nn.Module):
    def __init__(self):
        super().__init__()
        print("Mish activation loaded...")
    def forward(self,x):
        x = x * (torch.tanh(F.softplus(x)))
        return x

In [None]:
class DNN(nn.Module):


    def __init__(self, inputs_dim, hidden_units, dropout_rate=0, use_bn=False,
                 init_std=0.0001, dice_dim=3, seed=1024):
        super(DNN, self).__init__()
        self.dropout_rate = dropout_rate
        self.dropout = nn.Dropout(dropout_rate)
        self.seed = seed
        self.use_bn = use_bn
        if len(hidden_units) == 0:
            raise ValueError("hidden_units is empty!!")
        hidden_units = [inputs_dim] + list(hidden_units)
        self.Glu = nn.ModuleList([nn.GLU() for i in range(len(hidden_units)-1)])
        self.linears = nn.ModuleList(
            [nn.Linear(hidden_units[i], 2*hidden_units[i + 1]) for i in range(len(hidden_units) - 1)])

        if self.use_bn:
            self.bn = nn.ModuleList(
                [nn.BatchNorm1d(hidden_units[i + 1]) for i in range(len(hidden_units) - 1)])
        # self.activation_layers = nn.ModuleList([nn.ReLU() for i in range(len(hidden_units) - 1)])
        self.activation_layers = nn.ModuleList([Mish() for i in range(len(hidden_units) - 1)])

        # self.activation_layers = nn.ModuleList(
        #     [activation_layer(activation, hidden_units[i + 1], dice_dim) for i in range(len(hidden_units) - 1)])

        for name, tensor in self.linears.named_parameters():
            if 'weight' in name:
                nn.init.normal_(tensor, mean=0, std=init_std)


    def forward(self, inputs):
        deep_input = inputs

        for i in range(len(self.linears)):

            fc = self.Glu[i](self.linears[i](deep_input))

            if self.use_bn:
                fc = self.bn[i](fc)

            fc = self.activation_layers[i](fc)

            fc = self.dropout(fc)
            deep_input = fc
        return deep_input


In [None]:
class CIN(nn.Module):
 

    def __init__(self, field_size, layer_size=(128, 128), activation='relu', split_half=True, l2_reg=1e-5, seed=1024,
                 device='cpu'):
        super(CIN, self).__init__()
        if len(layer_size) == 0:
            raise ValueError(
                "layer_size must be a list(tuple) of length greater than 1")

        self.layer_size = layer_size
        self.field_nums = [field_size]
        self.split_half = split_half
        # self.activation = nn.ReLU()
        self.activation = Mish()

        self.l2_reg = l2_reg
        self.seed = seed

        self.conv1ds = nn.ModuleList()
        for i, size in enumerate(self.layer_size):
            self.conv1ds.append(
                nn.Conv1d(self.field_nums[-1] * self.field_nums[0], size, 1))

            if self.split_half:
                if i != len(self.layer_size) - 1 and size % 2 > 0:
                    raise ValueError(
                        "layer_size must be even number except for the last layer when split_half=True")

                self.field_nums.append(size // 2)
            else:
                self.field_nums.append(size)

        #         for tensor in self.conv1ds:
        #             nn.init.normal_(tensor.weight, mean=0, std=init_std)
        self.to(device)

    def forward(self, inputs):
      #eg inputs: 6 *3 *8
      # 交互特征对位乘:6*9*8
      # cnn: 6*128 *8 ---split---6*64*8 fin.append(6*64*8)
      # 交互特征对位乘:6*(3✖64)*8
      # cnn:6 * 128 *8 ---split---6*64*8 fin.append(6*64*8)
      # result = torch.cat(fin, dim=1) 6*128*8
      # result = torch.sum(result, -1) 6*128
        if len(inputs.shape) != 3:
            raise ValueError(
                "Unexpected inputs dimensions %d, expect to be 3 dimensions" % (len(inputs.shape)))
        batch_size = inputs.shape[0]
        dim = inputs.shape[-1]
        hidden_nn_layers = [inputs]
        final_result = []

        for i, size in enumerate(self.layer_size):
            # x^(k-1) * x^0
            x = torch.einsum(
                'bhd,bmd->bhmd', hidden_nn_layers[-1], hidden_nn_layers[0]) # 每个特征做内积
            # x.shape = (batch_size , hi * m, dim)
            x = x.reshape(
                batch_size, hidden_nn_layers[-1].shape[1] * hidden_nn_layers[0].shape[1], dim)
            # x.shape = (batch_size , hi, dim)
            x = self.conv1ds[i](x)

            if self.activation is None or self.activation == 'linear':
                curr_out = x
            else:
                curr_out = self.activation(x)

            if self.split_half:
                if i != len(self.layer_size) - 1:
                    next_hidden, direct_connect = torch.split(
                        curr_out, 2 * [size // 2], 1)
                else:
                    direct_connect = curr_out
                    next_hidden = 0
            else:
                direct_connect = curr_out
                next_hidden = curr_out

            final_result.append(direct_connect)
            hidden_nn_layers.append(next_hidden)

        result = torch.cat(final_result, dim=1)
        result = torch.sum(result, -1)

        return result

In [None]:
class xDeepFM(nn.Module):


    def __init__(self, dict_id_word,cate_cols,emb_dim,filed_size,inputs_dim, dnn_hidden_units=(512, 128),
                 cin_layer_size=(256, 128,), cin_split_half=True, cin_activation='relu', l2_reg_linear=0.00001,
                 l2_reg_embedding=0.00001, l2_reg_dnn=0, l2_reg_cin=0, init_std=0.0001, seed=1024, dnn_dropout=0,
                 dnn_activation='relu', use_bn=True, task='binary' ):

        super().__init__()
        self.filed_size = filed_size
        self.emb_dim = emb_dim
        self.embed_list = nn.ModuleList()
        self.fc_share = nn.Linear(emb_dim,emb_dim)
        self.fc_mean = nn.Linear(emb_dim,1)
        self.fc_std =nn.Linear(emb_dim,1)
        self.linearlist = nn.ModuleList()
        for col in cate_cols:
          num_word = dict_id_word[col]['lenth']
          self.embed_list.append(nn.Embedding(num_word,int(8*torch.log(torch.Tensor([num_word])))))
          self.linearlist.append(nn.Linear(int(8*torch.log(torch.Tensor([num_word]))),emb_dim))
        for tensor in nn.ModuleList():
          nn.init.zeros_(tensor.weight)
        self.dnn_hidden_units = dnn_hidden_units
        self.use_dnn =  len(dnn_hidden_units) > 0
        if self.use_dnn:
            self.dnn = DNN(inputs_dim, dnn_hidden_units, dnn_dropout, use_bn)
            self.dnn_linear = nn.Linear(dnn_hidden_units[-1], 1, bias=False)
            # self.add_regularization_loss(
            #     filter(lambda x: 'weight' in x[0] and 'bn' not in x[0], self.dnn.named_parameters()), l2_reg_dnn)

            # self.add_regularization_loss(self.dnn_linear.weight, l2_reg_dnn)

        self.cin_layer_size = cin_layer_size
        self.use_cin = len(self.cin_layer_size) > 0 
        if self.use_cin:
            field_num = filed_size
            if cin_split_half == True:
                self.featuremap_num = sum(
                    cin_layer_size[:-1]) // 2 + cin_layer_size[-1]
            else:
                self.featuremap_num = sum(cin_layer_size)
            self.cin = CIN(field_num, cin_layer_size,
                           cin_activation, cin_split_half, l2_reg_cin, seed)
            self.cin_linear = nn.Linear(self.featuremap_num, 1, bias=False)
            # self.add_regularization_loss(
            #     filter(lambda x: 'weight' in x[0], self.cin.named_parameters()), l2_reg_cin)

        
        self.out = nn.Linear(dnn_hidden_units[-1],1,bias=False)

    def forward(self, x):
        mask_col = ['uid','task_id','adv_id']
        dense_fea = x['dense'].to(device)

        batch_size = (len(x[cate_cols[0]]))
        #embeding= []
        embeding = torch.zeros(batch_size,self.filed_size,self.emb_dim).to(device)
        for i ,col in enumerate(cate_cols):
          # if col in mask_col:
          #   if random.random()<0.15:
          #     embeding[:,i,:]=torch.zeros(batch_size,self.emb_dim).to(device)
          #   else:
          #     embeding[:,i,:] = self.linearlist[i](self.embed_list[i](x[col].to(device)))
          # else:
            embeding[:,i,:] = self.linearlist[i](self.embed_list[i](x[col].to(device)))

        temp_vae = embeding
        temp_double = self.fc_share(temp_vae)
        temp_men = self.fc_mean(temp_double).squeeze(2)
        temp_std = self.fc_std(temp_double).squeeze(2)
        eps = torch.randn_like(temp_std)
        temp_target = temp_men + eps * temp_std

        if self.use_cin:
            cin_input = embeding
            cin_output = self.cin(cin_input)
            cin_logit = self.cin_linear(cin_output)
        if self.use_dnn:
            dnn_input = torch.cat((embeding.contiguous().view(batch_size,-1) , dense_fea),-1)
            dnn_output = self.dnn(dnn_input)
            dnn_logit = self.dnn_linear(dnn_output)

        if len(self.dnn_hidden_units) == 0 and len(self.cin_layer_size) == 0:  # only linear
            final_logit = linear_logit
        elif len(self.dnn_hidden_units) == 0 and len(self.cin_layer_size) > 0:  # linear + CIN
            final_logit = linear_logit + cin_logit
        elif len(self.dnn_hidden_units) > 0 and len(self.cin_layer_size) == 0:  # linear +　Deep
            final_logit = linear_logit + dnn_logit
        elif len(self.dnn_hidden_units) > 0 and len(self.cin_layer_size) > 0:  # linear + CIN + Deep
            final_logit =   dnn_logit + cin_logit
        else:
            raise NotImplementedError
        

        return final_logit,temp_target

In [None]:
def my_loss(input,target,weight=None):
  input=torch.sigmoid(input)
  if weight is not None:
    loss =torch._C._nn.binary_cross_entropy(input,target)
    loss*=weight
    return loss.sum()
  else:
    loss = torch._C._nn.binary_cross_entropy(input,target)
    return loss.sum()

In [None]:
class CTR_dataset(Dataset):
  def __init__(self,df,cate_cols,dense_features,weight_T,is_train=False):
    self.weight_T =weight_T
    self.cate_cols = cate_cols
    if is_train:
      self.label = torch.tensor(df.label.values,dtype = torch.float32)
      self.weight = torch.tensor(self.get_weight(self.label) , dtype = torch.float32)
    else:
      self.label = None
    self.col_dict = {}
    for col in cate_cols:
      self.col_dict[col] = torch.tensor(df[col].values, dtype = torch.int64)
    self.dens_fea = torch.tensor(df[dense_features].values ,dtype = torch.float32)
  def __len__(self):
    return len(self.col_dict['task_id'])
  def __getitem__(self,item):
    output  = {}
    if self.label is not None:
      output= {'label': self.label[item],
            'weight':self.weight[item],
            }
    for col in self.cate_cols:
      output[col] =  self.col_dict[col][item]
    output['dense'] = self.dens_fea[item]
    return {key: value for key, value in output.items()}
  def get_weight(self,label):
    weight = []
    for i in label:
      if i==1:
        weight.append(self.weight_T)
      else:
        weight.append(1)
    return weight

In [None]:
class FocalLoss(nn.Module):
    def __init__(self, alpha=0.1, gamma=2, logits=True, reduce=True):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.logits = logits
        self.reduce = reduce

    def forward(self, inputs, targets):
        if self.logits:
            BCE_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduce=False)
        else:
            BCE_loss = F.binary_cross_entropy(inputs, targets, reduce=False)
        pt = torch.exp(-BCE_loss)
        F_loss = self.alpha * (1-pt)**self.gamma * BCE_loss

        if self.reduce:
            return torch.mean(F_loss)
        else:
            return F_loss

In [None]:
class ScheduledOptimn():
    def __init__(self,optimizer,init_lr,n_warmup_steps):
        self.optimizer=optimizer
        self.init_lr=init_lr
        self.n_warmup_steps=n_warmup_steps
        self.current_step=0
    def zero_grad(self):
        self.optimizer.zero_grad()
    def step_and_update_lr(self):
        self._update_learning_rate()
        self.optimizer.step()
    def _update_learning_rate(self):
        self.current_step+=1

        lr=self.init_lr*self._get_lr_scale()
        for param_group in  self.optimizer.param_groups:
                param_group['lr']=lr
       
    def _get_lr_scale(self):
        return min ([
            np.power(self.current_step,-0.5),
            np.power(self.n_warmup_steps,-1.5)*self.current_step
        ])


In [None]:
class EMA():
    def __init__(self, model, decay):
        self.model = model
        self.decay = decay
        self.shadow = {}
        self.backup = {}

    def register(self):
        for name, param in self.model.named_parameters():
            if param.requires_grad:
                self.shadow[name] = param.data.clone()

    def update(self):
        for name, param in self.model.named_parameters():
            if param.requires_grad:
                assert name in self.shadow
                new_average = (1.0 - self.decay) * param.data + self.decay * self.shadow[name]
                self.shadow[name] = new_average.clone()

    def apply_shadow(self):
        for name, param in self.model.named_parameters():
            if param.requires_grad:
                assert name in self.shadow
                self.backup[name] = param.data
                param.data = self.shadow[name]

    def restore(self):
        for name, param in self.model.named_parameters():
            if param.requires_grad:
                assert name in self.backup
                param.data = self.backup[name]
        self.backup = {}

In [None]:
class FGM():
  def __init__(self,model):
    self.model=model
    self.backup={}
  def attack(self,epsilon=1,emb_name='emb'):
    for name,param in self.model.named_parameters():
      if param.requires_grad and emb_name in name:
        self.backup[name]=param.data.clone()
        if (param.grad)== None:
          continue
        norm=torch.norm(param.grad)
        if norm!= 0 and not torch.isnan(norm):
          r_at=epsilon * param.grad/norm
          param.data.add_(r_at)
  def restore(self,emb_name='word_emb'):
    for name,param in self.model.named_parameters():
      if param.requires_grad and emb_name in name:
        assert name in self.backup
        param.data=self.backup[name]
    self.backup={}

In [None]:
def train_fn(model,ema,fgm,train_loader,train_shape,batch_size,optimizer,is_smooth,eps):
  model.train()
  train_loss = []
  auc = 0
  avg_loss = 0
  static = 0
  pred = torch.zeros((train_shape)).to(device)
  true_label = torch.zeros((train_shape)).to(device)
  criterion = FocalLoss()
  optimizer.zero_grad()
  
  for idx, batch in tqdm_notebook(enumerate(train_loader),mininterval=2,desc='--Training',leave=False):


    label = batch['label'].view(-1).to(device)
   
    label_loss= batch['label'].unsqueeze(1).repeat(1,20*21)
    if is_smooth =='double':
      label = (1-eps)*label+(1-label)*eps/1
    elif is_smooth =='one':
      label = label+(1-label)*eps/1
    weight = batch['weight'].view(-1).to(device)
    output_train,losses_output = model(batch)
    pred[idx*batch_size:(idx+1)*batch_size] = (output_train.view(-1))
    true_label[idx*batch_size:(idx+1)*batch_size] = label.view(-1)
    loss1 = criterion(output_train.view(-1),label)
    loss2 = criterion(losses_output,label.unsqueeze(1).repeat(1,21))

    loss = loss1+loss2

    avg_loss += loss.item()
    train_loss.append(loss.item())
    loss.backward()
    optimizer.step_and_update_lr()
   
    
    
    ema.update()
    optimizer.zero_grad()
  
  #auc = sklearn.metrics.roc_auc_score( true_label,pred)
  auc = sklearn.metrics.roc_auc_score( (true_label).detach().cpu().squeeze().numpy(),torch.sigmoid(pred).detach().cpu().squeeze().numpy())
  plt.plot(train_loss)
  return avg_loss/(idx+1-static),auc

In [None]:
def val_fn(model,ema,valid_loader,val_shape,batch_size):
  model.eval()
  auc = 0
  avg_loss = 0
  static = 0
  sum = 0
  pred = np.zeros((val_shape))
  predict = np.zeros((val_shape))
  true_label = np.zeros((val_shape))
  ema.apply_shadow()
  with torch.no_grad():
    for idx, batch in tqdm_notebook(enumerate(valid_loader),mininterval=2,desc='--valing',leave=False):

      label = batch['label'].to(device).view(-1)
      weight = batch['weight'].to(device).view(-1)
      output_train,_ = model(batch)
      pred[idx*batch_size:(idx+1)*batch_size] = torch.sigmoid(output_train.view(-1)).detach().cpu().squeeze().numpy()
      predict[idx*batch_size:(idx+1)*batch_size] = output_train.view(-1).detach().cpu().squeeze().numpy()
      true_label[idx*batch_size:(idx+1)*batch_size] = label.view(-1).detach().cpu().squeeze().numpy()
      loss = my_loss(output_train.view(-1),label,weight)
      avg_loss += loss.item()
      # if len(np.unique(label.cpu())) == 2:
      #   auc += sklearn.metrics.roc_auc_score( (label.cpu().int()),torch.sigmoid(output_train.view(-1)).detach().cpu().numpy())
      # else:
      #   static+=1
      #   sum+=1
      #   print(sum)
    auc = sklearn.metrics.roc_auc_score( true_label,pred)
  ema.restore()
  return avg_loss/(idx+1-static),auc,predict

In [None]:
test_df1

In [None]:
def test_fn(model,test_loader,test_shape,batch_size):
  model.eval()
  pred = torch.zeros((test_shape))
  with torch.no_grad():
    for idx, batch in tqdm_notebook(enumerate(test_loader),mininterval=2,desc='--testing',leave=False):
      output_train,_ = model(batch)
      pred[idx*batch_size:(idx+1)*batch_size] = output_train.view(-1)
  return pred

In [None]:
floder = StratifiedKFold(n_splits=5,random_state=42,shuffle=True)
predicts = np.zeros(len(train_df))
Batch_size = 7000
device = torch.device('cuda')
NUM_EPOCH = 5
test_predicts = np.zeros(len(test_df))

In [None]:
# for fold,(train, test) in enumerate(floder.split(train_df,train_df.label)):
for k in range(5):

  #data_process 
  train_dataset = CTR_dataset(train_df[train_df.K!=k].reset_index(drop=True),sparse_features,dense_features,1,True)
  valid_dataset = CTR_dataset(train_df[train_df.K==k].reset_index(drop=True),sparse_features,dense_features,1,True)
  train_dataloader = DataLoader(train_dataset , batch_size = Batch_size,num_workers=4, shuffle=True)
  valid_dataloader = DataLoader(valid_dataset , batch_size = Batch_size,num_workers=4, shuffle=False)

  #model 
  model  = xDeepFM(dict_id_word,sparse_features,32,filed_size=21,inputs_dim=21*32+33+33, dnn_hidden_units=(512,128),cin_layer_size=(256, 128,),dnn_dropout=0.3,use_bn=True)
  model.zero_grad();
  model.to(device)
  fgm = FGM(model)
  ema=EMA(model,0.99)
  ema.register()
  #优化器
  optimizer = torch.optim.AdamW(model.parameters(), lr=5e-3,weight_decay=0.1)
  ScheduledOptim = ScheduledOptimn(optimizer,5e-2,500)
  
  
  best_score = -1
  for epoch in range(NUM_EPOCH):
    torch.cuda.empty_cache()
    start_time = time.time()
    train_loss, train_auc = train_fn(model ,ema ,fgm,train_dataloader ,len(train_dataset), Batch_size,ScheduledOptim,'no',0.1)
    valid_loss,val_auc,predict, = val_fn(model ,ema,valid_dataloader ,len(valid_dataset), Batch_size)

    elapsed_time = time.time() - start_time
    print('epoch{}/{} , train_loss: {}  ,train_auc: {} \n \n val_loss :{} ,  val_auc:{} ,time: {}\n'.format(epoch+1,NUM_EPOCH,train_loss,train_auc,valid_loss,val_auc,elapsed_time))
    if val_auc>best_score:
      best_score = val_auc
      ema.apply_shadow()
      best_param_score = model.state_dict()
      ema.restore()
      #predicts[test] = predict
      torch.save(best_param_score,'xdeepfm_glo_rank__best_param_score_{}'.format(k+1))
      
      #test_predict = test_fn(model ,test_dataloader ,len(test_dataset), Batch_size)
  #test_predicts+=test_predict
  del train_dataset
  del valid_dataset
  del train_dataloader
  del valid_dataloader
  gc.collect()



In [None]:
#------------------------------模型预测----------------------------------------#

In [None]:
test_dataset1 = CTR_dataset(test_df1,sparse_features,dense_features,1,False)
test_dataloader1 = DataLoader(test_dataset1,batch_size = Batch_size ,num_workers=2 ,shuffle=False)
test_dataset2 = CTR_dataset(test_df2,sparse_features,dense_features,1,False)
test_dataloader2 = DataLoader(test_dataset2,batch_size = Batch_size ,num_workers=2 ,shuffle=False)
test_dataset3 = CTR_dataset(test_df3,sparse_features,dense_features,1,False)
test_dataloader3 = DataLoader(test_dataset3,batch_size = Batch_size ,num_workers=2 ,shuffle=False)
test_dataset4 = CTR_dataset(test_df4,sparse_features,dense_features,1,False)
test_dataloader4 = DataLoader(test_dataset4,batch_size = Batch_size ,num_workers=2 ,shuffle=False)
test_dataset5 = CTR_dataset(test_df5,sparse_features,dense_features,1,False)
test_dataloader5 = DataLoader(test_dataset5,batch_size = Batch_size ,num_workers=2 ,shuffle=False)

In [None]:


model1  =   model  = xDeepFM(dict_id_word,sparse_features,32,filed_size=21,inputs_dim=21*32+33+33, dnn_hidden_units=(512,128),cin_layer_size=(256, 128,),dnn_dropout=0.3,use_bn=True)
model1.zero_grad();
model1.to(device)
model1.load_state_dict(torch.load('xdeepfm_glo_rank__best_param_score_{}'.format(1)))

model2  =   model  = xDeepFM(dict_id_word,sparse_features,32,filed_size=21,inputs_dim=21*32+33+33, dnn_hidden_units=(512,128),cin_layer_size=(256, 128,),dnn_dropout=0.3,use_bn=True)
model2.zero_grad();
model2.to(device)
model2.load_state_dict(torch.load('xdeepfm_glo_rank__best_param_score_{}'.format(2)))

model3  =   model  = xDeepFM(dict_id_word,sparse_features,32,filed_size=21,inputs_dim=21*32+33+33, dnn_hidden_units=(512,128),cin_layer_size=(256, 128,),dnn_dropout=0.3,use_bn=True)
model3.zero_grad();

model3.to(device)
model3.load_state_dict(torch.load('xdeepfm_glo_rank__best_param_score_{}'.format(3)))

model4  =   model  = xDeepFM(dict_id_word,sparse_features,32,filed_size=21,inputs_dim=21*32+33+33, dnn_hidden_units=(512,128),cin_layer_size=(256, 128,),dnn_dropout=0.3,use_bn=True)
model4.zero_grad();
model4.to(device)
model4.load_state_dict(torch.load('xdeepfm_glo_rank__best_param_score_{}'.format(4)))

model5  =   model  = xDeepFM(dict_id_word,sparse_features,32,filed_size=21,inputs_dim=21*32+33+33, dnn_hidden_units=(512,128),cin_layer_size=(256, 128,),dnn_dropout=0.3,use_bn=True)
model5.zero_grad();
model5.to(device)
model5.load_state_dict(torch.load('xdeepfm_glo_rank__best_param_score_{}'.format(5)))

In [None]:
test_predict1 = test_fn(model1 ,test_dataloader1 ,len(test_dataset1), Batch_size)
print((test_predict1>0).sum())
test_predict2 = test_fn(model2 ,test_dataloader2 ,len(test_dataset2), Batch_size)
print((test_predict2>0).sum())

test_predict3 = test_fn(model3 ,test_dataloader3 ,len(test_dataset3), Batch_size)
test_predict4 = test_fn(model4 ,test_dataloader4 ,len(test_dataset4), Batch_size)
test_predict5 = test_fn(model5 ,test_dataloader5 ,len(test_dataset5), Batch_size)

In [None]:
test_predict = (test_predict1+test_predict2 +test_predict3 +test_predict4 +test_predict5)/5



In [None]:
test_predict_sig = torch.sigmoid(test_predict)
test_predict_sig = test_predict_sig.detach().cpu().squeeze().numpy()

In [None]:
(test_predict_sig>0.5).sum()

In [None]:
res = pd.DataFrame()
res['id'] = test_id
res['probability'] = (test_predict_sig)

In [None]:
res.to_csv('xdeep_submission.csv',index=False)

In [None]:
res

