In [None]:
import os
import gc
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, RobustScaler, StandardScaler, QuantileTransformer
from sklearn.model_selection import StratifiedKFold, KFold
import tensorflow as tf
from tensorflow.keras.layers import Dense, Layer, Dropout
from tensorflow.keras import Model
from tensorflow.keras.regularizers import l2
from tensorflow.keras.layers import Embedding, Dense, Input, LSTM
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.losses import binary_crossentropy
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import AUC
import warnings
warnings.simplefilter('ignore')
pd.set_option('max_columns', None)
pd.set_option('max_rows', 200)

from dcn import DCN
from utils import *
from feature_engineer import *
from gen_similarity_fea import generate_similarity_fea
from gen_kmeans_fea import generate_kmeans_fea

In [None]:
def train_kfold_model(x_train, y_train, test, features, feature_columns, model_path):
    
    # ========================= Hyper Parameters =======================
    dnn_dropout = 0.3
    hidden_units = [256, 128, 64]
    LR = 1e-3
    BATCH_SIZE = 8192 # 1024/2048/4096/8192/16384
    epochs = 2
    # ========================= END =======================

    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    test_pred = np.zeros(shape=(test.shape[0],))

    for i, (train_index, valid_index) in enumerate(kfold.split(x_train, y_train)):
        print('************************************ {} ************************************'.format(str(i + 1)))

        trn_x, trn_y = x_train[train_index], y_train[train_index]
        val_x, val_y = x_train[valid_index], y_train[valid_index]

        model = DCN(feature_columns, hidden_units=hidden_units, dnn_dropout=dnn_dropout)
        model.compile(loss=binary_crossentropy, optimizer=Adam(learning_rate=LR), metrics=[binary_crossentropy,])
        model.fit(
            trn_x,
            trn_y,
            epochs=2,
            callbacks=[EarlyStopping(monitor='val_binary_crossentropy', patience=1, restore_best_weights=True, mode='min')],  # checkpoint,
            batch_size=BATCH_SIZE,
            # validation_split=0.1,
            validation_data=(val_x, val_y),
        )
        del trn_x, trn_y, val_x, val_y
        gc.collect()

        print(f'fold_{i+1} model predict')
        test_pred_fold = model.predict(test[features].values.astype('int32'), batch_size=BATCH_SIZE)
        test_pred += (test_pred_fold.squeeze() / kfold.n_splits)
        del test_pred_fold; gc.collect()

        model.save_weights(model_path+f'DCN_fold_{i+1}.h5')
        print(f'save fold_{i+1} model ok!')
        del model; gc.collect()

        print(f'fold_{i+1} train finish!')
    
    return test_pred

In [None]:
seed_everything(seed=2022)
id_col = 'ID'
target = 'is_finish'

useless_cols = [id_col, target, 'is_like','is_favourite','is_share']
dense_features = []
sparse_features = ['userid','videoid','tag']
EMBED_DIM = 16

# ==== feature preparation =====
N_COMPONENT = 32
USER_CLUSTER_NUM = 16
VIDEO_CLUSTER_NUM = 48
generate_similarity_fea(n_component=N_COMPONENT)
generate_kmeans_fea(N_COMPONENT, USER_CLUSTER_NUM, VIDEO_CLUSTER_NUM)

In [None]:
# ==== load test data =====
test_path = '../temp_data/df_test_sp.pkl'
test = pd.read_pickle(test_path)
print(test.shape)

# ==== load train data =====
train_data_path = '../temp_data/df_train_sp.pkl'
train = pd.read_pickle(train_data_path)
train = reduce_mem_usage(train)
print(train.shape)
print(train['tag'].isnull().sum())

# video的反馈特征
video_fb_path = '../temp_data/video_stat_v1.pkl'
make_feedback_stat_fea(train, save_path=video_fb_path, group_fea='videoid')

# user对于tag的反馈统计特征
usertag_path = '../temp_data/user_tag_stat_v1.pkl'
make_user_tag_stat_fea(train, save_path=usertag_path)

# ==== concat train & test data =====
# data = pd.concat([train, test],axis=0)
data = train.append(test)
data.drop(['is_like','is_favourite','is_share'], axis=1, inplace=True)
print('all data shape: ', data.shape)
data = reduce_mem_usage(data)
del train, test
gc.collect()

data = data.reset_index(drop=True).reset_index()
data.sort_values(by=['userid','videoid'], inplace=True)

In [None]:
# ===================================ALL Feature Engineer=========================================
data, sparse_features, dense_features = make_feature_engineer(data, sparse_features, dense_features)

# ===================================Feature Columns=========================================
# dense encode
print('dense features NUM: ', len(dense_features))
print('dense features:', dense_features)
data = dense_fea_encode(data, dense_features=dense_features)
data = bin_encode(data, bin_cols=dense_features, bin_num=32)

# sparse encode
print('sparse feature NUM: ', len(sparse_features))
print('sparse features:', sparse_features)

features = sparse_features + dense_features
feature_columns = [sparseFeature(feat, int(data[feat].max())+1, embed_dim=EMBED_DIM) for feat in features]
print('feature num:', len(features))
print('features:', features)

# ===================================SPLIT DATA=========================================
x_train = data[data[target].notna()][features].values.astype('int32')
y_train = data[data[target].notna()][target].values.astype('int32')
test = data[data[target].isna()]
print(x_train.shape, test.shape)
del data; gc.collect()
# test_fe_path = '../temp_data/df_test_fe.pkl'
# test.to_pickle(test_fe_path)
# print('save feature engineer test data!')

# ===================================TRAIN MODEL=========================================
model_path = './model/'
test_pred = train_kfold_model(x_train, y_train, test, features, feature_columns, model_path)

In [None]:
submit_path = '../result/'
file_name = 'result.csv'

sub = test[['index']].copy()
sub['is_finish'] = test_pred
sub.sort_values(by=['index'], inplace=True)
sub['index'] = list(range(sub.shape[0]))
sub.columns = ['ID','is_finish']
print(sub.shape)

sub.to_csv(submit_path+file_name, index=False)
print(f'generate submitfile: {file_name} ok!')

## b榜
- 【0.4683】五折DCN 最好版本的特征 19个 
    - mean=4567 | 4561 4564 4567 4577 4567(lr=2e-3,bs=8192,drop=0.4)【0.4683】
    - mean=4554 | 4553 4555 4551 4564 4551(lr=1e-3,bs=8192,drop=0.4)
    - mean=4555 | 4552 4557 4559 4552 4555(lr=1e-3,bs=8192,drop=0.5,kfold=2022)
- 【0.4670】五折DCN 增加聚类特征(user 16类/video 64类) 21个 
    - mean=4548 | 4549 4544 4543 4562 4545(lr=1e-3,bs=8192,drop=0.3)
- 【0.4640】五折DCN 增加聚类特征(user 16类/video 64类) 21个 
    - mean=4505 | 4506 4501 4506 4507 4507(lr=1e-3,bs=8192,drop=0.3)
    - mean=4502 | 4502 4498 4503 4505 4502(lr=1e-3,bs=8192,drop=0.3)