In [None]:
import os
from sklearn.model_selection import KFold, train_test_split, GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn import preprocessing
from sklearn import metrics
from scipy.stats import entropy, kurtosis
import xgboost as xgb
import seaborn as sns
from xgboost import plot_importance
import warnings
import matplotlib.pyplot as plt
import pandas as pd
from math import *
import numpy as np
import tensorflow as tf
import time
import gc

pd.set_option('display.max_columns', None)
from IPython.display import display

warnings.filterwarnings('ignore')
%matplotlib inline

In [None]:
pathf = os.path.join("..", "data", "particles")
model_path = os.path.join(pathf, "model")
log_path = os.path.join(pathf, "model")
trainpd = pd.read_csv(os.path.join(pathf, "train.csv"))
print(trainpd.head(1))
trainshape = trainpd.shape
print(trainshape)
eventpd = pd.read_csv(os.path.join(pathf, "event.csv"))
print(eventpd.head(1))
print(eventpd.shape)
testpd = pd.read_csv(os.path.join(pathf, "test.csv"))
testshape = testpd.shape
print(testpd.head(1))
print(testpd.shape)

data = pd.concat([trainpd, testpd], ignore_index=True)
data = pd.merge(data, eventpd, on='event_id', how='left')

In [None]:
# (k(q,mc)*(t0+L))^2 + dis^2 -dis*cos(phi)*sin(thmc)*(t0+L) = (t+L)^2
# t0 方程 
# a = k(q,mc)^2
# b = 2*L*k(q,mc)^2 -dis*cos(phi)*sin(thmc)
# c = L^2 * k(q,mc)^2 + dis^2 - dis*cos(phi)*sin(thmc)*L - (t+L)^2 
# t0 = (-b +- (b^2 - 4*a*c)^(1/2))/2*a
data = pd.concat([trainpd, testpd], ignore_index=True)
data = pd.merge(data, eventpd, on='event_id', how='left')

data['fx'] = data['x'] - data['xcmc']
data['fy'] = data['y'] - data['ycmc']
data['phimc'] = data['phimc'] * np.pi / 180.
data['fphi'] = np.arctan2(data['fy'], data['fx']) - data['phimc']
data['fdis'] = np.sqrt(data['fx'] ** 2 + data['fy'] ** 2)
data['thetamc'] = data['thetamc'] * np.pi / 180.

data['fsinthmc'] = np.sin(data['thetamc'])
data['fsinthmc_v'] = 1.0/data['fsinthmc']
data['fcosphi'] = np.cos(data['fphi'])
data['fcosphi_v'] = 1.0/data['fcosphi']

data['fcosthmc'] = np.cos(data['thetamc'])
data['fcosthmc_v'] = 1.0/data['fcosthmc']
data['fsinphi'] = np.sin(data['fphi'])
data['fsinphi_v'] = 1.0/data['fsinphi']


del data['fsinphi']
del data['nhitreal']
gc.collect()

In [None]:
info_new = pd.DataFrame()
info_new["event_id"] = data.groupby(["event_id"])["event_id"].mean()
info_new["fdis_mean"] = data.groupby(["event_id"])["fdis"].mean()
info_new["fdis_std"] = data.groupby(["event_id"])["fdis"].std()
info_new["fdis_stdmean"] = info_new["fdis_std"] / info_new["fdis_mean"]
info_new["ft_min"] = data.groupby(["event_id"])["t"].min()
info_new["ft_max"] = data.groupby(["event_id"])["t"].max()
info_new["t_mean"] = data.groupby(["event_id"])["t"].mean()
info_new["ft_std"] = data.groupby(["event_id"])["t"].std()
info_new["ft_stdmean"] = info_new["ft_std"] / info_new["t_mean"]
info_new["ft_mean"] = (info_new['t_mean']-info_new['ft_min']) / (info_new['ft_max']-info_new['ft_min'])
info_new.reset_index(drop=True, inplace=True)
data = pd.merge(data, info_new, on='event_id', how='left')

data['ft_rel'] = data['t'] / data['ft_std']
data['ft2_rel'] = data['ft_rel'] ** 2
data['ft_rel_v'] = 1.0 / data['ft_rel']
data['ft2_rel_v'] = 1.0 / data['ft2_rel'] 

# (k(q,mc)*(t0+L))^2 + dis^2 -dis*cos(phi)*sin(thmc)*(t0+L) = (t+L)^2
data = data.sort_values(by=['event_id', 'ft_rel']).reset_index(drop=True)
for i in [1, 5, 7, 11]:
    data[f'ft_{i}diff'] = data.groupby('event_id')['ft_rel'].diff(periods=i).fillna(0)
del data['t']
gc.collect()

In [None]:
print(trainshape[0])
print(data.shape)
testpd = data[data.flag.isna()].reset_index()
trainpd = data[data.flag.notna()].reset_index()
trainpd['flag'] = trainpd['flag'].astype('int')
# trainpd = data[:trainshape[0]].reset_index()
# testpd = data[trainshape[0]:].reset_index()
del data
gc.collect()

In [None]:
print(trainpd.columns)
feature = [x for x in trainpd.columns if x not in ['flag', 'index', 'hit_id', 'event_id']]
labels = trainpd['flag']
del trainpd['flag']
del testpd['flag']

In [None]:
print(trainpd.head())
npx = trainpd.values  # returns a numpy array
npx[np.isinf(npx)] = 0 # 清洗nan
min_max_scaler = preprocessing.MinMaxScaler()
trainnormalpd = pd.DataFrame([[0]*len(trainpd.columns)])
trainnormalpd= pd.DataFrame(min_max_scaler.fit_transform(npx))
trainnormalpd.columns = trainpd.columns
print(trainnormalpd.head())

In [None]:
def batch_iter_list(data_list, batch_size, num_epochs, shuffle=True):
    data_size = len(data_list[0])
    num_batches_per_epoch = data_size // batch_size  # 每个epoch中包含的batch数量
    for epoch in range(num_epochs):
        # 每个epoch是否进行shuflle
        if shuffle:
            shuffle_indices = np.random.permutation(np.arange(data_size))
            shuffled_data_list = [data[shuffle_indices] for data in data_list]
        else:
            shuffled_data_list = data_list

        for batch_num in range(num_batches_per_epoch + 1):
            start_index = batch_num * batch_size
            end_index = min((batch_num + 1) * batch_size, data_size)
            yield [shuffled_data[start_index:end_index] for shuffled_data in shuffled_data_list]

class AbstractModeltensor(object):
    def __init__(self, config=None):
        self.config = config

    # You need to override this method.
    def buildModel(self):
        raise NotImplementedError("You need to implement your own model.")


class NeurousNet(AbstractModeltensor):
    def __init__(self, xlenth, config=None):
        super(NeurousNet, self).__init__(config)
        self.graph = tf.Graph()  # 为每个类(实例)单独创建一个graph
        self.modeldic = {
            "cnn_dense_less": self._cnn_dense_less_model,
            "mul4_model": self._mul4_model,
            "nomul_model": self._nomul_model,
        }
        self.ydim = 1
        self.keep_prob_ph = config["dropout"]
        self.input_dim = xlenth
        self.out_dim = 1
        with self.graph.as_default():
            with tf.name_scope('Inputs'):
                self.input_p = tf.placeholder(tf.float32, [None, self.input_dim])
                self.learn_rate_p = tf.placeholder(dtype=tf.float32, shape=[], name="lr")
                self.lr_decay = tf.placeholder(dtype=tf.float32, shape=[])
            with tf.name_scope('Outputs'):
                self.target_y = tf.placeholder(dtype=tf.float32, shape=[None, self.out_dim])

    def buildModel(self):
        tf.reset_default_graph()
        with self.graph.as_default():
            # 不同选择加载
            self.modeldic[self.config["modelname"]]()
            # 打印打包
            self.merged = tf.summary.merge_all()
            # 损失目标
            tvars = tf.trainable_variables()  # 返回需要训练的variable
            grads, _ = tf.clip_by_global_norm(tf.gradients(self.train_list, tvars), 1e-3)
            grads_and_vars = tuple(zip(grads, tvars))
            self.train_op = tf.train.AdamOptimizer(self.learn_rate_p).apply_gradients(grads_and_vars)
            #             self.train_op = []
            #             for i2 in self.train_list:
            #                 self.train_op.append(tf.train.AdamOptimizer(self.learn_rate_p).minimize(i2))
            # 同一保存加载
            self.saver = tf.train.Saver(tf.global_variables())
            # [print(n.name) for n in tf.get_default_graph().as_graph_def().node]
            # return self.saver

    def _mul4_model(self):
        with self.graph.as_default():
            # 部分1，预测值
            base0 = tf.layers.dense(inputs=self.input_p, units=self.input_dim*4, activation=tf.nn.elu,
                                     name="base0")
            base1 = tf.layers.dense(inputs=self.input_p, units=self.input_dim*8, activation=tf.nn.elu,
                                     name="base1")
            base2 = tf.layers.dense(inputs=self.input_p, units=self.input_dim*16, activation=tf.nn.elu,
                                     name="base2")
            base3 = tf.layers.dense(inputs=self.input_p, units=self.input_dim*32, activation=tf.nn.elu,
                                     name="base3")
            mult0 = tf.layers.dense(inputs=self.input_p, units=self.input_dim*4, activation=tf.nn.elu,
                                     name="mult0")
            mult_o1 = tf.nn.elu(mult0 * base0, name='mult_o1') # 4 in
            mult_e1 = tf.layers.dense(inputs=mult_o1, units=self.input_dim*8, activation=tf.nn.elu,
                                     name="mult_e1")
            mult_o2 = tf.nn.elu(mult_e1 * base1, name='mult_o2') # 8 in
            mult_e2 = tf.layers.dense(inputs=mult_o2, units=self.input_dim*16, activation=tf.nn.elu,
                                     name="mult_e2")
            mult_o3 = tf.nn.elu(mult_e2 * base2, name='mult_o3') # 16 in
            mult_e3 = tf.layers.dense(inputs=mult_o3, units=self.input_dim*32, activation=tf.nn.elu,
                                     name="mult_e3")            
            mult_o4 = tf.nn.elu(mult_e3 * base3, name='mult_o4') # 32 in
            
            concat1 = tf.concat([self.input_p, mult_o1, mult_o2, mult_o3, mult_o4], 1, name='concat1')
            denseo1 = tf.nn.dropout(concat1, keep_prob=self.keep_prob_ph)
            denseo2 = tf.layers.dense(inputs=denseo1, units=self.input_dim * 16, activation=tf.nn.elu, 
                                      name="denseo2")
            denseo2 = tf.nn.dropout(denseo2, keep_prob=self.keep_prob_ph)
            denseo3 = tf.layers.dense(inputs=denseo2, units=self.input_dim, activation=tf.nn.elu,
                                      name="denseo3")
            denseo4 = tf.layers.dense(inputs=denseo3, units=self.input_dim // 8, activation=tf.nn.elu,
                                      name="denseo4")
            y_res_t = tf.layers.dense(inputs=denseo4, units=self.out_dim, activation=None)
            y_res_v = tf.nn.sigmoid(y_res_t, name="y_res_v")
            tf.summary.histogram('y_res_v', y_res_v)  # 记录标量的变化
            # 损失返回值
            y_los = tf.nn.sigmoid_cross_entropy_with_logits(logits=y_res_t, labels=self.target_y, name="y_los")
            y_loss_t = tf.reduce_mean(y_los, name="y_loss_t")
            y_loss_v = tf.add(y_loss_t, 0, name="y_loss_v")

            one = tf.ones_like(y_res_t)
            zero = tf.zeros_like(y_res_t)
            label_bool = tf.where(y_res_t < 0.5, x=zero, y=one)
            self.auc_value, self.auc_op = tf.metrics.auc(self.target_y, label_bool, num_thresholds=4000)
            # 猜错的获取 实际盈利值的负数
            self.train_list = [y_loss_t]
            self.valid_list = [y_loss_v]
            self.pred_list = [y_res_v]
            # 打印信息
            tf.summary.scalar('y_loss_t', y_loss_t)  # 记录标量的变化
            tf.summary.scalar('y_loss_v', y_loss_v)  # 记录标量的变化
            tf.summary.histogram('concat1', concat1)  # 记录标量的变化
            tf.summary.histogram('denseo4', denseo4)  # 记录标量的变化

            tf.summary.scalar('lr', self.learn_rate_p)  # 记录标量的变化
            return None

    def batch_train(self, trainpd, labels, batch_size=8, num_epochs=1, retrain=True):
        # 设置
        sess = tf.Session(graph=self.graph)
        with sess.as_default():
            with self.graph.as_default():
                if self.config["retrain"] == 1:
                    model_dir = os.path.join(model_path, "modelevery_%s" % self.config["tailname"])
                    latest_ckpt = tf.train.latest_checkpoint(model_dir)
                    if os.path.isfile("{}.index".format(latest_ckpt)):
                        self.saver.restore(sess, latest_ckpt)
                        sess.run(tf.local_variables_initializer())
                        print("retraining {}".format(latest_ckpt))
                    else:
                        sess.run(tf.global_variables_initializer())
                        sess.run(tf.local_variables_initializer())
                        print("no old model, training new----")
                writer = tf.summary.FileWriter(os.path.join(log_path, "logsevery_%s" % self.config["tailname"]),
                                               sess.graph)
                global_n = 0
                stop_n = 0
                startt = time.time()
                pre_t_base_loss = pre_t_much_loss = pre_v_much_loss = pre_v_base_loss = 100000

                n_splits = 5
                kf = KFold(n_splits=n_splits, shuffle=True, random_state=4389)
                for epoch in range(num_epochs):
                    for train_index, valid_index in kf.split(trainpd):
                        if self.config["learn_rate"]>1e-9:
                            self.config["learn_rate"] *= 0.7
                        inputs_t = np.array(trainpd[feature].iloc[train_index])
                        output_t = np.expand_dims(np.array(labels[train_index]),-1)
                        inputs_v = np.array(trainpd[feature].iloc[valid_index])
                        output_v = np.expand_dims(np.array(labels[valid_index]),-1)
                        dataiter = batch_iter_list([inputs_t,output_t], batch_size, num_epochs)
                        starte = time.time()
                        print("iter_trainnum", inputs_t.shape[0] // batch_size + 1)
                        redi = inputs_t.shape[0] % batch_size
                        lenth = inputs_t.shape[0] // batch_size
                        if 0 != redi:
                            lenth += 1
                        counter = 0
                        for batch_num in range(lenth):
                            # 获取数据
                            r_inputs_t,r_output_t = next(dataiter)
                            feed_dict_t = {
                                self.input_p: r_inputs_t,
                                self.target_y: r_output_t,
                                self.learn_rate_p: self.config["learn_rate"],
                                self.lr_decay: 1,
                            }
                            # 更新学习率
                            sess.run(self.train_op, feed_dict_t)
                            global_n += 1
                            losslist_t = sess.run(self.train_list, feed_dict_t)
                            sess.run(self.auc_op, feed_dict=feed_dict_t)
                            accu = sess.run(self.auc_value)
                            result = sess.run(self.merged, feed_dict_t)
                            if batch_num % 200 == 0:
                                writer.add_summary(result, global_n)
                                self.saver.save(sess,
                                                os.path.join(model_path, 'modelevery_%s' % self.config["tailname"],
                                                             self.config["modelfile"]), global_step=global_n)
                                print("epocht {}, batch_num {}, step {}, time: {} s, accu: {}, loss_yt: {}".format(
                                    epoch, batch_num, global_n, time.time() - starte, accu, *losslist_t))
                        # valid part
                        dataiterv = batch_iter_list([inputs_v,output_v], batch_size, num_epochs)
                        redi = inputs_v.shape[0] % batch_size
                        vnum_iter = inputs_v.shape[0] // batch_size
                        if 0 != redi:
                            vnum_iter += 1
                        counter = 0
                        print("iter_validnum", vnum_iter)
                        losslist_va = 0
                        accu_va = 0
                        dataiter = batch_iter_list([inputs_v,output_v], batch_size, num_epochs)
                        for batch_num in range(vnum_iter):
                            # 获取数据
                            r_inputs_v,r_output_v = next(dataiter)
                            feed_dict_v = {
                                self.input_p: r_inputs_v,
                                self.target_y: r_output_v,
                                self.learn_rate_p: self.config["learn_rate"],
                                self.lr_decay: 1,
                            }
                            losslist_v = sess.run(self.valid_list, feed_dict_v)
                            sess.run(self.auc_op, feed_dict=feed_dict_v)
                            accu = sess.run(self.auc_value)
                            losslist_va += losslist_v[0]
                            accu_va += accu
                        losslist_va /= vnum_iter
                        accu_va /= vnum_iter
                        result = sess.run(self.merged, feed_dict_v)
                        writer.add_summary(result, global_n)
                        if losslist_t[0] < pre_t_base_loss and losslist_va < pre_v_base_loss:
                            stop_n += 1
                            if stop_n > self.config["early_stop"]:
                                break
                            else:
                                self.saver.save(sess,
                                                os.path.join(model_path, 'modelevery_%s' % self.config["tailname"],
                                                             self.config["modelfile"]), global_step=global_n)
                        else:
                            stop_n = 0
                            self.saver.save(sess, os.path.join(model_path, 'modelevery_%s' % self.config["tailname"],
                                                               self.config["modelfile"]), global_step=global_n)
                        print("epochv {}, step {}, stop_n {}, time: {} s, accu_va: {}, loss_yv: {}".format(
                            epoch, global_n, stop_n, time.time() - starte, accu_va, losslist_va))
                        pre_t_base_loss = losslist_t[0]
                        pre_v_base_loss = losslist_va
                writer.close()
                print("total time: %s s" % (time.time() - startt))
        # 结束
        print("train finished!")
        return None

    def batch_train2(self, trainpd, labels, batch_size=8, num_epochs=1, retrain=True):
        # 设置
        sess = tf.Session(graph=self.graph)
        with sess.as_default():
            with self.graph.as_default():
                if self.config["retrain"] == 1:
                    model_dir = os.path.join(model_path, "modelevery_%s" % self.config["tailname"])
                    latest_ckpt = tf.train.latest_checkpoint(model_dir)
                    if os.path.isfile("{}.index".format(latest_ckpt)):
                        self.saver.restore(sess, latest_ckpt)
                        sess.run(tf.local_variables_initializer())
                        print("retraining {}".format(latest_ckpt))
                    else:
                        sess.run(tf.global_variables_initializer())
                        sess.run(tf.local_variables_initializer())
                        print("no old model, training new----")
                writer = tf.summary.FileWriter(os.path.join(log_path, "logsevery_%s" % self.config["tailname"]),
                                               sess.graph)
                global_n = 0
                stop_n = 0
                startt = time.time()
                pre_t_base_loss = pre_t_much_loss = pre_v_much_loss = pre_v_base_loss = 100000

                n_splits = 5
                kf = KFold(n_splits=n_splits, shuffle=True, random_state=4389)
                for epoch in range(num_epochs):
                    self.config["learn_rate"] *= 0.3
                    trainevenidlist = list(set(trainpd['event_id']))
                    for train_index, valid_index in kf.split(trainevenidlist):
                        starte = time.time()
                        print("iter_trainnum", len(train_index))
                        np.random.shuffle(train_index)
                        np.random.shuffle(valid_index)
                        for batch_num, eventindex in enumerate(train_index):
                            # 获取数据
                            thisindex = trainpd[trainpd['event_id'] == trainevenidlist[eventindex]].index
                            r_inputs_t = np.array(trainpd.iloc[thisindex][feature])
                            r_output_t = np.expand_dims(np.array(labels[thisindex]), -1)
                            feed_dict_t = {
                                self.input_p: r_inputs_t,
                                self.target_y: r_output_t,
                                self.learn_rate_p: self.config["learn_rate"],
                                self.lr_decay: 1,
                            }
                            # 更新学习率
                            sess.run(self.train_op, feed_dict_t)
                            global_n += 1
                            losslist_t = sess.run(self.train_list, feed_dict_t)
                            sess.run(self.auc_op, feed_dict=feed_dict_t)
                            accu = sess.run(self.auc_value)
                            result = sess.run(self.merged, feed_dict_t)
                            if batch_num % 200 == 0:
                                writer.add_summary(result, global_n)
                                self.saver.save(sess,
                                                os.path.join(model_path, 'modelevery_%s' % self.config["tailname"],
                                                             self.config["modelfile"]), global_step=global_n)
                                print("epocht {}, batch_num {}, step {}, time: {} s, accu: {}, loss_yt: {}".format(
                                    epoch, batch_num, global_n, time.time() - starte, accu, *losslist_t))
                        # valid part
                        print("iter_validnum", len(valid_index))
                        losslist_va = 0
                        accu_va = 0
                        for batch_num, eventindex in enumerate(valid_index):
                            # 获取数据
                            thisindex = trainpd[trainpd['event_id'] == trainevenidlist[eventindex]].index
                            r_inputs_v = np.array(trainpd.iloc[thisindex][feature])
                            r_output_v = np.expand_dims(np.array(labels[thisindex]), -1)
                            feed_dict_v = {
                                self.input_p: r_inputs_v,
                                self.target_y: r_output_v,
                                self.learn_rate_p: self.config["learn_rate"],
                                self.lr_decay: 1,
                            }
                            losslist_v = sess.run(self.valid_list, feed_dict_v)
                            sess.run(self.auc_op, feed_dict=feed_dict_v)
                            accu = sess.run(self.auc_value)
                            losslist_va += losslist_v[0]
                            accu_va += accu
                        losslist_va /= len(valid_index)
                        accu_va /= len(valid_index)
                        result = sess.run(self.merged, feed_dict_v)
                        writer.add_summary(result, global_n)
                        if losslist_t[0] < pre_t_base_loss and losslist_va < pre_v_base_loss:
                            stop_n += 1
                            if stop_n > self.config["early_stop"]:
                                break
                            else:
                                self.saver.save(sess,
                                                os.path.join(model_path, 'modelevery_%s' % self.config["tailname"],
                                                             self.config["modelfile"]), global_step=global_n)
                        else:
                            stop_n = 0
                            self.saver.save(sess, os.path.join(model_path, 'modelevery_%s' % self.config["tailname"],
                                                               self.config["modelfile"]), global_step=global_n)
                        print("epochv {}, step {}, stop_n {}, time: {} s, accu_va: {}, loss_yv: {}".format(
                            epoch, global_n, stop_n, time.time() - starte, accu_va, losslist_va))
                        pre_t_base_loss = losslist_t[0]
                        pre_v_base_loss = losslist_va
                writer.close()
                print("total time: %s s" % (time.time() - startt))
        # 结束
        print("train finished!")
        return None

    def predict(self, inputs):
        model_dir = os.path.join(model_path, "modelevery_%s" % self.config["tailname"])
        print("loading model...")
        latest_ckpt = tf.train.latest_checkpoint(model_dir)

        sess = tf.Session(graph=self.graph)
        with sess.as_default():
            with self.graph.as_default():
                if os.path.isfile("{}.index".format(latest_ckpt)):
                    self.saver.restore(sess, latest_ckpt)
                else:
                    raise Exception("没有找到模型:{}".format(latest_ckpt))
                nplist = []
                oneiter = 2000
                redi = inputs.shape[0] % oneiter
                lenth = inputs.shape[0] // oneiter
                if 0 != redi:
                    lenth += 1
                counter = 0
                for num in range(lenth):
                    # 获取数据
                    startindex = num * oneiter
                    if num == lenth - 1 and redi != 0:
                        endindex = num * oneiter + redi
                    else:
                        endindex = (num + 1) * oneiter
                    tmppd = inputs.iloc[startindex:endindex][feature]
                    r_inputs_v = np.array(tmppd)
                    feed_dict = {
                        self.input_p: r_inputs_v,
                    }
                    teslis = sess.run(self.pred_list, feed_dict)
                    nplist.append(teslis)
                feed_dict = {
                    self.input_p: inputs,
                }
                teslist = np.concatenate(nplist, axis=1)
                return teslist


trainconfig = {
    "dropout": 0.5,
    "early_stop": 100,
#     "tailname": "nomul_modeltail",
#     "modelname": "nomul_model",
    "tailname": "mul4_modeltailv2",
    "modelname": "mul4_model",
#     "tailname": "mul_verse",
#     "modelname": "cnn_dense_less",
    "modelfile": "v2",
#     "learn_rate": 1e-3,
    "learn_rate": 5e-4,
#     "learn_rate": 1e-6,
    "retrain": 1
}
modelcrnn = NeurousNet(len(feature), config=trainconfig)
modelcrnn.buildModel()

In [None]:
# batch_size, num_epochs = 4096, 1000
# batch_size, num_epochs = 512, 1000
batch_size, num_epochs = 512, 1000
globalstep = modelcrnn.batch_train(trainnormalpd, labels, batch_size, num_epochs)

In [None]:
y_pred = modelcrnn.predict(testpd[feature])
fy_submission = np.squeeze(y_pred)
fy_submission = (fy_submission - fy_submission.min()) / (fy_submission.max() - fy_submission.min())

In [None]:
#阈值大概在0.2-0.4之间 本题对召回率较敏感，可适当降低一下阈值
thre = 0.5
#生成提交文件
sub = pd.DataFrame()
sub['hit_id'] = testpd['hit_id']
sub['flag_pred'] = fy_submission
sub['event_id'] = testpd['event_id']
sub['flag_pred'] = sub['flag_pred'].apply(lambda x: 1 if x >= thre else 0)
sub.to_csv(os.path.join(pathf, "subsample.csv").format(sub['flag_pred'].mean()), index=False)