In [None]:
import warnings
warnings.simplefilter('ignore')
import gc
import re
import time
import os
from sklearn.model_selection import KFold, train_test_split, GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics
from scipy.stats import entropy, kurtosis
import xgboost as xgb
import seaborn as sns
from xgboost import plot_importance
import matplotlib.pyplot as plt
import pandas as pd
from math import *
from sklearn import datasets
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GroupKFold
from sklearn.metrics import roc_auc_score
from sklearn.datasets.samples_generator import make_blobs
import numpy as np
import lightgbm as lgb
from IPython.display import display

# from tqdm import tqdm
# tqdm.pandas()

pd.set_option('display.max_columns', None)
pd.set_option('max_columns', None)
pd.set_option('max_rows', None)
pd.set_option('float_format', lambda x: '%.6f' % x)

%matplotlib inline

In [None]:
pathf = os.path.join("..", "data", "particles")

trainpd = pd.read_csv(os.path.join(pathf, "train.csv"))
print(trainpd.head(1))
trainshape = trainpd.shape
print(trainshape)
eventpd = pd.read_csv(os.path.join(pathf, "event.csv"))
print(eventpd.head(1))
print(eventpd.shape)
testpd = pd.read_csv(os.path.join(pathf, "test.csv"))
testshape = testpd.shape
print(testpd.head(1))
print(testpd.shape)

data = pd.concat([trainpd, testpd], ignore_index=True)
data = pd.merge(data, eventpd, on='event_id', how='left')

In [None]:
# (k(q,mc)*(t0+l))^2 + dis^2 -dis*cos(phi)*sin(thmc)*(t0+l) = (t+l)^2
data['fx'] = data['x'] - data['xcmc']
data['fy'] = data['y'] - data['ycmc']
data['phimc'] = data['phimc'] * np.pi / 180.
data['fphi'] = np.arctan2(data['fy'], data['fx']) - data['phimc']
data['fdis'] = np.sqrt(data['fx'] ** 2 + data['fy'] ** 2)
data['thetamc'] = data['thetamc'] * np.pi / 180.
data['fsinthmc'] = np.sin(data['thetamc'])
data['fcosphi'] = np.cos(data['fphi'])

data['ft2'] = data['t'] ** 2
data['fdis2'] = data['fdis'] ** 2
data['fsencond'] = data['fdis'] * data['fcosphi'] * data['fsinthmc']

data['fttrue'] = data['t'] / data['terror']
data['nhitratio'] = data['nhit'] / data['nhitreal']

data['fenergymc2'] = data['energymc'] ** 2

del data['fx']
del data['fy']
del data['x']
del data['y']
del data['z']
gc.collect()

In [None]:
info_new = pd.DataFrame()
info_new["event_id"] = data.groupby(["event_id"])["event_id"].mean()
info_new["fdis_mean"] = data.groupby(["event_id"])["fdis"].mean()
info_new["fdis_std"] = data.groupby(["event_id"])["fdis"].std()
info_new["fdis_stdmean"] = info_new["fdis_std"] / info_new["fdis_mean"]
info_new["ft_mean"] = data.groupby(["event_id"])["t"].mean()
info_new["ft_std"] = data.groupby(["event_id"])["t"].std()
info_new["ft_stdmean"] = info_new["ft_std"] / info_new["ft_mean"]
info_new["ft_mean2"] = info_new["ft_mean"] ** 2
info_new.reset_index(drop=True, inplace=True)
data = pd.merge(data, info_new, on='event_id', how='left')

data['fdis_rel'] = data['fdis'] / data['fdis_mean']
data['fdis_rel_std'] = data['fdis_rel'] / data['fdis_std']
data['ft_rel'] = data['t'] / data['ft_mean']
data['ft_rel_std'] = data['ft_rel'] / data['ft_std']

data['fdis2_rel'] = data['fdis_rel'] ** 2
data['fdis2_rel_std'] = data['fdis_rel_std'] ** 2
data['ft2_rel'] = data['ft_rel'] ** 2
data['ft2_rel_std'] = data['ft_rel_std'] ** 2

data['fsencond_rel'] = data['fdis2_rel'] * data['fcosphi'] * data['fsinthmc']
data['fsencond_rel_std'] = data['fdis2_rel_std'] * data['fcosphi'] * data['fsinthmc']

data['fsencond_ful'] = data['fsencond'] * data['ft_mean']



In [None]:
print(trainshape[0])
print(data.shape)

trainpd = data[:trainshape[0]].reset_index()
testpd = data[trainshape[0]:].reset_index()
del data
gc.collect()

In [None]:
print(trainpd.columns)
feature = [x for x in trainpd.columns if x not in ['flag', 'index', 'hit_id', 'event_id']]
labels = trainpd['flag']
del trainpd['flag']
del testpd['flag']

In [None]:
if "xgb"!="xgb":
    n_splits = 5
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=4399)
    y_pp_xgb = np.zeros(testshape[0])
#     y_pp_xgb_stacking = np.zeros(len(labels))
    for train_index, test_index in kf.split(trainpd):
        print(">>>", train_index)
        clf = xgb.XGBClassifier(tree_method='gpu_hist', max_depth=8, learning_rate=0.1, verbosity=3,
                                eval_metric='auc', n_estimators=2000)
#                                 eval_metric='auc', n_estimators=2000, predictor='cpu_predictor')
        clf.fit(
            trainpd[feature].iloc[train_index], labels[train_index],
            eval_set=[(trainpd[feature].iloc[train_index], labels[train_index]),
                      (trainpd[feature].iloc[test_index], labels[test_index])],
            early_stopping_rounds=50,
            verbose=True,
        )

        y_pred = clf.predict(trainpd[feature].iloc[test_index])
        y_predprob = clf.predict_proba(trainpd[feature].iloc[test_index])[:, 1]
#         y_pp_xgb_stacking[test_index] = y_predprob

        auc = metrics.roc_auc_score(labels[test_index], y_predprob)
        print("AUC Score (Train): %f" % auc)

        y_pp_xgb += clf.predict_proba(testpd[feature])[:, 1] / n_splits


In [None]:
if "lgbt"=="lgbt":
    def run_lgb(df_train, df_test, use_features):
        target = 'flag'
        oof_pred = np.zeros((len(df_train),))
        y_pred = np.zeros((len(df_test),))
        folds = GroupKFold(n_splits=2)  # 6 折比 5 折好一点, 当然有时间有机器可以试下更多的 folds
        for fold, (tr_ind, val_ind) in enumerate(folds.split(train, train[target], train['event_id'])):
            start_time = time.time()
            print(f'Fold {fold + 1}')
            x_train, x_val = df_train[use_features].iloc[tr_ind], df_train[use_features].iloc[val_ind]
            y_train, y_val = df_train[target].iloc[tr_ind], df_train[target].iloc[val_ind]
            train_set = lgb.Dataset(x_train, y_train)
            val_set = lgb.Dataset(x_val, y_val)
            params = {
                'learning_rate': 0.2,
                'metric': 'auc',
                'objective': 'binary',
                'feature_fraction': 0.80,
                'bagging_fraction': 0.75,
                'bagging_freq': 2,
                'n_jobs': -1,
                'seed': 1029,
                'max_depth': 8,
                'num_leaves': 64,
                'lambda_l1': 0.5,
                'lambda_l2': 0.5
            }
            model = lgb.train(params,
                              train_set,
                              num_boost_round=5000,
                              early_stopping_rounds=100,
                              valid_sets=[train_set, val_set],
                              verbose_eval=100)
            oof_pred[val_ind] = model.predict(x_val)
            y_pred += model.predict(df_test[use_features]) / folds.n_splits

            print("Features importance...")
            gain = model.feature_importance('gain')
            feat_imp = pd.DataFrame({'feature': model.feature_name(),
                                     'split': model.feature_importance('split'),
                                     'gain': 100 * gain / gain.sum()}).sort_values('gain', ascending=False)
            display(feat_imp)
            used_time = (time.time() - start_time) / 3600
            print(f'used_time: {used_time:.2f} hours')
            del x_train, x_val, y_train, y_val, train_set, val_set
            gc.collect()
        return y_pred, oof_pred


In [None]:
y_pred, oof_pred = run_lgb(train, test, use_features)
score = roc_auc_score(train['flag'], oof_pred)
print('auc: ', score)


In [1]:
if "stack" != "stack":
    '''模型融合中使用到的各个单模型'''
    clfs = [
        RandomForestClassifier(n_estimators=1000, max_depth=8, n_jobs=-1, criterion='gini'),
        RandomForestClassifier(n_estimators=1000, max_depth=8, n_jobs=-1, criterion='entropy'),
        ExtraTreesClassifier(n_estimators=1000, max_depth=8, n_jobs=-1, criterion='gini'),
        ExtraTreesClassifier(n_estimators=1000, max_depth=8, n_jobs=-1, criterion='entropy'),
        svm.SVC(C=0.1, kernel='linear', decision_function_shape='ovr'),
        xgb.XGBClassifier(tree_method='gpu_hist', max_depth=8, learning_rate=0.1, verbosity=3,
                                eval_metric='auc', n_estimators=1000),
        GradientBoostingClassifier(learning_rate=0.05, subsample=0.5, max_depth=8, n_estimators=1000)
    ]
    '''切分一部分数据作为测试集'''
    train1pd, train2pd ,label1s, label2s = train_test_split(trainpd[feature], labels, test_size=0.33, random_state=2017)
    dataset_blend_1train = np.zeros((train1pd.shape[0], len(clfs)))
    dataset_blend_2train = np.zeros((train2pd.shape[0], len(clfs)))
    dataset_blend_test = np.zeros((testpd.shape[0], len(clfs)))
    '''5折stacking'''
    n_folds = 2
    skf = list(StratifiedKFold(label1s, n_folds))
    for j, clf in enumerate(clfs):
        '''依次训练各个单模型'''
        # print(j, clf)
        dataset_blend_2train_j = np.zeros((train2pd.shape[0], len(skf)))
        dataset_blend_test_j = np.zeros((testpd.shape[0], len(skf)))
        for i, (train_index, test_index) in enumerate(skf):
            '''使用第i个部分作为预测，剩余的部分来训练模型，获得其预测的输出作为第i部分的新特征。'''
            # print("Fold", i)
            X_d1, y_d1, X_d2, y_d2 = train1pd[train_index], label1s[train_index], train2pd[test_index], label1s[test_index]
            clf.fit(X_d1, y_d1)
            dataset_blend_1train[test_index, j] = clf.predict_proba(X_d2)[:, 1]
            dataset_blend_2train_j[:, i] = clf.predict_proba(train2pd)[:, 1]
            dataset_blend_test_j[:, i] = clf.predict_proba(testpd[feature])[:, 1]
        '''对于测试集，直接用这k个模型的预测值均值作为新的特征。'''
        dataset_blend_2train[:, j] = dataset_blend_2train_j.mean(1)
        dataset_blend_test[:, j] = dataset_blend_test_j.mean(1)
        print("val auc Score: %f" % roc_auc_score(label2s, dataset_blend_2train[:, j]))


In [None]:
if "stack" != "stack":
# clf = LogisticRegression()
    clf2 = GradientBoostingClassifier(learning_rate=0.02, subsample=0.5, max_depth=6, n_estimators=30)
    clf2.fit(dataset_blend_1train, label1s)
    y_submission = clf2.predict_proba(dataset_blend_2train)[:, 1]
    fy_submission = clf2.predict_proba(dataset_blend_test)[:, 1]

    print("Linear stretch of predictions to [0,1]")
    y_submission = (y_submission - y_submission.min()) / (y_submission.max() - y_submission.min())
    print("blend result")
    print("val auc Score: %f" % (roc_auc_score(y_predict, y_submission)))
    fy_submission = (fy_submission - fy_submission.min()) / (fy_submission.max() - fy_submission.min())


In [None]:
if "blend" != "blend":
    '''模型融合中使用到的各个单模型'''
    clfs = [
        RandomForestClassifier(n_estimators=1000, max_depth=8, n_jobs=-1, criterion='gini'),
        RandomForestClassifier(n_estimators=1000, max_depth=8, n_jobs=-1, criterion='entropy'),
        ExtraTreesClassifier(n_estimators=1000, max_depth=8, n_jobs=-1, criterion='gini'),
        ExtraTreesClassifier(n_estimators=1000, max_depth=8, n_jobs=-1, criterion='entropy'),
        svm.SVC(C=0.1, kernel='linear', decision_function_shape='ovr'),
        xgb.XGBClassifier(tree_method='gpu_hist', max_depth=8, learning_rate=0.1, verbosity=3,
                                eval_metric='auc', n_estimators=1000),
        GradientBoostingClassifier(learning_rate=0.05, subsample=0.5, max_depth=8, n_estimators=1000)
    ]
    '''切分训练数据集为d1,d2两部分'''
    X_d1, X_d2, y_d1, y_d2 = train_test_split(trainpd[feature], labels, test_size=0.33, random_state=2017)
    dataset_d1 = np.zeros((train1pd.shape[0], len(clfs)))
    dataset_d2 = np.zeros((train2pd.shape[0], len(clfs)))
#     dataset_test = np.zeros((testpd.shape[0], len(clfs)))
    for j, clf in enumerate(clfs):
        '''依次训练各个单模型'''
        print(j, clf)
        clf.fit(X_d1, y_d1)
        '''使用第1个部分作为预测，第2部分来训练模型，获得其预测的输出作为第2部分的新特征。'''
        dataset_d1[:, j] = clf.predict_proba(X_d2)[:, 1]
        '''对于测试集，直接用这k个模型的预测值作为新的特征。'''
        dataset_d2[:, j] = clf.predict_proba(testpd[feature])[:, 1]
#         dataset_test[:, j] = clf.predict_proba(testpd[feature])[:, 1]
        print("val auc Score: %f" % roc_auc_score(y_d2, dataset_d1[:, j]))
    

In [None]:
if "blend" != "blend":
    '''模型融合中使用到的各个单模型'''
    '''融合使用的模型'''
    # clf = LogisticRegression()
    clf2 = GradientBoostingClassifier(learning_rate=0.02, subsample=0.5, max_depth=6, n_estimators=30)
    clf2.fit(dataset_d1, y_d2)
#     y_submission = clf2.predict_proba(dataset_d2)[:, 1]
    fy_submission = clf2.predict_proba(dataset_d2)[:, 1]

#     print("Linear stretch of predictions to [0,1]")
#     y_submission = (y_submission - y_submission.min()) / (y_submission.max() - y_submission.min())
#     print("blend result")
#     print("val auc Score: %f" % (roc_auc_score(y_d2, y_submission)))
    fy_submission = (fy_submission - fy_submission.min()) / (fy_submission.max() - fy_submission.min())
    

In [2]:
#阈值大概在0.2-0.4之间 本题对召回率较敏感，可适当降低一下阈值
thre = 0.25

#生成提交文件
sub = pd.DataFrame()
sub['hit_id'] = testpd['hit_id']
sub['flag_pred'] = fy_submission
sub['event_id'] = testpd['event_id']
sub['flag_pred'] = sub['flag_pred'].apply(lambda x: 1 if x >= thre else 0)
sub.to_csv(os.path.join(pathf, "subsample.csv").format(sub['flag_pred'].mean()), index=False)

NameError: name 'pd' is not defined