In [None]:
import os
import numpy as np
import random
import pandas as pd
import datatable as dt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import VotingClassifier
from tabnet import TabNet, TabNetClassifier, TabNetRegression
from tabnet import StackedTabNetClassifier
from matplotlib.colors import ListedColormap
import matplotlib.pyplot as plt
import tensorflow as tf

In [None]:
bpath = os.path.join("..","nocode","fin")
trainfile = os.path.join(bpath,"train.csv")
testfile = os.path.join(bpath,"example_test.csv")
featfile = os.path.join(bpath,"features.csv")
resfile = os.path.join(bpath,"example_sample_submission.csv")
modelfile = os.path.join(bpath,'weights')
def set_all_seeds(seed):
    np.random.seed(seed)
    random.seed(seed)
#     tf.random.set_seed(seed)
    tf.set_random_seed(seed)

SEED=33
set_all_seeds(SEED)

### 1. 数据处理

In [None]:
# 内存优化函数
def reduce_mem_usage(df):
    """ 根据数据的范围，修改数据类型 """
    start_mem = df.memory_usage().sum() / 1024 ** 2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    for col in df.columns:
        col_type = df[col].dtype.name

        if col_type not in ['object', 'category', 'datetime64[ns, UTC]']:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024 ** 2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
# 拆分数据函数
class PurgedGroupTimeSeriesSplitStacking(_BaseKFold):
    @_deprecate_positional_args
    def __init__(self,
                 n_splits=5,
                 *,
                 stacking_mode=True,
                 max_train_group_size=np.inf,
                 max_val_group_size=np.inf,
                 max_test_group_size=np.inf,
                 val_group_gap=None,
                 test_group_gap=None,
                 verbose=False
                 ):
        super().__init__(n_splits, shuffle=False, random_state=None)
        self.max_train_group_size = max_train_group_size
        self.max_val_group_size = max_val_group_size
        self.max_test_group_size = max_test_group_size
        self.val_group_gap = val_group_gap
        self.test_group_gap = test_group_gap
        self.verbose = verbose
        self.stacking_mode = stacking_mode

    def split(self, X, y=None, groups=None):
        if self.stacking_mode:
            return self.split_ensemble(X, y, groups)
        else:
            return self.split_standard(X, y, groups)

    def split_standard(self, X, y=None, groups=None):
        """Generate indices to split data into training and validation set.
        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Training data, where n_samples is the number of samples
            and n_features is the number of features.
        y : array-like of shape (n_samples,)
            Always ignored, exists for compatibility.
        groups : array-like of shape (n_samples,)
            Group labels for the samples used while splitting the dataset into
            train/validation set.
        Yields
        ------
        train : ndarray
            The training set indices for that split.
        val : ndarray
            The validation set indices for that split.
        """
        if groups is None:
            raise ValueError(
                "The 'groups' parameter should not be None")
        X, y, groups = indexable(X, y, groups)
        n_splits = self.n_splits
        group_gap = self.val_group_gap
        max_val_group_size = self.max_val_group_size
        max_train_group_size = self.max_train_group_size
        n_folds = n_splits + 1
        group_dict = {}
        u, ind = np.unique(groups, return_index=True)
        unique_groups = u[np.argsort(ind)]
        n_samples = _num_samples(X)
        n_groups = _num_samples(unique_groups)
        for idx in np.arange(n_samples):
            if (groups[idx] in group_dict):
                group_dict[groups[idx]].append(idx)
            else:
                group_dict[groups[idx]] = [idx]
        if n_folds > n_groups:
            raise ValueError(("Cannot have number of folds={0} greater than"
                 " the number of groups={1}").format(n_folds, n_groups))

        group_val_size = min(n_groups // n_folds, max_val_group_size)
        group_val_starts = range(n_groups - n_splits * group_val_size, n_groups, group_val_size)
        for group_val_start in group_val_starts:
            train_array = []
            val_array = []

            group_st = max(0, group_val_start - group_gap - max_train_group_size)
            for train_group_idx in unique_groups[group_st:(group_val_start - group_gap)]:
                train_array_tmp = group_dict[train_group_idx]
                train_array = np.sort(np.unique(np.concatenate((train_array, train_array_tmp)), axis=None), axis=None)

            train_end = train_array.size

            for val_group_idx in unique_groups[group_val_start: group_val_start + group_val_size]:
                val_array_tmp = group_dict[val_group_idx]
                val_array = np.sort(np.unique(np.concatenate((val_array, val_array_tmp)), axis=None), axis=None)

            val_array = val_array[group_gap:]

            if self.verbose > 0:
                pass

            yield [int(i) for i in train_array], [int(i) for i in val_array]

    def split_ensemble(self, X, y=None, groups=None):
        """Generate indices to split data into training, validation and test set.
        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Training data, where n_samples is the number of samples
            and n_features is the number of features.
        y : array-like of shape (n_samples,)
            Always ignored, exists for compatibility.
        groups : array-like of shape (n_samples,)
            Group labels for the samples used while splitting the dataset into
            train/test set.
        Yields
        ------
        train : ndarray
            The training set indices for that split.
        val : ndarray
            The validation set indices for that split (testing indices for base classifiers).
        test : ndarray
            The testing set indices for that split (testing indices for final classifier)
        """

        if groups is None:
            raise ValueError("The 'groups' parameter should not be None")

        X, y, groups = indexable(X, y, groups)
        n_splits = self.n_splits
        val_group_gap = self.val_group_gap
        test_group_gap = self.test_group_gap
        if test_group_gap is None:
            test_group_gap = val_group_gap
        max_train_group_size = self.max_train_group_size
        max_val_group_size = self.max_val_group_size
        max_test_group_size = self.max_test_group_size
        if max_test_group_size is None:
            max_test_group_size = max_val_group_size

        n_folds = n_splits + 1
        group_dict = {}
        u, ind = np.unique(groups, return_index=True)
        unique_groups = u[np.argsort(ind)]
        n_samples = _num_samples(X)
        n_groups = _num_samples(unique_groups)

        for idx in np.arange(n_samples):
            if (groups[idx] in group_dict):
                group_dict[groups[idx]].append(idx)
            else:
                group_dict[groups[idx]] = [idx]
        if n_folds > n_groups:
            raise ValueError(("Cannot have number of folds={0} greater than"
                 " the number of groups={1}").format(n_folds, n_groups))

        group_val_size = min(n_groups // n_folds, max_val_group_size)
        group_test_size = min(n_groups // n_folds, max_test_group_size)

        group_test_starts = range(n_groups - n_splits * group_test_size, n_groups, group_test_size)
        train_indices = []
        val_indices = []
        test_indices = []

        for group_test_start in group_test_starts:
            train_array = []
            val_array = []
            test_array = []

            val_group_st = max(max_train_group_size + val_group_gap, group_test_start - test_group_gap - max_val_group_size)
            train_group_st = max(0, val_group_st - val_group_gap - max_train_group_size)

            for train_group_idx in unique_groups[train_group_st:(val_group_st - val_group_gap)]:
                train_array_tmp = group_dict[train_group_idx]
                train_array = np.sort(np.unique(np.concatenate((train_array, train_array_tmp)), axis=None), axis=None)

            train_end = train_array.size

            for val_group_idx in unique_groups[val_group_st:(group_test_start - test_group_gap)]:
                val_array_tmp = group_dict[val_group_idx]
                val_array = np.sort(np.unique(np.concatenate((val_array, val_array_tmp)), axis=None), axis=None)

            val_array = val_array[val_group_gap:]

            for test_group_idx in unique_groups[group_test_start:(group_test_start + group_test_size)]:
                test_array_tmp = group_dict[test_group_idx]
                test_array = np.sort(np.unique(np.concatenate((test_array, test_array_tmp)), axis=None), axis=None)

            test_array = test_array[test_group_gap:]

            yield [int(i) for i in train_array], [int(i) for i in val_array], [int(i) for i in test_array]


In [None]:
# 1. 加载文件
trainpd = pd.read_csv(trainfile, header=0, encoding="utf8")
trainpd = trainpd.reset_index(drop = True)
# 2. 空值处理 
features = trainpd.columns 
# features = [c for c in trainpd.columns if 'feature' in c]
# f_mean = trainpd.mean()
# trainpd = trainpd.fillna(f_mean)
trainpd[features] = trainpd[features].fillna(method='bfill').fillna(0)
trainpd['action'] = (trainpd['resp'] > 0 and trainpd['weight'] != 0).astype('int8')
# 3. 优化内存
trainpd = reduce_mem_usage(trainpd)
# print(list(trainpd.dtypes))

# 4. 数据拆分训练
X = trainpd.loc[:, features]
y = trainpd.loc[:, 'action']
g = trainpd.loc[:, 'date']
# del trainpd
X = np.array(X)
y = np.array(y)

# 5. 数据拆分验证
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

N_SPLITS = 5
STACKING_MODE = True
VAL_GROUP_GAP = 20  # Days between end of training set and start of validation set
TEST_GROUP_GAP = 20  # Days between end of validation set and start of testing/stacking set
MAX_DAYS_TRAIN = 120
# MAX_DAYS_VAL = 60
MAX_DAYS_VAL = 0
MAX_DAYS_TEST = 60
RANDOM_SEED = 28

cv = PurgedGroupTimeSeriesSplitStacking(n_splits=N_SPLITS,
                                        stacking_mode=STACKING_MODE,
                                        max_train_group_size=MAX_DAYS_TRAIN, max_val_group_size=MAX_DAYS_VAL,
                                        max_test_group_size=MAX_DAYS_TEST, val_group_gap=VAL_GROUP_GAP,
                                        test_group_gap=TEST_GROUP_GAP)


### 数据集绘图

In [None]:
# 绘图函数
def plot_cv_indices_stacking(cv, X, y, group, ax, n_splits, lw=10):
    """Create a sample plot for indices of a cross-validation object."""
    cmap_cv = plt.cm.coolwarm

    jet = plt.cm.get_cmap('jet', 256)
    seq = np.linspace(0, 1, 256)
    _ = np.random.shuffle(seq)  # inplace
    cmap_data = ListedColormap(jet(seq))

    # Generate the training/testing visualizations for each CV split
    for ii, indices_split in enumerate(cv.split(X=X, y=y, groups=group)):
        # Fill in indices with the training/test groups

        indices = np.array([np.nan] * len(X))
        indices[indices_split[0]] = 1
        indices[indices_split[1]] = 0
        if cv.stacking_mode:
            indices[indices_split[2]] = -1

        # Visualize the results
        ax.scatter(range(len(indices)), [ii + .5] * len(indices), c=indices, marker='_', lw=lw, cmap=cmap_cv, vmin=-.2, vmax=1.2)

    # Plot the data classes and groups at the end
    ax.scatter(range(len(X)), [ii + 1.5] * len(X), c=y, marker='_', lw=lw, cmap=plt.cm.Set3)

    ax.scatter(range(len(X)), [ii + 2.5] * len(X), c=group, marker='_', lw=lw, cmap=cmap_data)

    if cv.stacking_mode:
        ax.scatter(range(len(X)), [ii + 3.5] * len(X), c=group, marker='_', lw=lw, cmap=cmap_data)

    # Formatting
    yticklabels = list(range(n_splits)) + ['target', 'day']
    ax.set(yticks=np.arange(n_splits + 2) + .5, yticklabels=yticklabels,
           xlabel='Sample index', ylabel="CV iteration",
           ylim=[n_splits + 2.2, -.2], xlim=[0, len(y)])

    ax.set_title('{}'.format(name_dict[cv.stacking_mode]), fontsize=15)
    # ax.set_title('{}'.format(type(cv).__name__), fontsize=15)
    return ax

fig, ax = plt.subplots(1, 1, figsize=(20, 12))
plot_cv_indices_stacking(cv, train[just_features], train['action'], train['date'], ax, 5, lw=20)

In [None]:
# testpd = pd.read_csv(testfile, header=0, encoding="utf8")
# testpd

In [None]:
# featpd = pd.read_csv(featfile, header=0, encoding="utf8")
# featpd

In [None]:
# respd = pd.read_csv(resfile, header=0, encoding="utf8")
# respd

### 2. 模型训练

In [None]:
model = TabNetClassifier(feature_list, num_classes, ...,dynamic=True)
model = StackedTabNetClassifier(feature_list, num_classes, num_layers, ...,dynamic=True)
print(TabNet.feature_selection_masks)
print(model.tabnet.*)
# 训练
# # Mask Generation must be in Eager Execution Mode
# x, _ = next(iter(tf_dataset))  # Assuming it generates an (x, y) tuple.
# _ = model(x)  # This forces eager execution.
for fold, (train_idx, val_idx, test_idx) in enumerate(cv.split(X, y, g)):
    print("FOLD: {}\n".format(fold))
    print("First train_day: {}\t Last train_day: {} \n".format(trainpd.loc[min(train_idx), 'date'],
                                                               trainpd.loc[max(train_idx), 'date']))
    print("First val_day: {}\t Last val_day: {} \n".format(trainpd.loc[min(val_idx), 'date'],
                                                           trainpd.loc[max(val_idx), 'date']))
    print("First test_day: {}\t Last test_day: {} \n\n\n".format(trainpd.loc[min(test_idx), 'date'],
                                                                 trainpd.loc[max(test_idx), 'date']))
# for fold, (train_idx, test_idx) in enumerate(cv.split(X, y, g)):
#     print("FOLD: {}\n".format(fold))
#     print("First train_day: {}\t Last train_day: {} \n".format(trainpd.loc[min(train_idx), 'date'],
#                                                                trainpd.loc[max(train_idx), 'date']))
#     print("First test_day: {}\t Last test_day: {} \n\n\n".format(trainpd.loc[min(test_idx), 'date'],
#                                                                  trainpd.loc[max(test_idx), 'date']))
    model.fit(X[train_idx],y[train_idx],validation_split=0.2)
# 加载模型
model.load_weights(modelfile)

# 模型预测
model.predict(x)
# clf = TabNetClassifier()
# clf.fit(
#     X_train, y_train,
#     eval_set=[(X_test, y_test)], max_epochs=2  # Change this to increase the accuracy
# )

# 可视化
writer = tf.summary.create_file_writer("logs/")
with writer.as_default():
    for i, mask in enumerate(model.tabnet.feature_selection_masks):
        print("Saving mask {} of shape {}".format(i + 1, mask.shape))
        tf.summary.image('mask_at_iter_{}'.format(i + 1), step=0, data=mask, max_outputs=1)
        writer.flush()

    agg_mask = model.tabnet.aggregate_feature_selection_mask
    print("Saving aggregate mask of shape", agg_mask.shape)
    tf.summary.image("Aggregate Mask", step=0, data=agg_mask, max_outputs=1)
    writer.flush()
writer.close()

### 集成学习 voting

In [None]:
classifiers = [['Neural Network :', MLPClassifier(max_iter=1000)],
               ['LogisticRegression :', LogisticRegression(max_iter=1000)],
               ['ExtraTreesClassifier :', ExtraTreesClassifier()],
               ['DecisionTree :', DecisionTreeClassifier()],
               ['RandomForest :', RandomForestClassifier()],
               ['Naive Bayes :', GaussianNB()],
               ['KNeighbours :', KNeighborsClassifier()],
               ['SVM :', SVC()],
               ['AdaBoostClassifier :', AdaBoostClassifier()],
               ['GradientBoostingClassifier: ', GradientBoostingClassifier()],
               ['XGB :', XGBClassifier()],
               ['CatBoost :', CatBoostClassifier(logging_level='Silent')]]

predictions_df = pd.DataFrame()
predictions_df['action'] = y_test

for name, classifier in classifiers:
    classifier = classifier
    classifier.fit(X_train, y_train.ravel())
    predictions = classifier.predict(X_test)
    predictions_df[name.strip(" :")] = predictions
    print(name, accuracy_score(y_test, predictions))

clf1 = ExtraTreesClassifier()
clf2 = CatBoostClassifier(logging_level='Silent')
clf3 = RandomForestClassifier()
eclf1 = VotingClassifier(estimators=[('ExTrees', clf1), ('CatBoost', clf2), ('RF', clf3)], voting='soft')
eclf1.fit(X_train, y_train)
predictions = eclf1.predict(X_test)
print(accuracy_score(y_test, predictions))

c = []
c.append(cross_val_score(clf1, X_train, y_train, scoring='accuracy', cv=10).mean())
c.append(cross_val_score(clf2, X_train, y_train, scoring='accuracy', cv=10).mean())
c.append(cross_val_score(clf3, X_train, y_train, scoring='accuracy', cv=10).mean())
print(c)

### 3. 预测评分

### 线上预测

In [None]:
import janestreet
env = janestreet.make_env()

In [None]:
%%time
I_WANT_TO_SUBMIT = False
I_WANT_TO_SUBMIT = True
rcount = 0
score_u = 0
if I_WANT_TO_SUBMIT:
    for (test_df, prediction_df) in env.iter_test():
#         X_test = test_df.loc[:, just_features].fillna(-999)
        print(test_df, prediction_df)
#         y_preds = model_trained.predict(X_test.values)
#         prediction_df.action = y_preds.item()
        prediction_df.action = 0
#         t=y_preds.item()*test_df['weight'][0]*test_df['resp'][0]
        env.predict(prediction_df)
        rcount += len(test_df.index)
        print(rcount)
    print(f'Finished processing {rcount} rows.')
#     score_u = 