Import modules

In [2]:
import numpy as np
import pandas as pd
from sklearn.experimental import enable_hist_gradient_boosting  # noqa
from sklearn.ensemble import HistGradientBoostingClassifier, StackingClassifier
from sklearn.ensemble._hist_gradient_boosting import loss
from sklearn.metrics import f1_score
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import OneHotEncoder


Custom Loss Function for HistGradientBoostingClassifier

In [3]:
class MyBinaryCrossEntropy(loss.BinaryCrossEntropy):

    def __init__(self):

        super().__init__()

    def __call__(self, y_true, raw_predictions, average=True):

        # shape (1, n_samples) --> (n_samples,). reshape(-1) is more likely to
        # return a view.
        raw_predictions = raw_predictions.reshape(-1)
        # logaddexp(0, x) = log(1 + exp(x))
        loss = 1 - f1_score(y_true, raw_predictions)
        return loss

def f1_loss(y_true, y_pred):

    return 1 - f1_score(y_true, y_pred)

class MyHistGradBoost(HistGradientBoostingClassifier):

    def __init__(self, loss='auto', learning_rate=0.1, max_iter=100,
                 max_leaf_nodes=31, max_depth=None, min_samples_leaf=20,
                 l2_regularization=0., max_bins=255, warm_start=False,
                 scoring=None, validation_fraction=0.1, n_iter_no_change=None,
                 tol=1e-7, verbose=0, random_state=None):
                 
        super(HistGradientBoostingClassifier, self).__init__(
            loss=loss, learning_rate=learning_rate, max_iter=max_iter,
            max_leaf_nodes=max_leaf_nodes, max_depth=max_depth,
            min_samples_leaf=min_samples_leaf,
            l2_regularization=l2_regularization, max_bins=max_bins,
            warm_start=warm_start, scoring=scoring,
            validation_fraction=validation_fraction,
            n_iter_no_change=n_iter_no_change, tol=tol, verbose=verbose,
            random_state=random_state)

    def _get_loss(self):

        return MyBinaryCrossEntropy()


Process data

In [4]:
def encode_seq(X):
    # Encode sequence with one-hot

    enc = OneHotEncoder(handle_unknown='ignore')
    new_X = list(X.copy().apply(list))

    enc.fit(new_X)
    enc_X = enc.transform(new_X).toarray()

    return enc_X


def read_data():

    # Read data
    train_raw = pd.read_csv('train.csv')
    test_raw = pd.read_csv('test.csv')

    # Extract data
    train_X_raw = train_raw['Sequence']
    train_y = train_raw['Active']

    test_X_raw = test_raw['Sequence']

    # Process sequence
    train_X = encode_seq(train_X_raw)
    test_X = encode_seq(test_X_raw)

    return train_X, train_y, test_X

In [5]:
train_X, train_y, test_X = read_data()

Train Model

In [9]:
# Stack estimators
estimators = [('mlp', MLPClassifier(hidden_layer_sizes=(80,))), ('hgb', MyHistGradBoost(max_iter=500))]

clf = StackingClassifier(estimators=estimators, n_jobs=2)
clf.fit(train_X, train_y)
print('Score is: %.3f\n' % f1_score(clf.predict(train_X), train_y))

Score is: 0.986



Predict

In [10]:
test_y = clf.predict(test_X)
np.savetxt('output.csv', test_y, fmt='%d')