Import modules

In [78]:
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score
from sklearn.preprocessing import OneHotEncoder

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

Process data

In [79]:
def encode_seq(X):
    # Encode sequence with one-hot

    enc = OneHotEncoder(handle_unknown='ignore')
    new_X = list(X.copy().apply(list))

    enc.fit(new_X)
    enc_X = enc.transform(new_X).toarray()

    return enc_X


def read_data():

    # Read data
    train_raw = pd.read_csv('train.csv')
    test_raw = pd.read_csv('test.csv')

    # Extract data
    train_X_raw = train_raw['Sequence']
    train_y = torch.tensor(train_raw['Active'])

    test_X_raw = test_raw['Sequence']

    # Process sequence
    train_X = torch.tensor(encode_seq(train_X_raw))
    test_X = torch.tensor(encode_seq(test_X_raw))

    return train_X, train_y, test_X

In [80]:
train_X, train_y, test_X = read_data()

Using F1 Loss from [this gist](https://gist.github.com/SuperShinyEyes/dcc68a08ff8b615442e3bc6a9b55a354)

In [81]:
class F1_Loss(nn.Module):
    '''Calculate F1 score. Can work with gpu tensors
    
    The original implmentation is written by Michal Haltuf on Kaggle.
    
    Returns
    -------
    torch.Tensor
        `ndim` == 1. epsilon <= val <= 1
    
    Reference
    ---------
    - https://www.kaggle.com/rejpalcz/best-loss-function-for-f1-score-metric
    - https://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html#sklearn.metrics.f1_score
    - https://discuss.pytorch.org/t/calculating-precision-recall-and-f1-score-in-case-of-multi-label-classification/28265/6
    - http://www.ryanzhang.info/python/writing-your-own-loss-function-module-for-pytorch/
    '''
    def __init__(self, epsilon=1e-7):
        super().__init__()
        self.epsilon = epsilon
        
    def forward(self, y_pred, y_true):
        assert y_pred.ndim == 2
        assert y_true.ndim == 1
        y_true = F.one_hot(y_true, 2).to(torch.float32)
        y_pred = F.softmax(y_pred, dim=1)
        
        tp = (y_true * y_pred).sum(dim=0).to(torch.float32)
        tn = ((1 - y_true) * (1 - y_pred)).sum(dim=0).to(torch.float32)
        fp = ((1 - y_true) * y_pred).sum(dim=0).to(torch.float32)
        fn = (y_true * (1 - y_pred)).sum(dim=0).to(torch.float32)

        precision = tp / (tp + fp + self.epsilon)
        recall = tp / (tp + fn + self.epsilon)

        f1 = 2* (precision*recall) / (precision + recall + self.epsilon)
        f1 = f1.clamp(min=self.epsilon, max=1-self.epsilon)
        
        return 1 - f1.mean()

Train the Neural Network

In [104]:
def train_NN(X, y, lr):
    
    net = nn.Sequential(nn.Linear(80, 40),
                        nn.ReLU(),
                        nn.Dropout(0.2),
                        nn.Linear(40, 1))
    
    loss_fn = F1_Loss()
    optimizer = optim.SGD(net.parameters(), lr=lr, momentum=0.5)
    
    # Set batch size
    batch_size = 100
    
    for epoch in range(10):  # loop over the dataset multiple times
    
        running_loss = 0.0
        for batch_idx in range(X.shape[0] // batch_size):

            # Reset gradient
            optimizer.zero_grad()

            # Forward
            y_pred = net(X[batch_size * batch_idx: batch_size * (batch_idx + 1) - 1, :].float())

            # Compute loss
            loss = loss_fn(y_pred, y[batch_size * batch_idx: batch_size * (batch_idx + 1) - 1])

            # Backward
            loss.backward()

            # Optimize
            optimizer.step()

            running_loss += loss.item()
            if batch_idx % 100 == 99:    # print every 100 mini-batches
                print('[%d, %5d] loss: %.3f' %
                      (epoch + 1, batch_idx + 1, running_loss / 100))
                running_loss = 0.0

    return net

In [105]:
net = train_NN(train_X, train_y, 0.001)
torch.save(net.state_dict(), './model.pth')

[1,   100] loss: 0.475
[1,   200] loss: 0.475
[1,   300] loss: 0.475
[1,   400] loss: 0.473
[1,   500] loss: 0.474
[1,   600] loss: 0.471
[1,   700] loss: 0.474
[1,   800] loss: 0.473
[1,   900] loss: 0.473
[1,  1000] loss: 0.473
[1,  1100] loss: 0.474
[2,   100] loss: 0.475
[2,   200] loss: 0.475
[2,   300] loss: 0.475
[2,   400] loss: 0.473
[2,   500] loss: 0.474
[2,   600] loss: 0.471
[2,   700] loss: 0.474
[2,   800] loss: 0.473
[2,   900] loss: 0.473
[2,  1000] loss: 0.473
[2,  1100] loss: 0.474
[3,   100] loss: 0.475
[3,   200] loss: 0.475
[3,   300] loss: 0.475
[3,   400] loss: 0.473
[3,   500] loss: 0.474
[3,   600] loss: 0.471
[3,   700] loss: 0.474
[3,   800] loss: 0.473
[3,   900] loss: 0.473
[3,  1000] loss: 0.473
[3,  1100] loss: 0.474
[4,   100] loss: 0.475
[4,   200] loss: 0.475
[4,   300] loss: 0.475
[4,   400] loss: 0.473
[4,   500] loss: 0.474
[4,   600] loss: 0.471
[4,   700] loss: 0.474
[4,   800] loss: 0.473
[4,   900] loss: 0.473
[4,  1000] loss: 0.473
[4,  1100] 

Predict

In [102]:
test_y = net(test_X.float())
# test_y.to_csv('output.csv', index=False)
np.savetxt('output.csv', test_y.detach().numpy() > 0.5)