In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder

# Data

In [2]:
# Load the dataframes
train_data = pd.read_csv("data/train.csv")
test_data=pd.read_csv("data/test.csv")

# Preview the dataframes
display(train_data.head(10))
display(test_data.head(10))

Unnamed: 0,Sequence,Active
0,DKWL,0
1,FCHN,0
2,KDQP,0
3,FNWI,0
4,NKRM,0
5,IWHA,0
6,GEFV,0
7,KARS,0
8,IMGW,0
9,KYRK,0


Unnamed: 0,Sequence
0,HWFK
1,MWPW
2,ALDV
3,NTLG
4,LHYY
5,AFGM
6,TPNY
7,EAKD
8,NDKE
9,GFIS


In [3]:
def create_char_df(df):
    """Create a dataframe that contains four columns (one for each char in 'Sequence')."""

    # Split sequences into characters
    first = []
    second = []
    third = []
    fourth = []

    for _, row in df.iterrows():
        ch_1, ch_2, ch_3, ch_4 = row["Sequence"]

        first.append(ch_1)
        second.append(ch_2)
        third.append(ch_3)
        fourth.append(ch_4)

    df_dict = {
        "first": first,
        "second": second,
        "third": third,
        "fourth": fourth
    }
    return pd.DataFrame(df_dict)

X_df = create_char_df(train_data)
X_test_df = create_char_df(test_data)

display(X_df.head(5))
display(X_test_df.head(5))

Unnamed: 0,first,second,third,fourth
0,D,K,W,L
1,F,C,H,N
2,K,D,Q,P
3,F,N,W,I
4,N,K,R,M


Unnamed: 0,first,second,third,fourth
0,H,W,F,K
1,M,W,P,W
2,A,L,D,V
3,N,T,L,G
4,L,H,Y,Y


## OneHot Encoding

In [4]:
enc = OneHotEncoder(handle_unknown='ignore')
enc.fit(X_df)

display(enc.categories_)

# Ensure the onehots are the same for each letter
assert((enc.categories_[0] == enc.categories_[1]).all() and (enc.categories_[1] == enc.categories_[2]).all() and (enc.categories_[2] == enc.categories_[3]).all())

[array(['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P',
        'Q', 'R', 'S', 'T', 'V', 'W', 'Y'], dtype=object),
 array(['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P',
        'Q', 'R', 'S', 'T', 'V', 'W', 'Y'], dtype=object),
 array(['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P',
        'Q', 'R', 'S', 'T', 'V', 'W', 'Y'], dtype=object),
 array(['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P',
        'Q', 'R', 'S', 'T', 'V', 'W', 'Y'], dtype=object)]

# Training

In [13]:
from sklearn.model_selection import train_test_split

X = enc.transform(X_df).toarray().reshape(-1, 4, 20)
y = train_data["Active"].values

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.04, random_state=1337, stratify=y)

# Create training np arrays
#X_train = enc.transform(X_df).toarray().reshape(-1, 4, 20)
#y_train = train_data["Active"].values

In [14]:
from keras import backend as K

def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [45]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense, GRU, Bidirectional
import keras
from sklearn.utils.class_weight import compute_class_weight

class_weight = compute_class_weight('balanced', np.unique(y_train), y_train)
class_weights = {
    0: class_weight[0] / np.sum(class_weight),
    1: class_weight[1] / np.sum(class_weight)
}

batch_size = 128

model_lstm = Sequential()
model_lstm.add(
    Bidirectional(GRU(20, activation='relu', input_shape=(4, 20)))
)
model_lstm.add(Dense(1, activation = 'sigmoid'))
optimizer = keras.optimizers.Adam(learning_rate=0.01)
model_lstm.compile(
    loss='binary_crossentropy',
    optimizer=optimizer,
    metrics=[f1_m]
)

history = model_lstm.fit(X_train, y_train, epochs=200, batch_size=batch_size, validation_data=(X_val, y_val), class_weight=class_weights)
print(model_lstm.summary())

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200

KeyboardInterrupt: 

In [58]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier

clf = make_pipeline(MLPClassifier(hidden_layer_sizes=(250,150,80,25), tol=0.00001, batch_size=128, random_state=1337, verbose=True), verbose=True)
clf.fit(X_train.reshape(-1, 80), y_train)

Iteration 1, loss = 0.06525055
Iteration 2, loss = 0.02815069
Iteration 3, loss = 0.02055373
Iteration 4, loss = 0.01588432
Iteration 5, loss = 0.01284418
Iteration 6, loss = 0.01119992
Iteration 7, loss = 0.00895968
Iteration 8, loss = 0.00693004
Iteration 9, loss = 0.00691008
Iteration 10, loss = 0.00509934
Iteration 11, loss = 0.00551264
Iteration 12, loss = 0.00426695
Iteration 13, loss = 0.00369342
Iteration 14, loss = 0.00396192
Iteration 15, loss = 0.00382146
Iteration 16, loss = 0.00316241
Iteration 17, loss = 0.00359102
Iteration 18, loss = 0.00280535
Iteration 19, loss = 0.00305275
Iteration 20, loss = 0.00281826
Iteration 21, loss = 0.00187638
Iteration 22, loss = 0.00264945
Iteration 23, loss = 0.00281540
Iteration 24, loss = 0.00263154
Iteration 25, loss = 0.00178901
Iteration 26, loss = 0.00235520
Iteration 27, loss = 0.00248415
Iteration 28, loss = 0.00233273
Iteration 29, loss = 0.00205619
Iteration 30, loss = 0.00200349
Iteration 31, loss = 0.00192928
Iteration 32, los

Pipeline(memory=None,
         steps=[('mlpclassifier',
                 MLPClassifier(activation='relu', alpha=0.0001, batch_size=128,
                               beta_1=0.9, beta_2=0.999, early_stopping=False,
                               epsilon=1e-08,
                               hidden_layer_sizes=(250, 150, 80, 25),
                               learning_rate='constant',
                               learning_rate_init=0.001, max_fun=15000,
                               max_iter=200, momentum=0.9, n_iter_no_change=10,
                               nesterovs_momentum=True, power_t=0.5,
                               random_state=1337, shuffle=True, solver='adam',
                               tol=5e-05, validation_fraction=0.1, verbose=True,
                               warm_start=False))],
         verbose=True)

In [59]:
from sklearn.metrics import f1_score
y_pred=clf.predict(X_val.reshape(-1, 80))
f1_score(y_val, y_pred)

0.9101796407185629

In [60]:
X_test = enc.transform(X_test_df).toarray()

In [61]:
y_test=np.around(clf.predict(X_test))

In [62]:
np.savetxt("submission.txt", y_test, fmt="%d")

In [63]:
display(y_test)

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)