In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder

# Data

In [2]:
# Load the dataframes
train_data = pd.read_csv("data/train.csv")
test_data=pd.read_csv("data/test.csv")

# Preview the dataframes
display(train_data.head(10))
display(test_data.head(10))

Unnamed: 0,Sequence,Active
0,DKWL,0
1,FCHN,0
2,KDQP,0
3,FNWI,0
4,NKRM,0
5,IWHA,0
6,GEFV,0
7,KARS,0
8,IMGW,0
9,KYRK,0


Unnamed: 0,Sequence
0,HWFK
1,MWPW
2,ALDV
3,NTLG
4,LHYY
5,AFGM
6,TPNY
7,EAKD
8,NDKE
9,GFIS


In [3]:
def create_char_df(df):
    """Create a dataframe that contains four columns (one for each char in 'Sequence')."""

    # Split sequences into characters
    first = []
    second = []
    third = []
    fourth = []

    for _, row in df.iterrows():
        ch_1, ch_2, ch_3, ch_4 = row["Sequence"]

        first.append(ch_1)
        second.append(ch_2)
        third.append(ch_3)
        fourth.append(ch_4)

    df_dict = {
        "first": first,
        "second": second,
        "third": third,
        "fourth": fourth
    }
    return pd.DataFrame(df_dict)

X_df = create_char_df(train_data)
X_test_df = create_char_df(test_data)

display(X_df.head(5))
display(X_test_df.head(5))

Unnamed: 0,first,second,third,fourth
0,D,K,W,L
1,F,C,H,N
2,K,D,Q,P
3,F,N,W,I
4,N,K,R,M


Unnamed: 0,first,second,third,fourth
0,H,W,F,K
1,M,W,P,W
2,A,L,D,V
3,N,T,L,G
4,L,H,Y,Y


## OneHot Encoding

In [4]:
enc = OneHotEncoder(handle_unknown='ignore')
enc.fit(X_df)

display(enc.categories_)

# Ensure the onehots are the same for each letter
assert((enc.categories_[0] == enc.categories_[1]).all() and (enc.categories_[1] == enc.categories_[2]).all() and (enc.categories_[2] == enc.categories_[3]).all())

[array(['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P',
        'Q', 'R', 'S', 'T', 'V', 'W', 'Y'], dtype=object),
 array(['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P',
        'Q', 'R', 'S', 'T', 'V', 'W', 'Y'], dtype=object),
 array(['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P',
        'Q', 'R', 'S', 'T', 'V', 'W', 'Y'], dtype=object),
 array(['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P',
        'Q', 'R', 'S', 'T', 'V', 'W', 'Y'], dtype=object)]

# Training

In [6]:
# Create training np arrays
X_train = enc.transform(X_df).toarray()
y_train = train_data["Active"].values

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier

clf = make_pipeline(StandardScaler(), MLPClassifier(hidden_layer_sizes=(15,), batch_size=128, verbose=True), verbose=True)
clf.fit(X_train, y_train)

[Pipeline] .... (step 1 of 2) Processing standardscaler, total=   0.2s
Iteration 1, loss = 0.22803785
Iteration 2, loss = 0.07649394
Iteration 3, loss = 0.05941810
Iteration 4, loss = 0.04904024
Iteration 5, loss = 0.04267791
Iteration 6, loss = 0.03871639
Iteration 7, loss = 0.03617555
Iteration 8, loss = 0.03418426
Iteration 9, loss = 0.03260710
Iteration 10, loss = 0.03152534
Iteration 11, loss = 0.03060759
Iteration 12, loss = 0.02972534
Iteration 13, loss = 0.02925880
Iteration 14, loss = 0.02875247
Iteration 15, loss = 0.02833662
Iteration 16, loss = 0.02785404
Iteration 17, loss = 0.02743803
Iteration 18, loss = 0.02697936
Iteration 19, loss = 0.02661015
Iteration 20, loss = 0.02636317
Iteration 21, loss = 0.02584833
Iteration 22, loss = 0.02557382
Iteration 23, loss = 0.02540533
Iteration 24, loss = 0.02479886
Iteration 25, loss = 0.02452182
Iteration 26, loss = 0.02423649
Iteration 27, loss = 0.02388542
Iteration 28, loss = 0.02354361
Iteration 29, loss = 0.02315945
Iteration 

In [None]:
X_test = enc.transform(X_test_df).toarray()

In [None]:
y_test=clf.predict(X_test)

In [None]:
np.savetxt("submission.txt", y_test, fmt="%d")