In [1]:
## OS
import os

## Maths
import numpy as np

## Graphics
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

## DataFrames
import pandas as pd

## Sklearn

# preprocessing
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder

# metrics
from sklearn.metrics import f1_score

# pipeline
from sklearn.pipeline import Pipeline

# models
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

# Load dataset and preprocess it

In [2]:
## Load training dataset
train_df = pd.read_csv('dataset/train.csv', delimiter=',')

In [3]:
train_df

Unnamed: 0,Sequence,Active
0,DKWL,0
1,FCHN,0
2,KDQP,0
3,FNWI,0
4,NKRM,0
...,...,...
111995,GSME,0
111996,DLPT,0
111997,SGHC,0
111998,KIGT,0


In [16]:
## Encoder for the dataframe
amino_acids = ["R", "H", "K", "D", "E", "S", "T", "N", "Q", "C", "U", "G", "P", "A", "I", "L", "M", "F", "W", "Y", "V"]
amino_acids_sorted = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'Y']


def label_encoding(df, amino_acids):

    ## Columns to be used for training
    training_columns = []
    
    ## Label encoder for the amino_acid
    amino_acids_encoding = { amino_acid: idx for idx, amino_acid in enumerate(amino_acids) }
    
    ## Make a column for each mutation
    for site_number in range(4):
        site_column = "site" + str(site_number+1)
        df[site_column] = df["Sequence"].apply(lambda x: x[site_number])
    
    ## Label encode each mutation column
    for site_number in range(4):
        site_column = "site" + str(site_number+1)
        site_column_encoded = "site" + str(site_number+1) + "_encoded"
        df[site_column_encoded] = df[site_column].apply(lambda x: amino_acids_encoding[x])
        training_columns.append(site_column_encoded)
        
    return training_columns


def one_hot_encoding(df, amino_acids):    
    
    ## Columns to be used for training
    training_columns = []
    
    ## Make a column for each mutation
    for site_number in range(4):
        site_column = "site" + str(site_number+1)
        df[site_column] = df["Sequence"].apply(lambda x: x[site_number])
        
    ## One hot encode all columns
    for site_number in range(4):
        column = "site" + str(site_number+1)
        for amino_acid in amino_acids:
            new_column = column + "_" + amino_acid
            df[new_column] = (df[column] == amino_acid).astype(int)
            training_columns.append(new_column)
    
    return training_columns


# def one_hot_encoding(df, amino_acids):    
#     ## Make a column for each mutation
#     columns_to_transform = []
#     for site_number in range(4):
#         site_column = "site"+str(site_number+1)
#         df[site_column] = df["Sequence"].apply(lambda x: x[site_number])
#         columns_to_transform.append(site_column)

#     ## One hot encode all columns
#     encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
#     encoded = encoder.fit_transform(df[columns_to_transform])
    
#     ## One-hot encoding removed index, put it back
#     df = pd.DataFrame(encoded, index=df.index)
#     return df

In [17]:
train_df_label = train_df.copy()
training_columns_label = label_encoding(train_df_label, amino_acids)
train_df_label

Unnamed: 0,Sequence,Active,site1,site2,site3,site4,site1_encoded,site2_encoded,site3_encoded,site4_encoded
0,DKWL,0,D,K,W,L,3,2,18,15
1,FCHN,0,F,C,H,N,17,9,1,7
2,KDQP,0,K,D,Q,P,2,3,8,12
3,FNWI,0,F,N,W,I,17,7,18,14
4,NKRM,0,N,K,R,M,7,2,0,16
...,...,...,...,...,...,...,...,...,...,...
111995,GSME,0,G,S,M,E,11,5,16,4
111996,DLPT,0,D,L,P,T,3,15,12,6
111997,SGHC,0,S,G,H,C,5,11,1,9
111998,KIGT,0,K,I,G,T,2,14,11,6


In [18]:
train_df_one_hot = train_df.copy()
training_columns_one_hot = one_hot_encoding(train_df_one_hot, amino_acids)
train_df_one_hot

Unnamed: 0,Sequence,Active,site1,site2,site3,site4,site1_R,site1_H,site1_K,site1_D,...,site4_G,site4_P,site4_A,site4_I,site4_L,site4_M,site4_F,site4_W,site4_Y,site4_V
0,DKWL,0,D,K,W,L,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0
1,FCHN,0,F,C,H,N,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,KDQP,0,K,D,Q,P,0,0,1,0,...,0,1,0,0,0,0,0,0,0,0
3,FNWI,0,F,N,W,I,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,NKRM,0,N,K,R,M,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
111995,GSME,0,G,S,M,E,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
111996,DLPT,0,D,L,P,T,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
111997,SGHC,0,S,G,H,C,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
111998,KIGT,0,K,I,G,T,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [19]:
## Make features and predictions datasets
X_label = train_df_label[training_columns_label]
X_one_hot = train_df_one_hot[training_columns_one_hot]
y = train_df_one_hot["Active"]

# Quick test of some models

In [23]:
## Train test split and normalisation

# train test split
X_train_label, X_validation_label, X_train_one_hot, X_validation_one_hot, y_train, y_validation \
    = train_test_split(X_label, X_one_hot, y, train_size=0.10)
#, y_train_one_hot, y_validation_one_hot = train_test_split(X_one_hot, y, train_size=0.10)

# Normalise label encoded dataset
std_scaler = StandardScaler().fit(X_train_label)
X_train_label = std_scaler.transform(X_train_label)
X_validation_label = std_scaler.transform(X_validation_label)

# One hot encoded dataset is already normalised (min max normalisation) by construction

In [25]:
classifier = RandomForestClassifier()
classifier.fit(X_train_one_hot, y_train)
y_pred = classifier.predict(X_validation_one_hot)
print("F1 score: %.3f" %(f1_score(y_validation, y_pred)))

F1 score: 0.544


In [26]:
classifier = RandomForestClassifier()
classifier.fit(X_train_label, y_train)
y_pred = classifier.predict(X_validation_label)
print("F1 score: %.3f" %(f1_score(y_validation, y_pred)))

F1 score: 0.639
