In [22]:
import numpy as np
import sklearn as skl
import pandas as pd
import datetime
import sklearn.preprocessing
from sklearn import linear_model
from sklearn.metrics import f1_score, mean_squared_error
from scipy import special
from sklearn.impute import SimpleImputer
from sklearn.utils import class_weight
import keras
from keras.layers import Dense, BatchNormalization, Dropout, Layer
from keras.models import Sequential
import keras.backend as K
from tensorflow.python.client import device_lib
import tensorflow as tf

np.random.seed(69) #fixing seed for reproducability

In [4]:
df_train = pd.read_csv("train.csv")
df_train_split = df_train['Sequence'].apply(lambda x: pd.Series(list(x))) #splits the 4 mutators into individual columns.

In [5]:
df_test = pd.read_csv("test.csv")
df_test_split = df_test['Sequence'].apply(lambda x: pd.Series(list(x))) #splits the 4 mutators into individual columns.

In [6]:
# define universe of possible input values
mutators = 'ACDEFGHIKLMNPQRSTUVWY' #abcdefghijklmnopqrstuvwxyz
#bjoqxz <-- these letters are missing

# define a mapping of chars to integers
char_to_int = dict((c, i) for i, c in enumerate(mutators))
int_to_char = dict((i, c) for i, c in enumerate(mutators))

# integer encode input data
#integer_encoded = [char_to_int[item] for item in df_letters[0]]

# Define the label for the NN which is the activation of the protein.
labels = df_train['Active'].to_numpy()

In [7]:
#Defining some constants.

NUM_MUTATORS = len(mutators)

DF_TRAIN_SIZE = len(df_train.index)
DF_TEST_SIZE = len(df_test.index)

In [8]:
df_train_onehot = np.zeros((DF_TRAIN_SIZE, NUM_MUTATORS*4))

#We convert the mutators into the OneHot representation.
for i in range(0,4):
    for j in range(0, DF_TRAIN_SIZE):
        temp = char_to_int[df_train_split.loc[j,i]]
        df_train_onehot[j, i*NUM_MUTATORS + temp] = 1      #binary representation of the letters

In [9]:
#Do the same with the test dataframe
df_test_onehot = np.zeros((DF_TEST_SIZE, NUM_MUTATORS*4))

for i in range(0,4):
    for j in range(0, DF_TEST_SIZE):
        temp = char_to_int[df_test_split.loc[j,i]]
        df_test_onehot[j, i*NUM_MUTATORS + temp] = 1

In [10]:
#lets norm this shit (this is only necessary if we use the 4D representation
#df_nn = (df+1)/21
#df_cock_nn = (df_cock+1)/21

#for the binary version use this:
df_train_onehot_nn = df_train_onehot
df_test_onehot_nn = df_test_onehot

In [None]:
show_tensorboard = True

if show_tensorboard:
    # Load the TensorBoard notebook extension
    %load_ext tensorboard
    #%reload_ext tensorboard

    log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
    tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

    %tensorboard --logdir logs/fit

In [17]:
def f1_loss_simple(y_true, y_pred):
    loss_f1 = f1_score(y_true, y_pred)
    return loss_f1

In [15]:
""" class f1_loss(Layer):

    def __init__(self, rate=1e-2):
        super(f1_loss, self).__init__()
        self.rate = rate

    def custom_loss(self, y_true, y_pred):
        self.add_loss(self.rate * f1_score(y_true, y_pred))
        return  """

' class f1_loss(Layer):\n\n    def __init__(self, rate=1e-2):\n        super(f1_loss, self).__init__()\n        self.rate = rate\n\n    def custom_loss(self, y_true, y_pred):\n        self.add_loss(self.rate * f1_score(y_true, y_pred))\n        return  '

In [35]:
# Code from https://www.kaggle.com/rejpalcz/best-loss-function-for-f1-score-metric
def f1(y_true, y_pred):
    y_pred = K.round(y_pred)
    tp = K.sum(K.cast(y_true*y_pred, 'float'), axis=0)
    tn = K.sum(K.cast((1-y_true)*(1-y_pred), 'float'), axis=0)
    fp = K.sum(K.cast((1-y_true)*y_pred, 'float'), axis=0)
    fn = K.sum(K.cast(y_true*(1-y_pred), 'float'), axis=0)

    p = tp / (tp + fp + K.epsilon())
    r = tp / (tp + fn + K.epsilon())

    f1 = 2*p*r / (p+r+K.epsilon())
    f1 = tf.where(tf.is_nan(f1), tf.zeros_like(f1), f1)
    return K.mean(f1)

def f1_loss(y_true, y_pred):
    
    tp = K.sum(K.cast(y_true*y_pred, 'float'), axis=0)
    tn = K.sum(K.cast((1-y_true)*(1-y_pred), 'float'), axis=0)
    fp = K.sum(K.cast((1-y_true)*y_pred, 'float'), axis=0)
    fn = K.sum(K.cast(y_true*(1-y_pred), 'float'), axis=0)

    p = tp / (tp + fp + K.epsilon())
    r = tp / (tp + fn + K.epsilon())

    f1 = 2*p*r / (p+r+K.epsilon())
    f1 = tf.where(tf.is_nan(f1), tf.zeros_like(f1), f1)
    return 1 - K.mean(f1)

In [30]:
def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [32]:
#Define some callbacks to stop overfitting
callbacks = [
    keras.callbacks.EarlyStopping(
        # Stop training when val_loss is no longer improving
        monitor="val_f1_m",
        # Take the minimum of val_loss as a guideline.
        mode="max",
        # No longer improving" being further defined as "for at least 50 epochs.
        patience=300,
        # Since we have such a large paitience we restore back to the best weights after a long waittime.
        restore_best_weights = True,
        verbose = 1,
    )
]

## Some 
        #log_dir=log_dir,
        #histogram_freq=1,

In [16]:
number_1 = np.count_nonzero(labels)
print(number_1 , 'are a 1 from the label array.')
print(DF_TRAIN_SIZE - number_1, 'are a 0 that is ' , 100*(1-1/float(DF_TRAIN_SIZE)*number_1), '%.')

#Thus we need to ajust the weights used for punishment of the neural net. Or else a output of all 0 will lead to a "success" of 96%.

class_weights = class_weight.compute_class_weight('balanced', np.unique(labels), labels)
#Now we actualy want to give an increased weight to the ones. Here with 4 times more important.

class_weights = {i : class_weights[i] for i in range(len(np.unique(labels)))}
print(class_weights)

4213 are a 1 from the label array.
107787 are a 0 that is  96.23839285714286 %.
{0: 0.5195431731099298, 1: 53.16876335153098}


In [36]:
#lets construct the neural net:
neuralNetwork = Sequential()
neuralNetwork.add(Dense(128, activation='relu', input_dim=NUM_MUTATORS * 4))
neuralNetwork.add(BatchNormalization())
neuralNetwork.add(Dense(128, activation='relu'))
neuralNetwork.add(BatchNormalization())
neuralNetwork.add(Dropout(0.2))
neuralNetwork.add(Dense(128, activation='relu'))
neuralNetwork.add(BatchNormalization())
neuralNetwork.add(Dropout(0.2))
neuralNetwork.add(Dense(128, activation='relu'))
neuralNetwork.add(BatchNormalization())
neuralNetwork.add(Dropout(0.2))
neuralNetwork.add(Dense(128, activation='relu'))
neuralNetwork.add(BatchNormalization())
neuralNetwork.add(Dropout(0.2))
neuralNetwork.add(Dense(1, activation='sigmoid'))
#neuralNetwork.add(BatchNormalization())
 
neuralNetwork.compile(loss = f1_loss, optimizer = 'adam', metrics=['acc',f1, f1_m,precision_m, recall_m])

#fit the network to (for now the un-sparse matrix)
neuralNetwork.fit(df_train_onehot_nn, labels, epochs=1000, batch_size=1024, verbose=2, callbacks= [callbacks, tensorboard_callback], validation_split = 0.2, class_weight=class_weights)

#version without loss monitoring
""" if show_tensorboard:
    neuralNetwork.fit(df_train_onehot_nn, labels, epochs=1000, batch_size=1024, verbose=1, callbacks = tensorboard_callback, validation_split = 0.2, class_weight=class_weights)
else: neuralNetwork.fit(df_train_onehot_nn, labels, epochs=1000, batch_size=1024, verbose=1, validation_split = 0.2, class_weight=class_weights) """
#evaluation of the network prediction
predict_test_nn = neuralNetwork.predict(df_test_onehot_nn)
predict_train_nn = neuralNetwork.predict(df_train_onehot_nn)
print(predict_test_nn)
print(predict_train_nn)
predict_test_nn = (predict_test_nn >= 0.5)


neuralNetwork.summary()

Epoch 1/1000


TypeError: in user code:

    C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\keras\engine\training.py:805 train_function  *
        return step_function(self, iterator)
    <ipython-input-35-23825604e934>:18 f1_loss  *
        tp = K.sum(K.cast(y_true*y_pred, 'float'), axis=0)
    C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\ops\math_ops.py:1180 binary_op_wrapper
        raise e
    C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\ops\math_ops.py:1164 binary_op_wrapper
        return func(x, y, name=name)
    C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\ops\math_ops.py:1496 _mul_dispatch
        return multiply(x, y, name=name)
    C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\util\dispatch.py:201 wrapper
        return target(*args, **kwargs)
    C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\ops\math_ops.py:518 multiply
        return gen_math_ops.mul(x, y, name)
    C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\ops\gen_math_ops.py:6076 mul
        _, _, _op, _outputs = _op_def_library._apply_op_helper(
    C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\framework\op_def_library.py:555 _apply_op_helper
        raise TypeError(

    TypeError: Input 'y' of 'Mul' Op has type float32 that does not match type int64 of argument 'x'.


In [16]:
print(predict_train_nn.max())
print(predict_train_nn.min())

print("actual labels:", np.sum(labels))
print("predicted labels:", np.sum(predict_train_nn >0.5))
print("the f1 score is:", f1_score(labels.astype('bool'), (predict_train_nn >=0.5).astype('bool')))

1.0
2.907142e-20
actual labels: 4213
predicted labels: 4283
the f1 score is: 0.9724576271186441


In [17]:
predict_test_nn = pd.DataFrame(predict_test_nn.astype('int'))
predict_test_nn.to_csv('predictions.csv', header = False, index = False)
