In [19]:
import numpy as np
import sklearn as skl
import pandas as pd
import datetime
import sklearn.preprocessing
from sklearn import linear_model
from sklearn.metrics import f1_score
from scipy import special
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error
import keras
from keras.layers import Dense, BatchNormalization
from keras.models import Sequential
from tensorflow.python.client import device_lib

np.random.seed(69) #fixing seed for reproducability

In [2]:
df_train = pd.read_csv("train.csv")
df_train_split = df_train['Sequence'].apply(lambda x: pd.Series(list(x))) #splits the 4 mutators into individual columns.

In [3]:
df_test = pd.read_csv("test.csv")
df_test_split = df_test['Sequence'].apply(lambda x: pd.Series(list(x))) #splits the 4 mutators into individual columns.

In [4]:
# define universe of possible input values
mutators = 'ACDEFGHIKLMNPQRSTUVWY' #abcdefghijklmnopqrstuvwxyz
#bjoqxz <-- these letters are missing

# define a mapping of chars to integers
char_to_int = dict((c, i) for i, c in enumerate(mutators))
int_to_char = dict((i, c) for i, c in enumerate(mutators))

# integer encode input data
#integer_encoded = [char_to_int[item] for item in df_letters[0]]

# Define the label for the NN which is the activation of the protein.
labels = df_train['Active'].to_numpy()

In [5]:
#Defining some constants.

NUM_MUTATORS = len(mutators)

DF_TRAIN_SIZE = len(df_train.index)
DF_TEST_SIZE = len(df_test.index)

In [6]:
df_train_onehot = np.zeros((DF_TRAIN_SIZE, NUM_MUTATORS*4))

#We convert the mutators into the OneHot representation.
for i in range(0,4):
    for j in range(0, DF_TRAIN_SIZE):
        temp = char_to_int[df_train_split.loc[j,i]]
        df_train_onehot[j, i*NUM_MUTATORS + temp] = 1      #binary representation of the letters

In [7]:
#Do the same with the test dataframe
df_test_onehot = np.zeros((DF_TEST_SIZE, NUM_MUTATORS*4))

for i in range(0,4):
    for j in range(0, DF_TEST_SIZE):
        temp = char_to_int[df_test_split.loc[j,i]]
        df_test_onehot[j, i*NUM_MUTATORS + temp] = 1

In [8]:
#lets norm this shit (this is only necessary if we use the 4D representation
#df_nn = (df+1)/21
#df_cock_nn = (df_cock+1)/21

#for the binary version use this:
df_train_onehot_nn = df_train_onehot
df_test_onehot_nn = df_test_onehot

In [20]:
# Load the TensorBoard notebook extension
%load_ext tensorboard

log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")

%tensorboard --logdir logs/fit

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


Reusing TensorBoard on port 6006 (pid 21484), started 0:14:01 ago. (Use '!kill 21484' to kill it.)

In [22]:
#Define some callbacks to stop overfitting
callbacks = [
    keras.callbacks.EarlyStopping(
        # Stop training when val_loss is no longer improving
        monitor="val_loss",
        # Take the minimum of val_loss as a guideline.
        mode="min",
        # No longer improving" being further defined as "for at least 50 epochs.
        patience=50,
        # Since we have such a large paitience we restore back to the best weights after a long waittime.
        restore_best_weights = True,
        verbose = 1,
    )
]

## Some 
        #log_dir=log_dir,
        #histogram_freq=1,

In [26]:
#lets construct the neural net:
neuralNetwork = Sequential()
neuralNetwork.add(Dense(128, activation='relu', input_dim=NUM_MUTATORS * 4))
neuralNetwork.add(BatchNormalization())
neuralNetwork.add(Dense(128, activation='relu'))
neuralNetwork.add(BatchNormalization())
neuralNetwork.add(Dense(128, activation='relu'))
neuralNetwork.add(BatchNormalization())
neuralNetwork.add(Dense(128, activation='relu'))
neuralNetwork.add(BatchNormalization())
neuralNetwork.add(Dense(128, activation='relu'))
neuralNetwork.add(BatchNormalization())
neuralNetwork.add(Dense(1, activation='sigmoid'))
#neuralNetwork.add(BatchNormalization())
neuralNetwork.compile(loss = keras.losses.BinaryCrossentropy(), optimizer = 'adam', metrics=['accuracy'])

#fit the network to (for now the un-sparse matrix)
neuralNetwork.fit(df_train_onehot_nn, labels, epochs=10000, batch_size=1024, verbose=2, callbacks= callbacks, validation_split = 0.4)

#version without loss monitoring
#neuralNetwork.fit(df_train_onehot_nn, labels, epochs=100, batch_size=1024, verbose=1, validation_split = 0.4)


#evaluation of the network prediction
predict_test_nn = neuralNetwork.predict(df_test_onehot_nn)
predict_train_nn = neuralNetwork.predict(df_train_onehot_nn)
print(predict_test_nn)
print(predict_train_nn)
predict_test_nn = (predict_test_nn >= 0.5)

Epoch 1/10000
66/66 - 2s - loss: 0.5189 - accuracy: 0.7940 - val_loss: 0.3090 - val_accuracy: 0.9622
Epoch 2/10000
66/66 - 1s - loss: 0.1703 - accuracy: 0.9797 - val_loss: 0.1704 - val_accuracy: 0.9622
Epoch 3/10000
66/66 - 1s - loss: 0.0612 - accuracy: 0.9890 - val_loss: 0.1744 - val_accuracy: 0.9622
Epoch 4/10000
66/66 - 1s - loss: 0.0327 - accuracy: 0.9935 - val_loss: 0.1820 - val_accuracy: 0.9622
Epoch 5/10000
66/66 - 1s - loss: 0.0223 - accuracy: 0.9951 - val_loss: 0.1618 - val_accuracy: 0.9631
Epoch 6/10000
66/66 - 1s - loss: 0.0153 - accuracy: 0.9967 - val_loss: 0.1281 - val_accuracy: 0.9679
Epoch 7/10000
66/66 - 1s - loss: 0.0107 - accuracy: 0.9977 - val_loss: 0.0926 - val_accuracy: 0.9751
Epoch 8/10000
66/66 - 1s - loss: 0.0082 - accuracy: 0.9984 - val_loss: 0.0639 - val_accuracy: 0.9819
Epoch 9/10000
66/66 - 1s - loss: 0.0098 - accuracy: 0.9974 - val_loss: 0.0494 - val_accuracy: 0.9860
Epoch 10/10000
66/66 - 1s - loss: 0.0078 - accuracy: 0.9981 - val_loss: 0.0428 - val_accura

In [15]:
print(predict_train_nn.max())
print(predict_train_nn.min())

print("actual labels:", np.sum(labels))
print("predicted labels:", np.sum(predict_train_nn >0.5))
print("the f1 score is:", f1_score(labels.astype('bool'), (predict_train_nn >=0.5).astype('bool')))

1.0
2.6119238e-07
actual labels: 4213
predicted labels: 4101
the f1 score is: 0.9805147943228291


In [16]:
predict_test_nn = pd.DataFrame(predict_test_nn.astype('int'))
predict_test_nn.to_csv('predictions.csv', header = False, index = False)
