In [None]:
import numpy as np
import pandas as pd
import keras
from sklearn.preprocessing import Imputer

data = pd.read_csv('all_universal_animals_features.txt', sep='\t')
data.index = range(0, len(data))

# shuffle data frame rows (optional)
data = data.sample(frac=1).reset_index(drop=True)
data[:5]

Using TensorFlow backend.


#### Inspcet first 10 lines of raw data across all columns (in 3 parts)

In [None]:
# Part-1
data.iloc[:5,:10]

In [None]:
# Part-2
data.iloc[:5,10:23]

In [None]:
# Part-3
data.iloc[:5,23:]

<br><br><br><br>

#### Replace YES/NO flags with 1/0

In [None]:
label_mapping = {'YES': 1, 'NO': 0}
data = data.replace({'hits_on_mature_miR': label_mapping})
y = data['hits_on_mature_miR']

#### Normalise data

In [None]:
data = (data - data.min()) / (data.max() - data.min())
data[:5]

#### Impute missing values (with median)

In [None]:
fill_NaN = Imputer(missing_values=np.nan, strategy='median', axis=1)
imputed_DF = pd.DataFrame(fill_NaN.fit_transform(data))
imputed_DF.columns = data.columns
imputed_DF.index = data.index
data = imputed_DF

data[:5]

#### Re-integrate TP label with the original data

In [None]:
data['TP'] = y
data[:5]

<br><br><br><br>

### Split data into training and test sets

In [None]:
split_ratio = 0.8

sample = np.random.choice(data.index, size=int(len(data)*split_ratio), replace=False)
train_data, test_data = data.iloc[sample], data.drop(sample)

print("Number of training samples is", len(train_data))
print("Number of testing samples is", len(test_data))

#### Create np.arrays for X_train/test and y_train/test sets

In [None]:
X_train = np.array(train_data.drop('TP', axis=1))
y_train = np.array(keras.utils.to_categorical(train_data['TP'], 2))

X_test = np.array(test_data.drop('TP', axis=1))
y_test = np.array(keras.utils.to_categorical(test_data['TP'], 2))


print("Number of rows in X_train: ", len(X_train))
print("Number of rows in y_train: ", len(y_train))
print("Number of rows in X_test", len(X_test))
print("Number of rows in y_test", len(y_test))

<br><br><br><br>

### Train model using optimal parameters <span style='font-size:12px'>(found via GridSearchCV)</span>

In [None]:
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.utils import np_utils
from keras.callbacks import EarlyStopping
from keras import regularizers
from keras.optimizers import SGD
from keras.optimizers import RMSprop

from sklearn.model_selection import GridSearchCV
from keras.wrappers.scikit_learn import KerasClassifier

# create the model
def create_model(optimizer='adam'):
    
    init_mode = 'he_normal'
    reglr = 0.01
#     optimizer = keras.optimizers.SGD(lr=0.001, momentum=0.0, decay=0.0, nesterov=False)
    
    # Building the model
    model = Sequential()
    
    model.add(Dense(64, activation='relu', input_shape=(X_train.shape[1],), kernel_regularizer=regularizers.l2(reglr)))
    model.add(Dropout(.4))
    model.add(Dense(32, kernel_initializer=init_mode, activation='relu', kernel_regularizer=regularizers.l2(reglr)))
    model.add(Dropout(.4))

    model.add(Dense(2, activation='softmax'))

    model.compile(loss = 'categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

model = create_model()

#### Fit the model

In [None]:
callbacks = [EarlyStopping(monitor='acc', patience=2)] # do not apply without checking first with no callbacks

out = model.fit(X_train, y_train, epochs=30, batch_size=128, verbose=1, validation_split=0.2, callbacks=callbacks)

#### Plot training / validation loss and accuracy change at each epoch

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

epochs = np.array(out.__dict__['epoch'])
acc = np.array(out.__dict__['history']['acc'])
loss = np.array(out.__dict__['history']['loss'])
val_acc = np.array(out.__dict__['history']['val_acc'])
val_loss = np.array(out.__dict__['history']['val_loss'])

f = plt.figure(figsize=(7,5))
plt.plot(epochs, loss, label='training')
plt.plot(epochs, val_loss, label='validation')
plt.legend(loc='upper left')
plt.title('Loss')

plt.show()

f = plt.figure(figsize=(7,5))
plt.plot(epochs, acc, label='training')
plt.plot(epochs, val_acc, label='validation')
plt.legend(loc='upper left')
plt.title('Accuracy')

plt.show()

f.savefig("train_validation_loss.pdf", bbox_inches='tight')

#### Evaluating the model on the training and testing set

In [None]:
train_score = model.evaluate(X_train, y_train, verbose=1)
print("\n Training Accuracy:", train_score[1])
test_score = model.evaluate(X_test, y_test, verbose=1)
print("\n Testing Accuracy:", test_score[1])

#### Get confusion matrix

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

y_pred = model.predict(X_test)
y_pred = y.values.argmax

y_pred = model.predict_classes(X_test)

p = model.predict_proba(X_test)

print(classification_report(np.argmax(y_test, axis=1), y_pred))
print(confusion_matrix(np.argmax(y_test, axis=1), y_pred))
print()
TN, FP, FN, TP = confusion_matrix(np.argmax(y_test, axis=1), y_pred).ravel()
print("TP:", TP)
print("FN:", FN)
print("TN:", TN)
print("FP:", FP)

#### Inspect predicted positives

In [None]:
conc = pd.concat([pd.DataFrame(y_test), pd.DataFrame(p)], axis=1)
conc.columns = ['test_0', 'test_1', 'pred_0', 'pred_1']
subdf = conc.loc[conc['test_1'] == 1]
p = subdf.loc[subdf['pred_1'] >= 0.5]

p[:15]
res = subdf.loc[subdf['pred_1'] >= 0.5]
res[:5]