In [None]:
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt

import pandas as pd
import numpy as np
import itertools

import tensorflow as tf

from keras.models import *
from keras.layers import *
from keras.optimizers import *

from sklearn.metrics import confusion_matrix, accuracy_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.utils import class_weight

In [None]:
df = pd.read_csv('../raw_data/gancy/merged_training.csv')
df.head()

In [None]:
df.columns.values

In [None]:
def load_data(filename, features):
    df = pd.read_csv(filename)
    df = df.sample(frac=1).reset_index(drop=True)
    X = np.asarray(df[features], dtype='float')

    # Get targets to one-hot encoding
    f = pd.factorize(df['target'], sort=True)
    y = np.zeros((f[0].shape[0], len(set(f[0]))))
    y[np.arange(f[0].shape[0]), f[0].T] = 1
    targets = f[1]
    
    return X, y, targets

def load_validation_data(filename, features):
    df = pd.read_csv(filename)
    df = df.sample(frac=1).reset_index(drop=True)
    X = np.asarray(df[features], dtype='float')
        
    X_codes = df['cell_code']
    
    return X, X_codes

In [None]:
def train_NN_model_no_test(X_train, y_train, param, targets):   
    # Hyper-parameters
    input_units = X_train.shape[1]
    output_units = len(targets)
    
    # Construct NN architecture
    model = Sequential()
    model.add(Dense(30, input_dim=input_units, kernel_initializer='glorot_normal'))
    model.add(normalization.BatchNormalization(axis=1))
    model.add(Activation('relu'))
    #model.add(Dropout(0.1))
    model.add(Dense(20, kernel_initializer='glorot_normal'))
    model.add(normalization.BatchNormalization(axis=1))
    model.add(Activation('relu'))
    #model.add(Dropout(0.1))
    model.add(Dense(output_units, kernel_initializer='glorot_normal'))
    model.add(Activation('softmax'))

    # Optimization method
    model.compile(loss='categorical_crossentropy', optimizer=Adadelta(), metrics=['accuracy'])

    # Fit training data
    hist = model.fit(X_train, y_train, epochs=params['num_epochs'],
                     batch_size=params['batch_size'], validation_split=params['percent_to_valid'])

    return model

In [None]:
def train_NN_model(X_train, X_test, y_train, y_test, param, targets):   
    # Hyper-parameters
    input_units = X_train.shape[1]
    output_units = len(targets)
    
    # Construct NN architecture
    model = Sequential()
    model.add(Dense(30, input_dim=input_units, kernel_initializer='glorot_normal'))
    model.add(normalization.BatchNormalization(axis=1))
    model.add(Activation('relu'))
    #model.add(Dropout(0.1))
    model.add(Dense(20, kernel_initializer='glorot_normal'))
    model.add(normalization.BatchNormalization(axis=1))
    model.add(Activation('relu'))
    #model.add(Dropout(0.1))
    model.add(Dense(output_units, kernel_initializer='glorot_normal'))
    model.add(Activation('softmax'))

    # Optimization method
    model.compile(loss='categorical_crossentropy', optimizer=Adadelta(), metrics=['accuracy'])

    # Fit training data
    hist = model.fit(X_train, y_train, epochs=params['num_epochs'],
                     batch_size=params['batch_size'], validation_split=params['percent_to_valid'])

    # Evaluate model
    score = model.evaluate(X_test, y_test, batch_size=params['batch_size'])
    print('\nTest %s: %.2f' % (model.metrics_names[0], score[0]))
    print('Test %s: %.2f%%' % (model.metrics_names[1], score[1]*100))

    # Make predictions
    y_pred = model.predict(X_test, batch_size=params['batch_size'])
    y_pred = [targets[i] for i in np.argmax(y_pred, axis=1)]
    y_true = [targets[i] for i in np.argmax(y_test, axis=1)]

    # Print prediction performance on F1-score
    f1 = f1_score(y_true, y_pred, average='macro')  
    print('F1-Score: %.2f' % f1)
    
    unpredicted_classes = list(set(y_true) - set(y_pred))
    if len(unpredicted_classes):
        print('THESE CLASSES ARE NOT PREDICTED - SCORING WILL FAIL!')
        print(unpredicted_classes)

    # Plot confusion matrix
    #cm = confusion_matrix(y_true, y_pred, targets)
    #plt.figure(figsize=(10, 10))
    #plt.imshow(cm, interpolation='nearest', cmap=plt.cm.viridis)
    #tick_marks = np.arange(len(targets))
    #plt.xticks(tick_marks, targets, rotation=45, ha='right')
    #plt.yticks(tick_marks, targets)
    #for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
    #    plt.text(j, i, cm[i, j], horizontalalignment='center', color='black')

    #plt.ylabel('True label')
    #plt.xlabel('Predicted label')

    return model

In [None]:
def save_predictions(validation_filename, features, output_prefix, targets, runn):
    # Load validation data
    X_valid, valid_cell_codes = load_validation_data(validation_filename, features)

    # Predict target probabilities
    y_valid_prob = model.predict(X_valid, batch_size=params['batch_size'])
    df_valid_prob = pd.DataFrame(columns=target_names, data=y_valid_prob)
    df_valid_prob.insert(0, 'cell_code', valid_cell_codes)

    # Predictions on maximum probabilities
    df_valid_pred = pd.DataFrame(columns = ['cell_code', 'prediction'])
    df_valid_pred['cell_code'] = df_valid_prob['cell_code']
    df_valid_pred['prediction'] = [targets[i] for i in np.argmax(y_valid_prob, axis=1)]

    # Save predictions
    df_valid_prob.to_csv('%s_probabilities_run-%d.csv' % (output_prefix, runn), index=False)
    #df_valid_pred.to_csv('%s_run-%d.csv' % (output_prefix, runn), index=False)

In [None]:
# Input options

training_filename = '../raw_data/gancy/merged_training.csv'
testing_filename = 'feature_data/testing_feature_select_robust.csv'
validation_filename = '../raw_data/gancy/merged_validation.csv'

features = ['actin.s.area', 'actin.s.radius.mean',
       'actin.s.radius.sd', 'actin.s.radius.min', 'actin.b.sd',
       'actin.b.mad', 'actin.b.q005', 'actin.b.q01', 'actin.b.q05',
       'actin.m.cx', 'actin.m.eccentricity', 'actin.m.theta',
       'DNA.s.area', 'DNA.s.radius.sd', 'DNA.s.radius.min', 'DNA.b.sd',
       'DNA.b.mad', 'DNA.b.q005', 'DNA.b.q05', 'DNA.m.cy',
       'DNA.m.majoraxis', 'DNA.m.eccentricity', 'DNA.m.theta',
       'dist.10.nn', 'dist.30.nn', 'nuclear.displacement', 'FC2_14',
       'FC2_46', 'FC2_78', 'FC2_86', 'FC2_91', 'FC2_125', 'FC2_142',
       'FC2_157', 'FC2_257', 'FC2_260', 'FC2_262', 'FC2_283', 'FC2_308',
       'FC2_335', 'FC2_352', 'FC2_398', 'FC2_401', 'FC2_473', 'FC2_477']

params = {'percent_to_valid': 0.2,
          'percent_to_test': 0.1,
          'num_epochs': 50,
          'batch_size': 256}

In [None]:
for i in range(100):
    X_train, y_train, target_names = load_data(training_filename, features)
    #X_test, y_test, target_names = load_data(testing_filename, features)
    #model = train_NN_model(X_train, X_test, y_train, y_test, params, target_names)
    model = train_NN_model_no_test(X_train, y_train, params, target_names)
    output_prefix = 'combined_data_predictions_NN/combined_features'
    save_predictions(validation_filename, features, output_prefix, target_names, i)