In [None]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import sklearn.decomposition
import pandas as pd
import seaborn as sb

import tsne
import multimodal_data

# Load data

In [None]:
l1k = multimodal_data.load_l1000("replicate_level_all_alleles.csv")
cp = multimodal_data.load_cell_painting(
    "/data1/luad/others/morphology.csv", 
    "resnet18-validation-well_profiles.csv", 
    aggregate_replicates=False
)

In [None]:
l1k, cp = multimodal_data.align_profiles(l1k, cp, sample=0)
GE = np.asarray(l1k)[:,1:]
MP = np.asarray(cp)[:,1:]

# Separate training and validation

In [None]:
common_alleles = set(cp["Allele"].unique()).intersection( l1k["Allele"].unique() )
genes = list(common_alleles)
np.random.shuffle(genes)

train = genes[0:9*int(len(genes)/10)]
test = genes[9*int(len(genes)/10):]

GE_train = l1k[l1k["Allele"].isin(train)]
MP_train = cp[cp["Allele"].isin(train)]

GE_test = l1k[l1k["Allele"].isin(test)]
MP_test = cp[cp["Allele"].isin(test)]

# Normalize inputs and outputs

In [None]:
def z_score(A, model, features):
    alleles = list(A["Allele"])
    A = pd.DataFrame(data=model.transform(A[features]), columns=features)
    A["Allele"] = alleles
    return A[["Allele"] + features]

ge_features = [str(i) for i in range(GE.shape[1])]
sc_l1k = sklearn.preprocessing.StandardScaler()
sc_l1k.fit(GE_train[ge_features])
GE_train = z_score(GE_train, sc_l1k, ge_features)
GE_test = z_score(GE_test, sc_l1k, ge_features)

mp_features = [str(i) for i in range(MP.shape[1])]
sc_cp = sklearn.preprocessing.StandardScaler()
sc_cp.fit(MP_train[mp_features])
MP_train = z_score(MP_train, sc_cp, mp_features)
MP_test = z_score(MP_test, sc_cp, mp_features)

# Create Neural Net

In [None]:
def mp2ge_net(in_size, out_size):
    inLayer = tf.keras.layers.Input([in_size])
    net = tf.keras.layers.Dense(in_size, activation="relu")(inLayer)
    net = tf.keras.layers.BatchNormalization()(net)
    net = tf.keras.layers.Dense(in_size//2, activation="relu")(net)
    net = tf.keras.layers.BatchNormalization()(net)
    net = tf.keras.layers.Dropout(0.5)(net)
    net = tf.keras.layers.Dense(out_size//4, activation="relu")(net)
    net = tf.keras.layers.BatchNormalization()(net)
    net = tf.keras.layers.Dropout(0.5)(net)
    net = tf.keras.layers.Dense(out_size, activation=None)(net)
    return tf.keras.Model(inLayer, net)

In [None]:
model = mp2ge_net(MP.shape[1], GE.shape[1])
model.summary()

# Prepare data generator

In [None]:
class MultimodalDataGenerator(tf.keras.utils.Sequence):
    'Generates data for Keras'
    def __init__(self, modA, modB, batch_size=32):
        'Initialization'
        self.batch_size = batch_size
        self.modA = modA
        self.modB = modB
        self.classes = set( modA["Allele"].unique()).intersection( modB["Allele"].unique() )
        self.classes = list(self.classes)
        self.create_samples()
        
    def create_samples(self):
        dataA = []
        dataB = []
        classes = []
        # Generate all combinations of A and B with the same label
        for cl in self.classes:
            for idx, rowA in self.modA[self.modA["Allele"] == cl].iterrows():
                for jdx, rowB in self.modB[self.modB["Allele"] == cl].iterrows():
                    dataA.append(np.reshape(np.asarray(rowA)[1:], (1,self.modA.shape[1]-1)))
                    dataB.append(np.reshape(np.asarray(rowB)[1:], (1,self.modB.shape[1]-1)))
                    classes.append(cl)
        self.X = np.concatenate(dataA)
        self.Y = np.concatenate(dataB)
        self.Z = classes
        print("Total pairs:", len(dataA), self.X.shape, self.Y.shape)

    def __len__(self):
        'Denotes the number of batches per epoch'
        return int(np.floor(len(self.modA) / self.batch_size))

    def __getitem__(self, index):
        'Generate one batch of data'
        # Initialization
        index = np.arange(0,self.X.shape[0])
        np.random.shuffle(index)
        X = self.X[index[0:self.batch_size], :]
        Y = self.Y[index[0:self.batch_size], :]
        return X, Y

# Train model

In [None]:
#build session running on GPU 1
configuration = tf.ConfigProto()
configuration.gpu_options.allow_growth = True
configuration.gpu_options.visible_device_list = "3"
session = tf.Session(config = configuration)

tf.keras.backend.set_session(session)

In [None]:
model.compile(optimizer='adam', loss='mean_absolute_error')
dgen_train = MultimodalDataGenerator(MP_train, GE_train)
dgen_test = MultimodalDataGenerator(MP_test, GE_test)
model.fit_generator(dgen_train, epochs=100, validation_data=dgen_test)

# Make predictions

In [None]:
predicted_ge = model.predict(np.asarray(MP_test)[:,1:])

predicted_ge = pd.DataFrame(data=predicted_ge, columns=ge_features)
predicted_ge["Allele"] = MP_test["Allele"]
predicted_ge = predicted_ge[["Allele"] + ge_features]

predicted_ge["Real"] = False
GE_test["Real"] = True

compare_ge = pd.concat([GE_test, predicted_ge]).reset_index(drop=True)

In [None]:
# Compute tSNE
X = np.asarray(compare_ge)[:,1:-1]
X = np.asarray(X, dtype=np.float)
Y = tsne.tsne(X)

In [None]:
compare_ge["X"] = Y[:,0]
compare_ge["Y"] = Y[:,1]
sb.lmplot(data=compare_ge, x="X", y="Y", hue="Real", fit_reg=False)

In [None]:
plt.figure(figsize=(12,12))
p1 = sb.regplot(data=compare_ge[compare_ge["Real"]], x="X", y="Y", fit_reg=False, color="#FF983E", scatter_kws={'s':50})
for point in range(compare_ge.shape[0]):
    if point % 2 == 0:
        p1.text(compare_ge.X[point], compare_ge.Y[point], compare_ge.Allele[point], horizontalalignment='left', size='small', color='black')

p2 = sb.regplot(data=compare_ge[~compare_ge["Real"]], x="X", y="Y", fit_reg=False, color="#4B91C2", scatter_kws={'s':50})
for point in range(compare_ge.shape[0]):
    if point % 2 == 0:
        p2.text(compare_ge.X[point], compare_ge.Y[point], compare_ge.Allele[point], horizontalalignment='left', size='small', color='black')