In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import gc

# Load Data

In [None]:
df = pd.read_csv("https://storage.googleapis.com/gtex_analysis_v8/rna_seq_data/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_reads.gct.gz", sep='\t', compression='gzip', index_col=0, skiprows=2).drop("Description", 1)
df = df.divide(df.sum(0), 1)
gc.collect()
df.head()

In [None]:
df_files = pd.read_csv("https://storage.googleapis.com/gtex_analysis_v8/annotations/GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt", sep="\t").set_index("SAMPID")
df_files.head()

# Model

In [None]:
import tensorflow as tf
import numpy as np
import tensorflow.keras as keras
from keras.models import Sequential
from keras.layers import Dense
from keras.utils import to_categorical
from keras.losses import categorical_crossentropy
from keras.optimizers import SGD, Adam
from sklearn.model_selection import train_test_split
from keras.callbacks import Callback, CSVLogger, EarlyStopping
tf.config.experimental.list_physical_devices()

In [None]:
X = df.transpose().values
Y = to_categorical(np.unique(df_files.reindex(index=df.columns)["SMTS"], return_inverse=True)[1])

del df
gc.collect()

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.2, shuffle=True)

In [None]:
inputs = X_train.shape[1]
print(X_train.shape, Y_train.shape)

In [None]:
csv_logger = CSVLogger('log.csv', append=True, separator=',')
es = EarlyStopping(monitor='val_loss', min_delta=1e-1, mode='min', patience=5)

In [None]:
model = Sequential()
model.add(Dense(units = 1000, use_bias=False, bias_initializer='zeros', input_dim=inputs, activation="relu"))
model.add(Dense(units = 250, input_dim=inputs, activation="relu"))
model.add(Dense(units = Y_train.shape[1], activation="softmax"))

model.compile(loss=categorical_crossentropy, optimizer=Adam(lr=0.01), metrics=['accuracy'])

model.summary()

In [None]:
with tf.device("GPU"):
  model.fit(X_train, Y_train, epochs=100, batch_size=500, verbose=1, validation_split=0.2, use_multiprocessing=True, workers=-1, callbacks=[csv_logger, es], shuffle=True)

In [None]:
model.evaluate(X_test, Y_test)

In [None]:
model.save("/content/drive/My Drive/GTex_ML/GTex_ML.h5")

In [None]:
fig, ax = plt.subplots()
pd.read_csv("log.csv", sep=",")[['loss','val_loss']].plot(ax=ax)

fig.savefig("/content/drive/My Drive/GTex_ML/GTex_ML_losses.pdf")

In [None]:
classes = np.unique(df_files["SMTS"])

In [None]:
results = pd.DataFrame(index=classes, columns=classes).fillna(0)
for class_pred, y_test in zip(model.predict_classes(X_test), Y_test):
    results.at[classes[y_test.argmax()], classes[class_pred]]+=1

In [None]:
ax = sns.heatmap(results.divide(results.sum(1),0), annot=False)

fig = ax.get_figure()
fig.savefig("/content/drive/My Drive/GTex_ML/GTex_ML_classes.pdf")