<center><b>Tensorflow Keras model to train on the GTEx data</b></center>

#### __Aim__: To build a TensorFlow Keras model, train on the available gene expression data to identify the age of the owner of the gene expression.

In [5]:
#Import the required packages

import os
from pathlib import Path
import pandas as pd
import numpy as np

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation, Flatten, Conv2D, MaxPooling2D
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import load_model
from sklearn.preprocessing import LabelEncoder

#### Extract the meta information from the data

In [10]:
def GTEx_sample_shrinker(meta,by_col,n=20):
    by=meta[by_col].unique()
    ans=[]
    for i in by:
        nTissue=len(meta[meta[by_col]==i])
        if nTissue<n:
            nn=nTissue
        else:
            nn=n
        ans.append(meta[meta[by_col]==i].sample(nn))
    selectedMeta=pd.concat(ans)
    return selectedMeta

current_dir = os.getcwd() #current directory
data_dir = os.path.join(current_dir, "data")
manifest={"data":"All_Tissue_Site_Details.combined.reads.gct",
          "sample_meta":"GTEx_v7_Annotations_SampleAttributesDS.txt",
          "subject_meta":"GTEx_v7_Annotations_SubjectPhenotypesDS.txt",
           "merged_meta":"merged_meta.tsv"}
meta=pd.read_csv(os.path.join(data_dir, manifest['merged_meta']),sep="\t",dtype={'SMUBRID':object})
y=GTEx_sample_shrinker(meta,'SMTS',20)
y.to_csv(os.path.join(data_dir,"filteredMeta.tsv"),sep="\t",index=False)
y['SAMPID'].to_csv(os.path.join(data_dir,"filteredSAMPID.tsv"),sep="\t",index=False)

#### Identify all the unique tissue types before building the DL model

In [16]:
fileName = os.path.join(data_dir, "merged_meta.tsv")
meta=pd.read_csv(os.path.join(data_dir,manifest['merged_meta']),sep="\t",dtype={'SMUBRID':object,'SEX':object,'DTHHRDY':object})

meta=meta[~(meta['AGE'].isnull())] # removes all samples without age

#Extract only the tissue types with count > 200
counts=pd.DataFrame(meta['SMTS'].value_counts())
df=meta[meta['SMTS'].isin(counts[counts['SMTS'] > 200].index)]

#Identify the unique tissue types
tissue_types = df['SMTS'].unique()

print("Unique tissue types in the GTEx data: ", [t for t in tissue_types])

Unique tissue types in the GTEx data:  ['Adipose Tissue', 'Blood Vessel', 'Brain', 'Breast', 'Blood', 'Skin', 'Colon', 'Esophagus', 'Heart', 'Lung', 'Muscle', 'Nerve', 'Pancreas', 'Stomach', 'Testis', 'Thyroid']


#### Building the keras model

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation, Flatten, Conv2D, MaxPooling2D
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import load_model

class keras_model:
    DATA_DIR = "keras_models"
    
    def __init__(self):
        self.early_stopping_monitor = EarlyStopping(patience=3) #Hyperparameter tuning
        
    def construct_model(self, x_train, y_train):
        self.model = tf.keras.models.Sequential() #Sequential model
        self.model.add(tf.keras.layers.Flatten())
        self.model.add(tf.keras.layers.Dense(1024, input_dim=x_train.shape[1], activation=tf.nn.relu))
        self.model.add(tf.keras.layers.Dense(512, activation=tf.nn.relu))
        #self.model.add(tf.keras.layers.Dense(256, activation=tf.nn.relu))
        #self.model.add(tf.keras.layers.Dense(128, activation=tf.nn.relu))
        self.model.add(tf.keras.layers.Dense(y_train.shape[1], activation=tf.nn.softmax))
        #Note: Output layer is designed to hold the number of neurons equivalent to the number of classes of age groups

        self.model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
        
    def model_train(self, x_train, y_train):
        if x_train.shape[0] == y_train.shape[0]: #Checking shape since there are tissues with missing gene expressions
            try:
                print("Training Keras model...")
                
                self.construct_model(x_train, y_train) #Constructing the model
                self.model.fit(x_train, 
                               y_train, 
                               batch_size=32, #Setting batch size for ease of processing in local machines
                               epochs=30, #Maximum of 30 epochs
                               validation_split=0.1, 
                               callbacks=[self.early_stopping_monitor]) #Early stopping hyperparameter
                
                print("Evaluating training accuracy...")
                loss, accuracy = self.model.evaluate(x_train, y_train)
                
                try:
                    #Persisting the model trained for the corresponding tissue type
                    fileName = TISSUE + "_keras_model.h5"
                    filePath = os.path.join(self.DATA_DIR, fileName)
                    self.model.save(filePath)
                    return accuracy, fileName
                except:
                    print("Exception while saving the model.")
                    return accuracy, None
            except:
                print("Exception while processing!")
                return -1, None
        else:
            print("Shape mismatch encountered!")
            return -1, None