<center><b>Tensorflow Keras model to train on the GTEx data</b></center>

#### __Aim__: To build a TensorFlow Keras model, train on the available gene expression data to identify the age of the owner of the gene expression.

In [1]:
#Import the required packages

import os
from pathlib import Path
import pandas as pd
import numpy as np

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation, Flatten, Conv2D, MaxPooling2D
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import load_model
from sklearn.preprocessing import LabelEncoder
from sklearn import preprocessing

#### Extract the meta information from the data

In [2]:
def GTEx_sample_shrinker(meta,by_col,n=20):
    by=meta[by_col].unique()
    ans=[]
    for i in by:
        nTissue=len(meta[meta[by_col]==i])
        if nTissue<n:
            nn=nTissue
        else:
            nn=n
        ans.append(meta[meta[by_col]==i].sample(nn))
    selectedMeta=pd.concat(ans)
    return selectedMeta

current_dir = os.getcwd() #current directory
data_dir = os.path.join(current_dir, "data")
manifest={"data":"All_Tissue_Site_Details.combined.reads.gct",
          "sample_meta":"GTEx_v7_Annotations_SampleAttributesDS.txt",
          "subject_meta":"GTEx_v7_Annotations_SubjectPhenotypesDS.txt",
           "merged_meta":"merged_meta.tsv"}
meta=pd.read_csv(os.path.join(data_dir, manifest['merged_meta']),sep="\t",dtype={'SMUBRID':object})
y=GTEx_sample_shrinker(meta,'SMTS',20)
y.to_csv(os.path.join(data_dir,"filteredMeta.tsv"),sep="\t",index=False)
y['SAMPID'].to_csv(os.path.join(data_dir,"filteredSAMPID.tsv"),sep="\t",index=False)

#### Identify all the unique tissue types before building the DL model

In [3]:
fileName = os.path.join(data_dir, "merged_meta.tsv")
meta=pd.read_csv(os.path.join(data_dir,manifest['merged_meta']),sep="\t",dtype={'SMUBRID':object,'SEX':object,'DTHHRDY':object})

meta=meta[~(meta['AGE'].isnull())] # removes all samples without age

#Extract only the tissue types with count > 200
counts=pd.DataFrame(meta['SMTS'].value_counts())
df=meta[meta['SMTS'].isin(counts[counts['SMTS'] > 200].index)]

#Identify the unique tissue types
tissue_types = df['SMTS'].unique()

print("Unique tissue types in the GTEx data: ", [t for t in tissue_types])

Unique tissue types in the GTEx data:  ['Adipose Tissue', 'Blood Vessel', 'Brain', 'Breast', 'Blood', 'Skin', 'Colon', 'Esophagus', 'Heart', 'Lung', 'Muscle', 'Nerve', 'Pancreas', 'Stomach', 'Testis', 'Thyroid']


#### Building the keras model

In [4]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation, Flatten, Conv2D, MaxPooling2D
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import load_model

class keras_model:
    DATA_DIR = "keras_models"
    
    def __init__(self):
        self.early_stopping_monitor = EarlyStopping(patience=3) #Hyperparameter tuning
        
    def construct_model(self, x_train, y_train):
        self.model = tf.keras.models.Sequential() #Sequential model
        self.model.add(tf.keras.layers.Flatten())
        self.model.add(tf.keras.layers.Dense(1024, input_dim=x_train.shape[1], activation=tf.nn.relu))
        self.model.add(tf.keras.layers.Dense(512, activation=tf.nn.relu))
        #self.model.add(tf.keras.layers.Dense(256, activation=tf.nn.relu))
        #self.model.add(tf.keras.layers.Dense(128, activation=tf.nn.relu))
        self.model.add(tf.keras.layers.Dense(y_train.shape[1], activation=tf.nn.softmax))
        #Note: Output layer is designed to hold the number of neurons equivalent to the number of classes of age groups

        self.model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
        
    def model_train(self, x_train, y_train):
        if x_train.shape[0] == y_train.shape[0]: #Checking shape since there are tissues with missing gene expressions
            try:
                print("Training Keras model...")
                
                self.construct_model(x_train, y_train) #Constructing the model
                self.model.fit(x_train, 
                               y_train, 
                               batch_size=32, #Setting batch size for ease of processing in local machines
                               epochs=30, #Maximum of 30 epochs
                               validation_split=0.1, 
                               callbacks=[self.early_stopping_monitor]) #Early stopping hyperparameter
                
                print("Evaluating training accuracy...")
                loss, accuracy = self.model.evaluate(x_train, y_train)
                
                try:
                    #Persisting the model trained for the corresponding tissue type
                    fileName = TISSUE + "_keras_model.h5"
                    filePath = os.path.join(self.DATA_DIR, fileName)
                    self.model.save(filePath)
                    return accuracy, fileName
                except:
                    print("Exception while saving the model.")
                    return accuracy, None
            except:
                print("Exception while processing!")
                return -1, None
        else:
            print("Shape mismatch encountered!")
            return -1, None

#### Train the model for each tissue type

In [5]:
tissue_specific_path = "tissue-specific"

tissue_type = []
tissue_model_persist = []
tissue_model_accuracy = []

for tissue in tissue_types:
    k_model = keras_model()
    TISSUE=tissue
    infiles=os.listdir(tissue_specific_path)
    TISSUE_files=[f for f in infiles if  TISSUE in f]
    for entry in TISSUE_files:
        if "_cpm" in entry: #Identify the file with _cpm suffix; cpm stands for Counts Per Million
            pdd = pd.read_csv(os.path.join(tissue_specific_path,entry), sep='\t')
            
            print("Tissue type: ", TISSUE)

            #Dropping the gene id colunm since it plays no role in classification
            pdd = pdd.drop(pdd.columns[0], axis='columns')

            #Min_max normalization
            min_max_scaler = preprocessing.MinMaxScaler()
            np_scaled = min_max_scaler.fit_transform(pdd)
            pdd = pd.DataFrame(np_scaled)
            
            numpy_matrix = pdd.as_matrix()
            #Categorizing the target column and performing one-hot encoding
            tissue_meta=meta[meta['SMTS']==TISSUE]
            encoder = LabelEncoder()
            age_y = tissue_meta['AGE']
            encoder.fit(tissue_meta['AGE'])
            encoded_Y = encoder.transform(tissue_meta['AGE'])
            dummy_y = tf.keras.utils.to_categorical(encoded_Y)
            
            #Training the model for the current tissue type
            acc, fileName = k_model.model_train(numpy_matrix, dummy_y)
            if acc != -1:
                try:
                    acc = acc * 100
                    tissue_type.append(TISSUE)
                    tissue_model_persist.append(fileName)
                    tissue_model_accuracy.append(acc)
                    print("Final accuracy:", acc)
                except:
                    print("Error occurred for tissue type: ", TISSUE)
            print("\n")
            break

Tissue type:  Adipose Tissue


  return self.partial_fit(X, y)


Training Keras model...
Instructions for updating:
Colocations handled automatically by placer.




Train on 717 samples, validate on 80 samples
Instructions for updating:
Use tf.cast instead.
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Evaluating training accuracy...
Final accuracy: 30.740275979042053


Tissue type:  Blood Vessel


  return self.partial_fit(X, y)


Training Keras model...
Train on 821 samples, validate on 92 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Evaluating training accuracy...
Final accuracy: 29.901424050331116


Tissue type:  Brain


  return self.partial_fit(X, y)


Training Keras model...
Train on 297 samples, validate on 34 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Evaluating training accuracy...
Final accuracy: 45.92145085334778


Tissue type:  Breast


  return self.partial_fit(X, y)


Training Keras model...
Train on 261 samples, validate on 29 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Evaluating training accuracy...
Final accuracy: 30.344828963279724


Tissue type:  Blood


  return self.partial_fit(X, y)


Shape mismatch encountered!


Tissue type:  Skin


  return self.partial_fit(X, y)


Training Keras model...
Train on 1081 samples, validate on 121 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Evaluating training accuracy...
Final accuracy: 33.1114798784256


Tissue type:  Colon


  return self.partial_fit(X, y)


Training Keras model...
Train on 456 samples, validate on 51 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Evaluating training accuracy...
Final accuracy: 27.613413333892822


Tissue type:  Esophagus


  return self.partial_fit(X, y)


Training Keras model...
Train on 918 samples, validate on 103 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Evaluating training accuracy...
Final accuracy: 34.867775440216064


Tissue type:  Heart


  return self.partial_fit(X, y)


Training Keras model...
Train on 540 samples, validate on 60 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Evaluating training accuracy...
Final accuracy: 36.666667461395264


Tissue type:  Lung


  return self.partial_fit(X, y)


Training Keras model...
Train on 384 samples, validate on 43 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Evaluating training accuracy...
Final accuracy: 32.55269229412079


Tissue type:  Muscle


  return self.partial_fit(X, y)


Training Keras model...
Train on 507 samples, validate on 57 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Evaluating training accuracy...
Final accuracy: 33.33333432674408


Tissue type:  Nerve


  return self.partial_fit(X, y)


Training Keras model...
Train on 372 samples, validate on 42 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Evaluating training accuracy...
Final accuracy: 32.12560415267944


Tissue type:  Pancreas


  return self.partial_fit(X, y)


Training Keras model...
Train on 223 samples, validate on 25 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Evaluating training accuracy...
Final accuracy: 38.306450843811035


Tissue type:  Stomach


  return self.partial_fit(X, y)


Training Keras model...
Train on 234 samples, validate on 27 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Evaluating training accuracy...
Final accuracy: 35.63218414783478


Tissue type:  Testis


  return self.partial_fit(X, y)


Training Keras model...
Train on 233 samples, validate on 26 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Evaluating training accuracy...
Final accuracy: 34.74903404712677


Tissue type:  Thyroid


  return self.partial_fit(X, y)


Training Keras model...
Train on 401 samples, validate on 45 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Evaluating training accuracy...
Final accuracy: 33.856502175331116




#### Persist the results of the model in a TSV file for evaluation later

In [6]:
keras_model_pd = pd.DataFrame(
    data = {
        'tissue_type': tissue_type,
        'model_file': tissue_model_persist,
        'model_accuracy': tissue_model_accuracy
    }
)

keras_model_pd = keras_model_pd.sort_values('tissue_type')

keras_model_pd.to_csv("keras_model_results.tsv",sep="\t",index=False)

keras_model_pd

Unnamed: 0,tissue_type,model_file,model_accuracy
0,Adipose Tissue,Adipose Tissue_keras_model.h5,30.740276
1,Blood Vessel,Blood Vessel_keras_model.h5,29.901424
2,Brain,Brain_keras_model.h5,45.921451
3,Breast,Breast_keras_model.h5,30.344829
5,Colon,Colon_keras_model.h5,27.613413
6,Esophagus,Esophagus_keras_model.h5,34.867775
7,Heart,Heart_keras_model.h5,36.666667
8,Lung,Lung_keras_model.h5,32.552692
9,Muscle,Muscle_keras_model.h5,33.333334
10,Nerve,Nerve_keras_model.h5,32.125604
