In [1]:
from time import time
import re, sys, os
import numpy as np
import pandas as pd
import matplotlib. pyplot as plt 
from keras.models import load_model
from CCLE_AE import r_square

autoencoder_path = 'Models/'
encoder_path = 'Models/'

Using TensorFlow backend.


# Use pre-trained CCLE Simple autoencoder 

In [2]:
autoencoder = load_model(autoencoder_path,custom_objects={'r_square': r_square})
encoder = load_model(encoder_path,custom_objects={'r_square': r_square})

In [67]:
autoencoder.summary()

Model: "model_17"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_17 (InputLayer)        (None, 56202)             0         
_________________________________________________________________
dense_84 (Dense)             (None, 256)               14387968  
_________________________________________________________________
dense_85 (Dense)             (None, 128)               32896     
_________________________________________________________________
dense_86 (Dense)             (None, 64)                8256      
_________________________________________________________________
dense_87 (Dense)             (None, 32)                2080      
_________________________________________________________________
dense_88 (Dense)             (None, 64)                2112      
_________________________________________________________________
dense_89 (Dense)             (None, 128)               832

In [68]:
encoder.summary()

Model: "model_18"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_17 (InputLayer)        (None, 56202)             0         
_________________________________________________________________
dense_84 (Dense)             (None, 256)               14387968  
_________________________________________________________________
dense_85 (Dense)             (None, 128)               32896     
_________________________________________________________________
dense_86 (Dense)             (None, 64)                8256      
_________________________________________________________________
dense_87 (Dense)             (None, 32)                2080      
Total params: 14,431,200
Trainable params: 14,431,200
Non-trainable params: 0
_________________________________________________________________


`model.layers[idx].output`

Above is a tensor object, so you can modify it using operations that can be applied to a tensor object.

For example, to get the shape `model.layers[idx].output.get_shape()`

`idx` is the index of the layer and you can find it from `model.summary()`

## GI Scores
Load data from [Mapping the Genetic Landscape of Human Cells](https://www.sciencedirect.com/science/article/pii/S0092867418307359?via%3Dihub#mmc2) paper. 
> [**Table S5.**](https://www.sciencedirect.com/science/article/pii/S0092867418307359?via%3Dihub#mmc5)

In [1]:
import matplotlib.pyplot as plt
from scipy.stats import fisher_exact, spearmanr
import plotly as py
import pandas as pd 
import plotly.tools as tls
import main_functions

In [4]:
main_functions.get_

<module 'main_functions' from '/rumi/shams/abe/Projects/GI_predictor/main_functions.py'>

## Input data: cBioportal cohorts 

## 1st
- ### [Acute Myeloid Leukemia (OHSU, Nature 2018)](https://www.cbioportal.org/study?id=aml_ohsu_2018)

In [82]:
f_path = 'cBioPortal/aml_ohsu_2018/' + 'data_RNA_Seq_expression_cpm_Zscores.txt'
# ls cBioPortal/aml_ohsu_2018

In [83]:
data = pd.read_csv(f_path,sep='\t', index_col='Hugo_Symbol', 
                   na_values ='NA').drop(columns='Entrez_Gene_Id').astype(float)

gene pairs for **K562** GI pairs:

In [87]:
cell_line = 'K562'
data_G1, data_G2 = get_gene_pairs(SL_dataset[cell_line], data)
X = np.concatenate((    np.array (data_G1.T),    np.array (data_G2.T))).T
# this is our input:
print(X.shape)
print (f'rows: {X.shape[0]} (#gene pairs)\ncolumns: {X.shape[1]} (2*#patients)')

(98790, 902)
rows: 98790 (#gene pairs)
columns: 902 (2*#patients)


In [None]:
def r_square(y_true, y_pred):
    # custom R2-score metrics for keras backend
    SS_res =  K.sum(K.square(y_true - y_pred))
    SS_tot = K.sum(K.square(y_true - K.mean(y_true)))
    return ( 1 - SS_res/(SS_tot + K.epsilon()) )

def create_model(input_size, dim):
    # this is our input placeholder
    myinput = Input(shape=(input_size,))
    ### hidden layers
    deep = Dense(dim, activation='relu')(myinput)
    ### add cell-line features from encoder model
    deep #??
    # last layer 
    last = Dense(1, activation='sigmoid')(deep)
    model = Model(myinput, last)   
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', r_square])
    return model

In [88]:
def run_model(X,y):
    # make X and y from df
    y_true = y
    X_train, X_test,y_train,y_test = train_test_split(X, y, test_size=test_size, random_state=42)
    # callbacks
    history = History()
    early_stopping = EarlyStopping(monitor='val_loss', patience=20)
    # make, save plot and fit the model
    model = create_model(X.shape[1])
    print(model.summary())
    model.fit(X_train, y_train,
              epochs=epochs,
              batch_size=batch_size,
              shuffle=False, # changed to false to keep val cell equal for comparing models
              callbacks=[history,early_stopping],
              validation_data=(X_test, y_test)
             )
    print("fitting has just been finished")
    X_pred = model.predict(X_test, batch_size=batch_size, verbose=2)
    print("prediction process has just been finished")
    # save the model and encoded-layer output
    autoencoder.save(filepath=model_path+"autoencoder.h5")
    encoder.save(filepath=model_path+"encoder.h5")
    # save the result and prediction value
    np.savetxt(X=X_test, fname=model_path+"X_test.csv", delimiter=",")
    np.savetxt(X=X_pred[0], fname=model_path+"X_pred.csv", delimiter=",")
    if early_stopping.stopped_epoch == 0:
        plot_by_epochs(autoencoder,epochs)
    elif early_stopping.stopped_epoch > 0:
        print (f'model stopped training at epoch {early_stopping.stopped_epoch}')
        # plot_by_epochs(autoencoder,early_stopping.stopped_epoch)
    print("model objects and metrics plots saved")


(98790, 902)

gene pairs for **Jurkat** GI pairs:

# 2nd
- ### [Pediatric Acute Lymphoid Leukemia - Phase II (TARGET, 2018)](https://www.cbioportal.org/study?id=all_phase2_target_2018_pub)

In [44]:
# ls cBioPortal/all_phase2_target_2018_pub

In [43]:
# meta data
filepath = 'cBioPortal/all_phase2_target_2018_pub/meta_study.txt'
with open(filepath) as fp:
    print( fp.read() )

type_of_cancer: bll
cancer_study_identifier: all_phase2_target_2018_pub
name: Pediatric Acute Lymphoid Leukemia - Phase II (TARGET, 2018)
short_name: ALL-Phase II (TARGET, 2018)
description: Comprehensive profiling of ALL Phase 2 samples. <p>TARGET data is intended exclusively for biomedical research using pediatric data (i.e., the research objectives cannot be accomplished using data from adults) that focus on the development of more effective treatments, diagnostic tests, or prognostic markers for childhood cancers. Moreover, TARGET data can be used for research relevant to the biology, causes, treatment and late complications of treatment of pediatric cancers, but is not intended for the sole purposes of methods and/or tool development (please see <a href="https://ocg.cancer.gov/programs/target/using-target-data">Using TARGET Data</a> section of the OCG website). If you are interested in using TARGET data for publication or other research purposes, you must follow the <a href="https

In [51]:
f_path = 'cBioPortal/all_phase2_target_2018_pub/' + 'data_RNA_Seq_mRNA_median_Zscores.txt'
data = pd.read_csv(f_path,sep='\t', index_col='Hugo_Symbol', 
                   na_values ='NA', na_filter = True).astype(float)

In [None]:
data.isna

gene pairs for **K562** GI pairs:

gene pairs for **Jurkat** GI pairs: