# CCLE Simple autoencoder 

Making the model to use later in trianing: 

In [51]:
from time import time
import re, sys, os
from keras.layers import Input, Dense
from keras.models import Model
from keras.callbacks import CSVLogger, History
from sklearn.preprocessing import LabelEncoder, normalize
import numpy as np
import pandas as pd
user = '/rumi/shams/abe/'

In [49]:
def read_data(user=user, dtype = 'counts'):
    t0 = time()
    data_folder = user + 'Datasets/CCLE/'
    m_rna_counts = data_folder + 'CCLE_RNAseq_genes_counts_20180929.gct.gz'
    m_rna_rpkm = data_folder + 'CCLE_RNAseq_genes_rpkm_20180929.gct.gz'
    
    # select input format 
    if dtype == 'counts':
        m_rna = m_rna_counts
    elif dtype == 'rpkm':
        m_rna = m_rna_rpkm
    
    # read raw data
    raw_m_rna = pd.read_csv(m_rna, skiprows=2, sep='\t')

    # make meta data dictionary 
    meta = {'m_rna': raw_m_rna[['Name','Description']],
            'cell_lines': raw_m_rna.columns.values.tolist()[2:]}
    # normalize
    df_m_rna = raw_m_rna.drop(columns=['Name','Description']).to_numpy()
    df_m_rna = normalize(X=df_m_rna, axis=0, norm="max")
    data = {'df':df_m_rna, 'meta': meta}
    return data 

def simple_autoencoder(df, encoding_dim = 32, batch_size = 256, epochs = 30 , model_id='', user=user):
    def create_model(X):
        # this is our input placeholder
        myinput = Input(shape=(X.shape[1],))
        # "encoded" is the encoded representation of the input
        encoded = Dense(encoding_dim*2**2, activation='relu')(myinput)
        encoded = Dense(encoding_dim*2, activation='relu')(encoded)
        encoded = Dense(encoding_dim, activation='relu')(encoded)
        decoded = Dense(encoding_dim*2, activation='relu')(encoded)
        decoded = Dense(encoding_dim*2**2, activation='relu')(decoded)
        decoded = Dense(X.shape[1], activation='sigmoid')(decoded)
        # "decoded" is the lossy reconstruction of the input
        decoded = Dense(X.shape[1], activation='sigmoid')(encoded)
        # this model maps an input to its reconstruction
        autoencoder = Model(myinput, decoded)

        # Let's also create a separate encoder model: this model maps an input to its encoded representation
        encoder = Model(myinput, encoded)
        # As well as the decoder model: create a placeholder for an encoded (32-dimensional) input
        encoded_input = Input(shape=(encoding_dim,))
        # retrieve the last layer of the autoencoder model
        decoder_layer = autoencoder.layers[-1]
        # create the decoder model
        decoder = Model(encoded_input, decoder_layer(encoded_input))
        autoencoder.compile(optimizer='adadelta', loss='binary_crossentropy')
        return autoencoder, encoder, decoder

    results_folder = user + 'Project/GI_predictor/Results'
    file_name = "CCLE_autoencoder_bs_"+batch_size+"_ep_"+epochs+model_id
    # make X from df
    X = df.T 
    # make model
    
    # log files 
    csv_logger = CSVLogger(result_folder + file_name+".log")
    history = History()
    
    model = create_model(X)
    autoencoder.fit(X_train, X_train,
            epochs=epochs,
            batch_size=batch_size,
            shuffle=True,
            validation_data=(X_test, X_test))

    print(history.history.keys())
    print("fitting has just been finished")
    # save the model and encoded-layer output

    model.save(filepath=result_folder+file_name+".h5")
    layer_name = "encoded"
    encoded_layer_model = Model(inputs=model.input, outputs=model.get_layer(layer_name).output)
    encoded_output = encoded_layer_model.predict(X)
    np.savetxt(X=encoded_output, fname=result_folder + "ccle_encoded.csv", delimiter=",")

    # save the result and prediction value
    data_pred = model.predict(X_test, batch_size=batch_size, verbose=2)
    np.savetxt(X=X_test, fname=result_folder + "ccle_genes.csv", delimiter=",", fmt='%1.3f')
    np.savetxt(X=data_pred[0], fname=result_folder + "ccle_genes_pred.csv", delimiter=",", fmt='%1.3f')
    print("prediction process has just been finished")

    print("run has just been finished")

In [50]:
data = read_data()
cell_lines = data['meta']['cell_lines']

In [38]:
[[i, cell] for i, cell in enumerate(cell_lines) if 'K562' in cell ]

[[385, 'K562_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE']]

In [43]:
[[i, cell] for i, cell in enumerate(cell_lines) if 'JURKAT' in cell ]

[[380, 'JURKAT_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE']]

In [None]:
simple_autoencoder()

In [3]:
import matplotlib.pyplot as plt
from scipy.stats import fisher_exact, spearmanr
import plotly as py
import plotly.tools as tls
from screen import get_SLdataset, get_gene_pairs

## GI Scores
Load data from [Mapping the Genetic Landscape of Human Cells](https://www.sciencedirect.com/science/article/pii/S0092867418307359?via%3Dihub#mmc2) paper. 
> [**Table S5.**](https://www.sciencedirect.com/science/article/pii/S0092867418307359?via%3Dihub#mmc5)

In [33]:
SL_dataset = get_SLdataset()

K562: 99.55%	100128 SLs from 100576 unique gene pairs
Jurkat: 99.49%	75078 SLs from 75466 unique gene pairs 
done in 54.022702s


## Input data: cBioportal cohorts 

## 1st
- ### [Acute Myeloid Leukemia (OHSU, Nature 2018)](https://www.cbioportal.org/study?id=aml_ohsu_2018)

In [None]:
f_path = 'cBioPortal/aml_ohsu_2018/' + 'data_RNA_Seq_expression_cpm_Zscores.txt'
# ls cBioPortal/aml_ohsu_2018

In [16]:
data = pd.read_csv(f_path,sep='\t', index_col='Hugo_Symbol', 
                   na_values ='NA').drop(columns='Entrez_Gene_Id').astype(float)

gene pairs for **K562** GI pairs:

In [31]:
cell_line = 'K562'
data_G1, data_G2 = get_gene_pairs(SL_dataset[cell_line], data)
X = np.concatenate((    np.array (data_G1.T),    np.array (data_G2.T)))
# this is our input:
print (f'num of patients X 2 = {X.shape[0]} \nnum of gene pairs (Obs.) = {X.shape[1]}')

num of patients X 2 = 902 
num of gene pairs (Obs.) = 657


gene pairs for **Jurkat** GI pairs:

# 2nd
- ### [Pediatric Acute Lymphoid Leukemia - Phase II (TARGET, 2018)](https://www.cbioportal.org/study?id=all_phase2_target_2018_pub)

In [44]:
# ls cBioPortal/all_phase2_target_2018_pub

In [43]:
# meta data
filepath = 'cBioPortal/all_phase2_target_2018_pub/meta_study.txt'
with open(filepath) as fp:
    print( fp.read() )

type_of_cancer: bll
cancer_study_identifier: all_phase2_target_2018_pub
name: Pediatric Acute Lymphoid Leukemia - Phase II (TARGET, 2018)
short_name: ALL-Phase II (TARGET, 2018)
description: Comprehensive profiling of ALL Phase 2 samples. <p>TARGET data is intended exclusively for biomedical research using pediatric data (i.e., the research objectives cannot be accomplished using data from adults) that focus on the development of more effective treatments, diagnostic tests, or prognostic markers for childhood cancers. Moreover, TARGET data can be used for research relevant to the biology, causes, treatment and late complications of treatment of pediatric cancers, but is not intended for the sole purposes of methods and/or tool development (please see <a href="https://ocg.cancer.gov/programs/target/using-target-data">Using TARGET Data</a> section of the OCG website). If you are interested in using TARGET data for publication or other research purposes, you must follow the <a href="https

In [51]:
f_path = 'cBioPortal/all_phase2_target_2018_pub/' + 'data_RNA_Seq_mRNA_median_Zscores.txt'
data = pd.read_csv(f_path,sep='\t', index_col='Hugo_Symbol', 
                   na_values ='NA', na_filter = True).astype(float)

In [None]:
data.isna

gene pairs for **K562** GI pairs:

gene pairs for **Jurkat** GI pairs: