# Process Crow et al. data
This notebook does the following:

1. Select template experiment. This template experiment will be used in the next [notebook](2_identify_generic_genes_pathways.ipynb) to simulate experiments with the same experimental design but testing a different biological process.

2. Crow et al. data was downloaded using `download_Crow_data.R` script that downloads expression data from https://github.com/PavlidisLab/gemmaAPI.R

3. Train VAE on processed data.

In [1]:
%load_ext autoreload
%load_ext rpy2.ipython
%autoreload 2
import os
import pandas as pd
import pickle
from ponyo import utils, train_vae_modules
from generic_expression_patterns_modules import process

Using TensorFlow backend.







examples.directory is deprecated; in the future, examples will be found relative to the 'datapath' directory.
  "found relative to the 'datapath' directory.".format(key))


In [2]:
# Set seeds to get reproducible VAE trained models
process.set_all_seeds()




### Set parameters for data processing

Most parameters are read from `config_filename`. We manually selected bioproject [GSE11352](https://www.ncbi.nlm.nih.gov/gds/?term=GSE11352[Accession]) as the template experiment, which contains breast cell lines treated with estradiol at 12H, 24H and 48H.

In [3]:
base_dir = os.path.abspath(os.path.join(os.getcwd(), "../"))

# Read in config variables
config_filename = os.path.abspath(
    os.path.join(base_dir, "configs", "config_human_Crow.tsv")
)

params = utils.read_config(config_filename)

local_dir = params["local_dir"]
dataset_name = params["dataset_name"]

# File that contains gene ranks identified by Crow et. al.
DE_prior_filename = params["reference_gene_filename"]

# Template experiment ID
project_id = params["project_id"]

# Column name containing sample ids
metadata_colname = params["metadata_colname"]

# Output file: pickled list of shared genes(generated during gene ID mapping)
shared_genes_filename = params["shared_genes_filename"]

# Output files of template experiment data
mapped_template_filename = params["mapped_template_filename"]
processed_template_filename = params["processed_template_filename"]

# Output files of Rani's compendium data
raw_compendium_filename = params["raw_compendium_filename"]
mapped_compendium_filename = params["mapped_compendium_filename"]
normalized_compendium_filename = params["normalized_compendium_filename"]

# Output file: pickled scaler (generated during compendium normalization)
scaler_filename = params["scaler_filename"]

### Load compendium data

In [4]:
raw_compendium = pd.read_csv(raw_compendium_filename, sep="\t", header=0, index_col=0)
print(raw_compendium.shape)
raw_compendium.head()

(32476, 14487)


Unnamed: 0,HSPA6|HSPA7,UBA7,ESRRA,CYP2A7|CYP2A6,ADAM32,CORO6,CCDC65,LEAP2,CTCFL,C4orf33,...,TRIM15,NBL1|MICOS10-NBL1,GRK2,ATN1,FBXO41,KCTD13,TFEB,DND1,CASKIN2,GIMAP1-GIMAP5|GIMAP5
L_IB38___BioAssayId=72907Name=050705MJA_U133_2.0_IB08.CEL,8.96564,83.32761,48.94783,6.05333,17.74886,177.3555,22.08059,24.29061,,57.44873,...,,,,,,,,,,
L_IB40___BioAssayId=72909Name=050705MJA_U133_2.0_IB10.CEL,769.3045,61.90878,119.33116,17.42643,11.58205,117.40292,46.72446,50.81827,4.21085,29.51632,...,,,,,,,,,,
L_IB41___BioAssayId=72910Name=050705MJA_U133_2.0_IB11.CEL,23.69639,93.37774,80.67838,27.73698,18.56551,175.07208,21.01308,56.90577,4.5237,70.19458,...,,,,,,,,,,
L_IB33___BioAssayId=72902Name=050705MJA_U133_2.0_IB03.CEL,10.37628,56.58948,64.8345,14.86978,8.71849,41.07041,24.34623,62.00211,12.89886,38.07436,...,,,,,,,,,,
L_IB36___BioAssayId=72905Name=050705MJA_U133_2.0_IB06.CEL,48.51214,41.29277,76.53021,15.66673,5.35315,27.70762,4.19627,13.335,0.54027,14.19567,...,,,,,,,,,,


### Process compendium data

1. Drop probe column
2. Transpose
3. Get only shared genes from Crow et. al.
4. Normalize

In [5]:
# Transpose matrix to be sample x gene
processed_compendium = raw_compendium

# TO DO:
# Remove NaN columns?

# Get only gene expression data for genes in Crow et. al.
our_gene_ids_hgnc = list(processed_compendium.columns)

published_generic_genes = process.get_published_generic_genes(DE_prior_filename)
shared_genes_hgnc = list(set(our_gene_ids_hgnc).intersection(published_generic_genes))

# In Python, the order of elements in a list that is converted from a set
# is non-deterministic, so it is sorted here to have reproducible result.
shared_genes_hgnc.sort()

# Pickle `shared_genes_hgnc` and save as `shared_genes_filename`
if not os.path.exists(shared_genes_filename):
    with open(shared_genes_filename, "wb") as pkl_fh:
        pickle.dump(shared_genes_hgnc, pkl_fh)

mapped_compendium = processed_compendium[shared_genes_hgnc]
print(mapped_compendium.shape)
mapped_compendium.head()

(32476, 12228)


Unnamed: 0,A1BG,A1BG-AS1,A1CF,A2M,A2M-AS1,A2ML1,A4GALT,A4GNT,AA06,AAAS,...,ZSWIM6,ZSWIM7,ZSWIM8,ZSWIM8-AS1,ZW10,ZWINT,ZXDA,ZXDB,ZYG11A,ZZEF1
L_IB38___BioAssayId=72907Name=050705MJA_U133_2.0_IB08.CEL,11.63699,,,676.43811,18.94505,8.59358,,19.44703,24.31123,50.73169,...,470.20654,183.29526,248.36073,,71.50054,59.74241,37.28952,37.95333,,135.73766
L_IB40___BioAssayId=72909Name=050705MJA_U133_2.0_IB10.CEL,20.67487,,,591.69562,23.72454,9.03128,,47.6812,14.11284,49.37021,...,325.37585,105.83591,188.41771,,68.06551,27.87321,32.73832,38.87195,,96.63411
L_IB41___BioAssayId=72910Name=050705MJA_U133_2.0_IB11.CEL,40.67805,,,1069.66284,50.84652,8.31981,,12.64032,14.81776,109.17731,...,855.58441,353.11365,367.59689,,88.50397,60.85935,66.90208,84.84532,,156.67801
L_IB33___BioAssayId=72902Name=050705MJA_U133_2.0_IB03.CEL,13.7947,,,1358.1543,45.66821,8.50909,,17.39715,18.87131,40.65568,...,840.18787,366.77325,222.1256,,90.49266,51.56433,62.14825,80.57075,,102.28099
L_IB36___BioAssayId=72905Name=050705MJA_U133_2.0_IB06.CEL,13.70568,,,1152.48254,46.30679,9.29537,,22.76678,14.42896,0.36647,...,389.9469,74.76771,71.15443,,53.26638,13.28645,42.16255,59.41403,,51.57303


In [None]:
# Save
mapped_compendium.to_csv(mapped_compendium_filename, sep="\t")

In [None]:
# Normalize data
process.normalize_compendium(
    mapped_compendium_filename, normalized_compendium_filename, scaler_filename
)

### Select and process template data

1. Get gene expression associated with `project_id`, which was manually selected by the user and specified in the config file.

Note: The data is not normalized so that we can perform DE analysis in next notebook

In [None]:
# Note: This is the only notebook using this function, so for now it is included here
# Get sample ids associated with selected project id
def get_sample_ids(experiment_id, mapping_filename):
    """
    Return sample ids for a given experiment id

    """
    # Read in metadata
    metadata = pd.read_csv(mapping_filename, header=0)
    metadata.set_index("Experiment id", inplace=True)

    selected_metadata = metadata.loc[experiment_id]
    sample_ids = list(selected_metadata[metadata_colname])

    return sample_ids


# metadata file with mapping from experiment to sample
experiment_to_sample_metadata_filename = os.path.join(
    base_dir, dataset_name, "data", "metadata", "experiment_sample_annotations.csv"
)

sample_ids = get_sample_ids(project_id, experiment_to_sample_metadata_filename)

# Get expression data
template_mapped = mapped_compendium.loc[sample_ids]
print(template_mapped.shape)

# Save
template_mapped.to_csv(mapped_template_filename, sep="\t")

### Train VAE

In [None]:
# Create VAE directories if needed
output_dirs = [
    os.path.join(base_dir, dataset_name, "models"),
    os.path.join(base_dir, dataset_name, "logs"),
]

NN_architecture = params["NN_architecture"]

# Check if NN architecture directory exist otherwise create
for each_dir in output_dirs:
    sub_dir = os.path.join(each_dir, NN_architecture)
    os.makedirs(sub_dir, exist_ok=True)

In [None]:
# Train VAE on new compendium data
train_vae_modules.train_vae(config_filename, normalized_compendium_filename)