### Goal 

Compute the crossmatrix (X,Y) where x_ijk - input and y_ijl - target, for gene i, individual j, and tissues k,l where k <> l 

### Conclusions

These will be stored under `/s/project/rep/processed/gtex/input_data/` as `X_train.h5`, `Y_train.h5`, `X_valid.h5`, `Y_valid.h5`, `X_test.h5`, `Y_test.h5`

In [2]:
import os
import sys

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
import seaborn as sns
import h5py
import anndata

from rep import preprocessing_new as p
from rep.linear_regression import Linear_Regression
from rep.linear_regression import Transform
from rep.linear_regression import FeatureReduction
from rep import evaluate as e

import plotly
import plotly.plotly as py
from plotly.graph_objs import graph_objs

# set credentials
plotly.tools.set_credentials_file(username='gium', api_key='nUFs5UnmuBR3pEbGIMj8')

import warnings; warnings.simplefilter('ignore')

In [10]:
file = os.path.join(os.readlink(os.path.join("..","..","data")),"processed","gtex","recount","recount_gtex_logratios.h5ad")
gtex = p.load(file).transpose()
gtex = p.RepAnnData(X=gtex.X,samples_obs=gtex.obs,genes_var=gtex.var)

# load invidivudals
path = os.path.join("..","..","data","processed","gtex","recount")

train = []
valid = []
test = []
states = ['train','valid','test']
dict_states_indiv = {'train':train,'valid':valid,'test':test}

for s in states:
    with open(os.path.join(path,s+"_individuals.txt"), 'r') as f:
        for l in f: dict_states_indiv[s].append(l.replace("\n",""))


selected_genes = gtex.genes_names # training with all features
print("Total Genes: ",len(selected_genes))


Total Genes:  19932


In [14]:
%time (X_inputs, Y_targets) =  p.rnaseq_train_valid_test(gtex,dict_states_indiv,selected_genes,onlyBlood=True)
# save inputs and targets
path = os.path.join("..","..","data","processed","gtex","input_data")
X_inputs.save(os.path.join(path,'X_inputs_pc_onlyblood.h5'))
Y_targets.save(os.path.join(path,'Y_targets_pc_onlyblood.h5'))

compute all arrangements
Total pairs: 4403
compute all arrangements
Total pairs: 1630
compute all arrangements
Total pairs: 1558


In [16]:
# get train inputs
X_inputs.samples[X_inputs.samples['Type']=='train'].shape

(4403, 11)

In [None]:
%time (X_inputs, Y_targets) =  p.rnaseq_train_valid_test(gtex,dict_states_indiv,selected_genes,onlyBlood=False)
# save inputs and targets
path = os.path.join("..","..","data","processed","gtex","input_data")
X_inputs.save(os.path.join(path,'X_inputs_pc.h5'))
Y_targets.save(os.path.join(path,'Y_targets_pc.h5'))

compute all arrangements
Total pairs: 105778


OLD - calling

In [2]:
def cross(norm_gtex):
    # load invidivudals
    path = os.path.join("..","..","data","processed","gtex","recount")

    train = []
    valid = []
    test = []
    states = ['train','valid','test']
    dict_states_indiv = {'train':train,'valid':valid,'test':test}

    for s in states:
        with open(os.path.join(path,s+"_individuals.txt"), 'r') as f:
            for l in f: dict_states_indiv[s].append(l.replace("\n",""))


    selected_genes = norm_gtex.obs_names # training with all features
    print("Total Genes: ",len(selected_genes))

    # compute cross tissue matrix
    (X_train_norm, Y_train_norm, samples_description_train, gene_id_train, metadata_train) = p.rnaseq_cross_tissue(norm_gtex, individuals=train, gene_ids=selected_genes, onlyBlood = True)
    (X_valid_norm, Y_valid_norm, samples_description_valid, gene_id_valid, metadata_valid) = p.rnaseq_cross_tissue(norm_gtex, individuals=valid, gene_ids=selected_genes, onlyBlood = True)
    (X_test_norm, Y_test_norm, samples_description_test, gene_id_test, metadata_test) = p.rnaseq_cross_tissue(norm_gtex, individuals=test, gene_ids=selected_genes, onlyBlood = True)
    
    return  (X_train_norm, Y_train_norm, samples_description_train, gene_id_train, metadata_train), (X_valid_norm, Y_valid_norm, samples_description_valid, gene_id_valid, metadata_valid), (X_test_norm, Y_test_norm, samples_description_test, gene_id_test, metadata_test)

In [4]:
file = os.path.join(os.readlink(os.path.join("..","..","data")),"processed","gtex","recount","recount_gtex_logratios.h5ad")
gtex = p.load(file)
(X_train, Y_train, samples_description_train, gene_id_train, metadata_train), (X_valid, Y_valid, samples_description_valid, gene_id_valid, metadata_valid), (X_test, Y_test, samples_description_test, gene_id_test, metadata_test) = cross(gtex)

Total Genes:  19932
compute all arrangements
Total pairs: 4403
compute all arrangements
Total pairs: 1630
compute all arrangements
Total pairs: 1558


In [5]:
# save
path = os.path.join(os.readlink(os.path.join("..","..","data")),"processed","gtex","input_data")

p.writeh5(X_train,"X_train",os.path.join(path,"X_train_pc_bloodonly.h5"))
p.writeh5(Y_train,"Y_train",os.path.join(path,"Y_train_pc_bloodonly.h5"))
p.writeh5(X_valid,"X_valid",os.path.join(path,"X_valid_pc_bloodonly.h5"))
p.writeh5(Y_valid,"Y_valid",os.path.join(path,"Y_valid_pc_bloodonly.h5"))
p.writeh5(X_test,"X_test",os.path.join(path,"X_test_pc_bloodonly.h5"))
p.writeh5(Y_test,"Y_test",os.path.join(path,"Y_test_pc_bloodonly.h5"))

p.writeJSON(metadata_train,os.path.join(path,"XY_metadata_train_pc_bloodonly.json"))
p.writeJSON(metadata_valid,os.path.join(path,"XY_metadata_valid_pc_bloodonly.json"))
p.writeJSON(metadata_test,os.path.join(path,"XY_metadata_test_pc_bloodonly.json"))

In [6]:
dict = {}
dict['train'] = {'samples':samples_description_train,'genes':gene_id_train.tolist()}
dict['valid'] = {'samples':samples_description_valid,'genes':gene_id_valid.tolist()}
dict['test'] = {'samples':samples_description_test,'genes':gene_id_test.tolist()}
p.writeJSON(dict,os.path.join(path,'metadata.json'))