### Goal
Build a multi linear regression model using the GTEx data. The model receives all combinations of pairs of tissues (X,Y) per invidividual. 

In [1]:
import os
import sys

import pandas as pd
import numpy as np

from rep import preprocessing as p

In [3]:
file = os.path.join(os.readlink(os.path.join("..","..","data")),"processed","gtex","recount","recount_gtex.h5ad")
gtex = p.load(file)

In [4]:
# get counts
gtex.X[:3,:3]

array([[3.396890e+05, 9.866900e+04, 5.469700e+04],
       [3.000000e+01, 7.640000e+02, 1.290000e+03],
       [2.175520e+05, 1.076085e+06, 1.685770e+05]], dtype=float32)

In [5]:
# get genes description
gtex.obs[:3]

Unnamed: 0_level_0,gene_id,bp_length,symbol
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ENSG00000000003.14,ENSG00000000003.14,4535,TSPAN6
ENSG00000000005.5,ENSG00000000005.5,1610,TNMD
ENSG00000000419.12,ENSG00000000419.12,1207,DPM1


In [6]:
# get samples description
gtex.var[:3]

Unnamed: 0_level_0,project,sample,experiment,mapped_read_count,avg_read_length,bigwig_file,sampid,smatsscr,Parent_Tissue,Tissue,Gender,Individual
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
SRR660824,SRP012682,SRS389722,SRX222703,170790002,152,SRR660824_SRS389722_SRX222703_male_lung.bw,GTEX-QMR6-1926-SM-32PL9,3.0,Lung,Lung,male,GTEX-QMR6
SRR2166176,SRP012682,SRS1036203,SRX1152700,191059974,500,SRR2166176_SRS1036203_SRX1152700_male_brain.ce...,GTEX-T5JC-0011-R11A-SM-5S2RX,,Brain,Brain - Cerebellar Hemisphere,male,GTEX-T5JC
SRR606939,SRP012682,SRS333474,SRX199032,159714774,136,SRR606939_SRS333474_SRX199032_female_heart.lef...,GTEX-POMQ-0326-SM-2I5FO,1.0,Heart,Heart - Left Ventricle,female,GTEX-POMQ


In [7]:
# compress row counts
# max values for int8, int16, int32, int64
max_values=[127,32767,2147483647,9223372036854775807]
for i in range(gtex.X.shape[1]):
    max_val = np.amax(gtex.X[:,i])
    if max_val < max_values[0]: gtex.X[:,i].astype(np.int8)
    elif max_val < max_values[1]: gtex.X[:,i].astype(np.int16)
    elif max_val < max_values[2]: gtex.X[:,i].astype(np.int32)
    else: gtex.X[:,i].astype(np.int64)

In [8]:
gtex.X[:,1].dtype

dtype('<f4')

In [9]:
# split individuals in 3/5, 1/5, 1/5 using Gender stratification
(train,valid,test) = p.split_by_individuals(gtex,groupby=['Gender'])
train[:3],valid[:3],test[:3],'Train set size: ' + str(len(train)),'Valid set size: ' + str(len(valid)),'Test set size: ' + str(len(test)), 'Total indiv: ' + str(pd.Series(gtex.var['Individual']).unique().shape)

(551, 2)
   Individual  Gender
0   GTEX-QMR6    male
1   GTEX-T5JC    male
2   GTEX-POMQ  female
3   GTEX-T6MN    male
5   GTEX-P4QR    male
6   GTEX-OHPN  female
7   GTEX-TMMY  female
8   GTEX-PVOW    male
9   GTEX-P4PP  female
10  GTEX-TSE9  female
<pandas.core.groupby.groupby.DataFrameGroupBy object at 0x7fa6aea2e780>


(['GTEX-POMQ', 'GTEX-OHPN', 'GTEX-TMMY'],
 ['GTEX-13NZ8', 'GTEX-145ME', 'GTEX-PWN1'],
 ['GTEX-ZE7O', 'GTEX-ZV6S', 'GTEX-T6MO'],
 'Train set size: 331',
 'Valid set size: 111',
 'Test set size: 109',
 'Total indiv: (551,)')

In [None]:
# compute train, valid, test set
selected_genes = gtex.obs_names[:100]
%time (X_train, Y_train) = p.rnaseq_cross_tissue(gtex, individuals=train, gene_ids=selected_genes)
%time (X_valid, Y_valid) = p.rnaseq_cross_tissue(gtex, individuals=valid, gene_ids=selected_genes)
%time (X_test, Y_test) = p.rnaseq_cross_tissue(gtex, individuals=test, gene_ids=selected_genes)


Total pairs: 139898
