# per Tissue Analysis and Models
*Ben Imlay*

## Database Setup

In [31]:
import pandas as pd
from pathlib import Path
from os import listdir
import sklearn.model_selection
import sklearn.feature_selection
TISSUE='Liver'

In [32]:
data_dir=Path("data")
tissue_dir=Path("tissue-specific")
manifest={"data":"All_Tissue_Site_Details.combined.reads.gct",
              "sample_meta":"GTEx_v7_Annotations_SampleAttributesDS.txt",
              "subject_meta":"GTEx_v7_Annotations_SubjectPhenotypesDS.txt",
               "merged_meta":"merged_meta.tsv"}
meta=pd.read_csv(data_dir/manifest['merged_meta'],sep="\t",dtype={'SMUBRID':object,'SEX':object,'DTHHRDY':object})
meta=meta[~(meta['AGE'].isnull())] # removes all samples without age
meta.iloc[0:3,:]

Unnamed: 0,SAMPID,SMATSSCR,SMCENTER,SMPTHNTS,SMRIN,SMTS,SMTSD,SMUBRID,SMTSISCH,SMTSPAX,...,SME1PCTS,SMRRNART,SME1MPRT,SMNUM5CD,SMDPMPRT,SME2PCTS,SUBJID,SEX,AGE,DTHHRDY
0,GTEX-1117F-0226-SM-5GZZ7,0.0,B1,"2 pieces, ~15% vessel stroma, rep delineated",6.8,Adipose Tissue,Adipose - Subcutaneous,2190,1214.0,1125.0,...,50.094357,0.003102,0.992826,,0.0,50.12628,GTEX-1117F,2.0,60-69,4.0
1,GTEX-111CU-1826-SM-5GZYN,0.0,B1,"2 pieces, small portion of nerve (<10% of one ...",7.5,Adipose Tissue,Adipose - Subcutaneous,2190,138.0,571.0,...,50.08925,0.002126,0.993923,,0.0,50.10142,GTEX-111CU,1.0,50-59,0.0
2,GTEX-111FC-0226-SM-5N9B8,2.0,B1,"2 pieces, larger piece is 30% fibrovascular ti...",7.3,Adipose Tissue,Adipose - Subcutaneous,2190,1040.0,869.0,...,50.113018,0.003756,0.992825,,0.0,50.06186,GTEX-111FC,1.0,60-69,1.0


## Avaiable Tissues

In [33]:
# To do python plots of counts
infiles=listdir(data_dir/tissue_dir)
TISSUE_files=[f for f in infiles if  TISSUE in f]
TISSUE_files

['Liver_lcpm.tsv', 'Liver_cpm.tsv']

## Reading CPM and LCPM files.

In [37]:
cpm=pd.read_csv(data_dir/tissue_dir/TISSUE_files[1],sep="\t",index_col=0)
lcpm=pd.read_csv(data_dir/tissue_dir/TISSUE_files[0],sep="\t",index_col=0)

In [36]:
cpm.iloc[0:5,0:20]

Unnamed: 0,ENSG00000223972,ENSG00000227232,ENSG00000243485,ENSG00000237613,ENSG00000268020,ENSG00000240361,ENSG00000186092,ENSG00000238009,ENSG00000233750,ENSG00000237683,ENSG00000268903,ENSG00000239906,ENSG00000241860,ENSG00000222623,ENSG00000241599,ENSG00000228463,ENSG00000237094,ENSG00000250575,ENSG00000233653,ENSG00000224813
GTEX-11DXY-0526-SM-5EGGQ,0.142941,9.691384,0.0,0.085764,0.0,0.114353,0.0,0.171529,0.714704,8.2048,0.0,0.0,0.857645,0.0,0.028588,2.1727,0.343058,0.0,0.200117,0
GTEX-11DXZ-0126-SM-5EGGY,0.175082,7.63358,0.140066,0.0,0.070033,0.140066,0.070033,0.49023,0.665312,1.785838,0.0,0.0,0.420197,0.0,0.035016,0.770361,0.420197,0.035016,0.385181,0
GTEX-11EQ9-0526-SM-5A5JZ,0.0,3.867529,0.021606,0.064819,0.021606,0.0,0.021606,0.043213,0.237669,3.370584,0.043213,0.021606,0.453732,0.0,0.0,0.345701,0.216063,0.021606,0.064819,0
GTEX-11GSP-0626-SM-5986T,0.13369,14.50537,0.033423,0.100268,0.0,0.13369,0.100268,0.13369,0.40107,5.046799,0.033423,0.167113,0.66845,0.0,0.100268,1.102943,0.969253,0.033423,0.26738,0
GTEX-11NUK-1226-SM-5P9GM,0.174075,9.008397,0.152316,0.021759,0.043519,0.043519,0.108797,0.130556,0.652782,4.373642,0.021759,0.065278,0.652782,0.0,0.021759,1.784272,0.674542,0.043519,0.36991,0


In [35]:
lcpm.iloc[0:5,0:20]

Unnamed: 0,ENSG00000223972,ENSG00000227232,ENSG00000243485,ENSG00000237613,ENSG00000268020,ENSG00000240361,ENSG00000186092,ENSG00000238009,ENSG00000233750,ENSG00000237683,ENSG00000268903,ENSG00000239906,ENSG00000241860,ENSG00000222623,ENSG00000241599,ENSG00000228463,ENSG00000237094,ENSG00000250575,ENSG00000233653,ENSG00000224813
GTEX-11DXY-0526-SM-5EGGQ,-2.513117,3.281493,-4.955184,-3.083136,-4.955184,-2.770155,-4.955184,-2.295023,-0.420936,3.042125,-4.955184,-4.955184,-0.168316,-4.955184,-4.039212,1.140736,-1.413908,-4.955184,-2.10561,-4.955184
GTEX-11DXZ-0126-SM-5EGGY,-2.270082,2.938439,-2.536991,-4.955184,-3.289562,-2.536991,-3.289562,-0.936591,-0.519636,0.86241,-4.955184,-4.955184,-1.144224,-4.955184,-3.89427,-0.317252,-1.144224,-3.89427,-1.26044,-4.955184
GTEX-11EQ9-0526-SM-5A5JZ,-4.955184,1.963387,-4.215116,-3.365055,-4.215116,-4.955184,-4.215116,-3.728362,-1.889474,1.766731,-3.728362,-4.215116,-1.041065,-4.955184,-4.955184,-1.403783,-2.009849,-4.215116,-3.365055,-4.955184
GTEX-11GSP-0626-SM-5986T,-2.591388,3.861718,-3.928874,-2.915897,-4.955184,-2.591388,-2.915897,-2.591388,-1.206541,2.344554,-3.928874,-2.326635,-0.51316,-4.955184,-2.915897,0.18292,0.002146,-3.928874,-1.738813,-4.955184
GTEX-11NUK-1226-SM-5P9GM,-2.277105,3.176424,-2.437902,-4.211019,-3.722519,-3.722519,-2.825895,-2.618894,-0.545786,2.139429,-4.211019,-3.358243,-0.545786,-4.955184,-4.211019,0.861167,-0.500672,-3.722519,-1.314209,-4.955184


### Merging metadata

In [28]:
tissue_meta=meta[meta['SMTSD']==TISSUE]
tissue_meta.iloc[0:5]

Unnamed: 0,SAMPID,SMATSSCR,SMCENTER,SMPTHNTS,SMRIN,SMTS,SMTSD,SMUBRID,SMTSISCH,SMTSPAX,...,SME1PCTS,SMRRNART,SME1MPRT,SMNUM5CD,SMDPMPRT,SME2PCTS,SUBJID,SEX,AGE,DTHHRDY
6536,GTEX-11DXY-0526-SM-5EGGQ,1.0,B1,"2 pieces, congestion",6.8,Liver,Liver,1114,891.0,1149.0,...,50.825165,0.015545,0.993035,,0.0,49.517372,GTEX-11DXY,1.0,60-69,2.0
6537,GTEX-11DXZ-0126-SM-5EGGY,1.0,B1,"2 pieces, foci of hepatocyte necrosis and ball...",7.9,Liver,Liver,1114,250.0,951.0,...,50.576138,0.005714,0.993494,,0.0,49.746162,GTEX-11DXZ,1.0,50-59,0.0
6538,GTEX-11EQ9-0526-SM-5A5JZ,1.0,B1,"2 pieces, diffuse macro and microvesucular ste...",8.1,Liver,Liver,1114,82.0,617.0,...,50.2212,0.007967,0.995574,,0.0,49.962532,GTEX-11EQ9,1.0,30-39,2.0
6539,GTEX-11GSP-0626-SM-5986T,1.0,B1,2 pieces; central vascular congestion,6.2,Liver,Liver,1114,771.0,565.0,...,50.910202,0.025063,0.99258,,0.0,49.414272,GTEX-11GSP,2.0,60-69,2.0
6540,GTEX-11NUK-1226-SM-5P9GM,1.0,B1,"2 pieces, includes capsule (target is 1 cm bel...",6.1,Liver,Liver,1114,956.0,829.0,...,49.88246,0.015518,0.989904,,0.0,50.5536,GTEX-11NUK,1.0,50-59,2.0


## Test and Train Set Creation

In [48]:
cpm_train, cpm_test, y_train, y_test = \
        sklearn.model_selection.train_test_split(cpm, tissue_meta['AGE'], test_size=.3, random_state=1234) # random state guarantees that the same split is made for a given tissue.
print(cpm_train.shape)
print(cpm_test.shape)
print(y_train.shape)
print(y_test.shape)
sum(cpm_train.iloc[:,0]) # Confirms that the split is the same each time

(122, 56202)
(53, 56202)
(122,)
(53,)


16.175560121040135

## Filter by Expression
As adapted from the edgeR package function [filterByExpression](https://rdrr.io/bioc/edgeR/src/R/filterByExpr.R).

In [40]:
def filter_by_expr(min):
    pass

(122, 56202)
(53, 56202)
(122,)
(53,)
