# TODO
* Why are h2 values so high?

In [1]:
# Set up
%pylab inline

# Allow us to edit fonts in Illustrator
import matplotlib
matplotlib.rcParams['ps.useafm'] = True
matplotlib.rcParams['pdf.use14corefonts'] = True
matplotlib.rcParams['text.usetex'] = True

# Libraries
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import scipy.stats

# Params
QVAL = 0.1
ANOVATHRESH = 0.05
SHORTEN = {
    "WholeBlood": "Blood",
    "Cells-Transformedfibroblasts": "Fibr.",
    "Muscle-Skeletal": "Muscle",
    "Artery-Tibial": "Artery",
    "Adipose-Subcutaneous": "Adipose",
    "Lung": "Lung",
    "Esophagus-Mucosa": "Esophagus",
}
NSAMPLES = {
    "WholeBlood": 144,
    "Cells-Transformedfibroblasts": 128,
    "Muscle-Skeletal": 125,
    "Artery-Tibial": 108,
    "Adipose-Subcutaneous": 102,
    "Esophagus-Mucosa": 102,
    "Lung": 110
}

# Path to data
RESDIR = "/storage/szfeupe/Runs/GTEx_estr/Analysis_by_Tissue/"
TISSUES = [item for item in list(SHORTEN.keys())]

data = {}
for t in TISSUES:
    data[t] = data[t] = pd.read_csv(os.path.join(RESDIR, t, "Master.table"), sep="\t")

Populating the interactive namespace from numpy and matplotlib


In [6]:
xsig.columns

Index(['gene', 'chrom', 'gene.name', 'str.id', 'best.str.start',
       'best.str.score', 'causality.score', 'top.variant', 'top.variant.score',
       'significant', 'anova_pval', 'delta_bic', 'delta_aic', 'p.wald', 'beta',
       'qvalue', 'NTEST', 'beta.se', 'estr_fdr', 'esnp_fdr', 'cis_str_h2',
       'num_snps', 'cis_str_h2_pval', 'cis_str_h2_se', 'nsamp', 'cis_snp_h2',
       'cis_snp_h2_se', 'logL'],
      dtype='object')

In [11]:
t = "WholeBlood"
xall = data[t]
xsig = xall[xall["qvalue"]<=QVAL]
xsig[xsig["anova_pval"]>0.1][["gene","chrom","best.str.start","cis_str_h2","cis_str_h2_se","cis_snp_h2","cis_snp_h2_se","beta","anova_pval","top.variant","top.variant.score","best.str.score"]]


Unnamed: 0,gene,chrom,best.str.start,cis_str_h2,cis_str_h2_se,cis_snp_h2,cis_snp_h2_se,beta,anova_pval,top.variant,top.variant.score,best.str.score
9,ENSG00000001460.13,chr1,24772559.0,0.103815,0.084777,0.000001,0.042943,0.325618,0.185195,STR_24772559,0.136631,1.366310e-01
49,ENSG00000004534.10,chr3,50032103.0,0.257713,0.074460,0.000001,0.038278,-0.502621,0.140003,SNP_49990497,0.080956,3.035200e-02
65,ENSG00000005020.8,chr7,26863718.0,0.080310,0.130869,0.079372,0.095324,-0.313824,0.344983,SNP_27068491,0.075877,6.926560e-05
225,ENSG00000010292.8,chr12,6574631.0,0.092829,0.114154,0.046348,0.059322,0.353823,0.126973,SNP_6631169,0.077467,2.845490e-04
243,ENSG00000011007.8,chr1,24149837.0,0.106959,0.081485,0.000001,0.059587,0.325696,0.354431,SNP_24051614,0.134850,2.569200e-03
258,ENSG00000011295.11,chr17,16030413.0,0.096232,0.081517,0.000001,0.049114,0.310742,0.740628,SNP_15870608,0.052175,3.772410e-03
293,ENSG00000013573.12,chr12,31261468.0,0.091758,0.130416,0.115369,0.076609,-0.559443,0.444043,SNP_31235371,0.712846,1.904720e-12
301,ENSG00000014164.6,chr8,144630218.0,0.214701,0.074772,0.000001,0.039580,-0.465111,0.767726,SNP_144584780,0.218228,1.888630e-02
329,ENSG00000017797.7,chr18,9506470.0,0.067118,0.094354,0.103058,0.090048,0.400090,0.248120,SNP_9488704,0.145444,2.146400e-05
353,ENSG00000020922.8,chr11,94246111.0,0.053634,0.104189,0.036385,0.060180,0.336109,0.267670,SNP_94128955,0.228651,6.556290e-05


In [2]:
num_genes = []
num_estrs = []
anova_pass = []
num_caviar_best = []
gcta_perc_estr = []
gcta_perc_all = []
gcta_r2_estr = []
gcta_r2_all = []
num_samples = []

for t in TISSUES:
    xall = data[t]
    xsig = xall[xall["qvalue"]<=QVAL]
    num_genes.append(xall.shape[0])
    num_estrs.append(xsig.shape[0])
    anova_pass.append(xsig[(xsig["anova_pval"]<ANOVATHRESH)].shape[0])
    num_caviar_best.append(xsig[(xsig["best.str.score"]>=xsig["top.variant.score"])].shape[0])
    gcta_perc_estr.append(np.mean(xsig[~np.isnan(xsig["cis_str_h2"])].apply(lambda x: x["cis_str_h2"]/(x["cis_str_h2"]+x["cis_snp_h2"]), 1)))
    gcta_perc_all.append(np.mean(xall[~np.isnan(xall["cis_str_h2"])].apply(lambda x: x["cis_str_h2"]/(x["cis_str_h2"]+x["cis_snp_h2"]), 1)))
    gcta_r2_estr.append(np.mean(xsig[~np.isnan(xsig["cis_str_h2"])].apply(lambda x: x["cis_str_h2"], 1)))
    gcta_r2_all.append(np.mean(xall[~np.isnan(xall["cis_str_h2"])].apply(lambda x: x["cis_str_h2"], 1)))
    num_samples.append(NSAMPLES[t])
    
mtable = pd.DataFrame({
    "00_tissue": [SHORTEN[t] for t in TISSUES],
    "0_samples": num_samples,
    "1_numgenes": num_genes,
    "2_num.estrs": num_estrs,
    "3_gcta.estr.perch2": gcta_perc_estr,
    "4_gcta.all.perch2": gcta_perc_all,
    "5_gcta.estr.r2": gcta_r2_estr,
    "6_gcta.all.r2": gcta_r2_all,
    "7_anova.pass": anova_pass,
    "8_num_caviar_best": num_caviar_best,
})

mtable.sort("2_num.estrs", ascending=False)



Unnamed: 0,00_tissue,0_samples,1_numgenes,2_num.estrs,3_gcta.estr.perch2,4_gcta.all.perch2,5_gcta.estr.r2,6_gcta.all.r2,7_anova.pass,8_num_caviar_best
4,Fibr.,128,15362,955,0.669205,0.795474,0.125969,0.044704,235,151
2,Blood,144,14845,630,0.679351,0.81467,0.114938,0.038565,131,90
3,Artery,108,15584,524,0.698473,0.820082,0.154555,0.05098,129,72
0,Esophagus,102,16111,475,0.719099,0.82826,0.163506,0.05239,107,87
5,Lung,110,15711,378,0.715036,0.833435,0.154623,0.047877,81,69
1,Adipose,102,15918,376,0.723583,0.830789,0.175574,0.051878,78,61
6,Muscle,125,15275,304,0.727204,0.837351,0.143855,0.041752,54,38
