In [1]:

import numpy as np
import pandas as pd
import gzip
import scipy.stats as stats
import argparse
import os
from gtex_normalization import normalize_expression


def get_donors(path):
    donor_ids = list()
    with open(path, 'r') as instream:
        for line in instream:
            donor_ids.append(line.strip().split()[0])
    return donor_ids

def read_gct(gct_file, donor_ids):
    """
    Load GCT as DataFrame
    """    
    df = pd.read_csv(gct_file, sep='\t', skiprows=2, index_col=0)
    df.drop('Description', axis=1, inplace=True)
    df.index.name = 'gene_id'
    df = df[[i for i in df.columns if '-'.join(i.split('-')[:2]) in donor_ids]]
    return df

In [2]:
path = "/home/franco/cluster2/datasets/gtex/expression"
expression_gct = os.path.join(path, "GTEx_Data_20150112_RNAseq_RNASeQCv1.1.8_gene_rpkm.gct.gz") # "rpkm file"
counts_gct = os.path.join(path, "GTEx_Data_20150112_RNAseq_RNASeQCv1.1.8_gene_reads.gct.gz")    # file with read counts
donors_path = "/media/disk1/gtex/donor_ids.fam"
expression_threshold=0.1    # 'Selects genes with > expression_threshold expression in at least min_samples')
count_threshold=5,          # 'Selects genes with > count_threshold reads in at least min_samples')
min_samples=10              # 'Minimum number of samples that must satisfy thresholds')


In [3]:
donor_ids = get_donors(donors_path)


In [4]:
expression_df = read_gct(expression_gct, donor_ids)

In [5]:
expression_df.shape

(56318, 8116)

In [6]:
counts_df = read_gct(counts_gct, donor_ids)

In [7]:
counts_df.shape

(56318, 8116)

In [12]:
import sys
base_path = "/home/franco/cluster2"
sys.path.insert(0, os.path.join(base_path, "gxpred"))

gtfpath = os.path.join(base_path, "datasets/gtex/gencode.v19.annotation.gtf.gz")

from iotools import readgtf

gene_infos = list()
for chrom in range(1,23):
    gene_info = readgtf.gencode_v12(gtfpath, include_chrom = chrom, trim=False)
    gene_infos = gene_infos+gene_info


In [15]:
allensembl_ids = [ i.ensembl_id for i in gene_infos]

In [17]:
common_ids = [i for i in allensembl_ids if i in expression_df.index]
genes_expression_df =expression_df.loc[common_ids]

In [18]:
import re
attfile = "/home/franco/gtex_expr_normalization/phs000424.v6.pht002743.v6.p1.c1.GTEx_Sample_Attributes.GRU.txt"
fulldonor_ids_rna = list()
fulldonor_ids_wgenotype = list()
with open(attfile, 'r') as instream:
    counter = 0
    for line in instream:
        counter += 1
        if re.search("^#", line): 
            continue
        arr = line.split("\t")
        if len(arr)>1:
            cond1 = re.search("Whole", arr[14], re.IGNORECASE)
            cond2 = re.search("flagged", arr[27], re.IGNORECASE)
            has_gt = re.search("dna", arr[26], re.IGNORECASE)
            is_rna = re.search("rna", arr[26], re.IGNORECASE)
            truseq = re.search("TrueSeq", arr[25])
            if not cond2:
                if has_gt:
                    fulldonor_ids_wgenotype.append(arr[1])
                    # if re.search("GTEX-N7MS", arr[1]):
                        #print(arr)
                if cond1 and is_rna and truseq:
                    fulldonor_ids_rna.append(arr[1]) 
                    # if re.search("GTEX-N7MS", arr[1]):
                        # print(arr)

# fulldonor_ids_w_genotype do not match our samples with genotype! 
# some GTEx filtering was done that we don't know about

In [19]:
shortids = [ "-".join(i.split("-")[:2]) for i in fulldonor_ids_rna]
print(len(shortids))
uu = set(shortids)
if len(uu) == len(shortids):
    print("No repeated rna samples!")

393
No repeated rna samples!


In [26]:
rna_w_gt = [i for i in fulldonor_ids_rna if ("-".join(i.split("-")[:2]) in donor_ids)]
print(len(rna_w_gt))

338


In [23]:
genes_expression_df.iloc[:2,:5]

Unnamed: 0_level_0,GTEX-1117F-0226-SM-5GZZ7,GTEX-111CU-1826-SM-5GZYN,GTEX-111FC-0226-SM-5N9B8,GTEX-111FC-1426-SM-5N9C7,GTEX-111VG-2326-SM-5N9BK
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ENSG00000186092.4,0.0,0.0,0.0,0.0,0.0
ENSG00000237683.5,3.559832,4.01445,1.590646,2.623339,14.087896


In [28]:
# index RNA expre matrix with only those gene expressions with genotype for whole blood
# newshortids = [ "-".join(i.split("-")[:2]) for i in rna_w_gt2]
wblood_expression_df = genes_expression_df.loc[:,rna_w_gt]
print(wblood_expression_df.shape)

(18744, 338)


In [30]:
wblood_counts_df = counts_df.loc[common_ids,rna_w_gt]
print(wblood_counts_df.shape)

(18744, 338)


In [31]:

print('Normalizing using all genes within %i samples ...' % expression_df.shape[1])
quant_std_df, quant_df = normalize_expression(wblood_expression_df, wblood_counts_df,
    expression_threshold=expression_threshold, count_threshold=count_threshold, min_samples=min_samples)


Normalizing using all genes within 8116 samples ...


In [32]:
quant_std_df.shape

(14810, 338)

In [34]:
quant_std_df.iloc[:2,:5]

# change long for short donor ids
# newdonor_ids = ['-'.join(i.split('-')[:2]) for i in quant_std_df.columns]
# quant_std_df.columns = newdonor_ids

Unnamed: 0_level_0,GTEX-111YS,GTEX-1122O,GTEX-1128S,GTEX-113IC,GTEX-113JC
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ENSG00000237683.5,0.438855,0.370628,1.866676,-2.371853,-1.047248
ENSG00000188976.6,-0.107422,-1.200131,-0.048081,0.092559,-0.307961


In [33]:
# write normalized expression file
quant_std_df.to_csv('gtex_afterfiltering.normalized.expression.txt', sep='\t')
