## We only perform our analysis on chr1 ~ chr 22, as sex chromosome expression varies between male and female, would lead to complex pipelines

In [1]:
import argparse
import pandas as pd
import numpy as np
import math




# Compute Percent Spliced In (PSI)
# PSI_df: the output matrix of psi
# exon_gene_df: exon read count matrix
# junction_gene_df: Exon-exon junction read count matrix
# annot_gene_df: annotation file for exons
def compute_gene_PSI(PSI_df,exon_gene_df,junction_gene_df,annot_gene_df,read_len):
     
    exons = list(exon_gene_df.index)
    min_reads = 10
    
    # loop through each exon
    for exon in exons:

        exon_chrom = annot_gene_df.loc[exon]['probe.chr']
        exon_start = annot_gene_df.loc[exon]['probe.start']
        exon_stop = annot_gene_df.loc[exon]['probe.stop']
        exon_len = math.fabs(exon_start - exon_stop) + 1
        exon_strand = annot_gene_df.loc[exon]['probe.strand']
        if exon_strand == '-':
            orig_start = exon_start
            exon_start = exon_stop
            exon_stop = orig_start
        
        
        #Get the A,B,C reads for PSI calculation
        B_intervals = []
        C_intervals = []
        for interval in junction_gene_df.index:
            chrom,start,stop = interval.split('_')
            chrom = chrom.split("chr")[1]
            if str(chrom) != str(exon_chrom):
                print(chrom)
                print(exon_chrom)
                print('Chromosomes aren\'t matching, something is wrong!!')

            if int(start) < exon_start and int(stop) > exon_stop:#C
                C_intervals.append(interval)
            if int(start) < exon_start and (int(stop) >= exon_start and int(stop) <= exon_stop):#B1
                B_intervals.append(interval)
            if (int(start) >= exon_start and int(start) <= exon_stop) and int(stop) > exon_stop:#B2
                B_intervals.append(interval)

        A_reads = exon_gene_df.loc[exon]
        B_reads = junction_gene_df.loc[B_intervals]
        B_reads = B_reads.iloc[:,2:].sum()

        C_reads = junction_gene_df.loc[C_intervals]
        C_reads = C_reads.iloc[:,2:].sum()

        total_reads = A_reads + B_reads + C_reads
        if len(total_reads[total_reads < min_reads])/len(total_reads) > 0.2:
            continue
        else:
            A_B_norm = (A_reads + B_reads)/(read_len + exon_len-1)
            C_norm = C_reads/(read_len-1)

            mask = (A_B_norm == 0) & (C_norm == 0)
            PSI_norm = 100*((A_B_norm)/(A_B_norm + C_norm)) if (A_B_norm.sum() + C_norm.sum()) > 0.00 else 0
            PSI_norm = PSI_norm.where(~mask,0)
            PSI_df[exon] = PSI_norm

    return PSI_df


# find out the maximum coverage sample for each individual
def max_coverage_samples(exon_df):
    'Given exon expression dataframe'
    'Take first two elements of sample name'
    'If there is duplicate sample'
    'Return list of max coverage non-dup samples'
    
    med_exon_df = pd.DataFrame(exon_df.median(axis=0))
    med_exon_df['samples'] = ["-".join(i.split('-')[:2]) for i in med_exon_df.index]
    med_exon_df['original_samples'] = med_exon_df.index
    highest_cov_samples = list(med_exon_df.groupby('samples').idxmax()[0]) 
    
    return highest_cov_samples

def filter_exons(PSI_df):
    'Given PSI dataframe'
    'Filter out exons with little variation:'
    'PSI = 100% or PSI = 0% in >90% patients'
    'Return filtered PSI'
    
    PSI_filtered_df = PSI_df.dropna(axis=1)
    exons = PSI_filtered_df.columns
    exons_to_remove = []
    num_samples = len(PSI_df.index)
    for exon in exons:
        exon_PSI = PSI_filtered_df[exon]
        num_zero = len(exon_PSI[exon_PSI == 0])
        num_hundred = len(exon_PSI[exon_PSI == 100])
        if num_zero/num_samples > 0.9 or num_hundred/num_samples > 0.9:
            exons_to_remove.append(exon)
    PSI_filtered_df = PSI_filtered_df.drop(exons_to_remove,axis=1).dropna()
    new_index = ["-".join(i.split('-')[:2]) for i in PSI_filtered_df.index]
    PSI_filtered_df.index = new_index

    return PSI_filtered_df

if __name__ == "__main__":
#     parser = argparse.ArgumentParser(description="Compute Percent Spliced In (PSI) for any exon")
#     parser.add_argument("--exon_expr", help="File path to exon expression dataset", type=str, required=True)
#     parser.add_argument("--junction_expr", help="File path to junction expression dataset", type=str, required=True)
#     parser.add_argument("--exprannot", help="Exon expression annotation file", type=str, required=True)
#     parser.add_argument("--chrom", help="Restrict analysis to this chromosome", type=str, required=False)
#     parser.add_argument("--min_samples", help="Require data for this many samples", type=int, default=0)
#     parser.add_argument("--read_len", help="Paired end length, default is GTEx read length: 75bp ", type=int, default=75)
#     parser.add_argument("--out", help="Write data files to this file", type=str, required=True)


#     args = parser.parse_args()
    EXONFILE = '/storage/ydong/data/gtexRNA/lung/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_exon_reads_Parsed.csv'#args.exon_expr
    JUNCTIONFILE = '/storage/ydong/data/gtexRNA/lung/GTEx_Analysis_2017-06-05_v8_STARv2.5.3a_junctions_Parsed.csv'#args.junction_expr
    ANNOTFILE = '/storage/ydong/data/gtexRNA/gencode_gene_annotations_GRCh38.csv'#args.exprannot
    
    
    # read in files
    exon_df_t = pd.read_csv(EXONFILE,index_col = "Name")
    annot_df_t = pd.read_csv(ANNOTFILE,index_col = 0,dtype = {"probe.chr":str})
    junction_df_t = pd.read_csv(JUNCTIONFILE,index_col = "Name",dtype = {"Chr":str})
    
    chrList = list(range(2,23))
    
    for CHROM in chrList:
#         CHROM = 1 #args.chrom
        MINSAMPLES = 0 #args.min_samples
        READLEN = 2*75 #args.read_len
        OUTFILE = '/storage/ydong/data/chrom/lung/chr' + str(CHROM) + '_PSI.csv'#args.out

        # format CHROM
        CHROM = str(CHROM)
        annot_df = annot_df_t[annot_df_t['probe.chr'] ==  str(CHROM)]
        junction_df =junction_df_t[junction_df_t['Description'].isin(annot_df['gene.id'])]
        junction_df = junction_df[junction_df["Chr"] == str(CHROM)]
        exon_df = exon_df_t.loc[list(annot_df['probe.id'])]

        #removing duplicated samples by taking highest coverage (in exon expression) sample

        highest_cov_samples = max_coverage_samples(exon_df) 
        exon_df = exon_df[highest_cov_samples]
        highest_cov_samples[:0] = ['Description','Chr']
        junction_df = junction_df[highest_cov_samples]
        PSI_df = pd.DataFrame(index=exon_df.columns, columns=exon_df.index)
        genes = set(annot_df['gene.id'])
        print('Chromosome: {}'.format(CHROM))
        print('Exons total: {}'.format(len(exon_df.index)))

        for gene in genes:
            exon_gene_df = exon_df.loc[exon_df.index.str.startswith(gene)]
            junction_gene_df = junction_df[junction_df['Description'] == gene]
            annot_gene_df = annot_df[annot_df['gene.id'] == gene]
            annot_gene_df.index = annot_gene_df['probe.id']
            PSI_df = compute_gene_PSI(PSI_df,
                                      exon_gene_df,
                                      junction_gene_df,
                                      annot_gene_df,
                                      READLEN)
        PSI_filtered_df = filter_exons(PSI_df.dropna(axis=1))
        print('Exons with PSI: {}\n'.format(len(PSI_filtered_df.columns)))
        PSI_filtered_df.to_csv(OUTFILE)
        print("done")

KeyboardInterrupt: 

In [4]:
JUNCTIONFILE = '/storage/ydong/data/gtexRNA/lung/GTEx_Analysis_2017-06-05_v8_STARv2.5.3a_junctions_Parsed.csv'#args.junction_expr


In [5]:
junction_df_t = pd.read_csv(JUNCTIONFILE,index_col = "Name",dtype = {"Chr":str})

In [6]:
junction_df_t

Unnamed: 0_level_0,Unnamed: 0,Chr,Description,GTEX-111CU-0326-SM-5GZXO,GTEX-111FC-1126-SM-5GZWU,GTEX-111VG-0726-SM-5GIDC,GTEX-111YS-0626-SM-5GZXV,GTEX-1122O-0126-SM-5GICA,GTEX-1128S-0726-SM-5N9D6,GTEX-117YW-0526-SM-5H11C,...,GTEX-ZVZQ-1526-SM-5N9G6,GTEX-ZXG5-0826-SM-5GID6,GTEX-ZY6K-0326-SM-5SIBB,GTEX-ZYFG-0226-SM-5GIDT,GTEX-ZYT6-0526-SM-5GIEA,GTEX-ZYVF-1726-SM-5E443,GTEX-ZYW4-1526-SM-5SIBA,GTEX-ZYY3-0926-SM-5E454,GTEX-ZZPT-1326-SM-5E43H,GTEX-ZZPU-0526-SM-5E44U
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
chr1_12058_12178,0,1,ENSG00000223972.5,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
chr1_12228_12612,1,1,ENSG00000223972.5,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
chr1_12698_12974,2,1,ENSG00000223972.5,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
chr1_12722_13220,3,1,ENSG00000223972.5,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
chr1_13053_13220,4,1,ENSG00000223972.5,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
chrY_57211621_57211760,357741,Y,ENSG00000182484.15_PAR_Y,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
chrY_57213126_57213203,357742,Y,ENSG00000227159.8_PAR_Y,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
chrY_57213358_57213525,357743,Y,ENSG00000227159.8_PAR_Y,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
chrY_57213603_57213879,357744,Y,ENSG00000227159.8_PAR_Y,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
EXONFILE = '/storage/ydong/data/gtexRNA/lung/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_exon_reads_Parsed.csv'#args.exon_expr
exon_df_t = pd.read_csv(EXONFILE,index_col = "Name")


In [2]:
list(range(2,23))

[1]

In [6]:
for gene in genes:
    exon_gene_df = exon_df.loc[exon_df.index.str.startswith(gene)]
    junction_gene_df = junction_df[junction_df['Description'] == gene]
    annot_gene_df = annot_df[annot_df['gene.id'] == gene]
    annot_gene_df.index = annot_gene_df['probe.id']
    PSI_df = compute_gene_PSI(PSI_df,
                              exon_gene_df,
                              junction_gene_df,
                              annot_gene_df,
                              READLEN)
PSI_filtered_df = filter_exons(PSI_df.dropna(axis=1))
print('Exons with PSI: {}\n'.format(len(PSI_filtered_df.columns)))
PSI_filtered_df.to_csv(OUTFILE)
print("done")

Exons with PSI: 3686

done
