## Load variants and PSIs

In [1]:
import pandas as pd
projectDIR = "../../data/gtex/"

In [2]:
df = pd.read_csv(projectDIR + 'A3SS_PSI.csv', index_col=0)
df = df.reset_index()

In [3]:
df['VARIANT'] = df.CHROM.map(str) + '_' + df.variant_position.map(str) + '_' + df.REF + '_' + df.ALT

In [4]:
df['ID'] = df.event_name + df.VARIANT

## Split into variant-exon pair format for the two alternative exons

In [5]:
df.head()

Unnamed: 0,index,event_name,CHROM,variant_position,REF,ALT,HETERO_MEAN,WT_MEAN,HOMO_MEAN,VARIANT,ID
0,0,chr21:45173464:45173600:+@chr21:45175358|45175...,21,45175571,C,A,0.0175,0.017569,,21_45175571_C_A,chr21:45173464:45173600:+@chr21:45175358|45175...
1,1,chr16:11935764:11935857:-@chr16:11935674|11935...,16,11935684,CAAAGA,C,0.908182,0.915887,,16_11935684_CAAAGA_C,chr16:11935764:11935857:-@chr16:11935674|11935...
2,2,chr16:50821697:50821763:+@chr16:50825469|50825...,16,50825459,G,A,0.772308,0.765786,,16_50825459_G_A,chr16:50821697:50821763:+@chr16:50825469|50825...
3,3,chr16:89619387:89619543:+@chr16:89620147|89620...,16,89620148,A,G,0.029091,0.024246,,16_89620148_A_G,chr16:89619387:89619543:+@chr16:89620147|89620...
4,4,chr19:19030484:19030665:+@chr19:19031389|19031...,19,19031444,C,T,0.781667,0.850264,,19_19031444_C_T,chr19:19030484:19030665:+@chr19:19031389|19031...


In [6]:
def get_alt_exon(event_name, SA1=True):
    ''' Give coordinates of SA1 or SA2
    '''
    strand = event_name[-1]
    event_split = event_name.split(":")
    chrom = event_split[0]
    if strand == "+":
        end = event_split[-2]
        if SA1:
            start = event_split[-3].split('|')[0]
        else:
            start = event_split[-3].split('|')[1]
    else:
        if SA1:
            end = event_split[-3].split('|')[1]
        else:
            end = event_split[-3].split('|')[0]
        start = event_split[-2]
    return chrom, start, end, strand

In [7]:
from copy import deepcopy
df_SA1 = deepcopy(df)
df_SA2 = deepcopy(df)

In [8]:
df_SA1['seqnames'], df_SA1['exon_start'], df_SA1['exon_end'], df_SA1['strand'] = zip(*df.event_name.apply(get_alt_exon))

In [9]:
df_SA1 = df_SA1[['event_name', 'VARIANT', 'seqnames', 'exon_start', 'exon_end', 'strand', 'variant_position', 'REF', 'ALT', 'ID']]

In [10]:
df_SA1.to_csv(projectDIR+'A3SS_SA1_variant.csv')

In [11]:
df_SA2['seqnames'], df_SA2['exon_start'], df_SA2['exon_end'], df_SA2['strand'] = zip(*df.event_name.apply(lambda x: get_alt_exon(x, SA1=False)))

In [12]:
df_SA2 = df_SA2[['event_name', 'VARIANT', 'seqnames', 'exon_start', 'exon_end', 'strand', 'variant_position', 'REF', 'ALT', 'ID']]

In [13]:
df_SA2.to_csv(projectDIR+'A3SS_SA2_variant.csv')