In [None]:
''' get the "ground truth" set of ENSPs that should be produced by funco/
    cereb for a given variant, randomly extracted from a raw vcf file '''

In [1]:
import vcfpy
import pandas as pd
pd.set_option('display.max_rows', 999)

In [55]:
def trim_funco_vcf(read_path, write_path):
    ''' gets rid of all the nonesense cerebra doesnt pick up '''
    reader = vcfpy.Reader.from_path(read_path)
    writer = vcfpy.Writer.from_path(write_path, reader.header)
    unwanted_class = ['COULD_NOT_DETERMINE', 'INTRON', 'FIVE_PRIME_UTR', 
                      'THREE_PRIME_UTR', 'IGR', 'FIVE_PRIME_FLANK', 'THREE_PRIME_FLANK', 'LINCRNA']

    for record in reader:
        funco = record.INFO.get('FUNCOTATION')[0]
    
        keep = True
        for elm in unwanted_class:
            if elm in funco:
                keep = False
    
        if keep:
            writer.write_record(record)

In [2]:
def get_indicies(pos, sorted_gtf):
    ''' get overlapping indicies from the gtf file '''
    keep_index_list = []
    for idx, row in sorted_gtf.iterrows():
        if row.start <= pos and row.end >= pos:
            keep_index_list.append(idx)
            
    return(keep_index_list)

In [3]:
def get_ensp_ids(indicies, sorted_gtf):
    ''' get the ENSP ids from the overlapping indicies in the gtf file '''
    sorted_gtf_trim = sorted_gtf.iloc[indicies]
    sorted_gtf_trim = sorted_gtf.reset_index(drop=True)
    
    pids = []
    for idx, row in sorted_gtf_trim.iterrows():
        attr = row.attribute
        if 'protein_id' in attr:
            pid = attr.split('protein_id')[1].split(';')[0] # this is janky 
            pid_strip = pid.split('"')[1]
            pids.append(pid_strip)
            
    return(pids)

In [57]:
''' MAIN STARTS HERE '''
# create trimmed funco vcf file
trim_funco_vcf('G10_1001000340_benchmark.vcf', 'G10_1001000340_funco_trimmed.vcf')

# pull out a random line
! gshuf -n 1 G10_1001000340_funco_trimmed.vcf

chr16	89985030	.	G	A	294	.	AC=2;AF=1.0;AN=2;DP=8;ExcessHet=3.0103;FS=0.0;FUNCOTATION=[AFG3L1P|hg38|chr16|89985030|89985030|RNA||SNP|G|G|A|g.chr16%3A89985030G>A|ENST00000418696.5|+|||c.e5+431G>A|||0.486284289276808|ATAATTAGCCGGACATGGTGG|AFG3L1P_ENST00000557444.5_RNA|||||||||||||||||||||||||||||false||];MLEAC=2;MLEAF=1.0;MQ=nan;QD=32.02;SOR=2.833	GT:AD:DP:GQ:PL	1/1:0,8:8:24:308,24,0


In [58]:
# make sure its in the raw vcf
! grep '89985030' G10_1001000340.vcf

chr16	89985030	.	G	A	294	.	AC=2;AF=1.00;AN=2;DP=8;ExcessHet=3.0103;FS=0.000;MLEAC=2;MLEAF=1.00;MQ=NaN;QD=32.02;SOR=2.833	GT:AD:DP:GQ:PL	1/1:0,8:8:24:308,24,0


In [None]:
# subset the gtf file

In [None]:
chrom = 16
pos = 89985030

sub_gtf = pd.read_csv('gpr157_sub.gtf', sep='\t', 
            names=['chr', 'source', 'feature', 'start', 'end', 'score', 'strand', 'frame', 'attribute'])
sub_gtf_sorted = sub_gtf.sort_values('start')
sub_gtf_sorted = sub_gtf_sorted.reset_index(drop=True)

index_l = get_indicies(pos, sub_gtf_sorted)
ensp_l = get_ensp_ids(index_l, sub_gtf_sorted)
ensp_l = set(ensp_l)

print(ensp_l)