In [None]:
""" write artifical vcfs with pyvcf """

In [92]:
import vcf
import pandas as pd
import os

In [106]:
def get_random_intervals():
    """ get set of 1000 random intronic / intergenic intervals, with bedtools """
    
    bed_rand = 'bedtools_random_out.csv'
    ! bedtools random -l 1 -n 1000 -g human.hg38.genome > $bed_rand

    bed_inter = 'bedtools_intersect_out.csv'
    ! bedtools intersect -a $bed_rand -b /Users/lincoln.harris/code/potLuck/hg38-plus.gtf > $bed_inter

    bed_unique = 'bed_unique_loci.csv'
    ! grep -F -xvf $bed_inter $bed_rand > $bed_unique
    
    unique = pd.read_csv('bed_unique_loci.csv', sep='\t', names=['chrom', 'start_pos', 'end_pos', 
                                                        'num', 'score', 'strand'])
    pos_strings = []
    
    for idx, row in unique.iterrows():
        chrom_ = row.chrom
        if '_' not in chrom_:
            pos = row.start_pos
            tup = [chrom_, pos]
            pos_strings.append(tup)
        
    os.remove('bedtools_random_out.csv')
    os.remove('bedtools_intersect_out.csv')
    os.remove('bed_unique_loci.csv')
    
    return(pos_strings)

In [107]:
def write_garbage_vcf_records():
    """ write a bunch of lines that dont correspond to any gene / amino acid sequence """
    
    random_positions = get_random_intervals()
    records_list = []
    
    currRec = sample_record
    for pos in random_positions:
        
        currRec.CHROM = pos[0]
        currRec.POS = pos[1]
        currRec.ID = None
        currRec.REF = 'A'
        currRec.ALT = ['C']
        currRec.QUAL = 10.2
        currRec.FILTER = None
        currRec.INFO = {'AC': [2], 'AF': [1], 'AN': 2, 'DB': True, 'DP': 94, 
                  'ExcessHet': 3.0103, 'FS': 0.0, 'MLEAC': [2], 'MLEAF': [1.0], 
                  'MQ': 3.0, 'QD': 28.37, 'SOR': 2.303}
    
        vcf_writer.write_record(currRec)

In [149]:
def write_artifical_vcf_record(chrom_, pos_, ref_, alt_, v_count_, wt_count_):
    """ writing the {expected to find} record here """
    currRec = sample_record

    currRec.CHROM = chrom_
    currRec.POS = pos_
    currRec.ID = None
    currRec.REF = ref_
    currRec.ALT = [alt_]
    currRec.QUAL = 10.2
    currRec.FILTER = None
    currRec.INFO = {'AC': [wt_count_], 'AF': [1], 'AN': 2, 'DB': True, 'DP': 94, 
                   'ExcessHet': 3.0103, 'FS': 0.0, 'MLEAC': [2], 'MLEAF': [1.0],
                   'MQ': 3.0, 'QD': 28.37, 'SOR': 2.303}
    
    vcf_writer.write_record(currRec)

In [150]:
def write_vcf(file_name, chrom, pos, ref, alt, v_count, wt_count):
    """ driver func for writing artifical vcf test set """
    global sample_record, vcf_writer
    
    vcf_reader = vcf.Reader(filename='A10_1001000407.vcf')
    sample_record = next(vcf_reader)

    vcf_writer = vcf.Writer(open(file_name, 'w'), vcf_reader)
    
    write_garbage_vcf_records()     # sandwiching the relevant one in among garbage
    write_artifical_vcf_record(chrom, pos, ref, alt, v_count, wt_count)
    write_garbage_vcf_records()

In [151]:
# write test set
write_vcf('test0_kras_G13C.vcf', 'chr12', 25245348, 'C', 'A', 0, 10)
# write_vcf('test1_kras_G13C.vcf', 'chr12', 25245348, 'C', 'A', 0, 10)
# write_vcf('test2_kras_G13C.vcf', 'chr12', 25245348, 'C', 'A', 5, 10)

# write_vcf('test3_egfr_L858R.vcf', 'chr7', 55191822, 'T', 'G', 2, 2)
# write_vcf('test4_egfr_L858R.vcf', 'chr7', 55191822, 'T', 'G', 0, 10)
# write_vcf('test5_egfr_L858R.vcf', 'chr7', 55191822, 'T', 'G', 5, 10)

# write_vcf('test6_braf_V600E.vcf', 'chr7', 140753336, 'A', 'T', 2, 2)
# write_vcf('test7_braf_V600E.vcf', 'chr7', 140753336, 'A', 'T', 0, 10)
# write_vcf('test8_braf_V600E.vcf', 'chr7', 140753336, 'A', 'T', 5, 10)

# write_vcf('test9_kras_G12C.vcf', 'chr12', 25245351, 'C', 'A', 2, 2)
# write_vcf('test10_kras_G12C.vcf', 'chr12', 25245351, 'C', 'A', 0, 10)
# write_vcf('test11_kras_G12C.vcf', 'chr12', 25245351, 'C', 'A', 5, 10)

# write_vcf('test12_braf_W450L.vcf', 'chr7', 140781659, 'C', 'A', 2, 2)
# write_vcf('test13_braf_W450L.vcf', 'chr7', 140781659, 'C', 'A', 0, 10)
# write_vcf('test14_braf_W450L.vcf', 'chr7', 140781659, 'C', 'A', 5, 10)

# write_vcf('test15_braf_Q257H.vcf', 'chr7', 140801501, 'C', 'A', 2, 2)
# write_vcf('test16_braf_Q257H.vcf', 'chr7', 140801501, 'C', 'A', 0, 10)
# write_vcf('test17_braf_Q257H.vcf', 'chr7', 140801501, 'C', 'A', 5, 10)

# write_vcf('test18_tp53_S241F.vcf', 'chr17', 7674241, 'G', 'A', 2, 2)
# write_vcf('test19_tp53_S241F.vcf', 'chr17', 7674241, 'G', 'A', 0, 10)
# write_vcf('test20_tp53_S241F.vcf', 'chr17', 7674241, 'G', 'A', 5, 10)

# write_vcf('test21_tp53_P72R.vcf', 'chr17', 7676154, 'G', 'C', 2, 2)
# write_vcf('test22_tp53_P72R.vcf', 'chr17', 7676154, 'G', 'C', 0, 10)
# write_vcf('test23_tp53_P72R.vcf', 'chr17', 7676154, 'G', 'C', 5, 10)

# write_vcf('test24_garbage.vcf', 'chr12', 25284777, 'G', 'C', 2, 2) # none of these should return anything
# write_vcf('test25_garbage.vcf', 'chr7', 54880599, 'G', 'C', 0, 10)
# write_vcf('test26_garbage.vcf', 'chr2', 41732258, 'G', 'C', 5, 10)

ERCC-00002	ERCC	exon	1	1061	0.000000	+	.	gene_id "ERCC-00002"; transcript_id "DQ459430";

ERCC-00002	ERCC	exon	1	1061	0.000000	+	.	gene_id "ERCC-00002"; transcript_id "DQ459430";

ERCC-00002	ERCC	exon	1	1061	0.000000	+	.	gene_id "ERCC-00002"; transcript_id "DQ459430";

ERCC-00002	ERCC	exon	1	1061	0.000000	+	.	gene_id "ERCC-00002"; transcript_id "DQ459430";



In [140]:
vcf_reader = vcf.Reader(filename='test0_kras_G13C.vcf')

for record in vcf_reader:
    if record.CHROM == 'chr12' and record.POS == 25245348:
        found_record = record
        print(record)

Record(CHROM=chr12, POS=25245348, REF=C, ALT=[A])


In [141]:
found_record.INFO

{'AC': [2, 2],
 'AF': [1.0],
 'AN': 2,
 'DB': True,
 'DP': 94,
 'ExcessHet': 3.0103,
 'FS': 0.0,
 'MLEAC': [2],
 'MLEAF': [1.0],
 'MQ': 3.0,
 'QD': 28.37,
 'SOR': 2.303}