In [None]:
""" write artifical vcfs with pyvcf """

In [80]:
import vcf
import pandas as pd
import os

In [81]:
def get_random_intervals():
    """ get set of 1000 random intronic / intergenic intervals, with bedtools """
    
    bed_rand = 'bedtools_random_out.csv'
    ! bedtools random -l 1 -n 1000 -g human.hg38.genome > $bed_rand

    bed_inter = 'bedtools_intersect_out.csv'
    ! bedtools intersect -a $bed_rand -b /Users/lincoln.harris/code/potLuck/hg38-plus.gtf > $bed_inter

    bed_unique = 'bed_unique_loci.csv'
    ! grep -F -xvf $bed_inter $bed_rand > $bed_unique
    
    unique = pd.read_csv('bed_unique_loci.csv', sep='\t', names=['chrom', 'start_pos', 'end_pos', 
                                                        'num', 'score', 'strand'])
    pos_strings = []
    
    for idx, row in unique.iterrows():
        chrom_ = row.chrom
        if '_' not in chrom_:
            pos = row.start_pos
            tup = [chrom_, pos]
            pos_strings.append(tup)
        
    os.remove('bedtools_random_out.csv')
    os.remove('bedtools_intersect_out.csv')
    os.remove('bed_unique_loci.csv')
    
    return(pos_strings)

In [82]:
def write_garbage_vcf_records():
    """ write a bunch of lines that dont correspond to any gene / amino acid sequence """
    
    random_positions = get_random_intervals()
    records_list = []
    
    currRec = sample_record
    for pos in random_positions:
        
        currRec.CHROM = pos[0]
        currRec.POS = pos[1]
        currRec.ID = None
        currRec.REF = 'A'
        currRec.ALT = ['C']
        currRec.QUAL = 10.2
        currRec.FILTER = None
        currRec.INFO = {'AC': [2], 'AF': [1], 'AN': 2, 'DB': True, 'DP': 94, 
                  'ExcessHet': 3.0103, 'FS': 0.0, 'MLEAC': [2], 'MLEAF': [1.0], 
                  'MQ': 3.0, 'QD': 28.37, 'SOR': 2.303}
    
        vcf_writer.write_record(currRec)

In [83]:
def write_artifical_vcf_record(chrom_, pos_, ref_, alt_):
    """ writing the {expected to find} record here """
    currRec = sample_record

    currRec.CHROM = chrom_
    currRec.POS = pos_
    currRec.ID = None
    currRec.REF = ref_
    currRec.ALT = [alt_]
    currRec.QUAL = 10.2
    currRec.FILTER = None
    currRec.INFO = {'AC': 2, 'AF': [1], 'AN': 2, 'DB': True, 'DP': 94, # just hard code this in, well change
                   'ExcessHet': 3.0103, 'FS': 0.0, 'MLEAC': [2], 'MLEAF': [1.0],     #  it later
                   'MQ': 3.0, 'QD': 28.37, 'SOR': 2.303}
    
    vcf_writer.write_record(currRec)

In [84]:
def write_vcf(file_name, chrom, pos, ref, alt):
    """ driver func for writing artifical vcf test set """
    global sample_record, vcf_writer
    
    vcf_reader = vcf.Reader(filename='A10_1001000407.vcf')
    sample_record = next(vcf_reader)

    vcf_writer = vcf.Writer(open(file_name, 'w'), vcf_reader)
    
    write_garbage_vcf_records()     # sandwiching the relevant one in among garbage
    write_artifical_vcf_record(chrom, pos, ref, alt)
    write_garbage_vcf_records()

In [92]:
# write test set
write_vcf('test0_kras_G13C.vcf', 'chr12', 25245348, 'C', 'A')
write_vcf('test1_kras_G13C.vcf', 'chr12', 25245348, 'C', 'A')
write_vcf('test2_kras_G13C.vcf', 'chr12', 25245348, 'C', 'A')

write_vcf('test3_egfr_L858R.vcf', 'chr7', 55191822, 'T', 'G')
write_vcf('test4_egfr_L858R.vcf', 'chr7', 55191822, 'T', 'G')
write_vcf('test5_egfr_L858R.vcf', 'chr7', 55191822, 'T', 'G')

write_vcf('test6_braf_V600E.vcf', 'chr7', 140753336, 'A', 'T')
write_vcf('test7_braf_V600E.vcf', 'chr7', 140753336, 'A', 'T')
write_vcf('test8_braf_V600E.vcf', 'chr7', 140753336, 'A', 'T')

write_vcf('test9_kras_G12C.vcf', 'chr12', 25245351, 'C', 'A')
write_vcf('test10_kras_G12C.vcf', 'chr12', 25245351, 'C', 'A')
write_vcf('test11_kras_G12C.vcf', 'chr12', 25245351, 'C', 'A')

write_vcf('test12_braf_W450L.vcf', 'chr7', 140781659, 'C', 'A')
write_vcf('test13_braf_W450L.vcf', 'chr7', 140781659, 'C', 'A')
write_vcf('test14_braf_W450L.vcf', 'chr7', 140781659, 'C', 'A')

write_vcf('test15_braf_Q257H.vcf', 'chr7', 140801501, 'C', 'A')
write_vcf('test16_braf_Q257H.vcf', 'chr7', 140801501, 'C', 'A')
write_vcf('test17_braf_Q257H.vcf', 'chr7', 140801501, 'C', 'A')

write_vcf('test18_tp53_S241F.vcf', 'chr17', 7674241, 'G', 'A')
write_vcf('test19_tp53_S241F.vcf', 'chr17', 7674241, 'G', 'A')
write_vcf('test20_tp53_S241F.vcf', 'chr17', 7674241, 'G', 'A')

write_vcf('test21_tp53_P72R.vcf', 'chr17', 7676154, 'G', 'C')
write_vcf('test22_tp53_P72R.vcf', 'chr17', 7676154, 'G', 'C')
write_vcf('test23_tp53_P72R.vcf', 'chr17', 7676154, 'G', 'C')

write_vcf('test24_garbage.vcf', 'chr12', 25284777, 'G', 'C') # none of these should return anything
write_vcf('test25_garbage.vcf', 'chr7', 54880599, 'G', 'C')
write_vcf('test26_garbage.vcf', 'chr2', 41732258, 'G', 'C')

ERCC-00002	ERCC	exon	1	1061	0.000000	+	.	gene_id "ERCC-00002"; transcript_id "DQ459430";

ERCC-00002	ERCC	exon	1	1061	0.000000	+	.	gene_id "ERCC-00002"; transcript_id "DQ459430";

ERCC-00002	ERCC	exon	1	1061	0.000000	+	.	gene_id "ERCC-00002"; transcript_id "DQ459430";

ERCC-00002	ERCC	exon	1	1061	0.000000	+	.	gene_id "ERCC-00002"; transcript_id "DQ459430";

ERCC-00002	ERCC	exon	1	1061	0.000000	+	.	gene_id "ERCC-00002"; transcript_id "DQ459430";

ERCC-00002	ERCC	exon	1	1061	0.000000	+	.	gene_id "ERCC-00002"; transcript_id "DQ459430";

ERCC-00002	ERCC	exon	1	1061	0.000000	+	.	gene_id "ERCC-00002"; transcript_id "DQ459430";

ERCC-00002	ERCC	exon	1	1061	0.000000	+	.	gene_id "ERCC-00002"; transcript_id "DQ459430";

ERCC-00002	ERCC	exon	1	1061	0.000000	+	.	gene_id "ERCC-00002"; transcript_id "DQ459430";

ERCC-00002	ERCC	exon	1	1061	0.000000	+	.	gene_id "ERCC-00002"; transcript_id "DQ459430";

ERCC-00002	ERCC	exon	1	1061	0.000000	+	.	gene_id "ERCC-00002"; transcript_id "DQ459430";

ERCC-00002

ERCC-00002	ERCC	exon	1	1061	0.000000	+	.	gene_id "ERCC-00002"; transcript_id "DQ459430";

ERCC-00002	ERCC	exon	1	1061	0.000000	+	.	gene_id "ERCC-00002"; transcript_id "DQ459430";

ERCC-00002	ERCC	exon	1	1061	0.000000	+	.	gene_id "ERCC-00002"; transcript_id "DQ459430";

ERCC-00002	ERCC	exon	1	1061	0.000000	+	.	gene_id "ERCC-00002"; transcript_id "DQ459430";

ERCC-00002	ERCC	exon	1	1061	0.000000	+	.	gene_id "ERCC-00002"; transcript_id "DQ459430";

ERCC-00002	ERCC	exon	1	1061	0.000000	+	.	gene_id "ERCC-00002"; transcript_id "DQ459430";

ERCC-00002	ERCC	exon	1	1061	0.000000	+	.	gene_id "ERCC-00002"; transcript_id "DQ459430";

ERCC-00002	ERCC	exon	1	1061	0.000000	+	.	gene_id "ERCC-00002"; transcript_id "DQ459430";

ERCC-00002	ERCC	exon	1	1061	0.000000	+	.	gene_id "ERCC-00002"; transcript_id "DQ459430";

ERCC-00002	ERCC	exon	1	1061	0.000000	+	.	gene_id "ERCC-00002"; transcript_id "DQ459430";

ERCC-00002	ERCC	exon	1	1061	0.000000	+	.	gene_id "ERCC-00002"; transcript_id "DQ459430";

ERCC-00002

ERCC-00002	ERCC	exon	1	1061	0.000000	+	.	gene_id "ERCC-00002"; transcript_id "DQ459430";

ERCC-00002	ERCC	exon	1	1061	0.000000	+	.	gene_id "ERCC-00002"; transcript_id "DQ459430";

ERCC-00002	ERCC	exon	1	1061	0.000000	+	.	gene_id "ERCC-00002"; transcript_id "DQ459430";

ERCC-00002	ERCC	exon	1	1061	0.000000	+	.	gene_id "ERCC-00002"; transcript_id "DQ459430";

ERCC-00002	ERCC	exon	1	1061	0.000000	+	.	gene_id "ERCC-00002"; transcript_id "DQ459430";

ERCC-00002	ERCC	exon	1	1061	0.000000	+	.	gene_id "ERCC-00002"; transcript_id "DQ459430";

ERCC-00002	ERCC	exon	1	1061	0.000000	+	.	gene_id "ERCC-00002"; transcript_id "DQ459430";

ERCC-00002	ERCC	exon	1	1061	0.000000	+	.	gene_id "ERCC-00002"; transcript_id "DQ459430";

ERCC-00002	ERCC	exon	1	1061	0.000000	+	.	gene_id "ERCC-00002"; transcript_id "DQ459430";

ERCC-00002	ERCC	exon	1	1061	0.000000	+	.	gene_id "ERCC-00002"; transcript_id "DQ459430";

ERCC-00002	ERCC	exon	1	1061	0.000000	+	.	gene_id "ERCC-00002"; transcript_id "DQ459430";

ERCC-00002

In [74]:
# sanity check 
vcf_reader = vcf.Reader(filename='test0_kras_G13C.vcf')

for record in vcf_reader:
    if record.CHROM == 'chr12' and record.POS == 25245348:
        found_record = record
        print(record)

Record(CHROM=chr12, POS=25245348, REF=C, ALT=[A])


In [75]:
found_record.INFO

{'AC': [2],
 'AF': [1.0],
 'AN': 2,
 'DB': True,
 'DP': 94,
 'ExcessHet': 3.0103,
 'FS': 0.0,
 'MLEAC': [2],
 'MLEAF': [1.0],
 'MQ': 3.0,
 'QD': 28.37,
 'SOR': 2.303}

In [20]:
#///////////////////////////////////////////////////////////////////
#///////////////////////////////////////////////////////////////////
#//////////   what if i changed the 20 field manually?    //////////
#///////////////////////////////////////////////////////////////////

In [93]:
def change_20_field_manual(file_in, field_new):
    """ change up the '20' field, with python
        so that we can do coverage analysis """
    
    field_old = '1/1:0,2:2:6:80,6,0' # everybody should have this
    tmp = 'tmp.vcf'

    with open(file_in, 'rt') as f_in:
        with open(tmp, 'wt') as f_out:
            for line in f_in:
                line_r = line.replace(field_old, field_new)
                f_out.write(line_r)

    os.rename(tmp, file_in)

In [94]:
# rewrite '20' field, for everybody
change_20_field_manual('test0_kras_G13C.vcf', '1/1:2,2:4:6:80,6,0') # 2/2 variant/wt ratio
change_20_field_manual('test1_kras_G13C.vcf', '1/1:0,10:10:6:80,6,0') # 0/10 variant/wt ratio
change_20_field_manual('test2_kras_G13C.vcf', '1/1:5,10:15:6:80,6,0') # 5/10 variant/wt ratio

change_20_field_manual('test3_egfr_L858R.vcf', '1/1:2,2:4:6:80,6,0') # 2/2 variant/wt ratio
change_20_field_manual('test4_egfr_L858R.vcf', '1/1:0,10:10:6:80,6,0') # 0/10 variant/wt ratio
change_20_field_manual('test5_egfr_L858R.vcf', '1/1:5,10:15:6:80,6,0') # 5/10 variant/wt ratio

change_20_field_manual('test6_braf_V600E.vcf', '1/1:2,2:4:6:80,6,0') # 2/2 variant/wt ratio
change_20_field_manual('test7_braf_V600E.vcf', '1/1:0,10:10:6:80,6,0') # 0/10 variant/wt ratio
change_20_field_manual('test8_braf_V600E.vcf', '1/1:5,10:15:6:80,6,0') # 5/10 variant/wt ratio

change_20_field_manual('test9_kras_G12C.vcf', '1/1:2,2:4:6:80,6,0') # 2/2 variant/wt ratio
change_20_field_manual('test10_kras_G12C.vcf', '1/1:0,10:10:6:80,6,0') # 0/10 variant/wt ratio
change_20_field_manual('test11_kras_G12C.vcf', '1/1:5,10:15:6:80,6,0') # 5/10 variant/wt ratio

change_20_field_manual('test12_braf_W450L.vcf', '1/1:2,2:4:6:80,6,0') # 2/2 variant/wt ratio
change_20_field_manual('test13_braf_W450L.vcf', '1/1:0,10:10:6:80,6,0') # 0/10 variant/wt ratio
change_20_field_manual('test14_braf_W450L.vcf', '1/1:5,10:15:6:80,6,0') # 5/10 variant/wt ratio

change_20_field_manual('test15_braf_Q257H.vcf', '1/1:2,2:4:6:80,6,0') # 2/2 variant/wt ratio
change_20_field_manual('test16_braf_Q257H.vcf', '1/1:0,10:10:6:80,6,0') # 0/10 variant/wt ratio
change_20_field_manual('test17_braf_Q257H.vcf', '1/1:5,10:15:6:80,6,0') # 5/10 variant/wt ratio

change_20_field_manual('test18_tp53_S241F.vcf', '1/1:2,2:4:6:80,6,0') # 2/2 variant/wt ratio
change_20_field_manual('test19_tp53_S241F.vcf', '1/1:0,10:10:6:80,6,0') # 0/10 variant/wt ratio
change_20_field_manual('test20_tp53_S241F.vcf', '1/1:5,10:15:6:80,6,0') # 5/10 variant/wt ratio

change_20_field_manual('test21_tp53_P72R.vcf', '1/1:2,2:4:6:80,6,0') # 2/2 variant/wt ratio
change_20_field_manual('test22_tp53_P72R.vcf', '1/1:0,10:10:6:80,6,0') # 0/10 variant/wt ratio
change_20_field_manual('test23_tp53_P72R.vcf', '1/1:5,10:15:6:80,6,0') # 5/10 variant/wt ratio

change_20_field_manual('test24_garbage.vcf', '1/1:2,2:4:6:80,6,0') # 2/2 variant/wt ratio
change_20_field_manual('test25_garbage.vcf', '1/1:0,10:10:6:80,6,0') # 0/10 variant/wt ratio
change_20_field_manual('test26_garbage.vcf', '1/1:5,10:15:6:80,6,0') # 5/10 variant/wt ratio

In [90]:
# sanity check 
! grep -i 25245348 test2_kras_G13C.vcf > l.csv
l = pd.read_csv('l.csv', sep='\t', names=['CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 
                                      'FILTER', 'INFO', 'FORMAT', '20'])
l

Unnamed: 0,CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO,FORMAT,20
0,chr12,25245348,.,C,A,10.2,.,AC=2;AF=1;AN=2;DB;DP=94;ExcessHet=3.0103;FS=0....,GT:AD:DP:GQ:PL,"1/1:5,10:15:6:80,6,0"
