In [4]:
""" Module for writing artifical vcfs for testing purposes """

' Module for writing artifical vcfs for testing purposes '

In [1]:
import pandas as pd
import os
import VCF
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [47]:
def add_garbage_lines(df_):
    """ want to add a bunch of lines that dont correspond to any gene / amino acid sequence"""
    
    g_info = 'AC=2;AF=1.00;AN=2;DP=7;ExcessHet=3.0103;FS=0.000;MLEAC=1;MLEAF=0.500;MQ=3.00;QD=30.14;SOR=2.303'
    g_format = 'GT:AD:DP:GQ:PL'
    g_20 = '1/1:2,2:13:38:434,38,0'
    
    garbage_positions = [[12, 25284777],
                         [12, 24788289],
                         [7, 54880599],
                         [7, 55271013], 
                         [2, 29954866], 
                         [2, 29081787],
                         [7, 140944330],
                         [7, 140238466], 
                         [17, 7636991],
                         [2, 41732258]]
    
    for elm in garbage_positions:
        add_chr = 'chr' + str(elm[0])
        add_pos = elm[1]
        
        df_ = df_.append({'#CHROM': add_chr, 'POS': add_pos, 'ID': '.', 'REF': 'A', 'ALT': 'C', 
                          'QUAL': 60, 'FILTER': '.', 'INFO': g_info, 'FORMAT': g_format, 
                          '20': g_20}, ignore_index=True)
    
    # random shuffle rows
    df_ = df_.sample(frac=1)
    
    return(df_)

In [48]:
def write_vcf(outStr_, chrom, pos, ref_, alt_, v_count, wt_count):
    """ routine for writing VCF files, from an existing dataframe. 
    essentially just adding in this horrible vcf header. """
    
    cwd = os.getcwd()
    
    with open(cwd + '/../cerebra/vcfheader.txt', 'r') as f: # read in artifical header 
        header = f.read()
        df = pd.DataFrame(columns=['#CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT', '20'])
        
        chrom_ = 'chr' + str(chrom)
        pos_ = pos
        id_ = '.'
        qual_ = 60.28
        filter_ = '.'
        info_ = 'AC=2;AF=1.00;AN=2;DP=7;ExcessHet=3.0103;FS=0.000;MLEAC=1;MLEAF=0.500;MQ=3.00;QD=30.14;SOR=2.303'
        format_ = 'GT:AD:DP:GQ:PL'
        twenty = '1/1:' + str(v_count) + ',' + str(wt_count) + ':13:38:434,38,0'
        
        
        df = df.append({'#CHROM': chrom_, 'POS': pos_, 'ID': id_, 'REF': ref_, 'ALT': alt_, 
                        'QUAL': qual_, 'FILTER': filter_, 'INFO': info_, 'FORMAT': format_,
                       '20': twenty}, ignore_index=True)
        
        df = add_garbage_lines(df)
        
        output_VCF = outStr_
        with open(output_VCF, 'w') as vcf:
            vcf.write(header)

        df.to_csv(output_VCF, sep="\t", mode='a', index=False)

In [51]:
# write test set
write_vcf('test0_kras_G13C.vcf', 12, 25245348, 'C', 'A', 2, 2)
write_vcf('test1_kras_G13C.vcf', 12, 25245348, 'C', 'A', 0, 10)
write_vcf('test2_kras_G13C.vcf', 12, 25245348, 'C', 'A', 5, 10)

write_vcf('test3_egfr_L858R.vcf', 7, 55191822, 'T', 'G', 2, 2)
write_vcf('test4_egfr_L858R.vcf', 7, 55191822, 'T', 'G', 0, 10)
write_vcf('test5_egfr_L858R.vcf', 7, 55191822, 'T', 'G', 5, 10)

write_vcf('test6_braf_V600E.vcf', 7, 140753336, 'A', 'T', 2, 2)
write_vcf('test7_braf_V600E.vcf', 7, 140753336, 'A', 'T', 0, 10)
write_vcf('test8_braf_V600E.vcf', 7, 140753336, 'A', 'T', 5, 10)

write_vcf('test9_kras_G12C.vcf', 12, 25245351, 'C', 'A', 2, 2)
write_vcf('test10_kras_G12C.vcf', 12, 25245351, 'C', 'A', 0, 10)
write_vcf('test11_kras_G12C.vcf', 12, 25245351, 'C', 'A', 5, 10)

write_vcf('test12_braf_W450L.vcf', 7, 140781659, 'C', 'A', 2, 2)
write_vcf('test13_braf_W450L.vcf', 7, 140781659, 'C', 'A', 0, 10)
write_vcf('test14_braf_W450L.vcf', 7, 140781659, 'C', 'A', 5, 10)

write_vcf('test15_braf_Q257H.vcf', 7, 140801501, 'C', 'A', 2, 2)
write_vcf('test16_braf_Q257H.vcf', 7, 140801501, 'C', 'A', 0, 10)
write_vcf('test17_braf_Q257H.vcf', 7, 140801501, 'C', 'A', 5, 10)

write_vcf('test18_tp53_S241F.vcf', 17, 7674241, 'G', 'A', 2, 2)
write_vcf('test19_tp53_S241F.vcf', 17, 7674241, 'G', 'A', 0, 10)
write_vcf('test20_tp53_S241F.vcf', 17, 7674241, 'G', 'A', 5, 10)

write_vcf('test21_tp53_P72R.vcf', 17, 7676154, 'G', 'C', 2, 2)
write_vcf('test21_tp53_P72R.vcf', 17, 7676154, 'G', 'C', 0, 10)
write_vcf('test21_tp53_P72R.vcf', 17, 7676154, 'G', 'C', 2, 10)

In [52]:
# validate
VCF.dataframe('test0_kras_G13C.vcf')

Unnamed: 0,CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO
0,chr12,24788289,.,A,C,60.0,.,AC=2;AF=1.00;AN=2;DP=7;ExcessHet=3.0103;FS=0.0...
1,chr12,25284777,.,A,C,60.0,.,AC=2;AF=1.00;AN=2;DP=7;ExcessHet=3.0103;FS=0.0...
2,chr2,29081787,.,A,C,60.0,.,AC=2;AF=1.00;AN=2;DP=7;ExcessHet=3.0103;FS=0.0...
3,chr7,55271013,.,A,C,60.0,.,AC=2;AF=1.00;AN=2;DP=7;ExcessHet=3.0103;FS=0.0...
4,chr12,25245348,.,C,A,60.28,.,AC=2;AF=1.00;AN=2;DP=7;ExcessHet=3.0103;FS=0.0...
5,chr17,7636991,.,A,C,60.0,.,AC=2;AF=1.00;AN=2;DP=7;ExcessHet=3.0103;FS=0.0...
6,chr7,54880599,.,A,C,60.0,.,AC=2;AF=1.00;AN=2;DP=7;ExcessHet=3.0103;FS=0.0...
7,chr7,140238466,.,A,C,60.0,.,AC=2;AF=1.00;AN=2;DP=7;ExcessHet=3.0103;FS=0.0...
8,chr7,140944330,.,A,C,60.0,.,AC=2;AF=1.00;AN=2;DP=7;ExcessHet=3.0103;FS=0.0...
9,chr2,29954866,.,A,C,60.0,.,AC=2;AF=1.00;AN=2;DP=7;ExcessHet=3.0103;FS=0.0...
