# VEP conversion of common mutations table: germline sample

In [3]:
# Needed basic packages
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from matplotlib_venn import venn3
import time

from tqdm import tqdm_notebook
from tqdm.notebook import tqdm_notebook
tqdm_notebook.pandas()
from tqdm.notebook import tqdm
tqdm.pandas()
from tqdm.notebook import tqdm_notebook
tqdm_notebook.pandas()

In [4]:
# Import the list of germline mutations filtered by sequencing quality.

mut_df = pd.read_csv('/workspace/projects/sjd_melos/MAFs_tables/Germline_VAF.tsv.gz', sep="\t", header=0)

In [5]:
mut_df.head()

Unnamed: 0,CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO,FORMAT,patient1_AB9766,GT,AD,DP,GQ,PL,AD_ref,AD_alt,VAF
0,chr1,13273,.,G,C,1444.64,PASS,AC=1;AF=0.500;AN=2;BaseQRankSum=0.096;CNN_1D=-...,GT:AD:DP:GQ:PL,"0/1:120,63:183:99:1452,0,3326",0/1,12063,183.0,99,145203326,120,63.0,0.344262
1,chr1,13813,.,T,G,114.64,PASS,AC=1;AF=0.500;AN=2;BaseQRankSum=-5.261;CNN_1D=...,GT:AD:DP:GQ:PL,"0/1:90,16:106:99:122,0,3424",0/1,9016,106.0,99,12203424,90,16.0,0.150943
2,chr1,13838,rs200683566,C,T,503.64,PASS,AC=1;AF=0.500;AN=2;BaseQRankSum=5.055;CNN_1D=-...,GT:AD:DP:GQ:PL,"0/1:110,23:133:99:511,0,2959",0/1,11023,133.0,99,51102959,110,23.0,0.172932
3,chr1,14599,.,T,A,146.64,PASS,AC=1;AF=0.500;AN=2;BaseQRankSum=-3.384;CNN_1D=...,GT:AD:DP:GQ:PL,"0/1:158,15:173:99:154,0,6545",0/1,15815,173.0,99,15406545,158,15.0,0.086705
4,chr1,14604,.,A,G,167.64,PASS,AC=1;AF=0.500;AN=2;BaseQRankSum=-1.322;CNN_1D=...,GT:AD:DP:GQ:PL,"0/1:165,16:181:99:175,0,6793",0/1,16516,181.0,99,17506793,165,16.0,0.088398


In [6]:
# CHROM column has to maintain the # so VEP recognizes that this is a VCF file!!
mut_df = mut_df.rename(columns = {'CHROM':'#CHROM'})

In [7]:
mut_df[mut_df['#CHROM'] == 'chr2']

Unnamed: 0,#CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO,FORMAT,patient1_AB9766,GT,AD,DP,GQ,PL,AD_ref,AD_alt,VAF
411824,chr2,10286,.,T,TACCCTA,504.02,PASS,AC=2;AF=1.00;AN=2;CNN_1D=1.506;DP=26;ExcessHet...,GT:AD:DP:GQ:PL,"1/1:0,12:12:36:518,36,0",1/1,012,12.0,36,518360,0,12.0,1.000000
411825,chr2,10437,rs71337607,C,T,261.96,PASS,AC=2;AF=1.00;AN=2;CNN_1D=0.876;DB;DP=31;Excess...,GT:AD:DP:GQ:PL,"1/1:0,9:9:27:276,27,0",1/1,09,9.0,27,276270,0,9.0,1.000000
411826,chr2,11320,rs113106463,G,A,2070.64,PASS,AC=1;AF=0.500;AN=2;BaseQRankSum=3.064;CNN_1D=-...,GT:AD:DP:GQ:PL,"0/1:80,79:159:99:2078,0,2257",0/1,8079,159.0,99,207802257,80,79.0,0.496855
411827,chr2,13949,rs112508762,G,GT,834.60,PASS,AC=1;AF=0.500;AN=2;BaseQRankSum=0.895;CNN_1D=-...,GT:AD:DP:GQ:PL,"0/1:52,44:96:99:842,0,1041",0/1,5244,96.0,99,84201041,52,44.0,0.458333
411828,chr2,23368,rs12714397,C,A,1818.64,PASS,AC=1;AF=0.500;AN=2;BaseQRankSum=2.833;CNN_1D=-...,GT:AD:DP:GQ:PL,"0/1:61,63:124:99:1826,0,1753",0/1,6163,124.0,99,182601753,61,63.0,0.508065
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
824894,chr2,242181956,rs377607041,C,T,5680.64,PASS,AC=1;AF=0.500;AN=2;BaseQRankSum=9.278;CNN_1D=-...,GT:AD:DP:GQ:PL,"0/1:89,215:304:99:5688,0,1704",0/1,89215,304.0,99,568801704,89,215.0,0.707237
824895,chr2,242182922,rs369616242,G,C,87.64,PASS,AC=1;AF=0.500;AN=2;BaseQRankSum=0.433;CNN_1D=-...,GT:AD:DP:GQ:PL,"0/1:32,8:40:95:95,0,851",0/1,328,40.0,95,950851,32,8.0,0.200000
824896,chr2,242183053,.,C,G,396.64,PASS,AC=1;AF=0.500;AN=2;BaseQRankSum=2.493;CNN_1D=-...,GT:AD:DP:GQ:PL,"0/1:17,15:32:99:404,0,457",0/1,1715,32.0,99,4040457,17,15.0,0.468750
824897,chr2,242183088,rs150373510,T,C,251.64,PASS,AC=1;AF=0.500;AN=2;BaseQRankSum=-3.058;CNN_1D=...,GT:AD:DP:GQ:PL,"0/1:16,10:26:99:259,0,450",0/1,1610,26.0,99,2590450,16,10.0,0.384615


In [8]:
# Perform a chromosome list
chroms = []
for n in range(1,23):
    chroms.append('chr'+str(n))
chroms.append('chrX')    
chroms.append('chrY')  
chroms

# This is the same as previous code but in few lines
# chroms = ['chr'+str(n) for n in range(1,23)  ]
# chroms.append('chrX')
# chroms.append('chrY')

['chr1',
 'chr2',
 'chr3',
 'chr4',
 'chr5',
 'chr6',
 'chr7',
 'chr8',
 'chr9',
 'chr10',
 'chr11',
 'chr12',
 'chr13',
 'chr14',
 'chr15',
 'chr16',
 'chr17',
 'chr18',
 'chr19',
 'chr20',
 'chr21',
 'chr22',
 'chrX',
 'chrY']

In [10]:
# Extract the chromosomes data:

path = '/workspace/projects/sjd_melos/vep/vep_input_files/germline/'

for chrom in chroms:
    file_name= path + chrom + '.tsv.gz'    
    chrom_df = mut_df[mut_df['#CHROM']==chrom]
    chrom_df.to_csv(file_name,sep='\t', index=None, compression = "gzip")