# VEP conversion of common mutations table: sarcoma sample

In [1]:
# Needed basic packages
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from matplotlib_venn import venn3
import time

from tqdm import tqdm_notebook
from tqdm.notebook import tqdm_notebook
tqdm_notebook.pandas()
from tqdm.notebook import tqdm
tqdm.pandas()
from tqdm.notebook import tqdm_notebook
tqdm_notebook.pandas()

In [2]:
# Import the list of common mutations that are CLONAL in the compendium of the three callers in each tumor. Note that all melanoma mutations are considered clonal!!

clmut_df = pd.read_csv('/workspace/projects/sjd_melos/MAFs_tables/Sar_CCF_Purple_all_mutations.tsv.gz', sep="\t", header=0) #same data as mel_mut_df

In [3]:
# CHROM column has to maintain the # so VEP recognizes that this is a VCF file!!
clmut_df = clmut_df.rename(columns = {'CHROM':'#CHROM'})

In [4]:
clmut_df[clmut_df['#CHROM'] == 'chr2']

Unnamed: 0,#CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO,FORMAT,NORMAL,...,t_ref_reads,n_ref_reads,VAF,mut_type,Caller_intersec,SAMPLE,mut,CN,CCF,clonality
1956,chr2,129401,.,G,T,.,PASS,"AS_FilterStatus=SITE;AS_SB_TABLE=110,92|26,32;...",GT:AD:AF:DP:F1R2:F2R1:FAD:SB,"0/0:110,0:9.140e-03:110:44,0:44,0:107,0:57,53,0,0",...,92,110,0.386667,SNV,Mutect_Strelka,sarcoma,chr2_129401_G_T,1.8593,1.066369,clonal
1957,chr2,147018,.,G,T,.,PASS,"AS_FilterStatus=SITE;AS_SB_TABLE=109,132|21,32...",GT:AD:AF:DP:F1R2:F2R1:FAD:SB,"0/0:121,1:8.462e-03:122:56,0:59,0:120,1:51,70,0,1",...,120,121,0.302326,SNV,Mutect_Strelka,sarcoma,chr2_147018_G_T,1.8593,0.833769,clonal
1958,chr2,468605,.,G,T,.,PASS,"AS_FilterStatus=SITE;AS_SB_TABLE=123,157|23,36...",GT:AD:AF:DP:F1R2:F2R1:FAD:SB,"0/0:161,0:6.285e-03:161:86,0:68,0:156,0:65,96,0,0",...,119,161,0.331461,SNV,Mutect_Strelka,sarcoma,chr2_468605_G_T,1.8593,0.914119,clonal
1959,chr2,470986,.,C,T,.,PASS,"AS_FilterStatus=SITE;AS_SB_TABLE=164,118|4,5;D...",GT:AD:AF:DP:F1R2:F2R1:FAD:SB,"0/0:129,0:7.741e-03:129:62,0:62,0:126,0:74,55,0,0",...,153,129,0.055556,SNV,Mutect_Strelka,sarcoma,chr2_470986_C_T,1.8593,0.153214,subclonal
1960,chr2,507019,.,C,A,.,PASS,"AS_FilterStatus=SITE;AS_SB_TABLE=120,100|40,29...",GT:AD:AF:DP:F1R2:F2R1:FAD:SB,"0/0:127,0:7.670e-03:127:58,0:66,0:127,0:74,53,0,0",...,93,127,0.425926,SNV,Mutect_Strelka,sarcoma,chr2_507019_C_A,1.8593,1.174640,clonal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3712,chr2,240458681,.,G,A,.,PASS,"AS_FilterStatus=SITE;AS_SB_TABLE=133,125|3,2;D...",GT:AD:AF:DP:F1R2:F2R1:FAD:SB,"0/0:154,0:6.434e-03:154:84,0:67,0:152,0:84,70,0,0",...,104,154,0.045872,SNV,Mutect_Strelka,sarcoma,chr2_240458681_G_A,0.7483,0.075544,subclonal
3713,chr2,240623721,.,A,G,.,PASS,"AS_FilterStatus=SITE;AS_SB_TABLE=78,116|14,26;...",GT:AD:AF:DP:F1R2:F2R1:FAD:SB,"0/0:142,0:7.022e-03:142:64,0:71,0:139,0:56,86,0,0",...,52,142,0.434783,SNV,Mutect_Strelka,sarcoma,chr2_240623721_A_G,0.7483,0.716022,clonal
3714,chr2,240901777,.,T,A,.,PASS,"AS_FilterStatus=SITE;AS_SB_TABLE=82,79|18,19;D...",GT:AD:AF:DP:F1R2:F2R1:FAD:SB,"0/0:114,0:8.645e-03:114:48,0:63,0:113,0:65,49,0,0",...,47,114,0.440476,SNV,Mutect_Strelka,sarcoma,chr2_240901777_T_A,0.7483,0.725399,clonal
3715,chr2,241417441,.,A,G,.,PASS,"AS_FilterStatus=SITE;AS_SB_TABLE=128,55|22,13;...",GT:AD:AF:DP:F1R2:F2R1:FAD:SB,"0/0:132,0:7.542e-03:132:64,0:56,0:130,0:92,40,0,0",...,51,132,0.406977,SNV,Mutect_Strelka,sarcoma,chr2_241417441_A_G,0.7483,0.670230,clonal


In [5]:
# Perform a chromosome list
chroms = []
for n in range(1,23):
    chroms.append('chr'+str(n))
chroms.append('chrX')    
chroms.append('chrY')  
chroms

# This is the same as previous code but in few lines
# chroms = ['chr'+str(n) for n in range(1,23)  ]
# chroms.append('chrX')
# chroms.append('chrY')

['chr1',
 'chr2',
 'chr3',
 'chr4',
 'chr5',
 'chr6',
 'chr7',
 'chr8',
 'chr9',
 'chr10',
 'chr11',
 'chr12',
 'chr13',
 'chr14',
 'chr15',
 'chr16',
 'chr17',
 'chr18',
 'chr19',
 'chr20',
 'chr21',
 'chr22',
 'chrX',
 'chrY']

In [6]:
# Extract the chromosomes data:

path = '/workspace/projects/sjd_melos/vep/vep_input_files/sarcoma/'

for chrom in chroms:
    file_name= path + chrom + '.tsv.gz'    
    chrom_df = clmut_df[clmut_df['#CHROM']==chrom]
    chrom_df.to_csv(file_name,sep='\t', index=None, compression = "gzip")