## Annotated Variant Filtering: Sarcoma-metastasis sample

In [1]:
# Needed basic packages
import pandas as pd
from tqdm import tqdm_notebook
from tqdm.notebook import tqdm_notebook
tqdm_notebook.pandas()
from tqdm.notebook import tqdm
tqdm.pandas()

### Import VEP output files

In [2]:
import os, sys

# Paths from VEP output directories
path = "/workspace/projects/sjd_melos/vep/vep_output_files/lung/"

# These are the files of the variable path
files = os.listdir(path)
files

['chr3.tsv.gz',
 'chr4.tsv.gz',
 'chr6.tsv.gz',
 'chr5.tsv.gz',
 'chr7.tsv.gz',
 'chr8.tsv.gz',
 'chrX.tsv.gz',
 'chr9.tsv.gz',
 'chr11.tsv.gz',
 'chr12.tsv.gz',
 'chr13.tsv.gz',
 'chr14.tsv.gz',
 'chr15.tsv.gz',
 'chr16.tsv.gz',
 'chr17.tsv.gz',
 'chr18.tsv.gz',
 'chr19.tsv.gz',
 'chr1.tsv.gz',
 'chr21.tsv.gz',
 'chr22.tsv.gz',
 'chr2.tsv.gz',
 'chrY.tsv.gz',
 'chr10.tsv.gz',
 'chr20.tsv.gz']

In [3]:
# Perform a chromosome list
chroms = []
for n in range(1,23):
    chroms.append('chr'+str(n))
chroms.append('chrX')    
chroms.append('chrY')  
chroms

['chr1',
 'chr2',
 'chr3',
 'chr4',
 'chr5',
 'chr6',
 'chr7',
 'chr8',
 'chr9',
 'chr10',
 'chr11',
 'chr12',
 'chr13',
 'chr14',
 'chr15',
 'chr16',
 'chr17',
 'chr18',
 'chr19',
 'chr20',
 'chr21',
 'chr22',
 'chrX',
 'chrY']

In [4]:
# Define a function that reads files based on chrom list and returns a dataframe
vep_out_df = pd.DataFrame()
for c in chroms: # use the chrom list to read each file chromosome by chromosome
    columns = ['Uploaded_variation', 'Location', 'Allele', 'Gene',	'Feature', 'Feature_type', 'Consequence', 'cDNA_position', 'CDS_position', 'Protein_position', 'Amino_acids','Codons','Existing_variation','IMPACT','DISTANCE','STRAND','FLAGS','SYMBOL','SYMBOL_SOURCE','HGNC_ID','CANONICAL','MANE_SELECT','MANE_PLUS_CLINICAL','ENSP',	'SOURCE', 'AF',	'AFR_AF', 'AMR_AF','EAS_AF', 'EUR_AF','SAS_AF','CLIN_SIG','SOMATIC','PHENO','gnomADg', 'gnomADg_AF', 'gnomADg_NFE']
    chr_df = pd.read_csv(path+c+'.tsv.gz', sep="\t", names = columns, comment='#', header=None)
    vep_out_df = pd.concat([vep_out_df,chr_df],ignore_index=True)
vep_out_df

Unnamed: 0,Uploaded_variation,Location,Allele,Gene,Feature,Feature_type,Consequence,cDNA_position,CDS_position,Protein_position,...,AMR_AF,EAS_AF,EUR_AF,SAS_AF,CLIN_SIG,SOMATIC,PHENO,gnomADg,gnomADg_AF,gnomADg_NFE
0,chr1_104188_C/A,chr1:104188,A,ENSG00000238009,ENST00000466430,Transcript,"intron_variant,non_coding_transcript_variant",-,-,-,...,-,-,-,-,-,-,-,-,-,-
1,chr1_104188_C/A,chr1:104188,A,ENSG00000238009,ENST00000477740,Transcript,"intron_variant,non_coding_transcript_variant",-,-,-,...,-,-,-,-,-,-,-,-,-,-
2,chr1_843320_G/A,chr1:843320,A,ENSG00000228794,ENST00000415295,Transcript,downstream_gene_variant,-,-,-,...,-,-,-,-,-,-,-,-,-,-
3,chr1_843320_G/A,chr1:843320,A,ENSG00000228794,ENST00000416570,Transcript,"intron_variant,non_coding_transcript_variant",-,-,-,...,-,-,-,-,-,-,-,-,-,-
4,chr1_843320_G/A,chr1:843320,A,ENSG00000228794,ENST00000441765,Transcript,downstream_gene_variant,-,-,-,...,-,-,-,-,-,-,-,-,-,-
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
82691,chrY_26654389_A/G,chrY:26654389,G,-,-,-,intergenic_variant,-,-,-,...,-,-,-,-,-,-,-,-,-,-
82692,chrY_56823266_G/A,chrY:56823266,A,-,-,-,intergenic_variant,-,-,-,...,-,-,-,-,-,-,-,-,-,-
82693,chrY_56842066_CT/AC,chrY:56842066-56842067,AC,-,-,-,intergenic_variant,-,-,-,...,-,-,-,-,-,-,-,-,-,-
82694,chrY_56842087_T/C,chrY:56842087,C,-,-,-,intergenic_variant,-,-,-,...,-,-,-,-,-,-,-,-,-,-


In [5]:
# Reformat Uploaded variation column so all separators are "_" to split this table to CHROM POS REF ALT
vep_out_df['Uploaded_variation'] = vep_out_df['Uploaded_variation'].str.replace('/','_')
vep_out_df = pd.concat([vep_out_df, vep_out_df['Uploaded_variation'].str.split('_', expand=True)], axis=1)
vep_out_df = vep_out_df.rename(columns={0:'#CHROM', 1: 'POS', 2:'REF', 3: 'ALT'})
vep_out_df

Unnamed: 0,Uploaded_variation,Location,Allele,Gene,Feature,Feature_type,Consequence,cDNA_position,CDS_position,Protein_position,...,CLIN_SIG,SOMATIC,PHENO,gnomADg,gnomADg_AF,gnomADg_NFE,#CHROM,POS,REF,ALT
0,chr1_104188_C_A,chr1:104188,A,ENSG00000238009,ENST00000466430,Transcript,"intron_variant,non_coding_transcript_variant",-,-,-,...,-,-,-,-,-,-,chr1,104188,C,A
1,chr1_104188_C_A,chr1:104188,A,ENSG00000238009,ENST00000477740,Transcript,"intron_variant,non_coding_transcript_variant",-,-,-,...,-,-,-,-,-,-,chr1,104188,C,A
2,chr1_843320_G_A,chr1:843320,A,ENSG00000228794,ENST00000415295,Transcript,downstream_gene_variant,-,-,-,...,-,-,-,-,-,-,chr1,843320,G,A
3,chr1_843320_G_A,chr1:843320,A,ENSG00000228794,ENST00000416570,Transcript,"intron_variant,non_coding_transcript_variant",-,-,-,...,-,-,-,-,-,-,chr1,843320,G,A
4,chr1_843320_G_A,chr1:843320,A,ENSG00000228794,ENST00000441765,Transcript,downstream_gene_variant,-,-,-,...,-,-,-,-,-,-,chr1,843320,G,A
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
82691,chrY_26654389_A_G,chrY:26654389,G,-,-,-,intergenic_variant,-,-,-,...,-,-,-,-,-,-,chrY,26654389,A,G
82692,chrY_56823266_G_A,chrY:56823266,A,-,-,-,intergenic_variant,-,-,-,...,-,-,-,-,-,-,chrY,56823266,G,A
82693,chrY_56842066_CT_AC,chrY:56842066-56842067,AC,-,-,-,intergenic_variant,-,-,-,...,-,-,-,-,-,-,chrY,56842066,CT,AC
82694,chrY_56842087_T_C,chrY:56842087,C,-,-,-,intergenic_variant,-,-,-,...,-,-,-,-,-,-,chrY,56842087,T,C


In [6]:
# This is the consequences list from BBG cluster with VEP/Gnomad older versions ordered by worse to good effect
CONSEQUENCES_LIST = [
    'transcript_ablation',
    'splice_acceptor_variant',
    'splice_donor_variant',
    'stop_gained',
    'frameshift_variant',
    'stop_lost',
    'start_lost',
    'transcript_amplification',
    'inframe_insertion',
    'inframe_deletion',
    'missense_variant',
    'protein_altering_variant',
    'splice_region_variant',
    'splice_donor_5th_base_variant',
    'splice_donor_region_variant',
    'splice_polypyrimidine_tract_variant',
    'incomplete_terminal_codon_variant',
    'start_retained_variant',
    'stop_retained_variant',
    'synonymous_variant',
    'coding_sequence_variant',
    'mature_miRNA_variant',
    '5_prime_UTR_variant',
    '3_prime_UTR_variant',
    'non_coding_transcript_exon_variant',
    'intron_variant',
    'NMD_transcript_variant',
    'non_coding_transcript_variant',
    'upstream_gene_variant',
    'downstream_gene_variant',
    'TFBS_ablation',
    'TFBS_amplification',
    'TF_binding_site_variant',
    'regulatory_region_ablation',
    'regulatory_region_amplification',
    'feature_elongation',
    'regulatory_region_variant',
    'feature_truncation',
    'intergenic_variant'
]

### 1/ Transcript selection based on MANE-Select information

In [7]:
# Create a final list of variants with the worse consequence whith unique transcripts per variant by this criteria:
# 1/ choose transcript based on MANE transcript
# 2/ when no MANE transcript choose transcript for the CANONICAL transcript
# 3/ when no MANE and not CANONICAL choose transcript based on the transcript containing the first ocurring worse consequence based on CONSEQUENCES_LIST

In [8]:
display(vep_out_df.columns)

Index(['Uploaded_variation', 'Location', 'Allele', 'Gene', 'Feature',
       'Feature_type', 'Consequence', 'cDNA_position', 'CDS_position',
       'Protein_position', 'Amino_acids', 'Codons', 'Existing_variation',
       'IMPACT', 'DISTANCE', 'STRAND', 'FLAGS', 'SYMBOL', 'SYMBOL_SOURCE',
       'HGNC_ID', 'CANONICAL', 'MANE_SELECT', 'MANE_PLUS_CLINICAL', 'ENSP',
       'SOURCE', 'AF', 'AFR_AF', 'AMR_AF', 'EAS_AF', 'EUR_AF', 'SAS_AF',
       'CLIN_SIG', 'SOMATIC', 'PHENO', 'gnomADg', 'gnomADg_AF', 'gnomADg_NFE',
       '#CHROM', 'POS', 'REF', 'ALT'],
      dtype='object')

In [9]:
# Check in a new dataframe the number of transcripts per position and sort it by its location  in genome
grp_df = vep_out_df.groupby(['Uploaded_variation','Gene'], as_index=False).count().sort_values('Location',ascending=False)
pd.set_option('display.max_columns', None) # Show information of all columns in pandas
grp_df

Unnamed: 0,Uploaded_variation,Gene,Location,Allele,Feature,Feature_type,Consequence,cDNA_position,CDS_position,Protein_position,Amino_acids,Codons,Existing_variation,IMPACT,DISTANCE,STRAND,FLAGS,SYMBOL,SYMBOL_SOURCE,HGNC_ID,CANONICAL,MANE_SELECT,MANE_PLUS_CLINICAL,ENSP,SOURCE,AF,AFR_AF,AMR_AF,EAS_AF,EUR_AF,SAS_AF,CLIN_SIG,SOMATIC,PHENO,gnomADg,gnomADg_AF,gnomADg_NFE,#CHROM,POS,REF,ALT
10994,chr21_16491222_G_-,ENSG00000215386,203,203,203,203,203,203,203,203,203,203,203,203,203,203,203,203,203,203,203,203,203,203,203,203,203,203,203,203,203,203,203,203,203,203,203,203,203,203,203
17006,chr4_86347899_T_A,ENSG00000109339,164,164,164,164,164,164,164,164,164,164,164,164,164,164,164,164,164,164,164,164,164,164,164,164,164,164,164,164,164,164,164,164,164,164,164,164,164,164,164
17004,chr4_86220427_C_T,ENSG00000109339,162,162,162,162,162,162,162,162,162,162,162,162,162,162,162,162,162,162,162,162,162,162,162,162,162,162,162,162,162,162,162,162,162,162,162,162,162,162,162
10993,chr21_16320705_G_C,ENSG00000215386,148,148,148,148,148,148,148,148,148,148,148,148,148,148,148,148,148,148,148,148,148,148,148,148,148,148,148,148,148,148,148,148,148,148,148,148,148,148,148
10997,chr21_16525153_G_T,ENSG00000215386,142,142,142,142,142,142,142,142,142,142,142,142,142,142,142,142,142,142,142,142,142,142,142,142,142,142,142,142,142,142,142,142,142,142,142,142,142,142,142
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10571,chr20_31720680_C_T,ENSG00000281376,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
10570,chr20_31720680_C_T,ENSG00000236559,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
10568,chr20_31532666_A_T,ENSG00000274364,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
10566,chr20_31419743_CCGTGGAAGATGTGCCTTGCTT_-,ENSG00000290996,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1


In [10]:
# Case-examples of different transcripts associated to variants to consider in the code:
display(vep_out_df[vep_out_df['Uploaded_variation']=='chr20_31720680_C_T']) # look that this variant has a MANE_Select transcript that distinguishes it from the other transcripts
display(vep_out_df[vep_out_df['Uploaded_variation']=='chr20_31419743_CCGTGGAAGATGTGCCTTGCTT_-']) # only Gene information no MANE in any transcript, but CANONICAL info distinguish one of all
display(vep_out_df[vep_out_df['Uploaded_variation']== 'chr10_10002148_C_T']) # no MANE and no Gene for intergenic variants, here just one transcript no need to distinguis from others

Unnamed: 0,Uploaded_variation,Location,Allele,Gene,Feature,Feature_type,Consequence,cDNA_position,CDS_position,Protein_position,Amino_acids,Codons,Existing_variation,IMPACT,DISTANCE,STRAND,FLAGS,SYMBOL,SYMBOL_SOURCE,HGNC_ID,CANONICAL,MANE_SELECT,MANE_PLUS_CLINICAL,ENSP,SOURCE,AF,AFR_AF,AMR_AF,EAS_AF,EUR_AF,SAS_AF,CLIN_SIG,SOMATIC,PHENO,gnomADg,gnomADg_AF,gnomADg_NFE,#CHROM,POS,REF,ALT
76023,chr20_31720680_C_T,chr20:31720680,T,ENSG00000171552,ENST00000307677,Transcript,intron_variant,-,-,-,-,-,rs2061609002,MODIFIER,-,-1,-,BCL2L1,HGNC,HGNC:992,YES,NM_138578.3,-,ENSP00000302564,-,-,-,-,-,-,-,-,-,-,-,-,-,chr20,31720680,C,T
76024,chr20_31720680_C_T,chr20:31720680,T,ENSG00000171552,ENST00000376055,Transcript,intron_variant,-,-,-,-,-,rs2061609002,MODIFIER,-,-1,-,BCL2L1,HGNC,HGNC:992,-,-,-,ENSP00000365223,-,-,-,-,-,-,-,-,-,-,-,-,-,chr20,31720680,C,T
76025,chr20_31720680_C_T,chr20:31720680,T,ENSG00000171552,ENST00000376062,Transcript,intron_variant,-,-,-,-,-,rs2061609002,MODIFIER,-,-1,-,BCL2L1,HGNC,HGNC:992,-,-,-,ENSP00000365230,-,-,-,-,-,-,-,-,-,-,-,-,-,chr20,31720680,C,T
76026,chr20_31720680_C_T,chr20:31720680,T,ENSG00000236559,ENST00000412972,Transcript,downstream_gene_variant,-,-,-,-,-,rs2061609002,MODIFIER,3855,1,-,BCL2L1-AS1,HGNC,HGNC:40095,YES,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,chr20,31720680,C,T
76027,chr20_31720680_C_T,chr20:31720680,T,ENSG00000171552,ENST00000420488,Transcript,intron_variant,-,-,-,-,-,rs2061609002,MODIFIER,-,-1,-,BCL2L1,HGNC,HGNC:992,-,-,-,ENSP00000390760,-,-,-,-,-,-,-,-,-,-,-,-,-,chr20,31720680,C,T
76028,chr20_31720680_C_T,chr20:31720680,T,ENSG00000171552,ENST00000422920,Transcript,intron_variant,-,-,-,-,-,rs2061609002,MODIFIER,-,-1,-,BCL2L1,HGNC,HGNC:992,-,-,-,ENSP00000411252,-,-,-,-,-,-,-,-,-,-,-,-,-,chr20,31720680,C,T
76029,chr20_31720680_C_T,chr20:31720680,T,ENSG00000171552,ENST00000434194,Transcript,intron_variant,-,-,-,-,-,rs2061609002,MODIFIER,-,-1,-,BCL2L1,HGNC,HGNC:992,-,-,-,ENSP00000401173,-,-,-,-,-,-,-,-,-,-,-,-,-,chr20,31720680,C,T
76030,chr20_31720680_C_T,chr20:31720680,T,ENSG00000171552,ENST00000439267,Transcript,intron_variant,-,-,-,-,-,rs2061609002,MODIFIER,-,-1,-,BCL2L1,HGNC,HGNC:992,-,-,-,ENSP00000389688,-,-,-,-,-,-,-,-,-,-,-,-,-,chr20,31720680,C,T
76031,chr20_31720680_C_T,chr20:31720680,T,ENSG00000171552,ENST00000450273,Transcript,intron_variant,-,-,-,-,-,rs2061609002,MODIFIER,-,-1,-,BCL2L1,HGNC,HGNC:992,-,-,-,ENSP00000406203,-,-,-,-,-,-,-,-,-,-,-,-,-,chr20,31720680,C,T
76032,chr20_31720680_C_T,chr20:31720680,T,ENSG00000171552,ENST00000456404,Transcript,intron_variant,-,-,-,-,-,rs2061609002,MODIFIER,-,-1,-,BCL2L1,HGNC,HGNC:992,-,-,-,ENSP00000395545,-,-,-,-,-,-,-,-,-,-,-,-,-,chr20,31720680,C,T


Unnamed: 0,Uploaded_variation,Location,Allele,Gene,Feature,Feature_type,Consequence,cDNA_position,CDS_position,Protein_position,Amino_acids,Codons,Existing_variation,IMPACT,DISTANCE,STRAND,FLAGS,SYMBOL,SYMBOL_SOURCE,HGNC_ID,CANONICAL,MANE_SELECT,MANE_PLUS_CLINICAL,ENSP,SOURCE,AF,AFR_AF,AMR_AF,EAS_AF,EUR_AF,SAS_AF,CLIN_SIG,SOMATIC,PHENO,gnomADg,gnomADg_AF,gnomADg_NFE,#CHROM,POS,REF,ALT
76009,chr20_31419743_CCGTGGAAGATGTGCCTTGCTT_-,chr20:31419743-31419764,-,ENSG00000290996,ENST00000433453,Transcript,downstream_gene_variant,-,-,-,-,-,-,MODIFIER,1672,-1,-,DEFB122,HGNC,HGNC:18102,YES,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,chr20,31419743,CCGTGGAAGATGTGCCTTGCTT,-
76010,chr20_31419743_CCGTGGAAGATGTGCCTTGCTT_-,chr20:31419743-31419764,-,ENSG00000204547,ENST00000569013,Transcript,downstream_gene_variant,-,-,-,-,-,-,MODIFIER,1766,-1,-,DEFB122,HGNC,HGNC:18102,YES,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,chr20,31419743,CCGTGGAAGATGTGCCTTGCTT,-


Unnamed: 0,Uploaded_variation,Location,Allele,Gene,Feature,Feature_type,Consequence,cDNA_position,CDS_position,Protein_position,Amino_acids,Codons,Existing_variation,IMPACT,DISTANCE,STRAND,FLAGS,SYMBOL,SYMBOL_SOURCE,HGNC_ID,CANONICAL,MANE_SELECT,MANE_PLUS_CLINICAL,ENSP,SOURCE,AF,AFR_AF,AMR_AF,EAS_AF,EUR_AF,SAS_AF,CLIN_SIG,SOMATIC,PHENO,gnomADg,gnomADg_AF,gnomADg_NFE,#CHROM,POS,REF,ALT
47958,chr10_10002148_C_T,chr10:10002148,T,-,-,-,intergenic_variant,-,-,-,-,-,-,MODIFIER,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,chr10,10002148,C,T


In [11]:
# Make a list from these two columns that results in unique values
groups_list = grp_df[['Uploaded_variation','Gene']].values.tolist()
groups_list

[['chr21_16491222_G_-', 'ENSG00000215386'],
 ['chr4_86347899_T_A', 'ENSG00000109339'],
 ['chr4_86220427_C_T', 'ENSG00000109339'],
 ['chr21_16320705_G_C', 'ENSG00000215386'],
 ['chr21_16525153_G_T', 'ENSG00000215386'],
 ['chr21_16496154_A_G', 'ENSG00000215386'],
 ['chr21_16518290_G_C', 'ENSG00000215386'],
 ['chr21_16251706_A_T', 'ENSG00000215386'],
 ['chr7_100327826_C_-', 'ENSG00000291178'],
 ['chr8_129528144_T_G', 'ENSG00000229140'],
 ['chr20_38429340_C_A', 'ENSG00000196756'],
 ['chr3_181970754_G_T', 'ENSG00000242512'],
 ['chr3_181974732_G_A', 'ENSG00000242512'],
 ['chr21_16183377_G_T', 'ENSG00000215386'],
 ['chr4_153270462_G_T', 'ENSG00000109654'],
 ['chr6_85561340_C_A', 'ENSG00000135317'],
 ['chr6_85563285_C_G', 'ENSG00000135317'],
 ['chr3_129490110_G_A', 'ENSG00000163913'],
 ['chr6_8469076_A_C', 'ENSG00000285219'],
 ['chr6_8468021_A_G', 'ENSG00000285219'],
 ['chr3_129482505_C_A', 'ENSG00000163913'],
 ['chr3_181994423_A_T', 'ENSG00000242512'],
 ['chr21_20771230_G_T', 'ENSG00000224924

In [12]:
# CODE TO FILTER TRANSCRIPTS OF THE SAME VARIANT:
final_df = pd.DataFrame()
for g in tqdm(groups_list):
    mut = g[0] # first element of list
    gene = g[1] # second element of list
    transcripts_df = vep_out_df[(vep_out_df['Uploaded_variation'] == mut) & (vep_out_df['Gene'] == gene)] # for each variant in dataframe that is equal to the variant in the list (unique value)
    if len(transcripts_df) > 1: # this selects those variants that have more than 1 transcript
        mane_transcripts_df = transcripts_df[transcripts_df['MANE_SELECT'] != '-']
        if len(mane_transcripts_df) == 1:
            df2 = mane_transcripts_df # select mane trancript
        elif len(mane_transcripts_df) == 0:
            can_df = transcripts_df[transcripts_df['CANONICAL'] == 'YES']
            if len(can_df) == 1:
                df2 = can_df # select canonical transcript
            elif len(can_df) == 0:
                df2 = transcripts_df # take all trancripts when no information of MANE or CANONICAL
            else:
                print('Another option?') # show this message and display it to show that something is missing in the code
                display(can_df)
        else:
            print('Another option?') # show this message and display it to show that something is missing in the code
            display(transcripts_df)
    elif len(transcripts_df) == 1: # this means there is only 1 transcript for this variant so no need to filter
        df2 = transcripts_df 
    else:
        print('Another option?')

    final_df = pd.concat([final_df, df2], ignore_index=True)
final_df

  0%|          | 0/25338 [00:00<?, ?it/s]

Unnamed: 0,Uploaded_variation,Location,Allele,Gene,Feature,Feature_type,Consequence,cDNA_position,CDS_position,Protein_position,Amino_acids,Codons,Existing_variation,IMPACT,DISTANCE,STRAND,FLAGS,SYMBOL,SYMBOL_SOURCE,HGNC_ID,CANONICAL,MANE_SELECT,MANE_PLUS_CLINICAL,ENSP,SOURCE,AF,AFR_AF,AMR_AF,EAS_AF,EUR_AF,SAS_AF,CLIN_SIG,SOMATIC,PHENO,gnomADg,gnomADg_AF,gnomADg_NFE,#CHROM,POS,REF,ALT
0,chr21_16491222_G_-,chr21:16491222,-,ENSG00000215386,ENST00000653129,Transcript,"intron_variant,non_coding_transcript_variant",-,-,-,-,-,-,MODIFIER,-,1,-,MIR99AHG,HGNC,HGNC:1274,YES,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,chr21,16491222,G,-
1,chr4_86347899_T_A,chr4:86347899,A,ENSG00000109339,ENST00000641462,Transcript,intron_variant,-,-,-,-,-,-,MODIFIER,-,-1,-,MAPK10,HGNC,HGNC:6872,YES,NM_138982.4,-,ENSP00000493435,-,-,-,-,-,-,-,-,-,-,-,-,-,chr4,86347899,T,A
2,chr4_86220427_C_T,chr4:86220427,T,ENSG00000109339,ENST00000641462,Transcript,intron_variant,-,-,-,-,-,-,MODIFIER,-,-1,-,MAPK10,HGNC,HGNC:6872,YES,NM_138982.4,-,ENSP00000493435,-,-,-,-,-,-,-,-,-,-,-,-,-,chr4,86220427,C,T
3,chr21_16320705_G_C,chr21:16320705,C,ENSG00000215386,ENST00000653129,Transcript,"intron_variant,non_coding_transcript_variant",-,-,-,-,-,-,MODIFIER,-,1,-,MIR99AHG,HGNC,HGNC:1274,YES,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,chr21,16320705,G,C
4,chr21_16525153_G_T,chr21:16525153,T,ENSG00000215386,ENST00000653129,Transcript,"intron_variant,non_coding_transcript_variant",-,-,-,-,-,-,MODIFIER,-,1,-,MIR99AHG,HGNC,HGNC:1274,YES,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,chr21,16525153,G,T
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28079,chr20_31720680_C_T,chr20:31720680,T,ENSG00000281376,ENST00000629058,Transcript,upstream_gene_variant,-,-,-,-,-,rs2061609002,MODIFIER,827,1,-,ABALON,HGNC,HGNC:49667,YES,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,chr20,31720680,C,T
28080,chr20_31720680_C_T,chr20:31720680,T,ENSG00000236559,ENST00000412972,Transcript,downstream_gene_variant,-,-,-,-,-,rs2061609002,MODIFIER,3855,1,-,BCL2L1-AS1,HGNC,HGNC:40095,YES,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,chr20,31720680,C,T
28081,chr20_31532666_A_T,chr20:31532666,T,ENSG00000274364,ENST00000614396,Transcript,upstream_gene_variant,-,-,-,-,-,-,MODIFIER,2597,1,-,-,-,-,YES,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,chr20,31532666,A,T
28082,chr20_31419743_CCGTGGAAGATGTGCCTTGCTT_-,chr20:31419743-31419764,-,ENSG00000290996,ENST00000433453,Transcript,downstream_gene_variant,-,-,-,-,-,-,MODIFIER,1672,-1,-,DEFB122,HGNC,HGNC:18102,YES,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,chr20,31419743,CCGTGGAAGATGTGCCTTGCTT,-


### 2/ Add the WORSE consequence for each variant based on CONSEQUENCE_LIST order

In [13]:
# Consequences list was defined previously
CONSEQUENCES_LIST

['transcript_ablation',
 'splice_acceptor_variant',
 'splice_donor_variant',
 'stop_gained',
 'frameshift_variant',
 'stop_lost',
 'start_lost',
 'transcript_amplification',
 'inframe_insertion',
 'inframe_deletion',
 'missense_variant',
 'protein_altering_variant',
 'splice_region_variant',
 'splice_donor_5th_base_variant',
 'splice_donor_region_variant',
 'splice_polypyrimidine_tract_variant',
 'incomplete_terminal_codon_variant',
 'start_retained_variant',
 'stop_retained_variant',
 'synonymous_variant',
 'coding_sequence_variant',
 'mature_miRNA_variant',
 '5_prime_UTR_variant',
 '3_prime_UTR_variant',
 'non_coding_transcript_exon_variant',
 'intron_variant',
 'NMD_transcript_variant',
 'non_coding_transcript_variant',
 'upstream_gene_variant',
 'downstream_gene_variant',
 'TFBS_ablation',
 'TFBS_amplification',
 'TF_binding_site_variant',
 'regulatory_region_ablation',
 'regulatory_region_amplification',
 'feature_elongation',
 'regulatory_region_variant',
 'feature_truncation',
 

In [14]:
def consequence_worse(row):
    conseq = row['Consequence'].split(',')  # split comma separated values of Consequence column
    print(len(conseq))
    num_elements = len(conseq)
    if num_elements == 1: # when there is only one Consequence for the variant
        return conseq[0] # return first value (unique one in this case) to avoid returning a list
    elif num_elements > 1: # when there is more than one Consequence for the variant
        for element in CONSEQUENCES_LIST: # for each element of the ordered list of consequences
            for cons in conseq: # for each element of the column Consequences
                if cons == element: # if the consequence is equal to the element
                    return element
                    break
                else:
                    print('This conseq type is not equal to this element in BBGWiki list')
                    print(cons, element)
    else:
        print('Different number of consequence types in data (different than 1 or more))')
        print(num_elements)

In [15]:
# Now apply it to all sample data
final_df['WORSE_conseq'] = final_df.progress_apply(consequence_worse, axis=1)

  0%|          | 0/28084 [00:00<?, ?it/s]

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)

IOPub data rate exceeded.
The Jupyter

In [16]:
final_df[['Consequence', 'WORSE_conseq']]

Unnamed: 0,Consequence,WORSE_conseq
0,"intron_variant,non_coding_transcript_variant",intron_variant
1,intron_variant,intron_variant
2,intron_variant,intron_variant
3,"intron_variant,non_coding_transcript_variant",intron_variant
4,"intron_variant,non_coding_transcript_variant",intron_variant
...,...,...
28079,upstream_gene_variant,upstream_gene_variant
28080,downstream_gene_variant,downstream_gene_variant
28081,upstream_gene_variant,upstream_gene_variant
28082,downstream_gene_variant,downstream_gene_variant


### 3/ Filter multiple transcripts by WORSE_conseq

In [17]:
# See some variants are repeated 
final_df.groupby(['Uploaded_variation', 'Gene'], as_index=False).count().sort_values('Location',ascending=False)

Unnamed: 0,Uploaded_variation,Gene,Location,Allele,Feature,Feature_type,Consequence,cDNA_position,CDS_position,Protein_position,Amino_acids,Codons,Existing_variation,IMPACT,DISTANCE,STRAND,FLAGS,SYMBOL,SYMBOL_SOURCE,HGNC_ID,CANONICAL,MANE_SELECT,MANE_PLUS_CLINICAL,ENSP,SOURCE,AF,AFR_AF,AMR_AF,EAS_AF,EUR_AF,SAS_AF,CLIN_SIG,SOMATIC,PHENO,gnomADg,gnomADg_AF,gnomADg_NFE,#CHROM,POS,REF,ALT,WORSE_conseq
5347,chr15_25105866_G_T,ENSG00000224078,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78
17007,chr4_86415802_T_A,ENSG00000109339,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60
15025,chr4_112965986_G_T,ENSG00000145362,57,57,57,57,57,57,57,57,57,57,57,57,57,57,57,57,57,57,57,57,57,57,57,57,57,57,57,57,57,57,57,57,57,57,57,57,57,57,57,57
15022,chr4_112835968_G_A,ENSG00000145362,53,53,53,53,53,53,53,53,53,53,53,53,53,53,53,53,53,53,53,53,53,53,53,53,53,53,53,53,53,53,53,53,53,53,53,53,53,53,53,53
10842,chr20_60105059_T_A,ENSG00000228340,45,45,45,45,45,45,45,45,45,45,45,45,45,45,45,45,45,45,45,45,45,45,45,45,45,45,45,45,45,45,45,45,45,45,45,45,45,45,45,45
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8585,chr1_110802719_T_C,-,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
8584,chr1_110776794_G_C,-,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
8583,chr1_110547605_T_A,ENSG00000177301,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
8581,chr1_110296620_A_C,ENSG00000227963,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1


In [18]:
# Example of variant with different transcripts because no CANONICAL and no MANE transcript to filter 
final_df[final_df['Uploaded_variation'] == 'chr4_86415802_T_A']

Unnamed: 0,Uploaded_variation,Location,Allele,Gene,Feature,Feature_type,Consequence,cDNA_position,CDS_position,Protein_position,Amino_acids,Codons,Existing_variation,IMPACT,DISTANCE,STRAND,FLAGS,SYMBOL,SYMBOL_SOURCE,HGNC_ID,CANONICAL,MANE_SELECT,MANE_PLUS_CLINICAL,ENSP,SOURCE,AF,AFR_AF,AMR_AF,EAS_AF,EUR_AF,SAS_AF,CLIN_SIG,SOMATIC,PHENO,gnomADg,gnomADg_AF,gnomADg_NFE,#CHROM,POS,REF,ALT,WORSE_conseq
126,chr4_86415802_T_A,chr4:86415802,A,ENSG00000109339,ENST00000310816,Transcript,"intron_variant,NMD_transcript_variant",-,-,-,-,-,-,MODIFIER,-,-1,-,MAPK10,HGNC,HGNC:6872,-,-,-,ENSP00000309857,-,-,-,-,-,-,-,-,-,-,-,-,-,chr4,86415802,T,A,intron_variant
127,chr4_86415802_T_A,chr4:86415802,A,ENSG00000109339,ENST00000361569,Transcript,intron_variant,-,-,-,-,-,-,MODIFIER,-,-1,-,MAPK10,HGNC,HGNC:6872,-,-,-,ENSP00000355297,-,-,-,-,-,-,-,-,-,-,-,-,-,chr4,86415802,T,A,intron_variant
128,chr4_86415802_T_A,chr4:86415802,A,ENSG00000109339,ENST00000395160,Transcript,intron_variant,-,-,-,-,-,-,MODIFIER,-,-1,-,MAPK10,HGNC,HGNC:6872,-,-,-,ENSP00000378589,-,-,-,-,-,-,-,-,-,-,-,-,-,chr4,86415802,T,A,intron_variant
129,chr4_86415802_T_A,chr4:86415802,A,ENSG00000109339,ENST00000502302,Transcript,intron_variant,-,-,-,-,-,-,MODIFIER,-,-1,cds_end_NF,MAPK10,HGNC,HGNC:6872,-,-,-,ENSP00000423918,-,-,-,-,-,-,-,-,-,-,-,-,-,chr4,86415802,T,A,intron_variant
130,chr4_86415802_T_A,chr4:86415802,A,ENSG00000109339,ENST00000503911,Transcript,intron_variant,-,-,-,-,-,-,MODIFIER,-,-1,cds_end_NF,MAPK10,HGNC,HGNC:6872,-,-,-,ENSP00000421409,-,-,-,-,-,-,-,-,-,-,-,-,-,chr4,86415802,T,A,intron_variant
131,chr4_86415802_T_A,chr4:86415802,A,ENSG00000109339,ENST00000504397,Transcript,"intron_variant,non_coding_transcript_variant",-,-,-,-,-,-,MODIFIER,-,-1,-,MAPK10,HGNC,HGNC:6872,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,chr4,86415802,T,A,intron_variant
132,chr4_86415802_T_A,chr4:86415802,A,ENSG00000109339,ENST00000506773,Transcript,"intron_variant,non_coding_transcript_variant",-,-,-,-,-,-,MODIFIER,-,-1,-,MAPK10,HGNC,HGNC:6872,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,chr4,86415802,T,A,intron_variant
133,chr4_86415802_T_A,chr4:86415802,A,ENSG00000109339,ENST00000509464,Transcript,intron_variant,-,-,-,-,-,-,MODIFIER,-,-1,cds_end_NF,MAPK10,HGNC,HGNC:6872,-,-,-,ENSP00000424128,-,-,-,-,-,-,-,-,-,-,-,-,-,chr4,86415802,T,A,intron_variant
134,chr4_86415802_T_A,chr4:86415802,A,ENSG00000109339,ENST00000511328,Transcript,intron_variant,-,-,-,-,-,-,MODIFIER,-,-1,cds_end_NF,MAPK10,HGNC,HGNC:6872,-,-,-,ENSP00000421762,-,-,-,-,-,-,-,-,-,-,-,-,-,chr4,86415802,T,A,intron_variant
135,chr4_86415802_T_A,chr4:86415802,A,ENSG00000109339,ENST00000512017,Transcript,intron_variant,-,-,-,-,-,-,MODIFIER,-,-1,cds_end_NF,MAPK10,HGNC,HGNC:6872,-,-,-,ENSP00000424755,-,-,-,-,-,-,-,-,-,-,-,-,-,chr4,86415802,T,A,intron_variant


In [19]:
# Apply a function that takes the transcript with WORSE_conseq type (first one appearing) when no CANONICAL and no MANE transcript to filter

In [20]:
# Filter duplicated rows based on columns Uploaded variation and Gene based on order in consequence list of column WORSE_conseq
annot_df = final_df.sort_values(by='WORSE_conseq', key=lambda x: x.map(lambda y: CONSEQUENCES_LIST.index(y))).drop_duplicates(subset=['Uploaded_variation', 'Gene'])
annot_df

Unnamed: 0,Uploaded_variation,Location,Allele,Gene,Feature,Feature_type,Consequence,cDNA_position,CDS_position,Protein_position,Amino_acids,Codons,Existing_variation,IMPACT,DISTANCE,STRAND,FLAGS,SYMBOL,SYMBOL_SOURCE,HGNC_ID,CANONICAL,MANE_SELECT,MANE_PLUS_CLINICAL,ENSP,SOURCE,AF,AFR_AF,AMR_AF,EAS_AF,EUR_AF,SAS_AF,CLIN_SIG,SOMATIC,PHENO,gnomADg,gnomADg_AF,gnomADg_NFE,#CHROM,POS,REF,ALT,WORSE_conseq
8509,chr9_104825888_C_A,chr9:104825888,A,ENSG00000165029,ENST00000374736,Transcript,splice_acceptor_variant,-,-,-,-,-,-,HIGH,-,-1,-,ABCA1,HGNC,HGNC:29,YES,NM_005502.4,-,ENSP00000363868,-,-,-,-,-,-,-,-,-,-,-,-,-,chr9,104825888,C,A,splice_acceptor_variant
19123,chr15_22777766_G_T,chr15:22777766,T,ENSG00000274253,ENST00000619611,Transcript,"splice_acceptor_variant,non_coding_transcript_...",-,-,-,-,-,-,HIGH,-,1,-,-,-,-,YES,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,chr15,22777766,G,T,splice_acceptor_variant
8259,chr3_45035523_G_A,chr3:45035523,A,ENSG00000163815,ENST00000296130,Transcript,splice_acceptor_variant,-,-,-,-,-,rs1697627692,HIGH,-,1,-,CLEC3B,HGNC,HGNC:11891,YES,NM_003278.3,-,ENSP00000296130,-,-,-,-,-,-,-,-,-,-,rs1697627692,6.57013e-06,-,chr3,45035523,G,A,splice_acceptor_variant
20642,chr15_88797906_T_A,chr15:88797906,A,ENSG00000259676,ENST00000561358,Transcript,"splice_acceptor_variant,non_coding_transcript_...",-,-,-,-,-,-,HIGH,-,-1,-,-,-,-,YES,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,chr15,88797906,T,A,splice_acceptor_variant
25186,chr3_45035523_G_A,chr3:45035523,A,ENSG00000075914,ENST00000481405,Transcript,"splice_acceptor_variant,non_coding_transcript_...",-,-,-,-,-,rs1697627692,HIGH,-,1,-,EXOSC7,HGNC,HGNC:28112,-,-,-,-,-,-,-,-,-,-,-,-,-,-,rs1697627692,6.57013e-06,-,chr3,45035523,G,A,splice_acceptor_variant
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18730,chr14_83499105_G_T,chr14:83499105,T,-,-,-,intergenic_variant,-,-,-,-,-,-,MODIFIER,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,chr14,83499105,G,T,intergenic_variant
18731,chr14_83475799_T_A,chr14:83475799,A,-,-,-,intergenic_variant,-,-,-,-,-,-,MODIFIER,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,chr14,83475799,T,A,intergenic_variant
18732,chr14_83351411_C_A,chr14:83351411,A,-,-,-,intergenic_variant,-,-,-,-,-,-,MODIFIER,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,chr14,83351411,C,A,intergenic_variant
18721,chr14_84329401_G_T,chr14:84329401,T,-,-,-,intergenic_variant,-,-,-,-,-,-,MODIFIER,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,chr14,84329401,G,T,intergenic_variant


In [21]:
# Check if there are duplicated values based on variants and genes
annot_df.groupby(['Uploaded_variation', 'Gene'], as_index=False).count().sort_values('Location',ascending=False)

Unnamed: 0,Uploaded_variation,Gene,Location,Allele,Feature,Feature_type,Consequence,cDNA_position,CDS_position,Protein_position,Amino_acids,Codons,Existing_variation,IMPACT,DISTANCE,STRAND,FLAGS,SYMBOL,SYMBOL_SOURCE,HGNC_ID,CANONICAL,MANE_SELECT,MANE_PLUS_CLINICAL,ENSP,SOURCE,AF,AFR_AF,AMR_AF,EAS_AF,EUR_AF,SAS_AF,CLIN_SIG,SOMATIC,PHENO,gnomADg,gnomADg_AF,gnomADg_NFE,#CHROM,POS,REF,ALT,WORSE_conseq
0,chr10_10002148_C_T,-,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
16899,chr4_75984690_T_A,ENSG00000198301,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
16897,chr4_75513754_C_A,ENSG00000163743,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
16896,chr4_75390271_G_T,ENSG00000248646,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
16895,chr4_75357949_A_T,ENSG00000251454,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8444,chr19_8947567_G_A,ENSG00000181143,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
8443,chr19_8928660_G_T,ENSG00000181143,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
8442,chr19_832543_C_T,ENSG00000172232,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
8441,chr19_8322485_AATCCTGGCCGTGCGG_-,ENSG00000267855,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1


In [22]:
# Check previous example of variant with no CANONICAL and no MANE transcript information available to filter data 
annot_df[annot_df['Uploaded_variation'] == 'chr4_86415802_T_A']

Unnamed: 0,Uploaded_variation,Location,Allele,Gene,Feature,Feature_type,Consequence,cDNA_position,CDS_position,Protein_position,Amino_acids,Codons,Existing_variation,IMPACT,DISTANCE,STRAND,FLAGS,SYMBOL,SYMBOL_SOURCE,HGNC_ID,CANONICAL,MANE_SELECT,MANE_PLUS_CLINICAL,ENSP,SOURCE,AF,AFR_AF,AMR_AF,EAS_AF,EUR_AF,SAS_AF,CLIN_SIG,SOMATIC,PHENO,gnomADg,gnomADg_AF,gnomADg_NFE,#CHROM,POS,REF,ALT,WORSE_conseq
157,chr4_86415802_T_A,chr4:86415802,A,ENSG00000109339,ENST00000641203,Transcript,"intron_variant,NMD_transcript_variant",-,-,-,-,-,-,MODIFIER,-,-1,-,MAPK10,HGNC,HGNC:6872,-,-,-,ENSP00000493409,-,-,-,-,-,-,-,-,-,-,-,-,-,chr4,86415802,T,A,intron_variant


### 4/ Merge this table with the mutations dataframe (VEP input)

#### First transform mutations dataframe to VEP format

In [23]:
# Check that VEP output table is in VEP format
annot_df[annot_df['Uploaded_variation'] == 'chr20_31419743_CCGTGGAAGATGTGCCTTGCTT_-']

Unnamed: 0,Uploaded_variation,Location,Allele,Gene,Feature,Feature_type,Consequence,cDNA_position,CDS_position,Protein_position,Amino_acids,Codons,Existing_variation,IMPACT,DISTANCE,STRAND,FLAGS,SYMBOL,SYMBOL_SOURCE,HGNC_ID,CANONICAL,MANE_SELECT,MANE_PLUS_CLINICAL,ENSP,SOURCE,AF,AFR_AF,AMR_AF,EAS_AF,EUR_AF,SAS_AF,CLIN_SIG,SOMATIC,PHENO,gnomADg,gnomADg_AF,gnomADg_NFE,#CHROM,POS,REF,ALT,WORSE_conseq
28082,chr20_31419743_CCGTGGAAGATGTGCCTTGCTT_-,chr20:31419743-31419764,-,ENSG00000290996,ENST00000433453,Transcript,downstream_gene_variant,-,-,-,-,-,-,MODIFIER,1672,-1,-,DEFB122,HGNC,HGNC:18102,YES,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,chr20,31419743,CCGTGGAAGATGTGCCTTGCTT,-,downstream_gene_variant
28076,chr20_31419743_CCGTGGAAGATGTGCCTTGCTT_-,chr20:31419743-31419764,-,ENSG00000204547,ENST00000569013,Transcript,downstream_gene_variant,-,-,-,-,-,-,MODIFIER,1766,-1,-,DEFB122,HGNC,HGNC:18102,YES,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,chr20,31419743,CCGTGGAAGATGTGCCTTGCTT,-,downstream_gene_variant


In [24]:
# To transform input table to VEP format first read the mutations table including both clonal and subclonal
mut_df = pd.read_csv('/workspace/projects/sjd_melos/MAFs_tables/Lung_CCF_Purple_all_mutations.tsv.gz', sep="\t", header=0)
mut_df[mut_df['mut_type'] == 'INDEL'].head()

Unnamed: 0,CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO,FORMAT,NORMAL,TUMOR,t_AF,n_AF,t_DP,n_DP,t_alt_reads,n_alt_reads,t_ref_reads,n_ref_reads,VAF,mut_type,Caller_intersec,SAMPLE,mut,CN,CCF,clonality
13,chr1,4069973,.,T,TA,.,PASS,"AS_FilterStatus=SITE;AS_SB_TABLE=65,66|25,29;D...",GT:AD:AF:DP:F1R2:F2R1:FAD:SB,"0/0:96,0:9.600e-03:96:44,0:40,0:94,0:50,46,0,0","0/1:35,54:0.616:89:14,19:14,19:33,53:15,20,25,29",0.616,0.0096,89,96,54,0,35,96,0.606742,INDEL,Mutect_Strelka,sarcoma_lung,chr1_4069973_T_TA,1.1331,0.99087,clonal
23,chr1,4608758,.,CA,C,600,PASS,MH=A;RC=TTCAAAAAAAAAAAAAAAAAAGA;RC_IDX=2;RC_LF...,GT:ABQ:AD:AF:DP:RABQ:RAD:RC_CNT:RC_IPC:RC_JIT:...,"./.:0:95,0:0:96:3126,0:117,0:0,0,0,0,0,95,96:0...","./.:31:29,46:0.597:77:1286,1343:47,46:35,8,1,0...",0.597,0.0,77,96,46,0,29,95,0.597,INDEL,Mutect_SAGE,sarcoma-lung,chr1_4608758_CA_C,1.1331,0.974961,clonal
27,chr1,4788541,.,T,TG,.,PASS,"AS_FilterStatus=SITE;AS_SB_TABLE=70,88|40,34;D...",GT:AD:AF:DP:F1R2:F2R1:FAD:SB,"0/0:132,0:7.211e-03:132:54,0:68,0:132,0:61,71,0,0","0/1:26,74:0.732:100:10,31:9,35:25,70:9,17,40,34",0.732,0.007211,100,132,74,0,26,132,0.74,INDEL,Mutect_Strelka,sarcoma_lung,chr1_4788541_T_TG,1.1331,1.208494,clonal
83,chr1,20130828,.,CT,C,.,PASS,"AS_FilterStatus=SITE;AS_SB_TABLE=77,69|29,30;D...",GT:AD:AF:DP:F1R2:F2R1:FAD:SB,"0/0:118,1:0.015:119:50,1:45,0:118,1:64,54,1,0","0/1:28,58:0.663:86:10,24:11,24:28,56:13,15,28,30",0.663,0.015,86,119,58,1,28,118,0.674419,INDEL,Mutect_Strelka,sarcoma_lung,chr1_20130828_CT_C,1.1897,1.139565,clonal
109,chr1,25654655,.,CTT,C,.,PASS,"AS_FilterStatus=SITE;AS_SB_TABLE=153,122|12,14...",GT:AD:AF:DP:F1R2:F2R1:FAD:SB,"0/0:115,1:0.016:116:43,0:57,0:112,1:60,55,0,1","0/1:160,25:0.145:185:73,10:64,10:152,25:93,67,...",0.145,0.016,185,116,25,1,160,115,0.135135,INDEL,Mutect_Strelka,sarcoma_lung,chr1_25654655_CTT_C,2.7412,0.438,clonal


In [25]:
# Change format of these INDEL mutations
def change_format (row):

    ref = row['REF']
    alt = row['ALT']
    pos = row['POS']
    mut_type = row['mut_type']
    position = row['POS']

    if mut_type == 'INDEL':

        if len(ref) == 1: # this is an insertion
            ref = '-' # change for "-"
            alt = alt[1:] #change for same without first character
            pos = pos + 1 # add 1 to position
        else: # this is a deletion
            ref = ref[1:] # change for same without first character
            alt = '-' # change for "-"
            pos = pos + 1 # add 1 to position
        row['REF'] = ref
        row['ALT'] = alt
        row['POS'] = pos
    return row

vep_df = mut_df.progress_apply(lambda row: change_format (row),axis=1)

  0%|          | 0/22125 [00:00<?, ?it/s]

In [26]:
# Check the transformation of INDELs
vep_df[vep_df['mut_type'] == 'INDEL'].head()

Unnamed: 0,CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO,FORMAT,NORMAL,TUMOR,t_AF,n_AF,t_DP,n_DP,t_alt_reads,n_alt_reads,t_ref_reads,n_ref_reads,VAF,mut_type,Caller_intersec,SAMPLE,mut,CN,CCF,clonality
13,chr1,4069974,.,-,A,.,PASS,"AS_FilterStatus=SITE;AS_SB_TABLE=65,66|25,29;D...",GT:AD:AF:DP:F1R2:F2R1:FAD:SB,"0/0:96,0:9.600e-03:96:44,0:40,0:94,0:50,46,0,0","0/1:35,54:0.616:89:14,19:14,19:33,53:15,20,25,29",0.616,0.0096,89,96,54,0,35,96,0.606742,INDEL,Mutect_Strelka,sarcoma_lung,chr1_4069973_T_TA,1.1331,0.99087,clonal
23,chr1,4608759,.,A,-,600,PASS,MH=A;RC=TTCAAAAAAAAAAAAAAAAAAGA;RC_IDX=2;RC_LF...,GT:ABQ:AD:AF:DP:RABQ:RAD:RC_CNT:RC_IPC:RC_JIT:...,"./.:0:95,0:0:96:3126,0:117,0:0,0,0,0,0,95,96:0...","./.:31:29,46:0.597:77:1286,1343:47,46:35,8,1,0...",0.597,0.0,77,96,46,0,29,95,0.597,INDEL,Mutect_SAGE,sarcoma-lung,chr1_4608758_CA_C,1.1331,0.974961,clonal
27,chr1,4788542,.,-,G,.,PASS,"AS_FilterStatus=SITE;AS_SB_TABLE=70,88|40,34;D...",GT:AD:AF:DP:F1R2:F2R1:FAD:SB,"0/0:132,0:7.211e-03:132:54,0:68,0:132,0:61,71,0,0","0/1:26,74:0.732:100:10,31:9,35:25,70:9,17,40,34",0.732,0.007211,100,132,74,0,26,132,0.74,INDEL,Mutect_Strelka,sarcoma_lung,chr1_4788541_T_TG,1.1331,1.208494,clonal
83,chr1,20130829,.,T,-,.,PASS,"AS_FilterStatus=SITE;AS_SB_TABLE=77,69|29,30;D...",GT:AD:AF:DP:F1R2:F2R1:FAD:SB,"0/0:118,1:0.015:119:50,1:45,0:118,1:64,54,1,0","0/1:28,58:0.663:86:10,24:11,24:28,56:13,15,28,30",0.663,0.015,86,119,58,1,28,118,0.674419,INDEL,Mutect_Strelka,sarcoma_lung,chr1_20130828_CT_C,1.1897,1.139565,clonal
109,chr1,25654656,.,TT,-,.,PASS,"AS_FilterStatus=SITE;AS_SB_TABLE=153,122|12,14...",GT:AD:AF:DP:F1R2:F2R1:FAD:SB,"0/0:115,1:0.016:116:43,0:57,0:112,1:60,55,0,1","0/1:160,25:0.145:185:73,10:64,10:152,25:93,67,...",0.145,0.016,185,116,25,1,160,115,0.135135,INDEL,Mutect_Strelka,sarcoma_lung,chr1_25654655_CTT_C,2.7412,0.438,clonal


In [27]:
# Rename column CHROM
vep_df.rename(columns={'CHROM':'#CHROM'}, inplace=True)

# Check the change in column name has been applied
display(vep_df.columns)

Index(['#CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT',
       'NORMAL', 'TUMOR', 't_AF', 'n_AF', 't_DP', 'n_DP', 't_alt_reads',
       'n_alt_reads', 't_ref_reads', 'n_ref_reads', 'VAF', 'mut_type',
       'Caller_intersec', 'SAMPLE', 'mut', 'CN', 'CCF', 'clonality'],
      dtype='object')

#### Then merge mutations dataframe with annot_df by CROM, POS, REF, ALT

In [28]:
# To merge all reference columns have to be the same type
print(vep_df['POS'].dtypes)
print(annot_df['POS'].dtypes)

int64
object


In [29]:
# Change column POS of annot_df to integer type to be able to merge (both columns in the dataframes have to be the same data type)
annot_df['POS']=annot_df['POS'].astype(int)

In [30]:
# Merge dataframes based on CHROM, POS, REF, ALT
merged_df = annot_df.merge(vep_df, on=['#CHROM','POS','REF','ALT'], how = 'left') # this retains all rows from the left DataFrame while matching rows from the right DataFrame
merged_df

Unnamed: 0,Uploaded_variation,Location,Allele,Gene,Feature,Feature_type,Consequence,cDNA_position,CDS_position,Protein_position,Amino_acids,Codons,Existing_variation,IMPACT,DISTANCE,STRAND,FLAGS,SYMBOL,SYMBOL_SOURCE,HGNC_ID,CANONICAL,MANE_SELECT,MANE_PLUS_CLINICAL,ENSP,SOURCE,AF,AFR_AF,AMR_AF,EAS_AF,EUR_AF,SAS_AF,CLIN_SIG,SOMATIC,PHENO,gnomADg,gnomADg_AF,gnomADg_NFE,#CHROM,POS,REF,ALT,WORSE_conseq,ID,QUAL,FILTER,INFO,FORMAT,NORMAL,TUMOR,t_AF,n_AF,t_DP,n_DP,t_alt_reads,n_alt_reads,t_ref_reads,n_ref_reads,VAF,mut_type,Caller_intersec,SAMPLE,mut,CN,CCF,clonality
0,chr9_104825888_C_A,chr9:104825888,A,ENSG00000165029,ENST00000374736,Transcript,splice_acceptor_variant,-,-,-,-,-,-,HIGH,-,-1,-,ABCA1,HGNC,HGNC:29,YES,NM_005502.4,-,ENSP00000363868,-,-,-,-,-,-,-,-,-,-,-,-,-,chr9,104825888,C,A,splice_acceptor_variant,.,.,PASS,"AS_FilterStatus=SITE;AS_SB_TABLE=78,115|25,26;...",GT:AD:AF:DP:F1R2:F2R1:FAD:SB,"0/0:121,0:8.178e-03:121:57,0:60,0:120,0:50,71,0,0","0/1:72,51:0.421:123:32,29:35,20:70,51:28,44,25,26",0.421,0.008178,123,121,51,0,72,121,0.414634,SNV,Mutect_Strelka,sarcoma_lung,chr9_104825888_C_A,1.5043,0.831051,clonal
1,chr15_22777766_G_T,chr15:22777766,T,ENSG00000274253,ENST00000619611,Transcript,"splice_acceptor_variant,non_coding_transcript_...",-,-,-,-,-,-,HIGH,-,1,-,-,-,-,YES,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,chr15,22777766,G,T,splice_acceptor_variant,.,.,PASS,"AS_FilterStatus=SITE;AS_SB_TABLE=117,92|37,25;...",GT:AD:AF:DP:F1R2:F2R1:FAD:SB,"0/0:116,0:8.627e-03:116:54,0:53,0:113,0:70,46,0,0","0/1:93,62:0.400:155:37,33:46,24:89,59:47,46,37,25",0.400,0.008627,155,116,62,0,93,116,0.400000,SNV,Mutect_Strelka,sarcoma_lung,chr15_22777766_G_T,1.9024,0.960960,clonal
2,chr3_45035523_G_A,chr3:45035523,A,ENSG00000163815,ENST00000296130,Transcript,splice_acceptor_variant,-,-,-,-,-,rs1697627692,HIGH,-,1,-,CLEC3B,HGNC,HGNC:11891,YES,NM_003278.3,-,ENSP00000296130,-,-,-,-,-,-,-,-,-,-,rs1697627692,6.57013e-06,-,chr3,45035523,G,A,splice_acceptor_variant,.,.,PASS,"AS_FilterStatus=SITE;AS_SB_TABLE=138,126|35,44...",GT:AD:AF:DP:F1R2:F2R1:FAD:SB,"0/0:156,0:6.507e-03:156:66,0:82,0:151,0:78,78,0,0","0/1:108,79:0.416:187:49,39:55,33:106,76:60,48,...",0.416,0.006507,187,156,79,0,108,156,0.422460,SNV,Mutect_Strelka,sarcoma_lung,chr3_45035523_G_A,1.9820,1.048545,clonal
3,chr15_88797906_T_A,chr15:88797906,A,ENSG00000259676,ENST00000561358,Transcript,"splice_acceptor_variant,non_coding_transcript_...",-,-,-,-,-,-,HIGH,-,-1,-,-,-,-,YES,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,chr15,88797906,T,A,splice_acceptor_variant,.,.,PASS,"AS_FilterStatus=SITE;AS_SB_TABLE=88,98|20,27;D...",GT:AD:AF:DP:F1R2:F2R1:FAD:SB,"0/0:98,0:9.714e-03:98:54,0:35,0:97,0:53,45,0,0","0/1:88,47:0.351:135:30,22:47,20:84,45:35,53,20,27",0.351,0.009714,135,98,47,0,88,98,0.348148,SNV,Mutect_Strelka,sarcoma_lung,chr15_88797906_T_A,1.9233,0.843667,clonal
4,chr3_45035523_G_A,chr3:45035523,A,ENSG00000075914,ENST00000481405,Transcript,"splice_acceptor_variant,non_coding_transcript_...",-,-,-,-,-,rs1697627692,HIGH,-,1,-,EXOSC7,HGNC,HGNC:28112,-,-,-,-,-,-,-,-,-,-,-,-,-,-,rs1697627692,6.57013e-06,-,chr3,45035523,G,A,splice_acceptor_variant,.,.,PASS,"AS_FilterStatus=SITE;AS_SB_TABLE=138,126|35,44...",GT:AD:AF:DP:F1R2:F2R1:FAD:SB,"0/0:156,0:6.507e-03:156:66,0:82,0:151,0:78,78,0,0","0/1:108,79:0.416:187:49,39:55,33:106,76:60,48,...",0.416,0.006507,187,156,79,0,108,156,0.422460,SNV,Mutect_Strelka,sarcoma_lung,chr3_45035523_G_A,1.9820,1.048545,clonal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25333,chr14_83499105_G_T,chr14:83499105,T,-,-,-,intergenic_variant,-,-,-,-,-,-,MODIFIER,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,chr14,83499105,G,T,intergenic_variant,.,.,PASS,"AS_FilterStatus=SITE;AS_SB_TABLE=92,129|29,41;...",GT:AD:AF:DP:F1R2:F2R1:FAD:SB,"0/0:124,1:8.181e-03:125:60,0:56,0:124,1:52,72,0,1","0/1:97,69:0.419:166:39,34:46,29:88,63:40,57,29,40",0.419,0.008181,166,125,69,1,97,124,0.415663,SNV,Mutect_Strelka,sarcoma_lung,chr14_83499105_G_T,2.1155,1.087166,clonal
25334,chr14_83475799_T_A,chr14:83475799,A,-,-,-,intergenic_variant,-,-,-,-,-,-,MODIFIER,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,chr14,83475799,T,A,intergenic_variant,.,.,PASS,"AS_FilterStatus=SITE;AS_SB_TABLE=104,87|47,28;...",GT:AD:AF:DP:F1R2:F2R1:FAD:SB,"0/0:113,0:8.636e-03:113:49,0:61,0:111,0:60,53,0,0","0/1:78,75:0.479:153:40,32:35,37:77,71:44,34,47,28",0.479,0.008636,153,113,75,0,78,113,0.490196,SNV,Mutect_Strelka,sarcoma_lung,chr14_83475799_T_A,2.1155,1.282108,clonal
25335,chr14_83351411_C_A,chr14:83351411,A,-,-,-,intergenic_variant,-,-,-,-,-,-,MODIFIER,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,chr14,83351411,C,A,intergenic_variant,.,.,PASS,"AS_FilterStatus=SITE;AS_SB_TABLE=114,115|41,25...",GT:AD:AF:DP:F1R2:F2R1:FAD:SB,"0/0:117,0:8.357e-03:117:48,0:67,0:117,0:61,56,0,0","0/1:112,66:0.389:178:46,28:50,32:99,63:53,59,4...",0.389,0.008357,178,117,66,0,112,117,0.370787,SNV,Mutect_Strelka,sarcoma_lung,chr14_83351411_C_A,2.1155,0.969792,clonal
25336,chr14_84329401_G_T,chr14:84329401,T,-,-,-,intergenic_variant,-,-,-,-,-,-,MODIFIER,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,chr14,84329401,G,T,intergenic_variant,.,.,PASS,"AS_FilterStatus=SITE;AS_SB_TABLE=108,106|34,35...",GT:AD:AF:DP:F1R2:F2R1:FAD:SB,"0/0:119,0:8.392e-03:119:60,0:55,0:116,0:63,56,0,0","0/1:95,69:0.425:164:44,28:44,37:90,66:45,50,34,35",0.425,0.008392,164,119,69,0,95,119,0.420732,SNV,Mutect_Strelka,sarcoma_lung,chr14_84329401_G_T,2.1155,1.100424,clonal


In [31]:
# Note that when grouped by variant coordinate and type of change variants are unique but some contain more than one transcript. 
# This is due to intergenic variants that VEP annotates them in the surrounding transcripts, so they are annotated more than one time.
merged_df.groupby(['#CHROM', 'POS','REF','ALT'], as_index=False).count().sort_values('Location',ascending=False)

Unnamed: 0,#CHROM,POS,REF,ALT,Uploaded_variation,Location,Allele,Gene,Feature,Feature_type,Consequence,cDNA_position,CDS_position,Protein_position,Amino_acids,Codons,Existing_variation,IMPACT,DISTANCE,STRAND,FLAGS,SYMBOL,SYMBOL_SOURCE,HGNC_ID,CANONICAL,MANE_SELECT,MANE_PLUS_CLINICAL,ENSP,SOURCE,AF,AFR_AF,AMR_AF,EAS_AF,EUR_AF,SAS_AF,CLIN_SIG,SOMATIC,PHENO,gnomADg,gnomADg_AF,gnomADg_NFE,WORSE_conseq,ID,QUAL,FILTER,INFO,FORMAT,NORMAL,TUMOR,t_AF,n_AF,t_DP,n_DP,t_alt_reads,n_alt_reads,t_ref_reads,n_ref_reads,VAF,mut_type,Caller_intersec,SAMPLE,mut,CN,CCF,clonality
16289,chr5,141463684,G,T,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20
399,chr1,63550594,G,A,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9
6172,chr14,101048065,G,T,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9
16880,chr6,26201964,C,A,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7
8758,chr19,53701878,T,C,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7961,chr18,33556743,A,C,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
7960,chr18,33525479,C,A,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
7959,chr18,33374400,T,C,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
7958,chr18,33301688,C,A,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1


In [32]:
# Export tsv table
# merged_df.to_csv('/workspace/projects/sjd_melos/MAFs_tables/Lung_common_mutations_callers_VEP_annotated.tsv.gz', sep ='\t', index=None, compression = "gzip")