## Annotated Variant Filtering: Sarcoma-primary sample

In [1]:
# Needed basic packages
import pandas as pd
from tqdm import tqdm_notebook
from tqdm.notebook import tqdm_notebook
tqdm_notebook.pandas()
from tqdm.notebook import tqdm
tqdm.pandas()

### 1) Read VEP output files

In [2]:
import os, sys

# Paths from VEP output directories
path = "/workspace/projects/sjd_melos/vep/vep_output_files/sarcoma/"

# These are the files of the variable path
files = os.listdir(path)
files

['chr3.tsv.gz',
 'chrX.tsv.gz',
 'chr8.tsv.gz',
 'chr9.tsv.gz',
 'chr12.tsv.gz',
 'chr11.tsv.gz',
 'chr5.tsv.gz',
 'chr4.tsv.gz',
 'chr7.tsv.gz',
 'chr6.tsv.gz',
 'chr13.tsv.gz',
 'chr14.tsv.gz',
 'chr15.tsv.gz',
 'chr16.tsv.gz',
 'chr17.tsv.gz',
 'chr18.tsv.gz',
 'chr19.tsv.gz',
 'chr1.tsv.gz',
 'chr21.tsv.gz',
 'chr22.tsv.gz',
 'chr2.tsv.gz',
 'chrY.tsv.gz',
 'chr10.tsv.gz',
 'chr20.tsv.gz']

In [3]:
# Perform a chromosome list
chroms = []
for n in range(1,23):
    chroms.append('chr'+str(n))
chroms.append('chrX')    
chroms.append('chrY')  
chroms

['chr1',
 'chr2',
 'chr3',
 'chr4',
 'chr5',
 'chr6',
 'chr7',
 'chr8',
 'chr9',
 'chr10',
 'chr11',
 'chr12',
 'chr13',
 'chr14',
 'chr15',
 'chr16',
 'chr17',
 'chr18',
 'chr19',
 'chr20',
 'chr21',
 'chr22',
 'chrX',
 'chrY']

In [4]:
# Define function that reads files based on chrom list and returns a dataframe
vep_out_df = pd.DataFrame()
for c in chroms: # use the chrom list to read each file chromosome by chromosome
    columns = ['Uploaded_variation', 'Location', 'Allele', 'Gene',	'Feature', 'Feature_type', 'Consequence', 'cDNA_position', 'CDS_position', 'Protein_position', 'Amino_acids','Codons','Existing_variation','IMPACT','DISTANCE','STRAND','FLAGS','SYMBOL','SYMBOL_SOURCE','HGNC_ID','CANONICAL','MANE_SELECT','MANE_PLUS_CLINICAL','ENSP',	'SOURCE', 'AF',	'AFR_AF', 'AMR_AF','EAS_AF', 'EUR_AF','SAS_AF','CLIN_SIG','SOMATIC','PHENO','gnomADg', 'gnomADg_AF', 'gnomADg_NFE']
    chr_df = pd.read_csv(path+c+'.tsv.gz', sep="\t", names = columns, comment='#', header=None)
    vep_out_df = pd.concat([vep_out_df,chr_df],ignore_index=True)
vep_out_df

Unnamed: 0,Uploaded_variation,Location,Allele,Gene,Feature,Feature_type,Consequence,cDNA_position,CDS_position,Protein_position,...,AMR_AF,EAS_AF,EUR_AF,SAS_AF,CLIN_SIG,SOMATIC,PHENO,gnomADg,gnomADg_AF,gnomADg_NFE
0,chr1_104188_C/A,chr1:104188,A,ENSG00000238009,ENST00000466430,Transcript,"intron_variant,non_coding_transcript_variant",-,-,-,...,-,-,-,-,-,-,-,-,-,-
1,chr1_104188_C/A,chr1:104188,A,ENSG00000238009,ENST00000477740,Transcript,"intron_variant,non_coding_transcript_variant",-,-,-,...,-,-,-,-,-,-,-,-,-,-
2,chr1_843320_G/A,chr1:843320,A,ENSG00000228794,ENST00000415295,Transcript,downstream_gene_variant,-,-,-,...,-,-,-,-,-,-,-,-,-,-
3,chr1_843320_G/A,chr1:843320,A,ENSG00000228794,ENST00000416570,Transcript,"intron_variant,non_coding_transcript_variant",-,-,-,...,-,-,-,-,-,-,-,-,-,-
4,chr1_843320_G/A,chr1:843320,A,ENSG00000228794,ENST00000441765,Transcript,downstream_gene_variant,-,-,-,...,-,-,-,-,-,-,-,-,-,-
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
92883,chrY_26654389_A/G,chrY:26654389,G,-,-,-,intergenic_variant,-,-,-,...,-,-,-,-,-,-,-,-,-,-
92884,chrY_56823266_G/A,chrY:56823266,A,-,-,-,intergenic_variant,-,-,-,...,-,-,-,-,-,-,-,-,-,-
92885,chrY_56842066_CT/AC,chrY:56842066-56842067,AC,-,-,-,intergenic_variant,-,-,-,...,-,-,-,-,-,-,-,-,-,-
92886,chrY_56842087_T/C,chrY:56842087,C,-,-,-,intergenic_variant,-,-,-,...,-,-,-,-,-,-,-,-,-,-


In [5]:
# Reformat Uploaded variation column so all separators are "_" to split this table to CHROM POS REF ALT
vep_out_df['Uploaded_variation'] = vep_out_df['Uploaded_variation'].str.replace('/','_')
vep_out_df = pd.concat([vep_out_df, vep_out_df['Uploaded_variation'].str.split('_', expand=True)], axis=1)
vep_out_df = vep_out_df.rename(columns={0:'#CHROM', 1: 'POS', 2:'REF', 3: 'ALT'})
vep_out_df

Unnamed: 0,Uploaded_variation,Location,Allele,Gene,Feature,Feature_type,Consequence,cDNA_position,CDS_position,Protein_position,...,CLIN_SIG,SOMATIC,PHENO,gnomADg,gnomADg_AF,gnomADg_NFE,#CHROM,POS,REF,ALT
0,chr1_104188_C_A,chr1:104188,A,ENSG00000238009,ENST00000466430,Transcript,"intron_variant,non_coding_transcript_variant",-,-,-,...,-,-,-,-,-,-,chr1,104188,C,A
1,chr1_104188_C_A,chr1:104188,A,ENSG00000238009,ENST00000477740,Transcript,"intron_variant,non_coding_transcript_variant",-,-,-,...,-,-,-,-,-,-,chr1,104188,C,A
2,chr1_843320_G_A,chr1:843320,A,ENSG00000228794,ENST00000415295,Transcript,downstream_gene_variant,-,-,-,...,-,-,-,-,-,-,chr1,843320,G,A
3,chr1_843320_G_A,chr1:843320,A,ENSG00000228794,ENST00000416570,Transcript,"intron_variant,non_coding_transcript_variant",-,-,-,...,-,-,-,-,-,-,chr1,843320,G,A
4,chr1_843320_G_A,chr1:843320,A,ENSG00000228794,ENST00000441765,Transcript,downstream_gene_variant,-,-,-,...,-,-,-,-,-,-,chr1,843320,G,A
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
92883,chrY_26654389_A_G,chrY:26654389,G,-,-,-,intergenic_variant,-,-,-,...,-,-,-,-,-,-,chrY,26654389,A,G
92884,chrY_56823266_G_A,chrY:56823266,A,-,-,-,intergenic_variant,-,-,-,...,-,-,-,-,-,-,chrY,56823266,G,A
92885,chrY_56842066_CT_AC,chrY:56842066-56842067,AC,-,-,-,intergenic_variant,-,-,-,...,-,-,-,-,-,-,chrY,56842066,CT,AC
92886,chrY_56842087_T_C,chrY:56842087,C,-,-,-,intergenic_variant,-,-,-,...,-,-,-,-,-,-,chrY,56842087,T,C


In [6]:
# This is the consequences list Ensembl variations calculated variant consequences (https://www.ensembl.org/info/genome/variation/prediction/predicted_data.html) arranged from the most severe to the least severe effect
CONSEQUENCES_LIST = [
    'transcript_ablation',
    'splice_acceptor_variant',
    'splice_donor_variant',
    'stop_gained',
    'frameshift_variant',
    'stop_lost',
    'start_lost',
    'transcript_amplification',
    'inframe_insertion',
    'inframe_deletion',
    'missense_variant',
    'protein_altering_variant',
    'splice_region_variant',
    'splice_donor_5th_base_variant',
    'splice_donor_region_variant',
    'splice_polypyrimidine_tract_variant',
    'incomplete_terminal_codon_variant',
    'start_retained_variant',
    'stop_retained_variant',
    'synonymous_variant',
    'coding_sequence_variant',
    'mature_miRNA_variant',
    '5_prime_UTR_variant',
    '3_prime_UTR_variant',
    'non_coding_transcript_exon_variant',
    'intron_variant',
    'NMD_transcript_variant',
    'non_coding_transcript_variant',
    'upstream_gene_variant',
    'downstream_gene_variant',
    'TFBS_ablation',
    'TFBS_amplification',
    'TF_binding_site_variant',
    'regulatory_region_ablation',
    'regulatory_region_amplification',
    'feature_elongation',
    'regulatory_region_variant',
    'feature_truncation',
    'intergenic_variant'
]

### 1/ Transcript selection based on MANE-Select information

In [7]:
# Create a final list of variants with the worse consequence whith unique transcripts per variant by this criteria:
# 1/ choose transcript based on MANE transcript
# 2/ when no MANE transcript choose transcript for the CANONICAL transcript
# 3/ when no MANE and not CANONICAL choose transcript based on the transcript containing the first ocurring worse consequence based on CONSEQUENCES_LIST

In [8]:
display(vep_out_df.columns)

Index(['Uploaded_variation', 'Location', 'Allele', 'Gene', 'Feature',
       'Feature_type', 'Consequence', 'cDNA_position', 'CDS_position',
       'Protein_position', 'Amino_acids', 'Codons', 'Existing_variation',
       'IMPACT', 'DISTANCE', 'STRAND', 'FLAGS', 'SYMBOL', 'SYMBOL_SOURCE',
       'HGNC_ID', 'CANONICAL', 'MANE_SELECT', 'MANE_PLUS_CLINICAL', 'ENSP',
       'SOURCE', 'AF', 'AFR_AF', 'AMR_AF', 'EAS_AF', 'EUR_AF', 'SAS_AF',
       'CLIN_SIG', 'SOMATIC', 'PHENO', 'gnomADg', 'gnomADg_AF', 'gnomADg_NFE',
       '#CHROM', 'POS', 'REF', 'ALT'],
      dtype='object')

In [9]:
# Check in a new dataframe the number of transcripts per position and sort it by its location  in genome
grp_df = vep_out_df.groupby(['Uploaded_variation','Gene'], as_index=False).count().sort_values('Location',ascending=False)
pd.set_option('display.max_columns', None) # Show information of all columns in pandas
grp_df

Unnamed: 0,Uploaded_variation,Gene,Location,Allele,Feature,Feature_type,Consequence,cDNA_position,CDS_position,Protein_position,Amino_acids,Codons,Existing_variation,IMPACT,DISTANCE,STRAND,FLAGS,SYMBOL,SYMBOL_SOURCE,HGNC_ID,CANONICAL,MANE_SELECT,MANE_PLUS_CLINICAL,ENSP,SOURCE,AF,AFR_AF,AMR_AF,EAS_AF,EUR_AF,SAS_AF,CLIN_SIG,SOMATIC,PHENO,gnomADg,gnomADg_AF,gnomADg_NFE,#CHROM,POS,REF,ALT
12286,chr21_16491222_G_-,ENSG00000215386,203,203,203,203,203,203,203,203,203,203,203,203,203,203,203,203,203,203,203,203,203,203,203,203,203,203,203,203,203,203,203,203,203,203,203,203,203,203,203
18812,chr4_86347899_T_A,ENSG00000109339,164,164,164,164,164,164,164,164,164,164,164,164,164,164,164,164,164,164,164,164,164,164,164,164,164,164,164,164,164,164,164,164,164,164,164,164,164,164,164
18810,chr4_86220427_C_T,ENSG00000109339,162,162,162,162,162,162,162,162,162,162,162,162,162,162,162,162,162,162,162,162,162,162,162,162,162,162,162,162,162,162,162,162,162,162,162,162,162,162,162
12285,chr21_16320705_G_C,ENSG00000215386,148,148,148,148,148,148,148,148,148,148,148,148,148,148,148,148,148,148,148,148,148,148,148,148,148,148,148,148,148,148,148,148,148,148,148,148,148,148,148
12282,chr21_16281875_A_T,ENSG00000215386,145,145,145,145,145,145,145,145,145,145,145,145,145,145,145,145,145,145,145,145,145,145,145,145,145,145,145,145,145,145,145,145,145,145,145,145,145,145,145
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11717,chr20_2062216_C_A,-,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
11712,chr20_20377902_C_A,-,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
11711,chr20_20214891_G_A,ENSG00000225417,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
11702,chr20_19728602_C_A,-,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1


In [10]:
# Case-examples of different transcripts associated to variants to consider in the code:
display(vep_out_df[vep_out_df['Uploaded_variation']=='chr1_240837724_G_T']) # this variant has a MANE_Select transcript that distinguishes it from the other transcripts
display(vep_out_df[vep_out_df['Uploaded_variation']=='chr1_221507472_A_T']) # only Gene information no MANE in any transcript, but CANONICAL info distinguish one of all
display(vep_out_df[vep_out_df['Uploaded_variation']== 'chr1_240837724_G_T']) # no MANE and no Gene for intergenic variants, here just one transcript no need to distinguis from others

Unnamed: 0,Uploaded_variation,Location,Allele,Gene,Feature,Feature_type,Consequence,cDNA_position,CDS_position,Protein_position,Amino_acids,Codons,Existing_variation,IMPACT,DISTANCE,STRAND,FLAGS,SYMBOL,SYMBOL_SOURCE,HGNC_ID,CANONICAL,MANE_SELECT,MANE_PLUS_CLINICAL,ENSP,SOURCE,AF,AFR_AF,AMR_AF,EAS_AF,EUR_AF,SAS_AF,CLIN_SIG,SOMATIC,PHENO,gnomADg,gnomADg_AF,gnomADg_NFE,#CHROM,POS,REF,ALT
7038,chr1_240837724_G_T,chr1:240837724,T,ENSG00000182901,ENST00000348120,Transcript,intron_variant,-,-,-,-,-,rs1227976331,MODIFIER,-,-1,-,RGS7,HGNC,HGNC:10003,-,-,-,ENSP00000341242,-,-,-,-,-,-,-,-,-,-,-,-,-,chr1,240837724,G,T
7039,chr1_240837724_G_T,chr1:240837724,T,ENSG00000182901,ENST00000366563,Transcript,intron_variant,-,-,-,-,-,rs1227976331,MODIFIER,-,-1,-,RGS7,HGNC,HGNC:10003,-,-,-,ENSP00000355521,-,-,-,-,-,-,-,-,-,-,-,-,-,chr1,240837724,G,T
7040,chr1_240837724_G_T,chr1:240837724,T,ENSG00000182901,ENST00000366564,Transcript,intron_variant,-,-,-,-,-,rs1227976331,MODIFIER,-,-1,-,RGS7,HGNC,HGNC:10003,-,-,-,ENSP00000355522,-,-,-,-,-,-,-,-,-,-,-,-,-,chr1,240837724,G,T
7041,chr1_240837724_G_T,chr1:240837724,T,ENSG00000182901,ENST00000366565,Transcript,intron_variant,-,-,-,-,-,rs1227976331,MODIFIER,-,-1,-,RGS7,HGNC,HGNC:10003,-,-,-,ENSP00000355523,-,-,-,-,-,-,-,-,-,-,-,-,-,chr1,240837724,G,T
7042,chr1_240837724_G_T,chr1:240837724,T,ENSG00000182901,ENST00000440928,Transcript,intron_variant,-,-,-,-,-,rs1227976331,MODIFIER,-,-1,-,RGS7,HGNC,HGNC:10003,YES,NM_001364886.1,-,ENSP00000404399,-,-,-,-,-,-,-,-,-,-,-,-,-,chr1,240837724,G,T
7043,chr1_240837724_G_T,chr1:240837724,T,ENSG00000182901,ENST00000685354,Transcript,"intron_variant,non_coding_transcript_variant",-,-,-,-,-,rs1227976331,MODIFIER,-,-1,-,RGS7,HGNC,HGNC:10003,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,chr1,240837724,G,T
7044,chr1_240837724_G_T,chr1:240837724,T,ENSG00000182901,ENST00000685936,Transcript,intron_variant,-,-,-,-,-,rs1227976331,MODIFIER,-,-1,-,RGS7,HGNC,HGNC:10003,-,-,-,ENSP00000510372,-,-,-,-,-,-,-,-,-,-,-,-,-,chr1,240837724,G,T
7045,chr1_240837724_G_T,chr1:240837724,T,ENSG00000182901,ENST00000686049,Transcript,"intron_variant,non_coding_transcript_variant",-,-,-,-,-,rs1227976331,MODIFIER,-,-1,-,RGS7,HGNC,HGNC:10003,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,chr1,240837724,G,T
7046,chr1_240837724_G_T,chr1:240837724,T,ENSG00000182901,ENST00000686241,Transcript,"intron_variant,non_coding_transcript_variant",-,-,-,-,-,rs1227976331,MODIFIER,-,-1,-,RGS7,HGNC,HGNC:10003,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,chr1,240837724,G,T
7047,chr1_240837724_G_T,chr1:240837724,T,ENSG00000182901,ENST00000686277,Transcript,"intron_variant,non_coding_transcript_variant",-,-,-,-,-,rs1227976331,MODIFIER,-,-1,-,RGS7,HGNC,HGNC:10003,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,chr1,240837724,G,T


Unnamed: 0,Uploaded_variation,Location,Allele,Gene,Feature,Feature_type,Consequence,cDNA_position,CDS_position,Protein_position,Amino_acids,Codons,Existing_variation,IMPACT,DISTANCE,STRAND,FLAGS,SYMBOL,SYMBOL_SOURCE,HGNC_ID,CANONICAL,MANE_SELECT,MANE_PLUS_CLINICAL,ENSP,SOURCE,AF,AFR_AF,AMR_AF,EAS_AF,EUR_AF,SAS_AF,CLIN_SIG,SOMATIC,PHENO,gnomADg,gnomADg_AF,gnomADg_NFE,#CHROM,POS,REF,ALT
6461,chr1_221507472_A_T,chr1:221507472,T,ENSG00000232436,ENST00000440104,Transcript,upstream_gene_variant,-,-,-,-,-,-,MODIFIER,1087,1,-,-,-,-,YES,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,chr1,221507472,A,T


Unnamed: 0,Uploaded_variation,Location,Allele,Gene,Feature,Feature_type,Consequence,cDNA_position,CDS_position,Protein_position,Amino_acids,Codons,Existing_variation,IMPACT,DISTANCE,STRAND,FLAGS,SYMBOL,SYMBOL_SOURCE,HGNC_ID,CANONICAL,MANE_SELECT,MANE_PLUS_CLINICAL,ENSP,SOURCE,AF,AFR_AF,AMR_AF,EAS_AF,EUR_AF,SAS_AF,CLIN_SIG,SOMATIC,PHENO,gnomADg,gnomADg_AF,gnomADg_NFE,#CHROM,POS,REF,ALT
7038,chr1_240837724_G_T,chr1:240837724,T,ENSG00000182901,ENST00000348120,Transcript,intron_variant,-,-,-,-,-,rs1227976331,MODIFIER,-,-1,-,RGS7,HGNC,HGNC:10003,-,-,-,ENSP00000341242,-,-,-,-,-,-,-,-,-,-,-,-,-,chr1,240837724,G,T
7039,chr1_240837724_G_T,chr1:240837724,T,ENSG00000182901,ENST00000366563,Transcript,intron_variant,-,-,-,-,-,rs1227976331,MODIFIER,-,-1,-,RGS7,HGNC,HGNC:10003,-,-,-,ENSP00000355521,-,-,-,-,-,-,-,-,-,-,-,-,-,chr1,240837724,G,T
7040,chr1_240837724_G_T,chr1:240837724,T,ENSG00000182901,ENST00000366564,Transcript,intron_variant,-,-,-,-,-,rs1227976331,MODIFIER,-,-1,-,RGS7,HGNC,HGNC:10003,-,-,-,ENSP00000355522,-,-,-,-,-,-,-,-,-,-,-,-,-,chr1,240837724,G,T
7041,chr1_240837724_G_T,chr1:240837724,T,ENSG00000182901,ENST00000366565,Transcript,intron_variant,-,-,-,-,-,rs1227976331,MODIFIER,-,-1,-,RGS7,HGNC,HGNC:10003,-,-,-,ENSP00000355523,-,-,-,-,-,-,-,-,-,-,-,-,-,chr1,240837724,G,T
7042,chr1_240837724_G_T,chr1:240837724,T,ENSG00000182901,ENST00000440928,Transcript,intron_variant,-,-,-,-,-,rs1227976331,MODIFIER,-,-1,-,RGS7,HGNC,HGNC:10003,YES,NM_001364886.1,-,ENSP00000404399,-,-,-,-,-,-,-,-,-,-,-,-,-,chr1,240837724,G,T
7043,chr1_240837724_G_T,chr1:240837724,T,ENSG00000182901,ENST00000685354,Transcript,"intron_variant,non_coding_transcript_variant",-,-,-,-,-,rs1227976331,MODIFIER,-,-1,-,RGS7,HGNC,HGNC:10003,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,chr1,240837724,G,T
7044,chr1_240837724_G_T,chr1:240837724,T,ENSG00000182901,ENST00000685936,Transcript,intron_variant,-,-,-,-,-,rs1227976331,MODIFIER,-,-1,-,RGS7,HGNC,HGNC:10003,-,-,-,ENSP00000510372,-,-,-,-,-,-,-,-,-,-,-,-,-,chr1,240837724,G,T
7045,chr1_240837724_G_T,chr1:240837724,T,ENSG00000182901,ENST00000686049,Transcript,"intron_variant,non_coding_transcript_variant",-,-,-,-,-,rs1227976331,MODIFIER,-,-1,-,RGS7,HGNC,HGNC:10003,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,chr1,240837724,G,T
7046,chr1_240837724_G_T,chr1:240837724,T,ENSG00000182901,ENST00000686241,Transcript,"intron_variant,non_coding_transcript_variant",-,-,-,-,-,rs1227976331,MODIFIER,-,-1,-,RGS7,HGNC,HGNC:10003,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,chr1,240837724,G,T
7047,chr1_240837724_G_T,chr1:240837724,T,ENSG00000182901,ENST00000686277,Transcript,"intron_variant,non_coding_transcript_variant",-,-,-,-,-,rs1227976331,MODIFIER,-,-1,-,RGS7,HGNC,HGNC:10003,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,chr1,240837724,G,T


In [11]:
# Make a list from these two columns that results in unique values
groups_list = grp_df[['Uploaded_variation','Gene']].values.tolist()
groups_list

[['chr21_16491222_G_-', 'ENSG00000215386'],
 ['chr4_86347899_T_A', 'ENSG00000109339'],
 ['chr4_86220427_C_T', 'ENSG00000109339'],
 ['chr21_16320705_G_C', 'ENSG00000215386'],
 ['chr21_16281875_A_T', 'ENSG00000215386'],
 ['chr21_16283348_-_ACTC', 'ENSG00000215386'],
 ['chr21_16518290_G_C', 'ENSG00000215386'],
 ['chr21_16525153_G_T', 'ENSG00000215386'],
 ['chr21_16496154_A_G', 'ENSG00000215386'],
 ['chr21_16251706_A_T', 'ENSG00000215386'],
 ['chr7_100327826_C_-', 'ENSG00000291178'],
 ['chr8_129528144_T_G', 'ENSG00000229140'],
 ['chr8_129423485_G_A', 'ENSG00000229140'],
 ['chr8_129360150_G_A', 'ENSG00000229140'],
 ['chr20_38429340_C_A', 'ENSG00000196756'],
 ['chr3_181970754_G_T', 'ENSG00000242512'],
 ['chr3_181974732_G_A', 'ENSG00000242512'],
 ['chr21_16183377_G_T', 'ENSG00000215386'],
 ['chr4_153270462_G_T', 'ENSG00000109654'],
 ['chr6_85570927_C_T', 'ENSG00000135317'],
 ['chr6_85563285_C_G', 'ENSG00000135317'],
 ['chr6_85561340_C_A', 'ENSG00000135317'],
 ['chr3_129490110_G_A', 'ENSG00000

In [12]:
# CODE TO FILTER TRANSCRIPTS OF THE SAME VARIANT:
final_df = pd.DataFrame()
for g in tqdm(groups_list):
    mut = g[0] # first element of list
    gene = g[1] # second element of list
    transcripts_df = vep_out_df[(vep_out_df['Uploaded_variation'] == mut) & (vep_out_df['Gene'] == gene)] # for each variant in dataframe that is equal to the variant in the list (unique value)
    if len(transcripts_df) > 1: # this selects those variants that have more than 1 transcript
        mane_transcripts_df = transcripts_df[transcripts_df['MANE_SELECT'] != '-']
        if len(mane_transcripts_df) == 1:
            df2 = mane_transcripts_df # select mane trancript
        elif len(mane_transcripts_df) == 0:
            can_df = transcripts_df[transcripts_df['CANONICAL'] == 'YES']
            if len(can_df) == 1:
                df2 = can_df # select canonical transcript
            elif len(can_df) == 0:
                df2 = transcripts_df # take all trancripts when no information of MANE or CANONICAL
            else:
                print('Another option?') # show this message and display it to show that something is missing in the code
                display(can_df)
        else:
            print('Another option?') # show this message and display it to show that something is missing in the code
            display(transcripts_df)
    elif len(transcripts_df) == 1: # this means there is only 1 transcript for this variant so no need to filter
        df2 = transcripts_df 
    else:
        print('Another option?')

    final_df = pd.concat([final_df, df2], ignore_index=True)
final_df

  0%|          | 0/27874 [00:00<?, ?it/s]

Unnamed: 0,Uploaded_variation,Location,Allele,Gene,Feature,Feature_type,Consequence,cDNA_position,CDS_position,Protein_position,Amino_acids,Codons,Existing_variation,IMPACT,DISTANCE,STRAND,FLAGS,SYMBOL,SYMBOL_SOURCE,HGNC_ID,CANONICAL,MANE_SELECT,MANE_PLUS_CLINICAL,ENSP,SOURCE,AF,AFR_AF,AMR_AF,EAS_AF,EUR_AF,SAS_AF,CLIN_SIG,SOMATIC,PHENO,gnomADg,gnomADg_AF,gnomADg_NFE,#CHROM,POS,REF,ALT
0,chr21_16491222_G_-,chr21:16491222,-,ENSG00000215386,ENST00000653129,Transcript,"intron_variant,non_coding_transcript_variant",-,-,-,-,-,-,MODIFIER,-,1,-,MIR99AHG,HGNC,HGNC:1274,YES,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,chr21,16491222,G,-
1,chr4_86347899_T_A,chr4:86347899,A,ENSG00000109339,ENST00000641462,Transcript,intron_variant,-,-,-,-,-,-,MODIFIER,-,-1,-,MAPK10,HGNC,HGNC:6872,YES,NM_138982.4,-,ENSP00000493435,-,-,-,-,-,-,-,-,-,-,-,-,-,chr4,86347899,T,A
2,chr4_86220427_C_T,chr4:86220427,T,ENSG00000109339,ENST00000641462,Transcript,intron_variant,-,-,-,-,-,-,MODIFIER,-,-1,-,MAPK10,HGNC,HGNC:6872,YES,NM_138982.4,-,ENSP00000493435,-,-,-,-,-,-,-,-,-,-,-,-,-,chr4,86220427,C,T
3,chr21_16320705_G_C,chr21:16320705,C,ENSG00000215386,ENST00000653129,Transcript,"intron_variant,non_coding_transcript_variant",-,-,-,-,-,-,MODIFIER,-,1,-,MIR99AHG,HGNC,HGNC:1274,YES,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,chr21,16320705,G,C
4,chr21_16281875_A_T,chr21:16281875,T,ENSG00000215386,ENST00000653129,Transcript,"intron_variant,non_coding_transcript_variant",-,-,-,-,-,-,MODIFIER,-,1,-,MIR99AHG,HGNC,HGNC:1274,YES,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,chr21,16281875,A,T
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31030,chr20_2062216_C_A,chr20:2062216,A,-,-,-,intergenic_variant,-,-,-,-,-,rs1440775833,MODIFIER,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,rs1440775833,1.97148e-05,-,chr20,2062216,C,A
31031,chr20_20377902_C_A,chr20:20377902,A,-,-,-,intergenic_variant,-,-,-,-,-,-,MODIFIER,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,chr20,20377902,C,A
31032,chr20_20214891_G_A,chr20:20214891,A,ENSG00000225417,ENST00000443692,Transcript,"intron_variant,non_coding_transcript_variant",-,-,-,-,-,rs6081923,MODIFIER,-,-1,-,-,-,-,YES,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,chr20,20214891,G,A
31033,chr20_19728602_C_A,chr20:19728602,A,-,-,-,intergenic_variant,-,-,-,-,-,rs1568713602,MODIFIER,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,rs1568713602,6.57903e-06,-,chr20,19728602,C,A


### 2/ Add the WORSE consequence for each variant based on CONSEQUENCE_LIST order

In [13]:
# Consequences list was defined previously
CONSEQUENCES_LIST

['transcript_ablation',
 'splice_acceptor_variant',
 'splice_donor_variant',
 'stop_gained',
 'frameshift_variant',
 'stop_lost',
 'start_lost',
 'transcript_amplification',
 'inframe_insertion',
 'inframe_deletion',
 'missense_variant',
 'protein_altering_variant',
 'splice_region_variant',
 'splice_donor_5th_base_variant',
 'splice_donor_region_variant',
 'splice_polypyrimidine_tract_variant',
 'incomplete_terminal_codon_variant',
 'start_retained_variant',
 'stop_retained_variant',
 'synonymous_variant',
 'coding_sequence_variant',
 'mature_miRNA_variant',
 '5_prime_UTR_variant',
 '3_prime_UTR_variant',
 'non_coding_transcript_exon_variant',
 'intron_variant',
 'NMD_transcript_variant',
 'non_coding_transcript_variant',
 'upstream_gene_variant',
 'downstream_gene_variant',
 'TFBS_ablation',
 'TFBS_amplification',
 'TF_binding_site_variant',
 'regulatory_region_ablation',
 'regulatory_region_amplification',
 'feature_elongation',
 'regulatory_region_variant',
 'feature_truncation',
 

In [14]:
def consequence_worse(row):
    conseq = row['Consequence'].split(',')  # split comma separated values of Consequence column
    print(len(conseq))
    num_elements = len(conseq)
    if num_elements == 1: # when there is only one Consequence for the variant
        return conseq[0] # return first value (unique one in this case) to avoid returning a list
    elif num_elements > 1: # when there is more than one Consequence for the variant
        for element in CONSEQUENCES_LIST: # for each element of the ordered list of consequences
            for cons in conseq: # for each element of the column Consequences
                if cons == element: # if the consequence is equal to the element
                    return element
                    break
                else:
                    print('This conseq type is not equal to this element in BBGWiki list')
                    print(cons, element)
    else:
        print('Different number of consequence types in data (different than 1 or more))')
        print(num_elements)

In [15]:
# Now apply it to all sample data
final_df['WORSE_conseq'] = final_df.progress_apply(consequence_worse, axis=1)

  0%|          | 0/31035 [00:00<?, ?it/s]

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)

IOPub data rate exceeded.
The Jupyter

In [16]:
final_df[['Consequence', 'WORSE_conseq']] # see information has been added into new column according to CONSEQUENCES_LIST

Unnamed: 0,Consequence,WORSE_conseq
0,"intron_variant,non_coding_transcript_variant",intron_variant
1,intron_variant,intron_variant
2,intron_variant,intron_variant
3,"intron_variant,non_coding_transcript_variant",intron_variant
4,"intron_variant,non_coding_transcript_variant",intron_variant
...,...,...
31030,intergenic_variant,intergenic_variant
31031,intergenic_variant,intergenic_variant
31032,"intron_variant,non_coding_transcript_variant",intron_variant
31033,intergenic_variant,intergenic_variant


### 3/ Filter multiple transcripts by WORSE_conseq

In [17]:
# See some variants are repeated 
final_df.groupby(['Uploaded_variation', 'Gene'], as_index=False).count().sort_values('Location',ascending=False)

Unnamed: 0,Uploaded_variation,Gene,Location,Allele,Feature,Feature_type,Consequence,cDNA_position,CDS_position,Protein_position,Amino_acids,Codons,Existing_variation,IMPACT,DISTANCE,STRAND,FLAGS,SYMBOL,SYMBOL_SOURCE,HGNC_ID,CANONICAL,MANE_SELECT,MANE_PLUS_CLINICAL,ENSP,SOURCE,AF,AFR_AF,AMR_AF,EAS_AF,EUR_AF,SAS_AF,CLIN_SIG,SOMATIC,PHENO,gnomADg,gnomADg_AF,gnomADg_NFE,#CHROM,POS,REF,ALT,WORSE_conseq
5677,chr15_25105866_G_T,ENSG00000224078,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78
24240,chr8_127203884_G_T,ENSG00000254166,65,65,65,65,65,65,65,65,65,65,65,65,65,65,65,65,65,65,65,65,65,65,65,65,65,65,65,65,65,65,65,65,65,65,65,65,65,65,65,65
18813,chr4_86415802_T_A,ENSG00000109339,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60
16769,chr4_112965986_G_T,ENSG00000145362,57,57,57,57,57,57,57,57,57,57,57,57,57,57,57,57,57,57,57,57,57,57,57,57,57,57,57,57,57,57,57,57,57,57,57,57,57,57,57,57
16766,chr4_112835968_G_A,ENSG00000145362,53,53,53,53,53,53,53,53,53,53,53,53,53,53,53,53,53,53,53,53,53,53,53,53,53,53,53,53,53,53,53,53,53,53,53,53,53,53,53,53
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9451,chr1_105203454_T_A,-,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
9450,chr1_105123996_A_T,-,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
9449,chr1_105109519_C_A,-,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
9448,chr1_105068562_A_G,-,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1


In [18]:
# Example of variant with different transcripts because no CANONICAL and no MANE transcript to filter 
final_df[(final_df['Uploaded_variation'] == 'chr15_25105866_G_T') & (final_df['Gene'] == 'ENSG00000224078')]

Unnamed: 0,Uploaded_variation,Location,Allele,Gene,Feature,Feature_type,Consequence,cDNA_position,CDS_position,Protein_position,Amino_acids,Codons,Existing_variation,IMPACT,DISTANCE,STRAND,FLAGS,SYMBOL,SYMBOL_SOURCE,HGNC_ID,CANONICAL,MANE_SELECT,MANE_PLUS_CLINICAL,ENSP,SOURCE,AF,AFR_AF,AMR_AF,EAS_AF,EUR_AF,SAS_AF,CLIN_SIG,SOMATIC,PHENO,gnomADg,gnomADg_AF,gnomADg_NFE,#CHROM,POS,REF,ALT,WORSE_conseq
30,chr15_25105866_G_T,chr15:25105866,T,ENSG00000224078,ENST00000546682,Transcript,non_coding_transcript_exon_variant,3483,-,-,-,-,-,MODIFIER,-,1,-,SNHG14,HGNC,HGNC:37462,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,chr15,25105866,G,T,non_coding_transcript_exon_variant
31,chr15_25105866_G_T,chr15:25105866,T,ENSG00000224078,ENST00000549804,Transcript,"intron_variant,non_coding_transcript_variant",-,-,-,-,-,-,MODIFIER,-,1,-,SNHG14,HGNC,HGNC:37462,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,chr15,25105866,G,T,intron_variant
32,chr15_25105866_G_T,chr15:25105866,T,ENSG00000224078,ENST00000553108,Transcript,"intron_variant,non_coding_transcript_variant",-,-,-,-,-,-,MODIFIER,-,1,-,SNHG14,HGNC,HGNC:37462,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,chr15,25105866,G,T,intron_variant
33,chr15_25105866_G_T,chr15:25105866,T,ENSG00000224078,ENST00000626200,Transcript,upstream_gene_variant,-,-,-,-,-,-,MODIFIER,3816,1,-,SNHG14,HGNC,HGNC:37462,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,chr15,25105866,G,T,upstream_gene_variant
34,chr15_25105866_G_T,chr15:25105866,T,ENSG00000224078,ENST00000640631,Transcript,"intron_variant,non_coding_transcript_variant",-,-,-,-,-,-,MODIFIER,-,1,-,SNHG14,HGNC,HGNC:37462,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,chr15,25105866,G,T,intron_variant
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
103,chr15_25105866_G_T,chr15:25105866,T,ENSG00000224078,ENST00000670697,Transcript,upstream_gene_variant,-,-,-,-,-,-,MODIFIER,425,1,-,SNHG14,HGNC,HGNC:37462,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,chr15,25105866,G,T,upstream_gene_variant
104,chr15_25105866_G_T,chr15:25105866,T,ENSG00000224078,ENST00000671121,Transcript,"intron_variant,non_coding_transcript_variant",-,-,-,-,-,-,MODIFIER,-,1,-,SNHG14,HGNC,HGNC:37462,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,chr15,25105866,G,T,intron_variant
105,chr15_25105866_G_T,chr15:25105866,T,ENSG00000224078,ENST00000671136,Transcript,"intron_variant,non_coding_transcript_variant",-,-,-,-,-,-,MODIFIER,-,1,-,SNHG14,HGNC,HGNC:37462,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,chr15,25105866,G,T,intron_variant
106,chr15_25105866_G_T,chr15:25105866,T,ENSG00000224078,ENST00000671374,Transcript,"intron_variant,non_coding_transcript_variant",-,-,-,-,-,-,MODIFIER,-,1,-,SNHG14,HGNC,HGNC:37462,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,chr15,25105866,G,T,intron_variant


In [19]:
# Apply a function that takes the transcript with WORSE_conseq type (first one appearing) when no CANONICAL and no MANE transcript to filter

In [20]:
# Filter duplicated rows based on columns Uploaded variation and Gene based on order in consequence list of column WORSE_conseq
annot_df = final_df.sort_values(by='WORSE_conseq', key=lambda x: x.map(lambda y: CONSEQUENCES_LIST.index(y))).drop_duplicates(subset=['Uploaded_variation', 'Gene'])
annot_df

Unnamed: 0,Uploaded_variation,Location,Allele,Gene,Feature,Feature_type,Consequence,cDNA_position,CDS_position,Protein_position,Amino_acids,Codons,Existing_variation,IMPACT,DISTANCE,STRAND,FLAGS,SYMBOL,SYMBOL_SOURCE,HGNC_ID,CANONICAL,MANE_SELECT,MANE_PLUS_CLINICAL,ENSP,SOURCE,AF,AFR_AF,AMR_AF,EAS_AF,EUR_AF,SAS_AF,CLIN_SIG,SOMATIC,PHENO,gnomADg,gnomADg_AF,gnomADg_NFE,#CHROM,POS,REF,ALT,WORSE_conseq
28321,chr3_45035523_G_A,chr3:45035523,A,ENSG00000075914,ENST00000481405,Transcript,"splice_acceptor_variant,non_coding_transcript_...",-,-,-,-,-,rs1697627692,HIGH,-,1,-,EXOSC7,HGNC,HGNC:28112,-,-,-,-,-,-,-,-,-,-,-,-,-,-,rs1697627692,6.57013e-06,-,chr3,45035523,G,A,splice_acceptor_variant
10728,chr9_104825888_C_A,chr9:104825888,A,ENSG00000165029,ENST00000374736,Transcript,splice_acceptor_variant,-,-,-,-,-,-,HIGH,-,-1,-,ABCA1,HGNC,HGNC:29,YES,NM_005502.4,-,ENSP00000363868,-,-,-,-,-,-,-,-,-,-,-,-,-,chr9,104825888,C,A,splice_acceptor_variant
8386,chr3_130855879_A_T,chr3:130855879,T,ENSG00000017260,ENST00000509150,Transcript,"splice_acceptor_variant,non_coding_transcript_...",-,-,-,-,-,-,HIGH,-,1,-,ATP2C1,HGNC,HGNC:13211,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,chr3,130855879,A,T,splice_acceptor_variant
21112,chr15_88797906_T_A,chr15:88797906,A,ENSG00000259676,ENST00000561358,Transcript,"splice_acceptor_variant,non_coding_transcript_...",-,-,-,-,-,-,HIGH,-,-1,-,-,-,-,YES,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,chr15,88797906,T,A,splice_acceptor_variant
9455,chr3_45035523_G_A,chr3:45035523,A,ENSG00000163815,ENST00000296130,Transcript,splice_acceptor_variant,-,-,-,-,-,rs1697627692,HIGH,-,1,-,CLEC3B,HGNC,HGNC:11891,YES,NM_003278.3,-,ENSP00000296130,-,-,-,-,-,-,-,-,-,-,rs1697627692,6.57013e-06,-,chr3,45035523,G,A,splice_acceptor_variant
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20720,chr14_95609258_T_G,chr14:95609258,G,-,-,-,intergenic_variant,-,-,-,-,-,-,MODIFIER,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,chr14,95609258,T,G,intergenic_variant
20722,chr14_94880476_T_G,chr14:94880476,G,-,-,-,intergenic_variant,-,-,-,-,-,-,MODIFIER,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,chr14,94880476,T,G,intergenic_variant
20723,chr14_94826038_G_T,chr14:94826038,T,-,-,-,intergenic_variant,-,-,-,-,-,-,MODIFIER,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,chr14,94826038,G,T,intergenic_variant
20705,chr14_88119210_A_T,chr14:88119210,T,-,-,-,intergenic_variant,-,-,-,-,-,rs1193531685,MODIFIER,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,rs1193531685,6.57825e-06,-,chr14,88119210,A,T,intergenic_variant


In [21]:
# Check if there are duplicated values based on variants and genes
annot_df.groupby(['Uploaded_variation', 'Gene'], as_index=False).count().sort_values('Location',ascending=False)

Unnamed: 0,Uploaded_variation,Gene,Location,Allele,Feature,Feature_type,Consequence,cDNA_position,CDS_position,Protein_position,Amino_acids,Codons,Existing_variation,IMPACT,DISTANCE,STRAND,FLAGS,SYMBOL,SYMBOL_SOURCE,HGNC_ID,CANONICAL,MANE_SELECT,MANE_PLUS_CLINICAL,ENSP,SOURCE,AF,AFR_AF,AMR_AF,EAS_AF,EUR_AF,SAS_AF,CLIN_SIG,SOMATIC,PHENO,gnomADg,gnomADg_AF,gnomADg_NFE,#CHROM,POS,REF,ALT,WORSE_conseq
0,chr10_100037315_T_A,ENSG00000120054,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
18578,chr4_68722193_A_T,-,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
18589,chr4_69183698_A_G,ENSG00000250696,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
18588,chr4_69183698_A_G,ENSG00000196472,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
18587,chr4_6918220_G_T,ENSG00000132405,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9288,chr19_56540280_A_T,ENSG00000267421,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
9287,chr19_56540280_A_T,ENSG00000267224,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
9286,chr19_56540280_A_T,ENSG00000196867,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
9285,chr19_56128613_T_A,ENSG00000279959,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1


In [22]:
# Check previous example of variant with different transcripts selected because no CANONICAL and no MANE transcript information available to filter data 
annot_df[(annot_df['Uploaded_variation'] == 'chr15_25105866_G_T') & (final_df['Gene'] == 'ENSG00000224078')]

  annot_df[(annot_df['Uploaded_variation'] == 'chr15_25105866_G_T') & (final_df['Gene'] == 'ENSG00000224078')]


Unnamed: 0,Uploaded_variation,Location,Allele,Gene,Feature,Feature_type,Consequence,cDNA_position,CDS_position,Protein_position,Amino_acids,Codons,Existing_variation,IMPACT,DISTANCE,STRAND,FLAGS,SYMBOL,SYMBOL_SOURCE,HGNC_ID,CANONICAL,MANE_SELECT,MANE_PLUS_CLINICAL,ENSP,SOURCE,AF,AFR_AF,AMR_AF,EAS_AF,EUR_AF,SAS_AF,CLIN_SIG,SOMATIC,PHENO,gnomADg,gnomADg_AF,gnomADg_NFE,#CHROM,POS,REF,ALT,WORSE_conseq
30,chr15_25105866_G_T,chr15:25105866,T,ENSG00000224078,ENST00000546682,Transcript,non_coding_transcript_exon_variant,3483,-,-,-,-,-,MODIFIER,-,1,-,SNHG14,HGNC,HGNC:37462,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,chr15,25105866,G,T,non_coding_transcript_exon_variant


### 4/ Merge this table with the mutations dataframe (VEP input)

#### First transform mutations dataframe to VEP format

In [23]:
# Check that VEP output table is in VEP format
display(annot_df[annot_df['REF'].str.len() > 1])

Unnamed: 0,Uploaded_variation,Location,Allele,Gene,Feature,Feature_type,Consequence,cDNA_position,CDS_position,Protein_position,Amino_acids,Codons,Existing_variation,IMPACT,DISTANCE,STRAND,FLAGS,SYMBOL,SYMBOL_SOURCE,HGNC_ID,CANONICAL,MANE_SELECT,MANE_PLUS_CLINICAL,ENSP,SOURCE,AF,AFR_AF,AMR_AF,EAS_AF,EUR_AF,SAS_AF,CLIN_SIG,SOMATIC,PHENO,gnomADg,gnomADg_AF,gnomADg_NFE,#CHROM,POS,REF,ALT,WORSE_conseq
2583,chr11_113204426_GACGCCGGCATTTACAAGTGTGTGGTTACAG_-,chr11:113204426-113204456,-,ENSG00000149294,ENST00000316851,Transcript,frameshift_variant,461-491,268-298,90-100,DAGIYKCVVTG/X,GACGCCGGCATTTACAAGTGTGTGGTTACAGgc/gc,-,HIGH,-,1,-,NCAM1,HGNC,HGNC:7656,YES,NM_181351.5,-,ENSP00000318472,-,-,-,-,-,-,-,-,-,-,-,-,-,chr11,113204426,GACGCCGGCATTTACAAGTGTGTGGTTACAG,-,frameshift_variant
25255,chr12_116428308_CA_-,chr12:116428308-116428309,-,ENSG00000264037,ENST00000582069,Transcript,mature_miRNA_variant,10-11,-,-,-,-,rs202127912,MODIFIER,-,-1,-,MIR4472-2,HGNC,HGNC:41752,YES,-,-,-,-,-,-,-,-,-,-,-,-,-,rs202127912,0.0267378,-,chr12,116428308,CA,-,mature_miRNA_variant
7705,chr19_8322485_AATCCTGGCCGTGCGG_-,chr19:8322485-8322500,-,ENSG00000233927,ENST00000600659,Transcript,3_prime_UTR_variant,475-490,-,-,-,-,-,MODIFIER,-,1,-,RPS28,HGNC,HGNC:10418,YES,NM_001031.5,-,ENSP00000472469,-,-,-,-,-,-,-,-,-,-,-,-,-,chr19,8322485,AATCCTGGCCGTGCGG,-,3_prime_UTR_variant
12829,chr21_37525627_ATTCATTG_-,chr21:37525627-37525634,-,ENSG00000157540,ENST00000647188,Transcript,3_prime_UTR_variant,16293-16300,-,-,-,-,-,MODIFIER,-,1,-,DYRK1A,HGNC,HGNC:3091,YES,NM_001347721.2,-,ENSP00000494572,-,-,-,-,-,-,-,-,-,-,-,-,-,chr21,37525627,ATTCATTG,-,3_prime_UTR_variant
4442,chr7_102447660_GC_TT,chr7:102447660-102447661,TT,ENSG00000160991,ENST00000495936,Transcript,3_prime_UTR_variant,1582-1583,-,-,-,-,-,MODIFIER,-,1,-,ORAI2,HGNC,HGNC:21667,YES,NM_001126340.3,-,ENSP00000420178,-,-,-,-,-,-,-,-,-,-,-,-,-,chr7,102447660,GC,TT,3_prime_UTR_variant
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21713,chr14_47877652_AC_GT,chr14:47877652-47877653,GT,-,-,-,intergenic_variant,-,-,-,-,-,-,MODIFIER,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,chr14,47877652,AC,GT,intergenic_variant
21641,chr14_38434213_GG_TT,chr14:38434213-38434214,TT,-,-,-,intergenic_variant,-,-,-,-,-,-,MODIFIER,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,chr14,38434213,GG,TT,intergenic_variant
20248,chr5_86939138_TAGAAGTGTT_-,chr5:86939138-86939147,-,-,-,-,intergenic_variant,-,-,-,-,-,-,MODIFIER,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,chr5,86939138,TAGAAGTGTT,-,intergenic_variant
20647,chr4_165571492_AAC_-,chr4:165571492-165571494,-,-,-,-,intergenic_variant,-,-,-,-,-,rs767482822,MODIFIER,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,rs767482822,0.0649383,-,chr4,165571492,AAC,-,intergenic_variant


In [24]:
# To transform input table to VEP format first read the mutations table including both clonal and subclonal
mut_df = pd.read_csv('/workspace/projects/sjd_melos/MAFs_tables/Sar_CCF_Purple_all_mutations.tsv.gz', sep="\t", header=0)
mut_df[mut_df['mut_type'] == 'INDEL'].head()

Unnamed: 0,CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO,FORMAT,NORMAL,TUMOR,t_AF,n_AF,t_DP,n_DP,t_alt_reads,n_alt_reads,t_ref_reads,n_ref_reads,VAF,mut_type,Caller_intersec,SAMPLE,mut,CN,CCF,clonality
17,chr1,4069973,.,T,TA,.,PASS,"AS_FilterStatus=SITE;AS_SB_TABLE=67,63|31,33;D...",GT:AD:AF:DP:F1R2:F2R1:FAD:SB,"0/0:97,1:0.010:98:44,1:41,0:95,1:51,46,1,0","0/1:33,63:0.657:96:12,30:16,20:32,62:16,17,30,33",0.657,0.01,96,98,63,1,33,97,0.65625,INDEL,Mutect_Strelka,sarcoma,chr1_4069973_T_TA,1.3481,1.474365,clonal
21,chr1,4231919,.,CA,C,.,PASS,"AS_FilterStatus=SITE;AS_SB_TABLE=129,124|3,3;D...",GT:AD:AF:DP:F1R2:F2R1:FAD:SB,"0/0:128,0:7.337e-03:128:57,0:57,0:126,0:61,67,0,0","0/1:125,6:0.052:131:53,5:62,0:125,6:68,57,3,3",0.052,0.007337,131,128,6,0,125,128,0.045802,INDEL,Mutect_Strelka,sarcoma,chr1_4231919_CA_C,1.3481,0.1029,subclonal
29,chr1,4608758,.,CA,C,572,PASS,MH=A;RC=TTCAAAAAAAAAAAAAAAAAAGA;RC_IDX=2;RC_LF...,GT:ABQ:AD:AF:DP:RABQ:RAD:RC_CNT:RC_IPC:RC_JIT:...,"./.:0:95,0:0:96:3126,0:117,0:0,0,0,0,0,95,96:0...","./.:31:28,44:0.579:76:1276,1260:51,43:31,10,0,...",0.579,0.0,76,96,44,0,28,95,0.579,INDEL,Mutect_SAGE,sarcoma,chr1_4608758_CA_C,1.3481,1.300811,clonal
34,chr1,4788541,.,T,TG,.,PASS,"AS_FilterStatus=SITE;AS_SB_TABLE=84,101|28,39;...",GT:AD:AF:DP:F1R2:F2R1:FAD:SB,"0/0:132,0:7.211e-03:132:54,0:68,0:132,0:61,71,0,0","0/1:53,67:0.554:120:19,28:24,27:52,65:23,30,28,39",0.554,0.007211,120,132,67,0,53,132,0.558333,INDEL,Mutect_Strelka,sarcoma,chr1_4788541_T_TG,1.3481,1.25438,clonal
101,chr1,20130828,.,CT,C,.,PASS,"AS_FilterStatus=SITE;AS_SB_TABLE=88,75|38,30;D...",GT:AD:AF:DP:F1R2:F2R1:FAD:SB,"0/0:118,1:0.015:119:50,1:45,0:118,1:64,54,1,0","0/1:45,67:0.600:112:21,34:16,26:44,66:24,21,37,30",0.6,0.015,112,119,67,1,45,118,0.598214,INDEL,Mutect_Strelka,sarcoma,chr1_20130828_CT_C,1.3694,1.356721,clonal


In [25]:
# Change format of these INDEL mutations
def change_format (row):

    ref = row['REF']
    alt = row['ALT']
    pos = row['POS']
    mut_type = row['mut_type']
    position = row['POS']

    if mut_type == 'INDEL':

        if len(ref) == 1: # this is an insertion
            ref = '-' # change for "-"
            alt = alt[1:] #change for same without first character
            pos = pos + 1 # add 1 to position
        else: # this is a deletion
            ref = ref[1:] # change for same without first character
            alt = '-' # change for "-"
            pos = pos + 1 # add 1 to position
        row['REF'] = ref
        row['ALT'] = alt
        row['POS'] = pos
    return row

vep_df = mut_df.progress_apply(lambda row: change_format (row),axis=1)

  0%|          | 0/24295 [00:00<?, ?it/s]

In [26]:
vep_df

Unnamed: 0,CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO,FORMAT,NORMAL,TUMOR,t_AF,n_AF,t_DP,n_DP,t_alt_reads,n_alt_reads,t_ref_reads,n_ref_reads,VAF,mut_type,Caller_intersec,SAMPLE,mut,CN,CCF,clonality
0,chr1,104188,.,C,A,.,PASS,"AS_FilterStatus=SITE;AS_SB_TABLE=36,157|24,22;...",GT:AD:AF:DP:F1R2:F2R1:FAD:PGT:PID:PS:SB,"0|0:124,0:7.128e-03:124:33,0:29,0:125,0:0|1:10...","0|1:69,46:0.403:115:9,16:11,19:67,46:0|1:10418...",0.403,0.007128,115,124,46,0,69,124,0.400000,SNV,Mutect_Strelka,sarcoma,chr1_104188_C_A,1.4411,0.935860,clonal
1,chr1,843320,.,G,A,.,PASS,"AS_FilterStatus=SITE;AS_SB_TABLE=105,87|38,31;...",GT:AD:AF:DP:F1R2:F2R1:FAD:SB,"0/0:126,0:7.857e-03:126:58,0:63,0:125,0:67,59,0,0","0/1:66,69:0.515:135:34,33:31,36:65,69:38,28,38,31",0.515,0.007857,135,126,69,0,66,126,0.511111,SNV,Mutect_Strelka,sarcoma,chr1_843320_G_A,1.4411,1.195821,clonal
2,chr1,1014876,.,G,T,.,PASS,"AS_FilterStatus=SITE;AS_SB_TABLE=162,109|4,2;D...",GT:AD:AF:DP:F1R2:F2R1:FAD:SB,"0/0:141,0:7.025e-03:141:64,0:69,0:139,0:85,56,0,0","0/1:130,6:0.052:136:62,3:57,2:126,6:77,53,4,2",0.052,0.007025,136,141,6,0,130,141,0.044118,SNV,Mutect_Strelka,sarcoma,chr1_1014876_G_T,1.4411,0.103220,subclonal
3,chr1,1118520,.,C,T,.,PASS,"AS_FilterStatus=SITE;AS_SB_TABLE=120,130|19,26...",GT:AD:AF:DP:F1R2:F2R1:FAD:SB,"0/0:153,0:6.470e-03:153:62,0:83,0:152,0:78,75,0,0","0/1:97,45:0.329:142:46,25:43,20:93,45:42,55,19,26",0.329,0.006470,142,153,45,0,97,153,0.316901,SNV,Mutect_Strelka,sarcoma,chr1_1118520_C_T,1.4411,0.741439,clonal
4,chr1,1136548,.,G,T,.,PASS,"AS_FilterStatus=SITE;AS_SB_TABLE=86,92|45,45;D...",GT:AD:AF:DP:F1R2:F2R1:FAD:SB,"0/0:123,0:8.107e-03:123:60,0:59,0:121,0:57,66,0,0","0/1:55,90:0.615:145:25,41:30,44:55,89:29,26,45,45",0.615,0.008107,145,123,90,0,55,123,0.620690,SNV,Mutect_Strelka,sarcoma,chr1_1136548_G_T,1.4411,1.452197,clonal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24290,chrY,26654389,.,A,G,864,PASS,RC=GGAGAGGAAT;RC_IDX=6;RC_LF=GTAATGGAGT;RC_NM=...,GT:ABQ:AD:AF:DP:RABQ:RAD:RC_CNT:RC_IPC:RC_JIT:...,"./.:0:57,0:0:58:2084,0:63,0:0,0,0,0,0,57,58:0:...","./.:31:19,41:0.683:60:616,1300:20,41:30,5,1,0,...",0.683,0.000000,60,58,41,0,19,57,0.683000,SNV,Strelka_SAGE,sarcoma,chrY_26654389_A_G,0.5867,1.014426,clonal
24291,chrY,56823266,.,G,A,.,PASS,"AS_FilterStatus=SITE;AS_SB_TABLE=487,411|29,27...",GT:AD:AF:DP:F1R2:F2R1:FAD:SB,"0/0:456,0:2.167e-03:456:195,0:201,0:448,0:238,...","0/1:442,56:0.115:498:190,29:200,24:437,56:249,...",0.115,0.002167,498,456,56,0,442,456,0.112450,SNV,Mutect_Strelka,sarcoma,chrY_56823266_G_A,0.5867,0.167016,subclonal
24292,chrY,56842066,.,CT,AC,1323,PASS,LPS=6133;LPS_RC=42;RC=ACACACAG;RC_IDX=2;RC_LF=...,GT:ABQ:AD:AF:DP:RABQ:RAD:RC_CNT:RC_IPC:RC_JIT:...,"./.:0:290,0:0:290:9437,0:306,0:0,0,0,0,0,290,2...","./.:30:169,75:0.305:246:5423,2339:174,76:59,15...",0.305,0.000000,246,290,75,0,169,290,0.305000,MNV,Mutect_SAGE,sarcoma,chrY_56842066_CT_AC,0.5867,0.453001,clonal
24293,chrY,56842087,.,T,C,.,PASS,"AS_FilterStatus=SITE;AS_SB_TABLE=241,235|34,34...",GT:AD:AF:DP:F1R2:F2R1:FAD:PGT:PID:PS:SB,"0|0:295,0:3.303e-03:295:143,0:125,0:295,0:1|0:...","1|0:181,68:0.269:249:80,28:81,31:177,66:1|0:56...",0.269,0.003303,249,295,68,0,181,295,0.273092,SNV,Mutect_Strelka,sarcoma,chrY_56842087_T_C,0.5867,0.405611,clonal


In [27]:
# Check the transformation of INDELs
vep_df[vep_df['mut_type'] == 'INDEL'].head()

Unnamed: 0,CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO,FORMAT,NORMAL,TUMOR,t_AF,n_AF,t_DP,n_DP,t_alt_reads,n_alt_reads,t_ref_reads,n_ref_reads,VAF,mut_type,Caller_intersec,SAMPLE,mut,CN,CCF,clonality
17,chr1,4069974,.,-,A,.,PASS,"AS_FilterStatus=SITE;AS_SB_TABLE=67,63|31,33;D...",GT:AD:AF:DP:F1R2:F2R1:FAD:SB,"0/0:97,1:0.010:98:44,1:41,0:95,1:51,46,1,0","0/1:33,63:0.657:96:12,30:16,20:32,62:16,17,30,33",0.657,0.01,96,98,63,1,33,97,0.65625,INDEL,Mutect_Strelka,sarcoma,chr1_4069973_T_TA,1.3481,1.474365,clonal
21,chr1,4231920,.,A,-,.,PASS,"AS_FilterStatus=SITE;AS_SB_TABLE=129,124|3,3;D...",GT:AD:AF:DP:F1R2:F2R1:FAD:SB,"0/0:128,0:7.337e-03:128:57,0:57,0:126,0:61,67,0,0","0/1:125,6:0.052:131:53,5:62,0:125,6:68,57,3,3",0.052,0.007337,131,128,6,0,125,128,0.045802,INDEL,Mutect_Strelka,sarcoma,chr1_4231919_CA_C,1.3481,0.1029,subclonal
29,chr1,4608759,.,A,-,572,PASS,MH=A;RC=TTCAAAAAAAAAAAAAAAAAAGA;RC_IDX=2;RC_LF...,GT:ABQ:AD:AF:DP:RABQ:RAD:RC_CNT:RC_IPC:RC_JIT:...,"./.:0:95,0:0:96:3126,0:117,0:0,0,0,0,0,95,96:0...","./.:31:28,44:0.579:76:1276,1260:51,43:31,10,0,...",0.579,0.0,76,96,44,0,28,95,0.579,INDEL,Mutect_SAGE,sarcoma,chr1_4608758_CA_C,1.3481,1.300811,clonal
34,chr1,4788542,.,-,G,.,PASS,"AS_FilterStatus=SITE;AS_SB_TABLE=84,101|28,39;...",GT:AD:AF:DP:F1R2:F2R1:FAD:SB,"0/0:132,0:7.211e-03:132:54,0:68,0:132,0:61,71,0,0","0/1:53,67:0.554:120:19,28:24,27:52,65:23,30,28,39",0.554,0.007211,120,132,67,0,53,132,0.558333,INDEL,Mutect_Strelka,sarcoma,chr1_4788541_T_TG,1.3481,1.25438,clonal
101,chr1,20130829,.,T,-,.,PASS,"AS_FilterStatus=SITE;AS_SB_TABLE=88,75|38,30;D...",GT:AD:AF:DP:F1R2:F2R1:FAD:SB,"0/0:118,1:0.015:119:50,1:45,0:118,1:64,54,1,0","0/1:45,67:0.600:112:21,34:16,26:44,66:24,21,37,30",0.6,0.015,112,119,67,1,45,118,0.598214,INDEL,Mutect_Strelka,sarcoma,chr1_20130828_CT_C,1.3694,1.356721,clonal


In [28]:
# Rename column CHROM
vep_df.rename(columns={'CHROM':'#CHROM'}, inplace=True)

# Check the change in column name has been applied
display(vep_df.columns)

Index(['#CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT',
       'NORMAL', 'TUMOR', 't_AF', 'n_AF', 't_DP', 'n_DP', 't_alt_reads',
       'n_alt_reads', 't_ref_reads', 'n_ref_reads', 'VAF', 'mut_type',
       'Caller_intersec', 'SAMPLE', 'mut', 'CN', 'CCF', 'clonality'],
      dtype='object')

#### Then merge mutations dataframe with annot_df by CROM, POS, REF, ALT

In [29]:
# To merge all reference columns have to be the same type
print(vep_df['POS'].dtypes)
print(annot_df['POS'].dtypes)

int64
object


In [30]:
# Change column POS of annot_df to integer type to be able to merge (both columns in the dataframes have to be the same data type)
annot_df['POS']=annot_df['POS'].astype(int)

In [31]:
# Merge dataframes based on CHROM, POS, REF, ALT
merged_df = annot_df.merge(vep_df, on=['#CHROM','POS','REF','ALT'], how = 'left') # this retains all rows from the left DataFrame while matching rows from the right DataFrame
merged_df

Unnamed: 0,Uploaded_variation,Location,Allele,Gene,Feature,Feature_type,Consequence,cDNA_position,CDS_position,Protein_position,Amino_acids,Codons,Existing_variation,IMPACT,DISTANCE,STRAND,FLAGS,SYMBOL,SYMBOL_SOURCE,HGNC_ID,CANONICAL,MANE_SELECT,MANE_PLUS_CLINICAL,ENSP,SOURCE,AF,AFR_AF,AMR_AF,EAS_AF,EUR_AF,SAS_AF,CLIN_SIG,SOMATIC,PHENO,gnomADg,gnomADg_AF,gnomADg_NFE,#CHROM,POS,REF,ALT,WORSE_conseq,ID,QUAL,FILTER,INFO,FORMAT,NORMAL,TUMOR,t_AF,n_AF,t_DP,n_DP,t_alt_reads,n_alt_reads,t_ref_reads,n_ref_reads,VAF,mut_type,Caller_intersec,SAMPLE,mut,CN,CCF,clonality
0,chr3_45035523_G_A,chr3:45035523,A,ENSG00000075914,ENST00000481405,Transcript,"splice_acceptor_variant,non_coding_transcript_...",-,-,-,-,-,rs1697627692,HIGH,-,1,-,EXOSC7,HGNC,HGNC:28112,-,-,-,-,-,-,-,-,-,-,-,-,-,-,rs1697627692,6.57013e-06,-,chr3,45035523,G,A,splice_acceptor_variant,.,.,PASS,"AS_FilterStatus=SITE;AS_SB_TABLE=151,155|40,33...",GT:AD:AF:DP:F1R2:F2R1:FAD:SB,"0/0:156,0:6.507e-03:156:66,0:82,0:151,0:78,78,0,0","0/1:150,73:0.326:223:69,32:76,38:146,70:73,77,...",0.326,0.006507,223,156,73,0,150,156,0.327354,SNV,Mutect_Strelka,sarcoma,chr3_45035523_G_A,2.4582,1.098847,clonal
1,chr9_104825888_C_A,chr9:104825888,A,ENSG00000165029,ENST00000374736,Transcript,splice_acceptor_variant,-,-,-,-,-,-,HIGH,-,-1,-,ABCA1,HGNC,HGNC:29,YES,NM_005502.4,-,ENSP00000363868,-,-,-,-,-,-,-,-,-,-,-,-,-,chr9,104825888,C,A,splice_acceptor_variant,.,.,PASS,"AS_FilterStatus=SITE;AS_SB_TABLE=72,105|35,46;...",GT:AD:AF:DP:F1R2:F2R1:FAD:SB,"0/0:121,0:8.178e-03:121:57,0:60,0:120,0:50,71,0,0","0/1:56,81:0.590:137:23,37:31,39:55,80:22,34,35,46",0.590,0.008178,137,121,81,0,56,121,0.591241,SNV,Mutect_Strelka,sarcoma,chr9_104825888_C_A,1.5944,1.473934,clonal
2,chr3_130855879_A_T,chr3:130855879,T,ENSG00000017260,ENST00000509150,Transcript,"splice_acceptor_variant,non_coding_transcript_...",-,-,-,-,-,-,HIGH,-,1,-,ATP2C1,HGNC,HGNC:13211,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,chr3,130855879,A,T,splice_acceptor_variant,.,235,PASS,RC=GCTGG;RC_IDX=2;RC_LF=ATGTGTCCCT;RC_NM=1;RC_...,GT:ABQ:AD:AF:DP:RABQ:RAD:RC_CNT:RC_IPC:RC_JIT:...,"./.:0:117,0:0:117:3966,0:124,0:0,0,0,0,0,117,1...","./.:32:121,8:0.062:129:3935,258:124,8:7,1,0,0,...",0.062,0.000000,129,117,8,0,121,117,0.062000,SNV,Strelka_SAGE,sarcoma,chr3_130855879_A_T,1.0352,0.119893,subclonal
3,chr15_88797906_T_A,chr15:88797906,A,ENSG00000259676,ENST00000561358,Transcript,"splice_acceptor_variant,non_coding_transcript_...",-,-,-,-,-,-,HIGH,-,-1,-,-,-,-,YES,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,chr15,88797906,T,A,splice_acceptor_variant,.,.,PASS,"AS_FilterStatus=SITE;AS_SB_TABLE=100,85|36,38;...",GT:AD:AF:DP:F1R2:F2R1:FAD:SB,"0/0:98,0:9.714e-03:98:54,0:35,0:97,0:53,45,0,0","0/1:87,74:0.466:161:41,27:36,40:84,73:47,40,36,38",0.466,0.009714,161,98,74,0,87,98,0.459627,SNV,Mutect_Strelka,sarcoma,chr15_88797906_T_A,2.4145,1.522769,clonal
4,chr3_45035523_G_A,chr3:45035523,A,ENSG00000163815,ENST00000296130,Transcript,splice_acceptor_variant,-,-,-,-,-,rs1697627692,HIGH,-,1,-,CLEC3B,HGNC,HGNC:11891,YES,NM_003278.3,-,ENSP00000296130,-,-,-,-,-,-,-,-,-,-,rs1697627692,6.57013e-06,-,chr3,45035523,G,A,splice_acceptor_variant,.,.,PASS,"AS_FilterStatus=SITE;AS_SB_TABLE=151,155|40,33...",GT:AD:AF:DP:F1R2:F2R1:FAD:SB,"0/0:156,0:6.507e-03:156:66,0:82,0:151,0:78,78,0,0","0/1:150,73:0.326:223:69,32:76,38:146,70:73,77,...",0.326,0.006507,223,156,73,0,150,156,0.327354,SNV,Mutect_Strelka,sarcoma,chr3_45035523_G_A,2.4582,1.098847,clonal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27869,chr14_95609258_T_G,chr14:95609258,G,-,-,-,intergenic_variant,-,-,-,-,-,-,MODIFIER,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,chr14,95609258,T,G,intergenic_variant,.,.,PASS,"AS_FilterStatus=SITE;AS_SB_TABLE=102,129|27,35...",GT:AD:AF:DP:F1R2:F2R1:FAD:SB,"0/0:119,0:8.190e-03:119:61,0:56,0:119,0:54,65,0,0","0/1:112,62:0.352:174:64,26:45,32:110,60:48,64,...",0.352,0.008190,174,119,62,0,112,119,0.356322,SNV,Mutect_Strelka,sarcoma,chr14_95609258_T_G,2.4528,1.194159,clonal
27870,chr14_94880476_T_G,chr14:94880476,G,-,-,-,intergenic_variant,-,-,-,-,-,-,MODIFIER,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,chr14,94880476,T,G,intergenic_variant,.,.,PASS,"AS_FilterStatus=SITE;AS_SB_TABLE=99,118|36,29;...",GT:AD:AF:DP:F1R2:F2R1:FAD:SB,"0/0:114,0:8.792e-03:114:57,0:52,0:111,0:47,67,0,0","0/1:103,65:0.393:168:50,38:50,27:101,65:52,51,...",0.393,0.008792,168,114,65,0,103,114,0.386905,SNV,Mutect_Strelka,sarcoma,chr14_94880476_T_G,2.4528,1.296654,clonal
27871,chr14_94826038_G_T,chr14:94826038,T,-,-,-,intergenic_variant,-,-,-,-,-,-,MODIFIER,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,chr14,94826038,G,T,intergenic_variant,.,.,PASS,"AS_FilterStatus=SITE;AS_SB_TABLE=146,123|41,36...",GT:AD:AF:DP:F1R2:F2R1:FAD:SB,"0/0:120,0:8.036e-03:120:51,0:59,0:120,0:69,51,0,0","0/1:149,77:0.344:226:66,39:62,32:146,76:77,72,...",0.344,0.008036,226,120,77,0,149,120,0.340708,SNV,Mutect_Strelka,sarcoma,chr14_94826038_G_T,2.4528,1.141832,clonal
27872,chr14_88119210_A_T,chr14:88119210,T,-,-,-,intergenic_variant,-,-,-,-,-,rs1193531685,MODIFIER,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,rs1193531685,6.57825e-06,-,chr14,88119210,A,T,intergenic_variant,.,.,PASS,"AS_FilterStatus=SITE;AS_SB_TABLE=123,152|12,16...",GT:AD:AF:DP:F1R2:F2R1:FAD:SB,"0/0:134,0:7.292e-03:134:60,0:60,0:130,0:59,75,0,0","0/1:141,28:0.173:169:54,14:70,10:136,28:64,77,...",0.173,0.007292,169,134,28,0,141,134,0.165680,SNV,Mutect_Strelka,sarcoma,chr14_88119210_A_T,2.4528,0.555253,clonal


In [34]:
# Note that when grouped by variant coordinate and type of change variants are unique but some contain more than one transcript. 
# This is due to intergenic variants that VEP annotates them in the surrounding transcripts, so they are annotated more than one time.
merged_df.groupby(['#CHROM', 'POS','REF','ALT'], as_index=False).count().sort_values('Location',ascending=False)

Unnamed: 0,#CHROM,POS,REF,ALT,Uploaded_variation,Location,Allele,Gene,Feature,Feature_type,Consequence,cDNA_position,CDS_position,Protein_position,Amino_acids,Codons,Existing_variation,IMPACT,DISTANCE,STRAND,FLAGS,SYMBOL,SYMBOL_SOURCE,HGNC_ID,CANONICAL,MANE_SELECT,MANE_PLUS_CLINICAL,ENSP,SOURCE,AF,AFR_AF,AMR_AF,EAS_AF,EUR_AF,SAS_AF,CLIN_SIG,SOMATIC,PHENO,gnomADg,gnomADg_AF,gnomADg_NFE,WORSE_conseq,ID,QUAL,FILTER,INFO,FORMAT,NORMAL,TUMOR,t_AF,n_AF,t_DP,n_DP,t_alt_reads,n_alt_reads,t_ref_reads,n_ref_reads,VAF,mut_type,Caller_intersec,SAMPLE,mut,CN,CCF,clonality
17866,chr5,141463684,G,T,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20
425,chr1,63550594,G,A,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9
6755,chr14,101048065,G,T,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9
11075,chr2,143099870,G,C,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8
6754,chr14,100883777,G,T,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8766,chr18,27286872,C,G,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
8765,chr18,27278021,T,A,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
8764,chr18,27275462,G,A,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
8763,chr18,27203045,C,T,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1


In [35]:
# Export tsv table
# merged_df.to_csv('/workspace/projects/sjd_melos/MAFs_tables/Sar_common_mutations_callers_VEP_annotated.tsv.gz', sep ='\t', index=None, compression = "gzip")