In [1]:
import pandas as pd
import re

In [2]:
clin_annot = pd.read_csv('clinvar.acmg.tsv', header=0, sep='\t')
clin_annot.head()

Unnamed: 0,CHROM,POS,ID,REF,ALT,ALLELEID,CLNREVSTAT,CLNSIG,CLNSIGCONF,CLNVC,CLNVI,GENEINFO,MC
0,1,17018889,1468479,A,C,1358839,"criteria_provided,_single_submitter",Uncertain_significance,.,single_nucleotide_variant,.,SDHB:6390,SO:0001583|missense_variant
1,1,17018889,657450,A,T,627077,"criteria_provided,_single_submitter",Uncertain_significance,.,single_nucleotide_variant,.,SDHB:6390,SO:0001583|missense_variant
2,1,17018904,459174,T,C,447440,"criteria_provided,_multiple_submitters,_no_con...",Uncertain_significance,.,single_nucleotide_variant,.,SDHB:6390,SO:0001583|missense_variant
3,1,17018906,486424,T,A,472271,"criteria_provided,_multiple_submitters,_no_con...",Uncertain_significance,.,single_nucleotide_variant,.,SDHB:6390,SO:0001583|missense_variant
4,1,17018909,1445276,G,A,1397602,"criteria_provided,_single_submitter",Uncertain_significance,.,single_nucleotide_variant,.,SDHB:6390,SO:0001583|missense_variant


In [3]:
clin_hgvs = pd.read_csv('variant_summary.hgvs.tsv', header=0, sep='\t')
clin_hgvs.columns = ['ALLELEID', 'HGVS_NAME', 'GENE_SYMBOL', 'CLINICAL_SIGNIFICANCE']
clin_hgvs.head()

Unnamed: 0,ALLELEID,HGVS_NAME,GENE_SYMBOL,CLINICAL_SIGNIFICANCE
0,15048,NM_000410.4(HFE):c.845G>A (p.Cys282Tyr),HFE,Conflicting interpretations of pathogenicity; ...
1,15049,NM_000410.4(HFE):c.187C>G (p.His63Asp),HFE,Conflicting interpretations of pathogenicity; ...
2,15050,NM_000410.4(HFE):c.193A>T (p.Ser65Cys),HFE,Conflicting interpretations of pathogenicity
3,15051,NM_000410.4(HFE):c.314T>C (p.Ile105Thr),HFE,Uncertain significance
4,15052,NM_000410.4(HFE):c.277G>C (p.Gly93Arg),HFE,Pathogenic


In [4]:
def resolve_conflict(annotation_list):
    
    annots = annotation_list.split('|_')
    
    annot_counter = {'P/LP': 0,
                'B/LB': 0,
                'VUS' : 0}

    for annot in annots:
        clin_term, count = re.search(r'^(\S+)\((\d+)\)', annot).groups()
        if clin_term in ['Pathogenic', 'Likely_pathogenic']:
            annot_counter['P/LP'] += int(count)
        elif clin_term in ['Benign', 'Likely_benign']:
            annot_counter['B/LB'] += int(count)
        elif clin_term == 'Uncertain_significance':
            annot_counter['VUS'] += int(count)
        else:
            pass
    
    max_count = max(annot_counter, key=annot_counter.get)

    if sum(map((annot_counter[max_count]).__eq__, annot_counter.values())) > 1:
        max_count = 'VUS'

    return max_count

In [5]:
clinical_annotations = {'Uncertain_significance': 'VUS',
                        'Conflicting_interpretations_of_pathogenicity': 'Conflicting',
                        'Likely_pathogenic': 'P/LP',
                        'Pathogenic': 'P/LP',
                        'Pathogenic/Likely_pathogenic': 'P/LP',
                        'Likely_benign': 'B/LB',
                        'not_provided': 'Other',
                        'Benign': 'B/LB',
                        'Benign/Likely_benign': 'B/LB',
                        'Pathogenic|_drug_response': 'P/LP',
                        'Likely_pathogenic|_drug_response': 'P/LP',
                        'Uncertain_significance|_drug_response': 'VUS',
                        'Conflicting_interpretations_of_pathogenicity|_other': 'Conflicting',
                        'Benign|_other': 'B/LB',
                        '.': 'Other',
                        'Conflicting_interpretations_of_pathogenicity|_other|_risk_factor': 'Conflicting',
                        'risk_factor': 'Other',
                        'Benign/Likely_benign|_other': 'B/LB',
                        'other': 'Other',
                        'Pathogenic/Likely_pathogenic|_other': 'P/LP'}

In [6]:
clin_annot['GROUP'] = clin_annot['CLNSIG'].map(clinical_annotations)

In [7]:
subset_mask = clin_annot['GROUP'] == 'Conflicting'
clin_annot.loc[subset_mask, 'GROUP'] = clin_annot.loc[subset_mask, 'CLNSIGCONF'].apply(resolve_conflict)

In [8]:
merged = pd.merge(clin_annot, clin_hgvs, on='ALLELEID')

In [9]:
merged.shape

(25935, 17)

In [10]:
# Only Cys282Tyr substitution (allele id: 15048) is considered for HFE
merged = merged.drop(merged[(merged['GENEINFO'] == 'HFE:3077') & (merged['ALLELEID'] != 15048)].index)

In [11]:
# Remove complex substitutions
merged['AA_CHANGE'] = merged['HGVS_NAME'].str.extract('\(p.(\S+)\)')
merged = merged.drop(merged[merged['AA_CHANGE'].str.contains('=')].index)

In [12]:
merged.shape

(25847, 18)

In [13]:
merged.to_csv('../data/final_mutant_list.tsv', sep='\t', header=True, index=False)

In [14]:
merged[['GENEINFO', 'AA_CHANGE', 'GROUP']].to_csv('../data/final_mutant_list.slim.tsv', sep='\t', header=True, index=False)