In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import scipy
import seaborn as sns

# Asking the question, are clinvar variants that are in amino acids that are near glycosites more likely to be pathogenic than ones that are not?

In [2]:
df = pd.read_csv('../data/clinvar_glycoproteins.txt',
                 sep='\t',
                 index_col=0)

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
df['aa_change'] = df['aaref'] + '=>' + df['aaalt']

In [4]:
counts = df.groupby(by=['clinvar_pathogenic','glycosite_proximal'],as_index=False).count()[['clinvar_pathogenic','glycosite_proximal','#chr']]
counts

Unnamed: 0,clinvar_pathogenic,glycosite_proximal,#chr
0,benign,False,409
1,benign,True,80
2,likely benign,False,322
3,likely benign,True,69
4,likely pathogenic,False,564
5,likely pathogenic,True,103
6,pathogenic,False,3488
7,pathogenic,True,704


In [5]:
counts_pivot = counts.pivot(index='clinvar_pathogenic',columns='glycosite_proximal',values='#chr')
counts_pivot.loc['likely benign + benign'] = counts_pivot.loc['likely benign'] + counts_pivot.loc['benign'] 
counts_pivot.loc['likely pathogenic + pathogenic'] = counts_pivot.loc['likely pathogenic'] + counts_pivot.loc['pathogenic'] 

In [6]:
counts_pivot

glycosite_proximal,False,True
clinvar_pathogenic,Unnamed: 1_level_1,Unnamed: 2_level_1
benign,409,80
likely benign,322,69
likely pathogenic,564,103
pathogenic,3488,704
likely benign + benign,731,149
likely pathogenic + pathogenic,4052,807


## Testing all reasonable combinations:) - with all distances from glycosite

In [7]:
comparisons = [('likely pathogenic + pathogenic','benign'),
                ('pathogenic','benign'),
                ('pathogenic','likely benign + benign'),
                ('likely pathogenic + pathogenic','likely benign + benign')]

In [8]:
for comp in comparisons:
    i,j = comp
    
    contingency_table = counts_pivot.loc[[i,j]]
    oddratio,p = scipy.stats.fisher_exact(contingency_table)
    if p < 0.1:
        print(contingency_table)
        print(oddratio,p)
        print('\n')

## Nothing close to significant. 

Note, not shown here but nothing close to significant if I filter for JUST residues within 6Angstrom of a glycosite.

## What if I segregate by amino acid substitution type?

In [9]:
by_aas = df.groupby(by=['glycosite_proximal','clinvar_pathogenic','aa_change',],as_index=False).count()[['glycosite_proximal','clinvar_pathogenic','aa_change','#chr']]

In [10]:
by_aas = by_aas.sort_values(by='aa_change')
by_aas

Unnamed: 0,glycosite_proximal,clinvar_pathogenic,aa_change,#chr
303,False,pathogenic,A=>D,16
182,False,likely pathogenic,A=>D,3
543,True,likely pathogenic,A=>D,1
606,True,pathogenic,A=>D,2
183,False,likely pathogenic,A=>E,3
304,False,pathogenic,A=>E,19
96,False,likely benign,A=>E,1
607,True,pathogenic,A=>E,1
0,False,benign,A=>E,1
305,False,pathogenic,A=>G,7


In [11]:
counts_pivot = counts.pivot(index='clinvar_pathogenic',columns='glycosite_proximal',values='#chr')
counts_pivot.loc['likely benign + benign'] = counts_pivot.loc['likely benign'] + counts_pivot.loc['benign'] 
counts_pivot.loc['likely pathogenic + pathogenic'] = counts_pivot.loc['likely pathogenic'] + counts_pivot.loc['pathogenic'] 

In [12]:
for aa in by_aas['aa_change'].unique():
    
    one_aa_change = by_aas[by_aas['aa_change'] == aa]
    counts_pivot = one_aa_change.pivot(index='clinvar_pathogenic',columns='glycosite_proximal',values='#chr')
    
    # Sometime AAs will not have all the mutation categories, making sure they have them here:
    expected_rows = ['benign','likely benign','likely pathogenic','pathogenic']
    for r in expected_rows:
        if r not in counts_pivot.index:
            counts_pivot.loc[r] = 0
    
    counts_pivot.loc['likely benign + benign'] = counts_pivot.loc['likely benign'] + counts_pivot.loc['benign'] 
    counts_pivot.loc['likely pathogenic + pathogenic'] = counts_pivot.loc['likely pathogenic'] + counts_pivot.loc['pathogenic'] 
    counts_pivot = counts_pivot.fillna(0)
    
    for comp in comparisons:
        i,j = comp

        contingency_table = counts_pivot.loc[[i,j]]

        if (contingency_table.shape[0] != 2) or (contingency_table.shape[1] != 2):
            continue

        oddratio,p = scipy.stats.fisher_exact(contingency_table)
        if p < 0.05:
            print('{}\tp={}'.format(aa,p))
            print(contingency_table)
            print('\n')

A=>S	p=0.011904761904761908
glycosite_proximal              False  True 
clinvar_pathogenic                          
likely pathogenic + pathogenic    0.0    3.0
likely benign + benign            6.0    0.0


A=>V	p=0.019328953463174738
glycosite_proximal      False  True 
clinvar_pathogenic                  
pathogenic               62.0   13.0
likely benign + benign   26.0    0.0


D=>N	p=0.003465495716561198
glycosite_proximal              False  True 
clinvar_pathogenic                          
likely pathogenic + pathogenic   67.0    0.0
likely benign + benign           19.0    4.0


G=>D	p=0.008860759493670869
glycosite_proximal              False  True 
clinvar_pathogenic                          
likely pathogenic + pathogenic   72.0    0.0
likely benign + benign            6.0    2.0


G=>S	p=0.0195961071409892
glycosite_proximal      False  True 
clinvar_pathogenic                  
pathogenic               67.0   15.0
likely benign + benign   26.0    0.0


G=>S	p=0.0121603