In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import scipy
import seaborn as sns
import numpy as np

In [2]:
db = pd.read_csv('../data/dbNSFP3.2.clinvar_clean.txt',
                  sep='\t')

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
db['aa_change'] = db['aaref'] + '=>' + db['aaalt']

## Create simple table for just proteins with glycosylation sites with distance annotation

In [4]:
p_d = pd.read_csv('../data/protein_aa_distance.txt',
                       sep='\t',
                       header=None,index_col=0)

  mask |= (ar1 == a)


In [5]:
p_d.columns = ['protein','aa','distance_to_feature']

In [6]:
db = db[['aapos','Uniprot_acc_Polyphen2','clinvar_pathogenic','aa_change']]
db = db[db['Uniprot_acc_Polyphen2'] != '.']
db.head()

Unnamed: 0,aapos,Uniprot_acc_Polyphen2,clinvar_pathogenic,aa_change
3,215;353,O00468,likely benign,Q=>R
4,590;728,O00468,benign,E=>V
5,950;1088,O00468,likely benign,L=>F
6,997;1135,O00468,likely benign,Q=>R
7,1151;1289,O00468,likely benign,P=>L


In [7]:
aas = []
proteins = []
clinvars = []
aa_changes = []
for i in db.index:
    one_var = db.loc[i]
    aa_list = one_var['aapos'].split(';')
    protein_list = one_var['Uniprot_acc_Polyphen2'].split(';')
    path = one_var['clinvar_pathogenic']
    aa_change = one_var['aa_change']
    
    for p in protein_list:
        for aa in aa_list:
            aas.append(aa)
            proteins.append(p)
            clinvars.append(path)
            aa_changes.append(aa_change)

In [8]:
flat_db = pd.DataFrame([pd.Series(aas,dtype=int),
                  pd.Series(proteins),
                  pd.Series(clinvars),
                  pd.Series(aa_changes)])
flat_db = flat_db.transpose()
flat_db.columns = ['aa','protein','clinvar_path','aa_change']

  return bool(asarray(a1 == a2).all())


In [9]:
print(flat_db.shape)
print('Dropping duplicates')
flat_db =  flat_db.drop_duplicates(subset=['aa','protein'])
print(flat_db.shape)

(295521, 4)
Dropping duplicates
(156796, 4)


In [10]:
p_d.head()

Unnamed: 0_level_0,protein,aa,distance_to_feature
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,P33527,205,83.58
1,P33527,206,79.96
2,P33527,207,79.12
3,P33527,208,72.96
4,P33527,209,72.11


In [11]:
flat_db.head()

Unnamed: 0,aa,protein,clinvar_path,aa_change
0,215,O00468,likely benign,Q=>R
1,353,O00468,likely benign,Q=>R
2,590,O00468,benign,E=>V
3,728,O00468,benign,E=>V
4,950,O00468,likely benign,L=>F


In [12]:
merged = flat_db.merge(p_d,on=['aa','protein'],how='outer')

In [13]:
print(merged.shape)
print('dropping no distance proteins')
merged = merged.dropna(subset=['distance_to_feature'])
print(merged.shape)
print('dropping no clinvar proteins')
merged = merged.dropna(subset=['clinvar_path'])
print(merged.shape)

(3363389, 5)
dropping no distance proteins
(3212126, 5)
dropping no clinvar proteins
(19081, 5)


## Looking at duplicated amino acids and proteins... don't fully understand this! Keeping minimum distance for all duplicates

In [14]:
merged[merged.duplicated(subset=['aa','protein'])]

Unnamed: 0,aa,protein,clinvar_path,aa_change,distance_to_feature
23,65.0,P43489,pathogenic,R=>C,45.67
332,359.0,O95479,pathogenic,G=>D,37.99
334,370.0,O95479,pathogenic,G=>D,34.95
790,73.0,Q99895,pathogenic,A=>T,29.27
792,254.0,Q99895,pathogenic,R=>W,21.14
995,738.0,P42892,pathogenic,R=>C,40.79
996,738.0,P42892,pathogenic,R=>C,46.70
997,738.0,P42892,pathogenic,R=>C,21.76
998,738.0,P42892,pathogenic,R=>C,31.96
999,738.0,P42892,pathogenic,R=>C,44.09


In [15]:
flat_db[flat_db.duplicated(subset=['aa','protein'])]

Unnamed: 0,aa,protein,clinvar_path,aa_change


In [16]:
duplicated = p_d[p_d.duplicated(subset=['aa','protein'])]
duplicated = duplicated[duplicated['protein']== 'P32004']
duplicated[duplicated['aa'] == 174]

Unnamed: 0_level_0,protein,aa,distance_to_feature
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2489975,P32004,174,10.79
2490554,P32004,174,49.34
2491133,P32004,174,48.53
2491712,P32004,174,133.47
2492291,P32004,174,127.97
2492870,P32004,174,145.84
2493449,P32004,174,117.73
2494028,P32004,174,183.32


In [17]:
merged = merged.sort_values(by='distance_to_feature').drop_duplicates(subset=['protein','aa'],keep='first')

In [18]:
print(merged.shape)

(5533, 5)


## Filtering sites where distance to feature == 0, which is presumably the glycosylation site

In [22]:
merged = merged[merged['distance_to_feature'] > 0]
merged.shape

(5499, 5)

## Hmm this is not that many variants, maybe the ones that do not have annotations need to be given a psuedo distance of far?

So the variants that are not given a distance, just add a psuedo distance

In [24]:
merged.to_csv('../data/clinvar_glycoproteins_distances.txt',sep='\t')