In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import scipy
import seaborn as sns
import numpy as np

In [2]:
db = pd.read_csv('../data/dbNSFP3.2.clinvar_clean.txt',
                  sep='\t')

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
db['aa_change'] = db['aaref'] + '=>' + db['aaalt']

## Create simple table for just proteins with glycosylation sites with distance annotation

In [4]:
p_d = pd.read_csv('../data/protein_aa_distance.txt',
                       sep='\t',
                       header=None,index_col=0)

In [5]:
p_d.columns = ['protein','aa','distance_to_feature']

In [6]:
db = db[['aapos','Uniprot_acc_Polyphen2','clinvar_pathogenic','aa_change']]
db = db[db['Uniprot_acc_Polyphen2'] != '.']
db.head()

Unnamed: 0,aapos,Uniprot_acc_Polyphen2,clinvar_pathogenic,aa_change
3,215;353,O00468,likely benign,Q=>R
4,590;728,O00468,benign,E=>V
5,950;1088,O00468,likely benign,L=>F
6,997;1135,O00468,likely benign,Q=>R
7,1151;1289,O00468,likely benign,P=>L


In [7]:
aas = []
proteins = []
clinvars = []
aa_changes = []
for i in db.index:
    one_var = db.loc[i]
    aa_list = one_var['aapos'].split(';')
    protein_list = one_var['Uniprot_acc_Polyphen2'].split(';')
    path = one_var['clinvar_pathogenic']
    aa_change = one_var['aa_change']
    
    for p in protein_list:
        for aa in aa_list:
            aas.append(aa)
            proteins.append(p)
            clinvars.append(path)
            aa_changes.append(aa_change)

In [8]:
flat_db = pd.DataFrame([pd.Series(aas,dtype=int),
                  pd.Series(proteins),
                  pd.Series(clinvars),
                  pd.Series(aa_changes)])
flat_db = flat_db.transpose()
flat_db.columns = ['aa','protein','clinvar_path','aa_change']

  return bool(asarray(a1 == a2).all())


In [9]:
print(flat_db.shape)
print('Dropping duplicates')
flat_db =  flat_db.drop_duplicates(subset=['aa','protein'])
print(flat_db.shape)

(295521, 4)
Dropping duplicates
(156796, 4)


In [10]:
p_d.head()

Unnamed: 0_level_0,protein,aa,distance_to_feature
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
402407,P05107,28,0.0
820774,O14656,143,0.0
1068490,Q92820,139,0.0
196505,Q9NY72,97,0.0
1159219,O15118,916,0.0


In [11]:
flat_db.head()

Unnamed: 0,aa,protein,clinvar_path,aa_change
0,215,O00468,likely benign,Q=>R
1,353,O00468,likely benign,Q=>R
2,590,O00468,benign,E=>V
3,728,O00468,benign,E=>V
4,950,O00468,likely benign,L=>F


In [12]:
merged = flat_db.merge(p_d,on=['aa','protein'],how='outer')

In [13]:
print(merged.shape)
print('dropping no clinvar proteins')
merged = merged.dropna(subset=['clinvar_path'])
print(merged.shape)

(543599, 5)
dropping no clinvar proteins
(156796, 5)


# Dropping any clinvar variants that are not in glycosylated proteins

In [22]:
g_proteins = list(p_d['protein'].unique())

In [25]:
merged = merged[merged['protein'].isin(g_proteins)]
print(merged.shape)

(5037, 5)


# Any variants that do not have feature distances are given a psuedo distance

In [27]:
high_distance = 150

In [28]:
merged['distance_to_feature'] = merged['distance_to_feature'].fillna(high_distance)

## Filtering sites where distance to feature == 0, which is presumably the glycosylation site

In [29]:
merged = merged[merged['distance_to_feature'] > 0]
merged.shape

(5018, 5)

In [30]:
merged.to_csv('../data/clinvar_glycoproteins_distances.txt',sep='\t')