# `dms-view` dataset for [Findlay _et al._, 2018](https://www.nature.com/articles/s41586-018-0461-z)

## notebook setup

In [1]:
import pandas as pd

## read in dataframe

In [2]:
df = (pd.read_csv('Findlay2018_suppTable1.csv', 
                  skiprows=2)[['aa_pos', 'aa_ref', 'aa_alt', 'function.score.mean', 'CADD.score']]
      .rename(columns={'aa_pos': 'site', 
                       'aa_ref': 'wildtype', 
                       'aa_alt': 'mutation', 
                       'function.score.mean': 'SGE', 
                       'CADD.score': 'CADD'})
     .dropna())
df['site'] = df['site'].astype('int')
df = pd.melt(df, id_vars=['site', 'wildtype', 'mutation'], value_name='mut_score', var_name='condition')
df.head()

Unnamed: 0,site,wildtype,mutation,condition,mut_score
0,1,M,L,SGE,-2.516529
1,1,M,V,SGE,-2.025645
2,1,M,L,SGE,-1.965629
3,1,M,K,SGE,-1.6367
4,1,M,T,SGE,-1.656569


In [3]:
# average over syn mutations at a site
df = df.groupby(['site', 'wildtype', 'mutation', 'condition']).mean().reset_index()
df.head()

Unnamed: 0,site,wildtype,mutation,condition,mut_score
0,1,M,I,CADD,27.2
1,1,M,I,SGE,-2.208512
2,1,M,K,CADD,26.8
3,1,M,K,SGE,-1.6367
4,1,M,L,CADD,26.35


## calculate site-level metrics

In [4]:
site = df.groupby(['site', 'condition']).agg({'mut_score': ['mean', 'median', 'max', 'min']})
site.columns = [f'site_{x[1]} score' for x in site.columns.values]
site = site.reset_index()
df = pd.merge(df, site, on=['site', 'condition'])
df.head()

Unnamed: 0,site,wildtype,mutation,condition,mut_score,site_mean score,site_median score,site_max score,site_min score
0,1,M,I,CADD,27.2,25.858333,25.975,27.2,24.2
1,1,M,K,CADD,26.8,25.858333,25.975,27.2,24.2
2,1,M,L,CADD,26.35,25.858333,25.975,27.2,24.2
3,1,M,R,CADD,25.6,25.858333,25.975,27.2,24.2
4,1,M,T,CADD,25.0,25.858333,25.975,27.2,24.2


In [5]:
df['protein_chain'] = 'A'
df['protein_site'] = df['site']
df['label_site'] = df['site']
df = df.sort_values(by='condition', ascending=False)
df.head()

Unnamed: 0,site,wildtype,mutation,condition,mut_score,site_mean score,site_median score,site_max score,site_min score,protein_chain,protein_site,label_site
4501,1855,I,V,SGE,0.046278,-0.20513,0.007632,0.10793,-0.863844,A,1855,1855
3536,1785,Q,H,SGE,-0.017713,-0.366623,0.023543,0.269408,-1.622907,A,1785,1785
1848,1662,F,L,SGE,-0.189815,0.012068,-0.0577,0.748164,-0.474514,A,1662,1662
1849,1662,F,S,SGE,-0.254858,0.012068,-0.0577,0.748164,-0.474514,A,1662,1662
1850,1662,F,V,SGE,-0.0577,0.012068,-0.0577,0.748164,-0.474514,A,1662,1662


## ring domain

In [6]:
df[df['site'].isin(list(range(1, 104)))].to_csv('Findlay2018_ringdomain.csv', index=False)

## BRCT domain

In [7]:
df[df['site'].isin(list(range(1649, 1860)))].to_csv('Findlay2018_brctdomain.csv', index=False)