# Building `dms-view` datasets for [Sourisseau *et al.*, 2019](https://research.fhcrc.org/content/dam/stripe/bloom/labfiles/publications/Sourisseau2019.pdf)

This jupyter notebook builds a `dms-view` datafile for the Deep Mutational Scanning (DMS; `Sourisseau2019_DMS.csv`) and the Mutational Antigenic Profiling (MAP; `Sourisseau2019_MAP.csv`) of Zika Envelope protein.
The data is scraped from the [paper repo](https://github.com/jbloomlab/ZIKV_DMS_with_EvansLab). 

## notebook setup

In [6]:
import pandas as pd
from scipy.stats import entropy

## Deep Mutational Scanning data

### data files

In [7]:
muteffects_fname = 'https://raw.githubusercontent.com/jbloomlab/ZIKV_DMS_with_EvansLab/master/results/muteffects/unscaled_muteffects.csv'
prefs_fname = 'https://raw.githubusercontent.com/jbloomlab/ZIKV_DMS_with_EvansLab/master/results/prefs/summary_avgprefs.csv'
rescaledprefs_fname = 'https://raw.githubusercontent.com/jbloomlab/ZIKV_DMS_with_EvansLab/master/results/prefs/rescaled_prefs.csv'
sitesummary_fname= 'https://raw.githubusercontent.com/jbloomlab/ZIKV_DMS_with_EvansLab/master/results/struct_props/struct_props_mut_tol.csv'

### site data

In [120]:
RSA = pd.read_csv(sitesummary_fname).query("pdb == '5ire'")[['site', 'RSA', 'mutational_tolerance_measure', 'mutational_tolerance']].rename(columns={'RSA': 'site_RSA'})
RSA = (pd.pivot_table(RSA, index=['site', 'site_RSA'], columns='mutational_tolerance_measure', values='mutational_tolerance')
            .reset_index()
            .rename(columns={'entropy': 'site_entropy', 'neffective': 'site_n effective'}))
mut_effects = pd.read_csv(muteffects_fname).drop(columns=['mutation', 'effect']).rename(columns={'mutant': 'mutation', 'log2effect': 'mut_value'})
mut_effects = pd.merge(mut_effects, RSA, on='site').assign(condition='mutational effects')
RSA = mut_effects.copy().drop(columns=['mut_value', 'site_entropy', 'site_n effective', 'condition'])
mut_effects.head()

Unnamed: 0,site,wildtype,mutation,mut_value,site_RSA,site_entropy,site_n effective,condition
0,1,I,A,-5.04603,0.030457,1.809948,6.110127,mutational effects
1,1,I,C,-4.308696,0.030457,1.809948,6.110127,mutational effects
2,1,I,D,-3.88453,0.030457,1.809948,6.110127,mutational effects
3,1,I,E,-4.290569,0.030457,1.809948,6.110127,mutational effects
4,1,I,F,-4.370049,0.030457,1.809948,6.110127,mutational effects


In [122]:
amino_acids = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L','M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']
prefs = (pd.concat([(pd.read_csv(fname)[['site'] + amino_acids])
                    .assign(condition=condition)
                    for condition, fname in [('raw preferences', prefs_fname),
                                             ('rescaled preferences', rescaledprefs_fname)]
                   ]))
prefs['site_entropy'] = prefs[amino_acids].apply(lambda x: entropy(x), axis=1)
prefs['site_n effective'] = prefs['site_entropy'].apply(lambda x: 2**x)
prefs = pd.melt(prefs, 
                id_vars=['site', 'condition', 'site_entropy', 'site_n effective'], 
                var_name='mutation', 
                value_name='mut_value')
prefs = pd.merge(prefs, RSA, on=['site', 'mutation'])
prefs.head()

Unnamed: 0,site,condition,site_entropy,site_n effective,mutation,mut_value,wildtype,site_RSA
0,1,raw preferences,1.809948,3.506295,A,0.005438,I,0.030457
1,1,rescaled preferences,1.317192,2.491807,A,0.000505,I,0.030457
2,2,raw preferences,1.851241,3.608105,A,0.007809,R,0.244526
3,2,rescaled preferences,0.543811,1.457819,A,0.000743,R,0.244526
4,3,raw preferences,1.375274,2.594171,A,0.006306,C,0.023952


In [131]:
df = pd.concat([mut_effects, prefs], sort=False)
df['label_site'] = df[['wildtype', 'site']].apply(lambda x: f"{x[0]} {x[1]}", axis=1)
df.head()

Unnamed: 0,site,wildtype,mutation,mut_value,site_RSA,site_entropy,site_n effective,condition,label_site
0,1,I,A,-5.04603,0.030457,1.809948,6.110127,mutational effects,I 1
1,1,I,C,-4.308696,0.030457,1.809948,6.110127,mutational effects,I 1
2,1,I,D,-3.88453,0.030457,1.809948,6.110127,mutational effects,I 1
3,1,I,E,-4.290569,0.030457,1.809948,6.110127,mutational effects,I 1
4,1,I,F,-4.370049,0.030457,1.809948,6.110127,mutational effects,I 1


In [None]:
df.to_csv('test.csv', index=False)