Build data for `dms-view` from raw data in supplementary files.

Import Python modules:

In [1]:
import pandas as pd

Read mapping of sequential numbers to protein chains, protein site, and site label.
Protein sites are set up to match PDB 1rvx:

In [2]:
sequential_to_h3 = (
    pd.read_csv('Supplemental_File_6_WSN_to_H3_numbering_conversion.txt',
                comment='#',
                sep=' ',
                names=['site', 'H3_site'])
    .assign(
        domain=lambda x: x['H3_site'].map(lambda y: 'HA2' if 'HA2' in y else 'HA1'),
        protein_site=lambda x: x['H3_site'].map(lambda y: int(y[: -5]) + 500 if 'HA2' in y else y),
        protein_chain=lambda x: x['domain'].map({'HA1': 'A C E',
                                                 'HA2': 'B D F'})
        )
    )
    

sequential_to_h3

Unnamed: 0,site,H3_site,domain,protein_site,protein_chain
0,2,-8,HA1,-8,A C E
1,3,-7,HA1,-7,A C E
2,4,-6,HA1,-6,A C E
3,5,-5,HA1,-5,A C E
4,6,-4,HA1,-4,A C E
...,...,...,...,...,...
559,561,218(HA2),HA2,718,B D F
560,562,219(HA2),HA2,719,B D F
561,563,220(HA2),HA2,720,B D F
562,564,221(HA2),HA2,721,B D F


Now read preferences and re-scaled preferences, and calculate site entropy and number of effective amino acids, then merge with `sequential_to_h3` data frame above:

In [3]:
prefs = (
    pd.concat([pd.read_csv(fname,
                           comment='#',
                           sep=' ',
                           names=['site', 'wildtype', 'entropy',
                                  'A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L',
                                  'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']
                           )
                .assign(condition=condition)
                for condition, fname in [('raw_measurements', 'Supplemental_File_2_HApreferences.txt'),
                                         ('rescaled_measurements', 'Supplemental_File_3_HApreferences_rescaled.txt')]
               ])
    .melt(id_vars=['site', 'wildtype', 'condition', 'entropy'],
          var_name='mutation',
          value_name='mut_preference')
    .rename(columns={'entropy': 'site_entropy_bits'})
    .assign(site_n_effective=lambda x: x['site_entropy_bits'].map(lambda y: 2**y))
    .merge(sequential_to_h3, on='site', validate='many_to_one')
    .assign(label_site=lambda x: x['wildtype'] + ' ' + x['H3_site'])
    [['site', 'label_site', 'wildtype', 'mutation', 'condition', 'protein_chain', 'protein_site',
      'site_entropy_bits', 'site_n_effective', 'mut_preference']]
    )

prefs

Unnamed: 0,site,label_site,wildtype,mutation,condition,protein_chain,protein_site,site_entropy_bits,site_n_effective,mut_preference
0,2,K -8,K,A,raw_measurements,A C E,-8,4.13229,17.536513,0.032454
1,2,K -8,K,A,rescaled_measurements,A C E,-8,3.84576,14.377690,0.019206
2,2,K -8,K,C,raw_measurements,A C E,-8,4.13229,17.536513,0.073960
3,2,K -8,K,C,rescaled_measurements,A C E,-8,3.84576,14.377690,0.085922
4,2,K -8,K,D,raw_measurements,A C E,-8,4.13229,17.536513,0.009216
...,...,...,...,...,...,...,...,...,...,...
22555,565,I 222(HA2),I,V,rescaled_measurements,B D F,722,2.03899,4.109577,0.034771
22556,565,I 222(HA2),I,W,raw_measurements,B D F,722,3.38570,10.451948,0.036676
22557,565,I 222(HA2),I,W,rescaled_measurements,B D F,722,2.03899,4.109577,0.012087
22558,565,I 222(HA2),I,Y,raw_measurements,B D F,722,3.38570,10.451948,0.043277


Write to data file:

In [4]:
prefs.to_csv('Doud2016.csv', index=False, float_format='%.4f')