# Data for [Lee *et al.*, 2018](http://www.pnas.org/content/115/35/E8276): Mutational Tolerance of Perth09 H3 HA

## notebook setup

In [1]:
import pandas as pd
from Bio.Seq import Seq
from Bio.Alphabet import generic_dna

## H3 prefs

In [2]:
df = pd.read_csv('summary_avgprefs_rescaled_entropies.csv')
df = (pd.melt(df, id_vars=['site', 'entropy', 'neffective'], var_name='mutation', value_name='mut_rescaled prefs')
      .rename(columns={'entropy': 'site_entropy', 'neffective': 'site_neffective'}))
df.head()

Unnamed: 0,site,site_entropy,site_neffective,mutation,mut_rescaled prefs
0,1,2.666732,14.392859,A,0.011568
1,2,2.241582,9.408201,A,0.000721
2,3,2.661328,14.315294,A,0.00844
3,4,2.417455,11.217279,A,0.192305
4,5,1.963548,7.124559,A,0.022888


In [3]:
nat = pd.read_csv('H3_alignment_frequences.csv')
nat = pd.melt(nat, id_vars='site', var_name='mutation', value_name='mut_natural frequencies')
df = pd.merge(df, nat, on=['site', 'mutation'])
df.head()

Unnamed: 0,site,site_entropy,site_neffective,mutation,mut_rescaled prefs,mut_natural frequencies
0,1,2.666732,14.392859,A,0.011568,0.0
1,2,2.241582,9.408201,A,0.000721,0.0
2,3,2.661328,14.315294,A,0.00844,0.010309
3,4,2.417455,11.217279,A,0.192305,0.0
4,5,1.963548,7.124559,A,0.022888,0.0


In [4]:
m = pd.read_csv('H3renumbering_scheme.csv').rename(columns={'original': 'site', 'new': 'label_site'})
df = pd.merge(df, m, on=['site'])
df.head()

Unnamed: 0,site,site_entropy,site_neffective,mutation,mut_rescaled prefs,mut_natural frequencies,label_site
0,1,2.666732,14.392859,A,0.011568,0.0,-16
1,1,2.666732,14.392859,C,0.088795,0.0,-16
2,1,2.666732,14.392859,D,0.158648,0.0,-16
3,1,2.666732,14.392859,E,0.033813,0.0,-16
4,1,2.666732,14.392859,F,0.024934,0.0,-16


## shifted preferences

In [5]:
shift = pd.read_csv('Perth_to_WSN_prefs_dist.csv').drop(columns=['RMSDbetween', "RMSDwithin"])
shift = (pd.melt(shift, id_vars=['site', 'RMSDcorrected'], var_name='mutation', value_name='mut_RMSD')
         .rename(columns={'RMSDcorrected': 'site_RMSDcorrected'}))
shift_map = pd.read_csv('Perth2009_compareprefs_renumber.csv').rename(columns={'original': 'H3_seq', 'new': 'site'})
shift = pd.merge(shift, shift_map, on='site').drop(columns='site').rename(columns={'H3_seq': 'site'})
shift = pd.merge(shift, m, on='site')
df = pd.merge(df, shift, on=['site', 'label_site', 'mutation'], how='left')  # want all of the prefs sites
df.head()

Unnamed: 0,site,site_entropy,site_neffective,mutation,mut_rescaled prefs,mut_natural frequencies,label_site,site_RMSDcorrected,mut_RMSD
0,1,2.666732,14.392859,A,0.011568,0.0,-16,,
1,1,2.666732,14.392859,C,0.088795,0.0,-16,,
2,1,2.666732,14.392859,D,0.158648,0.0,-16,,
3,1,2.666732,14.392859,E,0.033813,0.0,-16,,
4,1,2.666732,14.392859,F,0.024934,0.0,-16,,


In [6]:
df['protein_chain'] = 'A'
df['protein_site'] = df['label_site']
df['condition'] = 'H3 rescaled prefs'
df.head()

Unnamed: 0,site,site_entropy,site_neffective,mutation,mut_rescaled prefs,mut_natural frequencies,label_site,site_RMSDcorrected,mut_RMSD,protein_chain,protein_site,condition
0,1,2.666732,14.392859,A,0.011568,0.0,-16,,,A,-16,H3 rescaled prefs
1,1,2.666732,14.392859,C,0.088795,0.0,-16,,,A,-16,H3 rescaled prefs
2,1,2.666732,14.392859,D,0.158648,0.0,-16,,,A,-16,H3 rescaled prefs
3,1,2.666732,14.392859,E,0.033813,0.0,-16,,,A,-16,H3 rescaled prefs
4,1,2.666732,14.392859,F,0.024934,0.0,-16,,,A,-16,H3 rescaled prefs


## determine wildtype

In [7]:
with open('Perth09_HA_reference.fa', 'r') as f:
    seq = f.readlines()
seq = Seq(seq[1].strip(), generic_dna)
seq = str(seq.translate())[:-1]
wt = pd.DataFrame({'wildtype': [x for x in seq], 'site': [x+1 for x in range(len(seq))]})
df = pd.merge(df, wt, on=['site'])
df.head()

Unnamed: 0,site,site_entropy,site_neffective,mutation,mut_rescaled prefs,mut_natural frequencies,label_site,site_RMSDcorrected,mut_RMSD,protein_chain,protein_site,condition,wildtype
0,1,2.666732,14.392859,A,0.011568,0.0,-16,,,A,-16,H3 rescaled prefs,M
1,1,2.666732,14.392859,C,0.088795,0.0,-16,,,A,-16,H3 rescaled prefs,M
2,1,2.666732,14.392859,D,0.158648,0.0,-16,,,A,-16,H3 rescaled prefs,M
3,1,2.666732,14.392859,E,0.033813,0.0,-16,,,A,-16,H3 rescaled prefs,M
4,1,2.666732,14.392859,F,0.024934,0.0,-16,,,A,-16,H3 rescaled prefs,M


In [8]:
df.to_csv('Lee2018.csv', index=False)