# Data Wrangling for [Lee *et al.*, (2019)](https://elifesciences.org/articles/49324)

Mutational Antigenic Profiling of H3 influenza escape from human sera.

## notebook setup

In [1]:
import pandas as pd

## mutational antigenic profiling data

In [2]:
# target columns
target_sera = ['2010-age-21', '2009-age-53', '2009-age-64', '2009-age-65', 'ferret-Pitt-1-preinf', 
               'ferret-Pitt-1-postinf', 'ferret-Pitt-2-preinf', 'ferret-Pitt-2-postinf', 'ferret-Pitt-3-preinf', 
               'ferret-Pitt-3-postinf', 'ferret-WHO-Perth2009', 'ferret-WHO-Victoria2011',]
site_metrics = ['abs_diffsel', 'positive_diffsel', 'negative_diffsel', 'max_diffsel', 'min_diffsel']
mut_metrics = ['mutdiffsel']

# process the data
df = pd.read_csv('https://raw.githubusercontent.com/jbloomlab/map_flu_serum_Perth2009_H3_HA/master/'
                 'results/avgdiffsel/avg_sel_tidy.csv', low_memory=False)
df = (df[df['serum_name_formatted'].isin(target_sera)]
      .drop(columns=['serum', 'serum_group', 'serum_vaccination', 'zoom_site'])  # unneeded columns
      .rename(columns={'serum_name_formatted': 'condition',
                       'site': 'label_site', 
                      'isite': 'site', 
                      'pdb_site': 'protein_site',
                      'pdb_chain': 'protein_chain'})  # rename some columns 
     .rename(columns={key: f"site_{' '.join(key.split('_'))}"
                      for key in site_metrics})  # rename and format site metrics
      .rename(columns={key: f"mut_{' '.join(key.split('_'))}" 
                      for key in mut_metrics})  # rename and format mut metrics
     )

# positive mut_diffsel
df['mut_positive mutdiffsel'] = df['mut_mutdiffsel'].clip(lower=0)

# change the order of the columns 
firstmut = 'mut_positive mutdiffsel'
firstsite = 'site_positive diffsel'
cols = list(df.columns.values) 
cols.remove(firstmut) 
cols.remove(firstsite) 
df = df[[firstmut, firstsite] + cols]

df.head()

Unnamed: 0,mut_positive mutdiffsel,site_positive diffsel,condition,label_site,wildtype,mutation,mut_mutdiffsel,site_abs diffsel,site_negative diffsel,site_max diffsel,site_min diffsel,site,protein_chain,protein_site
90560,3.6671,15.837,2010-age-21,193,F,D,3.6671,17.384,-1.5467,3.6671,-0.52377,208,A,193
90561,3.1793,15.837,2010-age-21,193,F,N,3.1793,17.384,-1.5467,3.6671,-0.52377,208,A,193
90562,2.082,15.837,2010-age-21,193,F,Q,2.082,17.384,-1.5467,3.6671,-0.52377,208,A,193
90563,2.0098,15.837,2010-age-21,193,F,E,2.0098,17.384,-1.5467,3.6671,-0.52377,208,A,193
90564,1.096,15.837,2010-age-21,193,F,L,1.096,17.384,-1.5467,3.6671,-0.52377,208,A,193


In [3]:
df.to_csv('Lee2019.csv', index=False)