# Data Wrangling for [Doud, Lee, and Bloom (2016)](https://research.fhcrc.org/content/dam/stripe/bloom/labfiles/publications/Doud2018.pdf)

Mutational Antigenic Profiling of influenza monoclonal antibodies.

## notebook setup

In [1]:
import pandas as pd

## mutational antigenic profiling data

In [2]:
antibodies = ['C179', 'FI6v3', 'H17L7', 'H17L10', 'H17L19', 'S139']

df = []
for condition in antibodies:
    mut = pd.read_csv(f'antibody_{condition}_median.csv')
    site = pd.read_csv(f'antibody_{condition}_median_avgsite.csv')
    
    # process the site 
    site.columns = [x if x == 'site' else f"site_{x[:3]} excess frac survive" for x in site.columns.values]
    
    # process the mut
    mut = mut.rename(columns={'mutfracsurvive': 'mut_excess frac survive'})
    
    # together
    df.append(pd.merge(site, mut, on=['site']).assign(condition=condition))
df = pd.concat(df).rename(columns={'site': 'label_site'})
l = len(df)

# numbering
m = pd.read_csv('numbering_map.csv').rename(columns={'original': 'site', 'new': 'label_site'})
m['protein_chain'] = m['label_site'].apply(lambda x: 'B D F' if 'HA2' in x else 'A C E')
m['protein_site'] = m['label_site'].apply(lambda x: int(x[5: ]) + 500 if 'HA2' in x else x)
df = pd.merge(df, m, on='label_site', how='left')

df.head()

Unnamed: 0,label_site,site_avg excess frac survive,site_max excess frac survive,wildtype,mutation,mut_excess frac survive,condition,site,protein_chain,protein_site
0,(HA2)1,6e-06,0.000119,G,R,0.000119053,C179,344,B D F,501
1,(HA2)1,6e-06,0.000119,G,G,9.322773e-07,C179,344,B D F,501
2,(HA2)1,6e-06,0.000119,G,A,0.0,C179,344,B D F,501
3,(HA2)1,6e-06,0.000119,G,M,0.0,C179,344,B D F,501
4,(HA2)1,6e-06,0.000119,G,D,0.0,C179,344,B D F,501


In [3]:
df.to_csv('Doud2018.csv', index=False)