# [Dingens *et al.*, 2019](https://research.fhcrc.org/content/dam/stripe/bloom/labfiles/publications/Dingens2019.pdf) data: Mutatioal Antigenic Profiling of HIV Env from a panel of mAbs

## notebook setup

In [1]:
import pandas as pd

## MAP data

In [2]:
antibodies = ['101074', 'PGT145', 'PGT151', 'PGT121', 'VRC34', 'VRC01',
              '10E8', '3BNC117', 'PG9']
df = []
for condition in antibodies:
    site = pd.read_csv(f'https://raw.githubusercontent.com/jbloomlab/'
                     'EnvsAntigenicAtlas/master/results/fracsurviveaboveavg/'
                     f'concavg_wtDNA_ctrl/summary_{condition}-mediansitefracsurvive.csv')
    mut = pd.read_csv('https://raw.githubusercontent.com/jbloomlab/'
                     'EnvsAntigenicAtlas/master/results/fracsurviveaboveavg/'
                     f'concavg_wtDNA_ctrl/summary_{condition}-medianmutfracsurvive.csv')
    # process the site data
    site.columns = [x if x == 'site' else f"site_{' '.join(x.split('_'))}" for x in site.columns.values]
    
    # process the mut data
    mut = mut.rename(columns={'mutfracsurvive': 'mut_fracsurvive'})
    
    # add them together 
    df.append(pd.merge(site, mut, on=['site']).assign(condition=condition))
df = pd.concat(df).rename(columns={'site': 'label_site'})
                    
# change the order of the columns 
firstsite = 'site_avgfracsurvive'
cols = list(df.columns.values) 
cols.remove(firstsite) 
df = df[[firstsite] + cols]
df.head()

Unnamed: 0,site_avgfracsurvive,label_site,site_maxfracsurvive,wildtype,mutation,mut_fracsurvive,condition
0,0.071198,334,0.128302,S,L,0.128302,101074
1,0.071198,334,0.128302,S,R,0.121295,101074
2,0.071198,334,0.128302,S,F,0.107862,101074
3,0.071198,334,0.128302,S,Q,0.104261,101074
4,0.071198,334,0.128302,S,A,0.100473,101074


## protein data

In [3]:
def protein_map(fname):
    df = []
    with open(fname, "r") as f:
        for line in f.readlines():
            if line.startswith("ATOM"):
                line = line.split()
                df.append(line[4:6])
    df = pd.DataFrame(df, columns=["protein_chain", "protein_site"])
    df["label_site"] = df["protein_site"]
    return df

In [4]:
protein_fname = '5fyl_trimer_renumber.pdb'
pdb_map = protein_map(protein_fname)
df = pd.merge(df, pdb_map, on='label_site', how='left')

## fix sequential numbers

In [5]:
HXB2_fname = ('https://raw.githubusercontent.com/jbloomlab/'
              'EnvsAntigenicAtlas/master/results/HXB2_numbering/'
              'BG505_to_HXB2.csv')
HXB2_map = (pd.read_csv(HXB2_fname).rename(columns={"new": "label_site"})
            [["original", "label_site"]]).sort_values(by='original')
df = pd.merge(df, HXB2_map, on=['label_site'], how='left')
sites = df['original'].unique()
sites.sort()
site_map = {key: i for i, key in enumerate(sites)}
df['site'] = df['original'].map(site_map)
df = df.drop(columns='original')
df.head()

Unnamed: 0,site_avgfracsurvive,label_site,site_maxfracsurvive,wildtype,mutation,mut_fracsurvive,condition,protein_chain,protein_site,site
0,0.071198,334,0.128302,S,L,0.128302,101074,G,334,302
1,0.071198,334,0.128302,S,L,0.128302,101074,G,334,302
2,0.071198,334,0.128302,S,L,0.128302,101074,G,334,302
3,0.071198,334,0.128302,S,L,0.128302,101074,G,334,302
4,0.071198,334,0.128302,S,L,0.128302,101074,G,334,302


## clean up dataframe

In [6]:
df = df.drop(df[df['mut_fracsurvive'] == 0].index).drop_duplicates()
df.head()

Unnamed: 0,site_avgfracsurvive,label_site,site_maxfracsurvive,wildtype,mutation,mut_fracsurvive,condition,protein_chain,protein_site,site
0,0.071198,334,0.128302,S,L,0.128302,101074,G,334,302
18,0.071198,334,0.128302,S,R,0.121295,101074,G,334,302
36,0.071198,334,0.128302,S,F,0.107862,101074,G,334,302
54,0.071198,334,0.128302,S,Q,0.104261,101074,G,334,302
72,0.071198,334,0.128302,S,A,0.100473,101074,G,334,302


## output

In [7]:
df.to_csv('Dingens2019.csv', index=False)