# Processing data for HIV rabbit serum mapping experiments

## notebook setup

In [1]:
import pandas as pd

## create numbering maps 

In order to get the data into the dms-view format, we need a map from BG505 numbering to HxB2 numbering and from HxB2 numbering to protein chain/site in the protein structure file.

In [2]:
# create the HxB2 map
HXB2_fname = ('https://raw.githubusercontent.com/jbloomlab/'
              'EnvsAntigenicAtlas/master/results/HXB2_numbering/'
              'BG505_to_HXB2.csv')
HXB2_map = (pd.read_csv(HXB2_fname).rename(columns={"new": "label_site"})
            [["original", "label_site"]]).sort_values(by='original')
HXB2_map['site'] = [x+1 for x in range(len(HXB2_map))]

# create the protein map 
protein_fname = '5FYL_AbsRemoved.pdb'
pdb_map = []
with open(protein_fname, "r") as f:
    for line in f.readlines():
        if line.startswith("ATOM"):
            line = line.split()
            pdb_map.append(line[4:6])
pdb_map = pd.DataFrame(pdb_map, columns=["protein_chain", "protein_site"])
pdb_map["label_site"] = pdb_map["protein_site"]

## process the _site_ data

The code below processes the site differential selection summary metrics from the different conditions into a tidy data format with the following columns: 'label_site', 'metric', 'value', 'condition', 'site', 'protein_chain', 'protein_site'

In [3]:
'Create site metric long form data file'
site_metrics = {'mediansitediffsel': 'median'}
conditions = ['2124-Wk22', '2423-Wk18', '5724-Wk26', '2214-Wk43', '2425-Wk18', '5727-Wk26']
# process the site metrics
site = []
for condition in conditions:
    for metric in site_metrics.keys():
        fname = (f'summary_{condition}-{metric}.csv')
        print(f'reading ... {fname}')
        sitemetricdf = pd.read_csv(fname)
        sitemetricdf = pd.melt(sitemetricdf, id_vars='site',
                               var_name='metric')
        sitemetricdf['metric'] = (sitemetricdf['metric']
                                  .apply(lambda x:
                                         f"{' '.join(x.split('_'))} "
                                         f'({site_metrics[metric]} of reps)'))
        sitemetricdf['condition'] = condition
        site.append(sitemetricdf)
site = pd.concat(site).rename(columns={'site': 'label_site'})

# process the site numbering
site = pd.merge(site, HXB2_map, on=["label_site"]).astype({'site': 'int32'}).sort_values("site", ascending=True)

# process the protein numbering
site = pd.merge(site, pdb_map, on="label_site").drop(columns='original')

site.head()

reading ... summary_2124-Wk22-mediansitediffsel.csv
reading ... summary_2423-Wk18-mediansitediffsel.csv
reading ... summary_5724-Wk26-mediansitediffsel.csv
reading ... summary_2214-Wk43-mediansitediffsel.csv
reading ... summary_2425-Wk18-mediansitediffsel.csv
reading ... summary_5727-Wk26-mediansitediffsel.csv


Unnamed: 0,label_site,metric,value,condition,site,protein_chain,protein_site
0,31,min diffsel (median of reps),-1.802861,5727-Wk26,1,G,31
1,31,min diffsel (median of reps),-1.802861,5727-Wk26,1,G,31
2,31,min diffsel (median of reps),-1.802861,5727-Wk26,1,G,31
3,31,min diffsel (median of reps),-1.802861,5727-Wk26,1,G,31
4,31,min diffsel (median of reps),-1.802861,5727-Wk26,1,G,31


## process the _mutation_ data

The code below processes the mutation differential selection, preferences, and natural frequencies into a tidy data format with the following columns: 'label_site', 'metric', 'value', 'condition', 'site', 'protein_chain', 'protein_site', 'wildtype', 'mutatation' 


### mutation differential selection

In [4]:
mut_metrics = {'medianmutdiffsel':
               'mut diffsel (median of reps)'}
conditions = ['2124-Wk22', '2423-Wk18', '5724-Wk26', '2214-Wk43',
              '2425-Wk18', '5727-Wk26']
# process the mut metrics
mut = []
for condition in conditions:
    for mut_metric in mut_metrics.keys():
        fname = (f'summary_{condition}-{mut_metric}.csv')
        print(f'reading ... {fname}')
        mutmetricdf = pd.read_csv(fname)
        mutmetricdf = pd.melt(mutmetricdf,
                              id_vars=['site', 'wildtype', 'mutation'],
                              var_name='metric')
        mutmetricdf['metric'] = mut_metrics[mut_metric]
        mutmetricdf['value'] = mutmetricdf['value'].round(3)
        mutmetricdf['condition'] = condition
        mut.append(mutmetricdf)
mut = pd.concat(mut).rename(columns={'site': 'label_site'})

# process the site numbering
mut = pd.merge(mut, HXB2_map, on=["label_site"]).astype({'site': 'int32'}).sort_values("site", ascending=True)

# process the protein numbering
mut = pd.merge(mut, pdb_map, on="label_site")

mut.head()

reading ... summary_2124-Wk22-medianmutdiffsel.csv
reading ... summary_2423-Wk18-medianmutdiffsel.csv
reading ... summary_5724-Wk26-medianmutdiffsel.csv
reading ... summary_2214-Wk43-medianmutdiffsel.csv
reading ... summary_2425-Wk18-medianmutdiffsel.csv
reading ... summary_5727-Wk26-medianmutdiffsel.csv


Unnamed: 0,label_site,wildtype,mutation,metric,value,condition,original,site,protein_chain,protein_site
0,31,A,E,mut diffsel (median of reps),-0.044,2425-Wk18,30,1,G,31
1,31,A,E,mut diffsel (median of reps),-0.044,2425-Wk18,30,1,G,31
2,31,A,E,mut diffsel (median of reps),-0.044,2425-Wk18,30,1,G,31
3,31,A,E,mut diffsel (median of reps),-0.044,2425-Wk18,30,1,G,31
4,31,A,E,mut diffsel (median of reps),-0.044,2425-Wk18,30,1,G,31


### preferences

In [5]:
template = mut.drop(columns=['metric', 'value', 'mutation']).drop_duplicates()
template.head()

Unnamed: 0,label_site,wildtype,condition,original,site,protein_chain,protein_site
0,31,A,2425-Wk18,30,1,G,31
40,31,A,2214-Wk43,30,1,G,31
155,31,A,5727-Wk26,30,1,G,31
295,31,A,5724-Wk26,30,1,G,31
310,31,A,2423-Wk18,30,1,G,31


In [6]:
fname = 'BG505-avg-rescaled-prefs_ADrealigned.csv'
prefs = pd.read_csv(fname).rename(columns={'site': 'label_site'})
prefs = pd.melt(prefs, id_vars='label_site', var_name='mutation')
prefs['metric'] = 'DMS preferences'
prefs = pd.merge(prefs, template, on=['label_site'])
prefs.head()

Unnamed: 0,label_site,mutation,value,metric,wildtype,condition,original,site,protein_chain,protein_site
0,100,A,0.009841,DMS preferences,M,5727-Wk26,99,70,G,100
1,100,A,0.009841,DMS preferences,M,2423-Wk18,99,70,G,100
2,100,A,0.009841,DMS preferences,M,5724-Wk26,99,70,G,100
3,100,A,0.009841,DMS preferences,M,2124-Wk22,99,70,G,100
4,100,A,0.009841,DMS preferences,M,2425-Wk18,99,70,G,100


### natural frequencies

In [7]:
fname = 'LANL_NatFreq_dropIndelsRelBG505.csv'
freqs = pd.read_csv(fname).rename(columns={'site': 'label_site'})
freqs = pd.melt(freqs, id_vars='label_site', var_name='mutation')
freqs['metric'] = 'Natural Frequencies'
freqs = pd.merge(freqs, template, on=['label_site'])
freqs.head()

Unnamed: 0,label_site,mutation,value,metric,wildtype,condition,original,site,protein_chain,protein_site
0,31,A,0.270254,Natural Frequencies,A,2425-Wk18,30,1,G,31
1,31,A,0.270254,Natural Frequencies,A,2214-Wk43,30,1,G,31
2,31,A,0.270254,Natural Frequencies,A,5727-Wk26,30,1,G,31
3,31,A,0.270254,Natural Frequencies,A,5724-Wk26,30,1,G,31
4,31,A,0.270254,Natural Frequencies,A,2423-Wk18,30,1,G,31


### all together

In [8]:
mut = pd.concat([mut, prefs, freqs], sort=False)
mut.head()

Unnamed: 0,label_site,wildtype,mutation,metric,value,condition,original,site,protein_chain,protein_site
0,31,A,E,mut diffsel (median of reps),-0.044,2425-Wk18,30,1,G,31
1,31,A,E,mut diffsel (median of reps),-0.044,2425-Wk18,30,1,G,31
2,31,A,E,mut diffsel (median of reps),-0.044,2425-Wk18,30,1,G,31
3,31,A,E,mut diffsel (median of reps),-0.044,2425-Wk18,30,1,G,31
4,31,A,E,mut diffsel (median of reps),-0.044,2425-Wk18,30,1,G,31


## final dataframe

In [9]:
# process the mut data
mut['metric'] = mut['metric'].apply(lambda x: f'mut_{x}')
mut = pd.pivot_table(mut, index=['site', 'label_site', 'condition',
                                 'protein_site', 'protein_chain',
                                 'wildtype', 'mutation'],
                     columns='metric', values='value').reset_index()

# process the site data
site['metric'] = site['metric'].apply(lambda x: f'site_{x}')
site = pd.pivot_table(site, index=['site', 'label_site', 'condition',
                                   'protein_site', 'protein_chain'],
                      columns='metric', values='value').reset_index()

# combine the two dataframes
df = pd.merge(mut, site, on=['site', 'label_site', 'condition',
                             'protein_site', 'protein_chain'])
assert len(mut) == len(df)
df.head()

metric,site,label_site,condition,protein_site,protein_chain,wildtype,mutation,mut_DMS preferences,mut_Natural Frequencies,mut_mut diffsel (median of reps),site_abs diffsel (median of reps),site_max diffsel (median of reps),site_min diffsel (median of reps),site_negative diffsel (median of reps),site_positive diffsel (median of reps)
0,1,31,2124-Wk22,31,G,A,A,0.051555,0.270254,,10.235501,0.071443,-2.20906,-10.13826,0.097242
1,1,31,2124-Wk22,31,G,A,C,0.020093,0.0,0.0,10.235501,0.071443,-2.20906,-10.13826,0.097242
2,1,31,2124-Wk22,31,G,A,D,0.060383,0.009202,-1.293,10.235501,0.071443,-2.20906,-10.13826,0.097242
3,1,31,2124-Wk22,31,G,A,E,0.072537,0.076215,-0.1,10.235501,0.071443,-2.20906,-10.13826,0.097242
4,1,31,2124-Wk22,31,G,A,F,0.128547,0.0,-0.257,10.235501,0.071443,-2.20906,-10.13826,0.097242


In [10]:
df.to_csv('Dingens2020.csv', index=False)