# Processing data for HIV rabbit serum mapping experiments

## notebook setup

In [1]:
import pandas as pd

## create numbering maps 

In order to get the data into the dms-view format, we need a map from BG505 numbering to HxB2 numbering and from HxB2 numbering to protein chain/site in the protein structure file.

In [2]:
# create the HxB2 map
HXB2_fname = ('https://raw.githubusercontent.com/jbloomlab/'
              'EnvsAntigenicAtlas/master/results/HXB2_numbering/'
              'BG505_to_HXB2.csv')
HXB2_map = (pd.read_csv(HXB2_fname).rename(columns={"new": "label_site"})
            [["original", "label_site"]]).sort_values(by='original')
HXB2_map = HXB2_map.drop(columns='original')
HXB2_map['site'] = [x+1 for x in range(len(HXB2_map))]

# create the protein map 
protein_fname = '5fyl_trimer_renumber.pdb'
pdb_map = []
with open(protein_fname, "r") as f:
    for line in f.readlines():
        if line.startswith("ATOM"):
            line = line.split()
            pdb_map.append(line[4:6])
pdb_map = pd.DataFrame(pdb_map, columns=["protein_chain", "protein_site"])
pdb_map["label_site"] = pdb_map["protein_site"]

## process the _site_ data

The code below processes the site differential selection summary metrics from the different conditions into a tidy data format with the following columns: 'label_site', 'metric', 'value', 'condition', 'site', 'protein_chain', 'protein_site'

In [3]:
'Create site metric long form data file'
conditions = ['2124-Wk22', '2423-Wk18', '5724-Wk26', '2214-Wk43', '2425-Wk18', '5727-Wk26']
# process the site metrics
site = []
for condition in conditions:
    fname = (f'summary_{condition}-mediansitediffsel.csv')
    print(f'reading ... {fname}')
    sitemetricdf = pd.read_csv(fname)
    sitemetricdf.columns = [f"site_{' '.join(x.split('_'))}" if x != 'site' 
                            else x for x in sitemetricdf.columns.values]
    sitemetricdf['condition'] = condition
    site.append(sitemetricdf)
site = pd.concat(site).rename(columns={'site': 'label_site'}).round(3)
# process the site numbering
site = pd.merge(site, HXB2_map, on=["label_site"], how='left').astype({'site': 'int32'}).sort_values("site", ascending=True)

# process the protein numbering
site = pd.merge(site, pdb_map, on="label_site", how='left')

site.head()

reading ... summary_2124-Wk22-mediansitediffsel.csv
reading ... summary_2423-Wk18-mediansitediffsel.csv
reading ... summary_5724-Wk26-mediansitediffsel.csv
reading ... summary_2214-Wk43-mediansitediffsel.csv
reading ... summary_2425-Wk18-mediansitediffsel.csv
reading ... summary_5727-Wk26-mediansitediffsel.csv


Unnamed: 0,label_site,site_abs diffsel,site_positive diffsel,site_negative diffsel,site_max diffsel,site_min diffsel,condition,site,protein_chain,protein_site
0,31,4.761,2.784,-1.977,0.64,-0.687,2423-Wk18,1,G,31
1,31,4.761,2.784,-1.977,0.64,-0.687,2423-Wk18,1,G,31
2,31,4.761,2.784,-1.977,0.64,-0.687,2423-Wk18,1,G,31
3,31,4.761,2.784,-1.977,0.64,-0.687,2423-Wk18,1,G,31
4,31,4.761,2.784,-1.977,0.64,-0.687,2423-Wk18,1,G,31


## process the _mutation_ data

The code below processes the mutation differential selection, preferences, and natural frequencies into a tidy data format with the following columns: 'label_site', 'metric', 'value', 'condition', 'site', 'protein_chain', 'protein_site', 'wildtype', 'mutatation' 


### mutation differential selection

In [4]:
conditions = ['2124-Wk22', '2423-Wk18', '5724-Wk26', '2214-Wk43',
              '2425-Wk18', '5727-Wk26']
# process the mut metrics
mut = []
for condition in conditions:
    fname = (f'summary_{condition}-medianmutdiffsel.csv')
    print(f'reading ... {fname}')
    mutmetricdf = pd.read_csv(fname)
    mutmetricdf.columns = [f"mut_{' '.join(x.split('_'))}" if x not in ['site', 'wildtype', 'mutation']
                        else x for x in mutmetricdf.columns.values]
    mutmetricdf = mutmetricdf.round(3)
    mutmetricdf['condition'] = condition
    mut.append(mutmetricdf)
mut = pd.concat(mut).rename(columns={'site': 'label_site'})

# process the site numbering
mut = pd.merge(mut, HXB2_map, on=["label_site"], how='left').astype({'site': 'int32'}).sort_values("site", ascending=True)

# process the protein numbering
mut = pd.merge(mut, pdb_map, on="label_site", how='left')

mut.head()

reading ... summary_2124-Wk22-medianmutdiffsel.csv
reading ... summary_2423-Wk18-medianmutdiffsel.csv
reading ... summary_5724-Wk26-medianmutdiffsel.csv
reading ... summary_2214-Wk43-medianmutdiffsel.csv
reading ... summary_2425-Wk18-medianmutdiffsel.csv
reading ... summary_5727-Wk26-medianmutdiffsel.csv


Unnamed: 0,label_site,wildtype,mutation,mut_mutdiffsel,condition,site,protein_chain,protein_site
0,31,A,V,-0.183,5727-Wk26,1,G,31
1,31,A,V,-0.183,5727-Wk26,1,G,31
2,31,A,V,-0.183,5727-Wk26,1,G,31
3,31,A,V,-0.183,5727-Wk26,1,G,31
4,31,A,V,-0.183,5727-Wk26,1,G,31


### preferences

In [5]:
fname = 'BG505-avg-rescaled-prefs_ADrealigned.csv'
prefs = pd.read_csv(fname).rename(columns={'site': 'label_site'})
prefs = pd.melt(prefs, id_vars='label_site', var_name='mutation', value_name='mut_DMS preferences')
mut = pd.merge(mut, prefs, on=['label_site', 'mutation'])
mut.head()

Unnamed: 0,label_site,wildtype,mutation,mut_mutdiffsel,condition,site,protein_chain,protein_site,mut_DMS preferences
0,31,A,V,-0.183,5727-Wk26,1,G,31,0.044294
1,31,A,V,-0.183,5727-Wk26,1,G,31,0.044294
2,31,A,V,-0.183,5727-Wk26,1,G,31,0.044294
3,31,A,V,-0.183,5727-Wk26,1,G,31,0.044294
4,31,A,V,-0.183,5727-Wk26,1,G,31,0.044294


### natural frequencies

In [6]:
fname = 'LANL_NatFreq_dropIndelsRelBG505.csv'
freqs = pd.read_csv(fname).rename(columns={'site': 'label_site'})
freqs = pd.melt(freqs, id_vars='label_site', var_name='mutation', value_name='mut_Natural Frequencies')
mut = pd.merge(mut, freqs, on=['label_site', 'mutation'])
mut.head()

Unnamed: 0,label_site,wildtype,mutation,mut_mutdiffsel,condition,site,protein_chain,protein_site,mut_DMS preferences,mut_Natural Frequencies
0,31,A,V,-0.183,5727-Wk26,1,G,31,0.044294,0.118424
1,31,A,V,-0.183,5727-Wk26,1,G,31,0.044294,0.118424
2,31,A,V,-0.183,5727-Wk26,1,G,31,0.044294,0.118424
3,31,A,V,-0.183,5727-Wk26,1,G,31,0.044294,0.118424
4,31,A,V,-0.183,5727-Wk26,1,G,31,0.044294,0.118424


## final dataframe

In [7]:
mut['mut_pos mutdiffsel'] = mut['mut_mutdiffsel'].clip(lower=0)
df = pd.merge(mut, site, on=['site', 'label_site', 'condition', 'protein_site', 'protein_chain'], how='left').drop_duplicates()

# change the order of the columns 
firstmut = 'mut_pos mutdiffsel'
firstsite = 'site_positive diffsel'
cols = list(df.columns.values) 
cols.remove(firstmut) 
cols.remove(firstsite) 
df = df[[firstmut, firstsite] + cols]
df.head()

Unnamed: 0,mut_pos mutdiffsel,site_positive diffsel,label_site,wildtype,mutation,mut_mutdiffsel,condition,site,protein_chain,protein_site,mut_DMS preferences,mut_Natural Frequencies,site_abs diffsel,site_negative diffsel,site_max diffsel,site_min diffsel
0,0.0,2.492,31,A,V,-0.183,5727-Wk26,1,G,31,0.044294,0.118424,10.835,-8.343,0.935,-1.803
225,0.0,2.784,31,A,V,-0.167,2423-Wk18,1,G,31,0.044294,0.118424,4.761,-1.977,0.64,-0.687
450,0.0,3.415,31,A,V,-0.434,2425-Wk18,1,G,31,0.044294,0.118424,7.535,-4.12,0.975,-1.261
675,0.071,0.097,31,A,V,0.071,2124-Wk22,1,G,31,0.044294,0.118424,10.236,-10.138,0.071,-2.209
900,0.484,0.807,31,A,V,0.484,5724-Wk26,1,G,31,0.044294,0.118424,23.524,-22.717,0.484,-2.575


In [8]:
df = df.drop_duplicates()
df.to_csv('Dingens2020.csv', index=False)