In [1]:
import altair as alt

import pandas as pd

import itertools

import numpy

import sklearn.manifold
from sklearn.preprocessing import MinMaxScaler

import warnings
warnings.filterwarnings('ignore')

from IPython.utils import io

import glob

In [2]:
import os
os.chdir('../../')

In [4]:
# define samples in each age cohort
ped_sera = [2367, 3944, 2462, 2389, 2323, 2388, 2463, 3973, 4299, 4584]
teen_sera = [2343, 2350, 2365, 2380, 2382, 3866, 3856, 3857, 3862, 3895]
adult_sera = ['33C', '34C', '197C', '199C', '215C', '210C', '74C', '68C', '150C', '18C']

# get list of lists for samples divided by age group
serum_lists = [ped_sera, teen_sera, adult_sera]
age_cohorts = ['0-5', '15-18', '40-45']

# adjust this if we want more stringent filtering
min_times_seen = 5

df_list = []

i = 0 # for looping across age cohort definitions

for list in serum_lists:
    for serum in list:
        # reading in values from averaged libA and libB models
        avg_df = pd.read_csv(f'results/antibody_escape/{serum}_icXX_avg.csv'
                            ).query(f"`times_seen` >= {min_times_seen}")
        
        avg_df = avg_df[['site', 'wildtype', 'mutant', 'log2 fold change IC90 mean']]

        avg_df = avg_df.rename(columns={'log2 fold change IC90 mean': 'ic90_mean'})
        
        serum = str(serum) # ped / teen sera automatically read as ints
        avg_df['serum'] = serum
        avg_df['age_cohort'] = age_cohorts[i]

        # also get summed and mean site scores to check AA-level vs site-level metrics
        avg_df['sitewise_ic90_sum'] = avg_df['ic90_mean'].groupby(avg_df['site']).transform('sum')
        avg_df['sitewise_ic90_mean'] = avg_df['ic90_mean'].groupby(avg_df['site']).transform('mean')

        df_list.append(avg_df)

    i+=1

# concat to final df
escape_df = pd.concat(df_list).reset_index(drop=True)

escape_df.head()

Unnamed: 0,site,wildtype,mutant,ic90_mean,serum,age_cohort,sitewise_ic90_sum,sitewise_ic90_mean
0,-2,D,Y,-0.127,2367,0-5,-0.127,-0.127
1,1,Q,R,-0.1413,2367,0-5,-0.1413,-0.1413
2,2,K,N,0.0437,2367,0-5,0.0437,0.0437
3,3,I,A,0.0551,2367,0-5,-0.7658,-0.042544
4,3,I,D,0.0938,2367,0-5,-0.7658,-0.042544


In [50]:
positive_escape = escape_df.copy()
positive_escape['sitewise_ic90_sum'] = positive_escape['sitewise_ic90_sum'].clip(lower=0)
positive_escape['sitewise_ic90_mean'] = positive_escape['sitewise_ic90_mean'].clip(lower=0)
positive_escape['ic90_mean'] = positive_escape['ic90_mean'].clip(lower=0)

In [8]:
def escape_similarity(df, site_or_aa='site', p=1, feature_range=(0, 1)):   
    sera = df['serum'].unique()
    similarities = []
    
    if site_or_aa == 'site':         
        # generate df where each column is a serum and each row is escape at a site
        df = df[['serum', 'site', 'sitewise_ic90_mean']].drop_duplicates()
        pivoted_df = (
            df.assign(metric=lambda x: x['sitewise_ic90_mean']**p)
            .pivot_table(index='site',
                         columns='serum',
                         values='metric',
                         fill_value=0)
        )
        
        # initialize MinMaxScaler with default range of (0, 1)
        scaler = MinMaxScaler(feature_range=feature_range)
        
        # normalize each column (i.e. serum) independently
        normalized_data = scaler.fit_transform(pivoted_df.values)
        
        # generate a new df with normalized values and original column names
        normalized_df = pd.DataFrame(normalized_data, columns=pivoted_df.columns)
    
    elif site_or_aa == 'aa':
        # generate df where each column is a serum and each row is escape at a mutation
        pivoted_df = (
            df.assign(metric=lambda x: x['ic90_mean']**p)
            .pivot_table(index=['site', 'mutant'],
                         columns='serum',
                         values='metric',
                         fill_value=0)
        )

        # initialize MinMaxScaler with default range of (0, 1)
        scaler = MinMaxScaler(feature_range=feature_range)
        
        # normalize each column (i.e. serum) independently
        normalized_data = scaler.fit_transform(pivoted_df.values)
        
        # generate a new df with normalized values and original column names
        normalized_df = pd.DataFrame(normalized_data, columns=pivoted_df.columns)
        
    else: 
        raise ValueError("escape metric should be either 'site' or 'aa'")
    
    # calculate dot product for each serum profile against all other sera
    for ser1, ser2 in itertools.product(sera, sera):
        similarity = (
            normalized_df
            .assign(similarity=lambda x: x[ser1] * x[ser2])
            ['similarity']
        )
        assert similarity.notnull().all()
        similarities.append(similarity.sum())
        
    return pd.DataFrame(numpy
                        .array(similarities)
                        .reshape(len(sera), len(sera)),
                        columns=sera, index=sera) 

In [9]:
def dissimilarity(similarity, method='one_minus'):
    if method == 'one_minus':
        return 1 - similarity
    elif method == 'minus_log':
        return -numpy.log(similarity)
    else:
        raise ValueError(f"invalid `method` {method}")

In [10]:
def mds_and_plot(df,
                 site_or_aa='site', 
                 p=1,
                 dissimilarity_method='one_minus',
                 mds_random_state=1,
                 feature_range=(0,1)
                ):
    
    # compute similarities and dissimilarities, and get full list of sera
    similarities = escape_similarity(df, site_or_aa, p, feature_range)
    dissimilarities = similarities.applymap(lambda x: dissimilarity(x, method=dissimilarity_method))
    sera = df['serum'].unique()
    
    # use MDS to project dissimilarities into 2D space, and get array of serum profile coordinates
    mds = sklearn.manifold.MDS(n_components=2,
                               metric=True,
                               max_iter=3000,
                               eps=1e-6,
                               random_state=mds_random_state, 
                               dissimilarity='precomputed',
                               n_jobs=1)
    locs = mds.fit_transform(dissimilarities)
    
    # convert to pandas df with serum names
    locs_df = pd.DataFrame({'serum': sera, 'x_coord': locs[:, 0], 'y_coord': locs[:, 1]})

    # get one line per serum from full escape df, for age cohort mapping
    age_cohort_df = df.groupby('serum', group_keys=False).apply(lambda df: df.sample(1))

    # add age cohort column
    locs_df = locs_df.merge(age_cohort_df[['serum', 'age_cohort']],
                            how='left',
                            on='serum', 
                           )
    
    # calculate aspect ratio for plotting
    x_range = locs_df['x_coord'].max() - locs_df['x_coord'].min()
    y_range = locs_df['y_coord'].max() - locs_df['y_coord'].min()
    aspect_ratio = x_range / y_range
    
    # visualize with altair
    mds_plot = (
        alt.Chart(locs_df)
        .encode(
            x=alt.X("x_coord",
                    scale=alt.Scale(padding=5),
                   ),
            y=alt.Y("y_coord",
                    scale=alt.Scale(padding=5),
                   ),
            tooltip=['serum', 'age_cohort'],
            color=alt.Color('age_cohort:N'
                           ).scale(scheme='set2'),
            detail='serum',
        )
        .mark_circle(size=100, opacity=0.7)
        .properties(width=300, height=300 / aspect_ratio)
        .configure_axis(
            grid=False,
            title=None,
            labelFontSize=12
        )
        .configure_legend(
            titleFontSize=15,
            labelFontSize=13
        )
    )

    return mds_plot

In [48]:
mds_and_plot(positive_escape, mds_random_state=7, feature_range=(0, 1))

In [47]:
mds_and_plot(positive_escape, site_or_aa='aa', mds_random_state=1, p=2, feature_range=(0, 1))

## testing mean IC90

In [58]:
mds_and_plot(positive_escape, mds_random_state=1)

In [60]:
mds_and_plot(positive_escape, p=2, mds_random_state=1)

In [57]:
mds_and_plot(escape_df, mds_random_state=7, feature_range=(-1,1))

In [11]:
def escape_similarity(df, site_or_aa='site', p=1, feature_range=(0, 1)):   
    sera = df['serum'].unique()
    similarities = []
    
    if site_or_aa == 'site':         
        # generate df where each column is a serum and each row is escape at a site
        df = df[['serum', 'site', 'sitewise_ic90_sum']].drop_duplicates()
        pivoted_df = (
            df.assign(metric=lambda x: x['sitewise_ic90_sum']**p)
            .pivot_table(index='site',
                         columns='serum',
                         values='metric',
                         fill_value=0)
        )
        
c
    
    elif site_or_aa == 'aa':
        # generate df where each column is a serum and each row is escape at a mutation
        pivoted_df = (
            df.assign(metric=lambda x: x['ic90_mean']**p)
            .pivot_table(index=['site', 'mutant'],
                         columns='serum',
                         values='metric',
                         fill_value=0)
        )

        # initialize MinMaxScaler with default range of (0, 1)
        scaler = MinMaxScaler(feature_range=feature_range)
        
        # normalize each column (i.e. serum) independently
        normalized_data = scaler.fit_transform(pivoted_df.values)
        
        # generate a new df with normalized values and original column names
        normalized_df = pd.DataFrame(normalized_data, columns=pivoted_df.columns)
        
    else: 
        raise ValueError("escape metric should be either 'site' or 'aa'")
    
    # calculate dot product for each serum profile against all other sera
    for ser1, ser2 in itertools.product(sera, sera):
        similarity = (
            normalized_df
            .assign(similarity=lambda x: x[ser1] * x[ser2])
            ['similarity']
        )
        assert similarity.notnull().all()
        similarities.append(similarity.sum())
        
    return pd.DataFrame(numpy
                        .array(similarities)
                        .reshape(len(sera), len(sera)),
                        columns=sera, index=sera) 

In [19]:
mds_and_plot(escape_df, mds_random_state=5)

KeyError: "['sitewise_ic90_sum'] not in index"

In [5]:
df = escape_df.copy()

In [6]:
df = df[['serum', 'site', 'sitewise_ic90_sum']].drop_duplicates()
pivoted_df = (
    df.assign(metric=lambda x: x['sitewise_ic90_sum']**1)
    .pivot_table(index='site',
                 columns='serum',
                 values='metric',
                 fill_value=0)
)

normalized_df = pivoted_df.copy()

for column in normalized_df.columns:
    extremum = max(abs(pivoted_df[column]))
    normalized_df[column] = pivoted_df[column] / extremum
    
    
#     col_min = pivoted_df[column].min()
#     col_max = pivoted_df[column].max()
#     col_range = col_max - col_min

#     if col_range != 0:
#         normalized_df[column] = ((pivoted_df[column] - col_min) / col_range) * 2 - 1
        
normalized_df

serum,150C,18C,197C,199C,210C,215C,2323,2343,2350,2365,...,3857,3862,3866,3895,3944,3973,4299,4584,68C,74C
site,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
-2,-0.005305,0.006000,0.001150,0.054808,-0.001258,0.007157,0.026141,0.038528,0.000593,0.000664,...,0.000487,0.000224,-0.011577,-0.002191,0.010403,-0.003329,-0.030693,-0.002178,-0.005041,-0.002579
1,0.000423,0.006055,0.000000,0.000000,-0.001973,0.000000,0.007868,0.000000,0.000000,0.000000,...,-0.005555,-0.017009,0.000000,0.002291,0.001317,0.011960,-0.007685,0.012606,-0.010466,-0.010898
2,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.003128,0.002040,-0.008047,0.004821,...,0.013432,0.010291,-0.004364,-0.003943,0.001965,-0.001054,0.014525,-0.000295,0.000000,0.000000
3,-0.072592,0.026505,-0.032441,0.000255,-0.102307,-0.051330,-0.070235,0.006472,-0.045336,0.058957,...,0.027975,-0.019686,-0.036401,-0.066000,-0.044070,-0.045717,-0.213476,0.014447,-0.002416,0.006209
4,-0.035699,0.046050,-0.032685,-0.050762,-0.046144,-0.053239,-0.025265,-0.056580,-0.016123,-0.056023,...,-0.047657,-0.012661,0.034991,-0.002185,-0.053519,-0.026346,-0.124417,-0.046445,-0.097954,-0.015514
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
526,0.004891,0.003804,-0.000697,0.000930,0.003063,0.000462,0.005345,-0.025261,0.004313,-0.007546,...,-0.000065,0.019332,0.002536,0.008813,0.006836,-0.008682,0.030210,-0.001367,-0.012658,-0.004390
529,0.020410,-0.016098,-0.030920,-0.032999,-0.005779,-0.040048,0.015625,-0.032224,-0.004906,0.001188,...,-0.002000,0.005122,0.002553,0.016467,0.003546,0.005286,0.113638,-0.000440,0.006697,-0.026792
537,-0.008559,-0.006919,0.014054,-0.011918,-0.004912,-0.010555,-0.024904,0.001135,-0.011831,-0.017176,...,0.004387,-0.008919,0.000557,-0.012518,-0.015075,-0.039176,-0.070619,0.003627,-0.019844,-0.025838
538,-0.018672,-0.010573,0.009571,-0.012769,-0.025105,-0.018697,-0.002759,0.000368,0.002048,-0.050026,...,0.011353,0.026581,0.012981,0.002890,-0.016392,0.004215,0.015999,-0.001690,-0.026093,-0.009079


In [18]:
# define samples in each age cohort
ped_sera = [2367, 3944, 2462, 2389, 2323, 2388, 2463, 3973, 4299, 4584]
teen_sera = [2343, 2350, 2365, 2380, 2382, 3866, 3856, 3857, 3862, 3895]
adult_sera = ['33C', '34C', '197C', '199C', '215C', '210C', '74C', '68C', '150C', '18C']

# get list of lists for samples divided by age group
serum_lists = [ped_sera, teen_sera, adult_sera]
age_cohorts = ['0-5', '15-18', '40-45']

# adjust this if we want more stringent filtering
min_times_seen = 5

df_list = []

i = 0 # for looping across age cohort definitions

for list in serum_lists:
    for serum in list:
        # reading in values from just libB models
        beta_df = pd.read_csv(f'results/antibody_escape/{serum}_rep.csv'
                                 ).query(f"`times_seen` >= {min_times_seen}"
                                        ).query("`library` == 'libB'")
        ic90_df = pd.read_csv(f'results/antibody_escape/{serum}_icXX_rep.csv'
                                 ).query(f"`times_seen` >= {min_times_seen}"
                                        ).query("`library` == 'libB'")

        # get both ic90 and beta in same df
        full_df = beta_df.merge(ic90_df,
                                    how='left',
                                    on=['site', 'wildtype', 'mutant']
                                   )[['site', 'wildtype', 'mutant', 'escape', 
                                      'log2 fold change IC90']]

        full_df = full_df.rename(columns={'log2 fold change IC90': 'ic90',
                                          'escape': 'beta'
                                         })
        serum = str(serum) # ped / teen sera automatically read as ints
        full_df['serum'] = serum
        full_df['age_cohort'] = age_cohorts[i]

        # also get summed site scores to check AA-level vs site-level metrics
        full_df['sitewise_beta'] = full_df['beta'].groupby(full_df['site']).transform('sum')
        full_df['sitewise_ic90'] = full_df['ic90'].groupby(full_df['site']).transform('sum')

        df_list.append(full_df)

    i+=1

# concat to final df
escape_df = pd.concat(df_list).reset_index(drop=True)

escape_df.head()

Unnamed: 0,site,wildtype,mutant,beta,ic90,serum,age_cohort,sitewise_beta,sitewise_ic90
0,-2,D,G,-0.0661,-0.0954,2367,0-5,-0.7167,-1.034
1,-2,D,Y,-0.6506,-0.9386,2367,0-5,-0.7167,-1.034
2,1,Q,R,-0.007,-0.0101,2367,0-5,-0.007,-0.0101
3,2,K,N,0.0275,0.0397,2367,0-5,0.0275,0.0397
4,3,I,A,-0.0609,-0.0879,2367,0-5,-0.9673,-1.3951


In [20]:
escape_df = escape_df.rename(columns={'sitewise_ic90': 'sitewise_ic90_sum'})
escape_df.head()

Unnamed: 0,site,wildtype,mutant,beta,ic90,serum,age_cohort,sitewise_beta,sitewise_ic90_sum
0,-2,D,G,-0.0661,-0.0954,2367,0-5,-0.7167,-1.034
1,-2,D,Y,-0.6506,-0.9386,2367,0-5,-0.7167,-1.034
2,1,Q,R,-0.007,-0.0101,2367,0-5,-0.007,-0.0101
3,2,K,N,0.0275,0.0397,2367,0-5,0.0275,0.0397
4,3,I,A,-0.0609,-0.0879,2367,0-5,-0.9673,-1.3951


In [24]:
mds_and_plot(escape_df, mds_random_state=3)