# Multidimensional scaling of serum escape profiles across age groups

In [1]:
import altair as alt

import pandas as pd

import itertools

import numpy

import sklearn.manifold
from sklearn.preprocessing import MinMaxScaler

import warnings
warnings.filterwarnings('ignore')

from IPython.utils import io

import glob

In [2]:
import os
os.chdir('../../')

### Read beta and IC90 values
For this analysis, I'm using the escape scores from models averaged between libA and libB. We have data on the mean, median, and std deviation for beta and IC90 values between both models. I'm just working with the mean scores for now, and analyzing both beta and IC90 in case these yield different results. We also aggregate sitewise scores as a sum of AA-level escape scores.

In [3]:
# define samples in each age cohort
ped_sera = [2367, 3944, 2462, 2389, 2323, 2388, 2463, 3973, 4299, 4584]
teen_sera = [2343, 2350, 2365, 2380, 2382, 3866, 3856, 3857, 3862, 3895]
adult_sera = ['33C', '34C', '197C', '199C', '215C', '210C', '74C', '68C', '150C', '18C']

# get list of lists for samples divided by age group
serum_lists = [ped_sera, teen_sera, adult_sera]
age_cohorts = ['0-5', '15-18', '40-45']

# adjust this if we want more stringent filtering
min_times_seen = 5

df_list = []

i = 0 # for looping across age cohort definitions

for list in serum_lists:
    for serum in list:
        # reading in values from averaged libA and libB models
        avg_beta_df = pd.read_csv(f'results/antibody_escape/{serum}_avg.csv'
                                 ).query(f"`times_seen` >= {min_times_seen}")
        avg_ic90_df = pd.read_csv(f'results/antibody_escape/{serum}_icXX_avg.csv'
                                 ).query(f"`times_seen` >= {min_times_seen}")

        # get both ic90 and beta in same df
        full_df = avg_beta_df.merge(avg_ic90_df,
                                    how='left',
                                    on=['site', 'wildtype', 'mutant']
                                   )[['site', 'wildtype', 'mutant', 'escape_mean', 
                                      'log2 fold change IC90 mean']]

        full_df = full_df.rename(columns={'log2 fold change IC90 mean': 'ic90_mean',
                                          'escape_mean': 'beta_mean'
                                         })
        serum = str(serum) # ped / teen sera automatically read as ints
        full_df['serum'] = serum
        full_df['age_cohort'] = age_cohorts[i]

        # also get summed site scores to check AA-level vs site-level metrics
        full_df['sitewise_beta'] = full_df['beta_mean'].groupby(full_df['site']).transform('sum')
        full_df['sitewise_ic90'] = full_df['ic90_mean'].groupby(full_df['site']).transform('sum')

        df_list.append(full_df)

    i+=1

# concat to final df
escape_df = pd.concat(df_list).reset_index(drop=True)

escape_df.head()

Unnamed: 0,site,wildtype,mutant,beta_mean,ic90_mean,serum,age_cohort,sitewise_beta,sitewise_ic90
0,-2,D,Y,-0.0881,-0.127,2367,0-5,-0.0881,-0.127
1,1,Q,R,-0.0979,-0.1413,2367,0-5,-0.0979,-0.1413
2,2,K,N,0.0303,0.0437,2367,0-5,0.0303,0.0437
3,3,I,A,0.0382,0.0551,2367,0-5,-0.5305,-0.7658
4,3,I,D,0.065,0.0938,2367,0-5,-0.5305,-0.7658


### Calculate similarities between serum profiles, then convert to dissimilarities
We need to compute the similarity between all pairs of escape profiles in this data frame. Similarity is calculated as the dot product of the escape profiles for each pair of sera, testing both the mutation-level and site-level metrics. 

Previously, each profile was normalized based on the Euclidean L2 norm, such that the dot product with itself was 1. Here I'm testing normalizing using the MinMaxScaler from scikit.learn to just scale the escape values to a given range. This is defined in norm_range.

The escape metric can also be raised to the *p* power to emphasize sites with large values. For now, the default value is 1.

In [4]:
def escape_similarity(df, escape_metric='ic90', site_or_aa='site', p=1, feature_range=(-1, 1)):   
    sera = df['serum'].unique()
    similarities = []
    
    if site_or_aa == 'site':         
        # generate df where each column is a serum and each row is escape at a site
        metric_column = 'sitewise_' + escape_metric
        df = df[['serum', 'site', metric_column]].drop_duplicates()
        pivoted_df = (
            df.assign(metric=lambda x: x[metric_column]**p)
            .pivot_table(index='site',
                         columns='serum',
                         values='metric',
                         fill_value=0)
        )
        

        # initialize MinMaxScaler with default range of (-1, 1)
        scaler = MinMaxScaler(feature_range=feature_range)
        
        # normalize each column (i.e. serum) independently
        normalized_data = scaler.fit_transform(pivoted_df.values)
        
        # generate a new df with normalized values and original column names
        normalized_df = pd.DataFrame(normalized_data, columns=pivoted_df.columns)
    
    elif site_or_aa == 'aa':
        # generate df where each column is a serum and each row is escape at a mutation
        metric_column = escape_metric + '_mean'
        pivoted_df = (
            df.assign(metric=lambda x: x[metric_column]**p)
            .pivot_table(index=['site', 'mutant'],
                         columns='serum',
                         values='metric',
                         fill_value=0)
        )

        # initialize MinMaxScaler with default range of (-1, 1)
        scaler = MinMaxScaler(feature_range=feature_range)
        
        # normalize each column (i.e. serum) independently
        normalized_data = scaler.fit_transform(pivoted_df.values)
        
        # generate a new df with normalized values and original column names
        normalized_df = pd.DataFrame(normalized_data, columns=pivoted_df.columns)
        
    else: 
        raise ValueError("escape metric should be either 'site' or 'aa'")
    
    # calculate dot product for each serum profile against all other sera
    for ser1, ser2 in itertools.product(sera, sera):
        similarity = (
            normalized_df
            .assign(similarity=lambda x: x[ser1] * x[ser2])
            ['similarity']
        )
        assert similarity.notnull().all()
        similarities.append(similarity.sum())
        
    return pd.DataFrame(numpy
                        .array(similarities)
                        .reshape(len(sera), len(sera)),
                        columns=sera, index=sera) 

Define function to compute dissimilarity $d$ from the similarity $s$. Options are:
* **one_minus:** $d = 1-s$
* **minus_log:** $d = -ln(s)$

In [5]:
def dissimilarity(similarity, method='one_minus'):
    if method == 'one_minus':
        return 1 - similarity
    elif method == 'minus_log':
        return -numpy.log(similarity)
    else:
        raise ValueError(f"invalid `method` {method}")

### Run multidimensional scaling and plot results

Set up a function to compute similarities and dissimilarities, then run MDS [as described here](https://scikit-learn.org/stable/auto_examples/manifold/plot_mds.html#sphx-glr-auto-examples-manifold-plot-mds-py). I'm collapsing these computations and the plotting into a single function for now so that it's easier to test different parameters.

In [6]:
def mds_and_plot(df,
                 escape_metric='ic90', 
                 site_or_aa='site', 
                 p=1,
                 dissimilarity_method='one_minus',
                 mds_random_state=1,
                 feature_range=(-1,1)
                ):
    
    # compute similarities and dissimilarities, and get full list of sera
    similarities = escape_similarity(df, escape_metric, site_or_aa, p, feature_range)
    dissimilarities = similarities.applymap(lambda x: dissimilarity(x, method=dissimilarity_method))
    sera = df['serum'].unique()
    
    # use MDS to project dissimilarities into 2D space, and get array of serum profile coordinates
    mds = sklearn.manifold.MDS(n_components=2,
                               metric=True,
                               max_iter=3000,
                               eps=1e-6,
                               random_state=mds_random_state, 
                               dissimilarity='precomputed',
                               n_jobs=1)
    locs = mds.fit_transform(dissimilarities)
    
    # convert to pandas df with serum names
    locs_df = pd.DataFrame({'serum': sera, 'x_coord': locs[:, 0], 'y_coord': locs[:, 1]})

    # get one line per serum from full escape df, for age cohort mapping
    age_cohort_df = df.groupby('serum', group_keys=False).apply(lambda df: df.sample(1))

    # add age cohort column
    locs_df = locs_df.merge(age_cohort_df[['serum', 'age_cohort']],
                            how='left',
                            on='serum', 
                           )
    
    # visualize with altair
    mds_plot = (
        alt.Chart(locs_df)
        .encode(
            x=alt.X("x_coord",
                    scale=alt.Scale(padding=5),
                   ),
            y=alt.Y("y_coord",
                    scale=alt.Scale(padding=5),
                   ),
            tooltip=['serum', 'age_cohort'],
            color=alt.Color('age_cohort:N'
                           ).scale(scheme='set2'),
            detail='serum',
        )
        .mark_circle(size=200, opacity=0.7)
        .configure_axis(
            grid=False,
            title=None,
            labelFontSize=12
        )
        .configure_legend(
            titleFontSize=15,
            labelFontSize=13
        )
    )

    return mds_plot

### Normalizing values to (-1, 1) rather than (0, 1) leads to better age-dependent grouping

In [7]:
mds_and_plot(escape_df, escape_metric='ic90', mds_random_state=7, feature_range=(0, 1))

In [8]:
mds_and_plot(escape_df, escape_metric='ic90', mds_random_state=7, feature_range=(-1, 1))

All the young children are now grouping together, apart from outliers 4299 and 4584. Heterogeneity then seems to increase with age. I'm not sure why changing this range makes such a difference, and I think it's important to follow up on this.

### Age-dependent trends are inverted when we group by AA-level escape scores

In [9]:
mds_and_plot(escape_df, site_or_aa='site', p=1, 
             dissimilarity_method='one_minus', mds_random_state=7)

In [10]:
mds_and_plot(escape_df, site_or_aa='aa', p=1, 
             dissimilarity_method='one_minus', mds_random_state=7)

Children are spread out, while adults cluster more tightly compared to site-level analysis. Not sure how much to interpret this, as AA-level analysis is likely noisier.

## emphasizing sites with large values helps further separate adults from children
I'm only testing p=3 for now so that signs are maintained. But we can see that adults and teenagers start to cluster further away from the young children

In [11]:
mds_and_plot(escape_df, site_or_aa='site', p=1, 
             dissimilarity_method='one_minus', mds_random_state=7)

In [12]:
mds_and_plot(escape_df, site_or_aa='site', p=3, 
             dissimilarity_method='one_minus', mds_random_state=7)

# Old analyses

## Test with just libB scores
Because selection range was more consistent across samples for libB, try modeling just on libB scores to test the impact of selection range.

In [13]:
# # define samples in each age cohort
# ped_sera = [2367, 3944, 2462, 2389, 2323, 2388, 2463, 3973, 4299, 4584]
# teen_sera = [2343, 2350, 2365, 2380, 2382, 3866, 3856, 3857, 3862, 3895]
# adult_sera = ['33C', '34C', '197C', '199C', '215C', '210C', '74C', '68C', '150C', '18C']

# # get list of lists for samples divided by age group
# serum_lists = [ped_sera, teen_sera, adult_sera]
# age_cohorts = ['0-5', '15-18', '40-45']

# # adjust this if we want more stringent filtering
# min_times_seen = 5

# df_list = []

# i = 0 # for looping across age cohort definitions

# for list in serum_lists:
#     for serum in list:
#         # reading in values from just libB models
#         beta_df = pd.read_csv(f'results/antibody_escape/{serum}_rep.csv'
#                                  ).query(f"`times_seen` >= {min_times_seen}"
#                                         ).query("`library` == 'libB'")
#         ic90_df = pd.read_csv(f'results/antibody_escape/{serum}_icXX_rep.csv'
#                                  ).query(f"`times_seen` >= {min_times_seen}"
#                                         ).query("`library` == 'libB'")

#         # get both ic90 and beta in same df
#         full_df = beta_df.merge(ic90_df,
#                                     how='left',
#                                     on=['site', 'wildtype', 'mutant']
#                                    )[['site', 'wildtype', 'mutant', 'escape', 
#                                       'log2 fold change IC90']]

#         full_df = full_df.rename(columns={'log2 fold change IC90': 'ic90',
#                                           'escape': 'beta'
#                                          })
#         serum = str(serum) # ped / teen sera automatically read as ints
#         full_df['serum'] = serum
#         full_df['age_cohort'] = age_cohorts[i]

#         # also get summed site scores to check AA-level vs site-level metrics
#         full_df['sitewise_beta'] = full_df['beta'].groupby(full_df['site']).transform('sum')
#         full_df['sitewise_ic90'] = full_df['ic90'].groupby(full_df['site']).transform('sum')

#         df_list.append(full_df)

#     i+=1

# # concat to final df
# escape_df = pd.concat(df_list).reset_index(drop=True)

# escape_df.head()

In [14]:
# def escape_similarity(df, escape_metric='ic90', site_or_aa='site', p=1):   
#     sera = df['serum'].unique()
#     similarities = []
    
#     if site_or_aa == 'site':
#         metric_column = 'sitewise_' + escape_metric
#         df = df[['serum', 'site', metric_column]].drop_duplicates()
#         pivoted_df = (
#             df.assign(metric=lambda x: x[metric_column]**p)
#             .pivot_table(index='site',
#                          columns='serum',
#                          values='metric',
#                          fill_value=0)
#             # normalize such that each value is between 0 and 1
#             .transform(lambda x: x / numpy.linalg.norm(x, axis=0))
#         )
    
#     elif site_or_aa == 'aa':
#         pivoted_df = (
#             df.assign(metric=lambda x: x[escape_metric]**p)
#             .pivot_table(index=['site', 'mutant'],
#                          columns='serum',
#                          values='metric',
#                          fill_value=0)
#             # normalize such that each value is between 0 and 1
#             .transform(lambda x: x / numpy.linalg.norm(x, axis=0))
#         )
        
#     else: 
#         raise ValueError("escape metric should be either 'site' or 'aa'")
    
#     # calculate dot product for each serum profile against all other sera
#     for ser1, ser2 in itertools.product(sera, sera):
#         similarity = (
#             pivoted_df
#             .assign(similarity=lambda x: x[ser1] * x[ser2])
#             ['similarity']
#         )
#         assert similarity.notnull().all()
#         similarities.append(similarity.sum())
        
#     return pd.DataFrame(numpy
#                         .array(similarities)
#                         .reshape(len(sera), len(sera)),
#                         columns=sera, index=sera)    

In [15]:
# mds_and_plot(escape_df, site_or_aa='site', p=1, 
#              dissimilarity_method='one_minus', mds_random_state=7)

In [16]:
# mds_and_plot(escape_df, site_or_aa='site', p=2, 
#              dissimilarity_method='one_minus', mds_random_state=7)

In [17]:
# mds_and_plot(escape_df, site_or_aa='site', p=3, 
#              dissimilarity_method='one_minus', mds_random_state=7)

In [18]:
# def get_prob_escape(sera_list):
#     prob_escape_list = []
#     for serum in sera_list: 
#         file_pattern = f'results/prob_escape/libB_*_{serum}_*_prob_escape.csv'
#         file_list = glob.glob(file_pattern)
#         for file_path in file_list:
#             prob_escape = pd.read_csv(
#                 file_path, 
#                 keep_default_na=False,
#                 na_values="nan"
#             ).query(
#                 "`no-antibody_count` >= no_antibody_count_threshold"
#             )
        
#         prob_escape_list.append(prob_escape)
        
#     return prob_escape_list

In [19]:
# def plot_wt_prob_escape(serum_list):
    
#     mean_prob_escape_list = []

#     for prob_escape in serum_list:
#         serum = prob_escape['antibody'][0]
#         max_aa_subs = 4

#         mean_prob_escape = (
#             prob_escape.assign(
#                 n_subs=lambda x: (
#                     x["aa_substitutions_reference"]
#                     .str.split()
#                     .map(len)
#                     .clip(upper=max_aa_subs)
#                     .map(lambda n: str(n) if n < max_aa_subs else f">{max_aa_subs - 1}")
#                 )
#             )
#             .groupby(["antibody_concentration", "n_subs"], as_index=False)
#             .aggregate({"prob_escape": "mean", "prob_escape_uncensored": "mean"})
#             .rename(
#                 columns={
#                     "prob_escape": "censored to [0, 1]",
#                     "prob_escape_uncensored": "not censored",
#                 }
#             )
#             .melt(
#                 id_vars=["antibody_concentration", "n_subs"],
#                 var_name="censored",
#                 value_name="probability escape",
#             )
#             .assign(serum=serum)
#         )

#         mean_prob_escape_wt = mean_prob_escape.loc[(mean_prob_escape['n_subs'] == '0') &
#                                                    (mean_prob_escape['censored'] != 
#                                                     'not censored')
#                                                   ]
#         mean_prob_escape_wt['concentration'] = range(1, 1+len(mean_prob_escape_wt))
#         mean_prob_escape_wt['concentration'] = mean_prob_escape_wt['concentration'].astype(str)
#         mean_prob_escape_wt['serum'] = mean_prob_escape_wt['serum'].astype(str)

#         mean_prob_escape_list.append(mean_prob_escape_wt)

#     mean_prob_escape_full = pd.concat(mean_prob_escape_list, axis=0, ignore_index=True)
    
#     mean_prob_escape_chart = (
#         alt.Chart(mean_prob_escape_full)
#         .encode(
#             x=alt.X("concentration"),
#             y=alt.Y(
#                 "probability escape",
#                 scale=alt.Scale(type="symlog", constant=0.05),
#             ),
#             color=alt.Color("serum", title="serum"),
#             tooltip=[
#                 alt.Tooltip(c, format=".3g") if mean_prob_escape[c].dtype == float else c
#                 for c in mean_prob_escape.columns
#             ],
#         )
#         .mark_line(point=True, size=0.5)
#         .properties(width=200, height=125, title='avg prob escape of WT variants')
#         .configure_axis(grid=False)
#         .configure_title(
#             dy=-5,
#             fontWeight=500
#         )
#     )

#     return mean_prob_escape_chart

In [20]:
# peds_libB = get_prob_escape(ped_sera)

In [21]:
# plot_wt_prob_escape(peds_libB)

In [22]:
# teens_libB = get_prob_escape(teen_sera)
# plot_wt_prob_escape(teens_libB)

In [23]:
# adults_libB = get_prob_escape(adult_sera)
# plot_wt_prob_escape(adults_libB)

# Testing different escape score normalization strategies
In order to do MDS, we first have to normalize all escape scores to values between -1 and 1. This is currently achieved using `lambda x: x / numpy.linalg.norm(x, axis=0)`. This calculates the inverse of the '2-norm' / Euclidean distance - [see details here](https://numpy.org/doc/stable/reference/generated/numpy.linalg.norm.html). 

However, high magnitude of escape at site 189 may be skewing these results. Try thresholding magnitude of site sums to 10 before normalizing, i.e. anything with a higher magnitude = 10.

In [24]:
# escape_df['sitewise_ic90'] = escape_df['sitewise_ic90'].clip(upper=10)

# sera = escape_df['serum'].unique()
# similarities = []

# metric_column = 'sitewise_ic90'
# df = escape_df[['serum', 'site', metric_column]].drop_duplicates()
# pivoted_df = (
#     df.assign(metric=lambda x: x[metric_column]**1)
#     .pivot_table(index='site',
#                  columns='serum',
#                  values='metric',
#                  fill_value=0)
#     # normalize such that each value is between 0 and 1
#     .transform(lambda x: x / numpy.linalg.norm(x, axis=0))
# )

# for ser1, ser2 in itertools.product(sera, sera):
#     similarity = (
#         pivoted_df
#         .assign(similarity=lambda x: x[ser1] * x[ser2])
#         ['similarity']
#     )
#     assert similarity.notnull().all()
#     similarities.append(similarity.sum())

# similarities = pd.DataFrame(numpy
#                     .array(similarities)
#                     .reshape(len(sera), len(sera)),
#                     columns=sera, index=sera)   

In [25]:
# dissimilarities = similarities.applymap(lambda x: dissimilarity(x, method='one_minus'))

# # use MDS to project dissimilarities into 2D space, and get array of serum profile coordinates
# mds = sklearn.manifold.MDS(n_components=2,
#                            metric=True,
#                            max_iter=3000,
#                            eps=1e-6,
#                            random_state=1, 
#                            dissimilarity='precomputed',
#                            n_jobs=1)
# locs = mds.fit_transform(dissimilarities)

# # convert to pandas df with serum names
# locs_df = pd.DataFrame({'serum': sera, 'x_coord': locs[:, 0], 'y_coord': locs[:, 1]})

# # get one line per serum from full escape df, for age cohort mapping
# age_cohort_df = escape_df.groupby('serum', group_keys=False).apply(lambda df: df.sample(1))

# # add age cohort column
# locs_df = locs_df.merge(age_cohort_df[['serum', 'age_cohort']],
#                         how='left',
#                         on='serum', 
#                        )

# # visualize with altair
# mds_plot = (
#     alt.Chart(locs_df)
#     .encode(
#         x=alt.X("x_coord",
#                 scale=alt.Scale(padding=5),
#                ),
#         y=alt.Y("y_coord",
#                 scale=alt.Scale(padding=5),
#                ),
#         tooltip=['serum', 'age_cohort'],
#         color=alt.Color('age_cohort:N'
#                        ).scale(scheme='set2'),
#         detail='serum',
#     )
#     .mark_circle(size=200, opacity=0.7)
#     .configure_axis(
#         grid=False,
#         title=None,
#         labelFontSize=12
#     )
#     .configure_legend(
#         titleFontSize=15,
#         labelFontSize=13
#     )
# )

# mds_plot

## scratch code - 

In [26]:
# def escape_similarity_aa(df, metric_column, p=1):
#     """Compute similarity between all pairs of conditions in `df`."""
#     df = df[['serum', 'site', 'mutant', metric_column]].drop_duplicates()
    
#     sera = df['serum'].unique()
#     similarities = []
#     pivoted_df = (
#         df.assign(metric=lambda x: x[metric_column]**p)
#         .pivot_table(index=['site', 'mutant'],
#                      columns='serum',
#                      values='metric',
#                      fill_value=0)
#         # normalize such that each value is between 0 and 1
#         .transform(lambda x: x / numpy.linalg.norm(x, axis=0))
#     )

#     for ser1, ser2 in itertools.product(sera, sera):
#         similarity = (
#             pivoted_df
#             .assign(similarity=lambda x: x[ser1] * x[ser2])
#             ['similarity']
#         )
#         assert similarity.notnull().all()
#         similarities.append(similarity.sum())
        
#     return pd.DataFrame(numpy
#                         .array(similarities)
#                         .reshape(len(sera), len(sera)),
#                         columns=sera, index=sera)

In [27]:
# def escape_similarity_site(df, metric_column, p=1):
#     """Compute similarity between all pairs of conditions in `df`."""
#     df = df[['serum', 'site', metric_column]].drop_duplicates()
    
#     sera = df['serum'].unique()
#     similarities = []
#     pivoted_df = (
#         df.assign(metric=lambda x: x[metric_column]**p)
#         .pivot_table(index='site',
#                      columns='serum',
#                      values='metric',
#                      fill_value=0)
#         # normalize such that each value is between 0 and 1
#         .transform(lambda x: x / numpy.linalg.norm(x, axis=0))
#     )

#     for ser1, ser2 in itertools.product(sera, sera):
#         similarity = (
#             pivoted_df
#             .assign(similarity=lambda x: x[ser1] * x[ser2])
#             ['similarity']
#         )
#         assert similarity.notnull().all()
#         similarities.append(similarity.sum())
        
#     return pd.DataFrame(numpy
#                         .array(similarities)
#                         .reshape(len(sera), len(sera)),
#                         columns=sera, index=sera)

In [28]:
# dissimilarity_method = 'one_minus'

# similarities = escape_similarity_aa(full_df, 'sitewise_beta')
# dissimilarities = similarities.applymap(lambda x: dissimilarity(x, method=dissimilarity_method))
# sera = full_df['serum'].unique()
# n = len(sera)

# mds = sklearn.manifold.MDS(n_components=2,
#                            metric=True,
#                            max_iter=3000,
#                            eps=1e-6,
#                            random_state=5, 
#                            dissimilarity='precomputed',
#                            n_jobs=1)
# locs = mds.fit_transform(dissimilarities)

In [29]:
# def draw_pie(dist, xpos, ypos, size, ax, colors, alpha, circle_color):
#     """Based on this: https://stackoverflow.com/q/56337732"""
#     # for incremental pie slices
#     cumsum = numpy.cumsum(dist)
#     cumsum = cumsum / cumsum[-1]
#     pie = [0] + cumsum.tolist()

#     assert len(colors) == len(dist)
#     for r1, r2, color in zip(pie[:-1], pie[1:], colors):
#         angles = numpy.linspace(2 * numpy.pi * r1, 2 * numpy.pi * r2)
#         x = [0] + numpy.cos(angles).tolist()
#         y = [0] + numpy.sin(angles).tolist()

#         xy = numpy.column_stack([x, y])

#         ax.scatter([xpos], [ypos], marker=xy, s=size, facecolors=color, alpha=alpha, edgecolors='none')
#         ax.scatter(xpos, ypos, marker='o', s=size, edgecolors=circle_color,
#                    facecolors='none', alpha=alpha)

#     return ax

# color_scheme_df = (full_df
#                    .groupby(['serum', 'age_cohort'], group_keys=False)
#                    .apply(lambda df: df.sample(1))
#                    .sort_index()
#                    [['serum', 'age_cohort']]
#                   )

# color_dict = {
#     '0-5': '#E69F00',
#     '15-18': '#56B4E9',
#     '40-45': '#009E73'
    
# }

# color_scheme_df['serum'] = color_scheme_df['serum'].astype(str)
# color_scheme_df['color'] = color_scheme_df['age_cohort'].map(color_dict)

# dists = [[1] for serum in sera]
# serum_to_color = color_scheme_df[['serum', 'color']].set_index('serum')['color'].to_dict()
# colors = [[serum_to_color[serum]] for serum in sera]

# default_circle_color = 'none'
# default_label_color = 'black'

# circle_colors = []

# for serum in sera:
#     circle_colors.append(default_circle_color)
    
# # plot the multidimensional scaling result
# plot_size = 4
# fig, ax = plt.subplots(figsize=(plot_size, plot_size))
# xs = locs[:, 0]
# ys = locs[:, 1]
# for x, y, dist, color, circle_color in zip(xs, ys, dists, colors, circle_colors):
#     draw_pie(dist, x, y,
#              size=100,
#              ax=ax,
#              colors=color,
#              alpha=0.7,
#              circle_color=circle_color,
#              )
# ax.set_aspect('equal', adjustable='box')  # same distance on both axes
# ax.set_xticks([])  # no x-ticks
# ax.set_yticks([])  # no y-ticks
# ax.margins(0.09)  # increase padding from axes

# plt.show(fig)
# plt.close(fig)

In [30]:
# dissimilarity_method = 'one_minus'
# # dissimilarity_method = 'minus_log'

# # similarities = escape_similarity_site(full_df, 'sitewise_ic90', p=2)
# similarities = escape_similarity_aa(full_df, 'ic90_mean', p=3)
# dissimilarities = similarities.applymap(lambda x: dissimilarity(x, method=dissimilarity_method))
# sera = full_df['serum'].unique()
# n = len(sera)

# mds = sklearn.manifold.MDS(n_components=2,
#                            metric=True,
#                            max_iter=3000,
#                            eps=1e-6,
#                            random_state=12, # this is variable between samples in example, follow up
#                            dissimilarity='precomputed',
#                            n_jobs=1)
# locs = mds.fit_transform(dissimilarities)

In [31]:
# def mds_and_plot(df,
#                  escape_metric='ic90', 
#                  site_or_aa='site', 
#                  p=1,
#                  dissimilarity_method='one_minus',
#                  mds_random_state=1
#                 ):
    
#     # compute similarities and dissimilarities, and get full list of sera
#     similarities = escape_similarity(df, escape_metric, site_or_aa, p)
#     dissimilarities = similarities.applymap(lambda x: dissimilarity(x, method=dissimilarity_method))
#     sera = df['serum'].unique()
    
#     # use MDS to project dissimilarities into 2D space, and get array of serum profile coordinates
#     mds = sklearn.manifold.MDS(n_components=2,
#                                metric=True,
#                                max_iter=3000,
#                                eps=1e-6,
#                                random_state=mds_random_state, 
#                                dissimilarity='precomputed',
#                                n_jobs=1)
#     locs = mds.fit_transform(dissimilarities)
    
    
    
    
    
    
    
#     # following plotting is pulled from RBD MAP notebook, and is overly complex for what I'm doing here
#     # will edit later:
    
#     # assign colors to age groups
#     color_scheme_df = (df
#                        .groupby(['serum', 'age_cohort'], group_keys=False)
#                        .apply(lambda df: df.sample(1))
#                        .sort_index()
#                        [['serum', 'age_cohort']]
#                       )
    
#     color_dict = {
#         '0-5': '#E69F00',
#         '15-18': '#56B4E9',
#         '40-45': '#009E73'

#     }

#     color_scheme_df['serum'] = color_scheme_df['serum'].astype(str)
#     color_scheme_df['color'] = color_scheme_df['age_cohort'].map(color_dict)

#     dists = [[1] for serum in sera]
#     serum_to_color = color_scheme_df[['serum', 'color']].set_index('serum')['color'].to_dict()
#     colors = [[serum_to_color[serum]] for serum in sera]

#     default_circle_color = 'none'
#     default_label_color = 'black'

#     circle_colors = []

#     for serum in sera:
#         circle_colors.append(default_circle_color)

#     # plot the multidimensional scaling result
#     plot_size = 4
#     fig, ax = plt.subplots(figsize=(plot_size, plot_size))
#     xs = locs[:, 0]
#     ys = locs[:, 1]
#     for x, y, dist, color, circle_color in zip(xs, ys, dists, colors, circle_colors):
#         draw_pie(dist, x, y,
#                  size=100,
#                  ax=ax,
#                  colors=color,
#                  alpha=0.7,
#                  circle_color=circle_color,
#                  )
#     ax.set_aspect('equal', adjustable='box')  # same distance on both axes
#     ax.set_xticks([])  # no x-ticks
#     ax.set_yticks([])  # no y-ticks
#     ax.margins(0.09)  # increase padding from axes

    
#     plt.show(fig)
#     plt.close(fig)   

In [32]:
test = escape_df[['serum', 'site', 'sitewise_ic90']].drop_duplicates()
pivoted_df = (
    test.assign(metric=lambda x: x['sitewise_ic90']**1)
    .pivot_table(index='site',
                 columns='serum',
                 values='metric',
                 fill_value=0)
    # normalize such that each value is between 0 and 1
    # .transform(lambda x: x / numpy.linalg.norm(x, axis=0))
)

data_normalized = sklearn.preprocessing.normalize(pivoted_df, axis=1)
# pivoted_df_normalized = pd.DataFrame(data_normalized, columns=pivoted_df.columns)
pd.DataFrame(data_normalized).max()

0     0.610892
1     0.548348
2     0.603180
3     0.707784
4     0.454329
5     0.484798
6     0.426257
7     0.344771
8     0.436919
9     0.616638
10    0.863869
11    0.631699
12    0.621125
13    0.824801
14    0.520215
15    0.831705
16    0.394208
17    0.408042
18    0.564152
19    0.796123
20    0.604010
21    0.422618
22    0.500759
23    0.497525
24    0.624584
25    0.470961
26    0.614906
27    0.852818
28    0.514162
29    0.581065
dtype: float64

In [33]:
test = escape_df[['serum', 'site', 'sitewise_ic90']].drop_duplicates()
pivoted_df = (
    test.assign(metric=lambda x: x['sitewise_ic90']**1)
    .pivot_table(index='site',
                 columns='serum',
                 values='metric',
                 fill_value=0)
    # normalize such that each value is between 0 and 1
    # .transform(lambda x: x / numpy.linalg.norm(x, axis=0))
)

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(-1, 1))

data_normalized = scaler.fit_transform(pivoted_df.values)
pivoted_df_normalized = pd.DataFrame(data_normalized, columns=pivoted_df.columns)
pivoted_df_normalized

serum,150C,18C,197C,199C,210C,215C,2323,2343,2350,2365,...,3857,3862,3866,3895,3944,3973,4299,4584,68C,74C
0,-0.396735,0.394088,0.166275,0.473356,0.838387,0.174453,-0.270607,0.459719,0.011037,-0.192784,...,0.208282,0.469447,-0.494513,-0.490664,-0.353995,0.370442,0.183813,-0.662019,0.472826,0.824505
1,-0.388777,0.394164,0.164936,0.396800,0.837069,0.166107,-0.294448,0.405566,0.010438,-0.193576,...,0.200985,0.444131,-0.477410,-0.483998,-0.366426,0.391464,0.211913,-0.637502,0.464795,0.809287
2,-0.389365,0.385773,0.164936,0.396800,0.840702,0.166107,-0.300633,0.408433,0.002307,-0.187822,...,0.223915,0.484237,-0.483857,-0.493271,-0.365540,0.373570,0.239038,-0.658897,0.480288,0.829222
3,-0.490222,0.422502,0.127144,0.397156,0.652386,0.106251,-0.396350,0.414664,-0.035371,-0.123206,...,0.241479,0.440197,-0.531189,-0.585573,-0.428527,0.312157,-0.039419,-0.634449,0.476711,0.840580
4,-0.438963,0.449588,0.126860,0.325896,0.755764,0.104025,-0.337677,0.326040,-0.005853,-0.260443,...,0.150138,0.450518,-0.425714,-0.490655,-0.441456,0.338792,0.069349,-0.735432,0.335288,0.800843
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
409,-0.382570,0.391045,0.164124,0.398100,0.846340,0.166646,-0.297740,0.370061,0.014796,-0.202582,...,0.207616,0.497519,-0.473663,-0.474297,-0.358876,0.363081,0.258194,-0.660674,0.461550,0.821192
410,-0.361009,0.363465,0.128916,0.350708,0.830065,0.119407,-0.284327,0.360273,0.005480,-0.192158,...,0.205279,0.476643,-0.473638,-0.462913,-0.363377,0.382287,0.360085,-0.659136,0.490202,0.780213
411,-0.401257,0.376184,0.181308,0.380153,0.831661,0.153798,-0.337206,0.407161,-0.001517,-0.214077,...,0.212991,0.456015,-0.476587,-0.506024,-0.388854,0.321151,0.135052,-0.652392,0.450914,0.781958
412,-0.415307,0.371121,0.176085,0.378964,0.794491,0.144304,-0.308313,0.406084,0.012507,-0.253285,...,0.221405,0.508169,-0.458232,-0.483107,-0.390657,0.380815,0.240839,-0.661210,0.441663,0.812615
