# Testing the RESTORE algorithm

I ran the RESTORE algorithm with scaling (https://www.biorxiv.org/content/10.1101/2020.09.30.321539v1.full.pdf) on my replicate data to see how well it normalized. 

The conclusion: "scaling" i.e., setting everything below threshold to 0 and scaling everything above threshold between 0.02 and 1 normalizes better than dividing by the threshold, but not as well as other methods (i.e. combat)

### Adapted From Single-cell analysis hands-on session
Erik Burlingame, Chang Lab, OHSU, 2020-12-2


Note: This notebook requires a special python environment and the RESTORE code

*please see:*
https://gitlab.com/eburling/SCA

In [None]:
import sklearn
import os
import pandas as pd # dataframe operations
import seaborn as sns # clustergram viz
import numpy as np # array operations
import holoviews as hv # viz
import datashader as ds # viz for large data
from sklearn.preprocessing import minmax_scale # data scaling
from holoviews.operation.datashader import datashade # viz for large data
from colorcet import fire, glasbey_hv # perceptually accurate colormaps
hv.extension('bokeh') # specify which library for plotting, e.g. 'bokeh' or 'matplotlib'
import matplotlib.pyplot as plt

In [None]:
os.chdir('../Collaborators')
import RESTORE # normalization code, clone Erik's repository https://gitlab.com/eburling/SCA
#also follow his instructions for building the rapids environment

In [None]:
#load data
#change to correct directory
#os.chdir('/home/groups/graylab_share/OMERO.rdsStore/engje/Data/cycIF_ValidationStudies/cycIF_Validation')
codedir=os.getcwd()
rootdir = f'{codedir}/Data/'
datadir = f'{codedir}/Data/filtered_data'
os.chdir(datadir)
s_type = "BM-Her2"#'JE-TMA' 
#load sampled data
if s_type == 'sampled':
    s_date = '20201209'
    df=pd.read_csv(f'{s_date}_JE-TMA-41-43-62_SampledMeanIntensity_{s_type}.csv',index_col=0)
    #df=pd.read_csv(f'{s_date}_BM-Her2N75_SampledMeanIntensity_{s_type}.csv',index_col=0) # for Biomax TMA
#or load full data
if s_type == "JE-TMA":
    df=pd.read_csv(f'20201210_JE-TMA-41-43-62_FilteredMeanIntensity.csv',index_col=0)
if s_type == "BM-Her2":
    df=pd.read_csv(f'20201229_BM-Her2N75-15-17-18_MeanIntensity.csv',index_col=0)
df['batch'] = [item.split('_')[0] for item in df.index]

In [None]:
raw_marker_mask = df.dtypes=='float64'
raw_marker_cols = df.columns[raw_marker_mask]
df['scene'] = [item.split('_cell')[0] for item in df.index]

## calculate RESTORE thresholds

In [None]:
# find mutually exclusive markers by batch
i_thresh = 0.66
i_low = 0.5
d_result = {}
d_result_good = {}
for QUERY_MARKER in raw_marker_cols:
    for QUERY_SCENE in sorted(set(df.batch)):
        tissue_mask = (df.batch==QUERY_SCENE)
        query_df = df.loc[tissue_mask,raw_marker_cols]
        r_vals = []
        good_r_vals = []
        for marker in query_df:
            if marker != QUERY_MARKER:
                X = query_df[[QUERY_MARKER, marker]].T
                svd =sklearn.decomposition.TruncatedSVD(n_components=2)
                svd.fit(X)
                r = svd.singular_values_[1] / svd.singular_values_[0]
                if r > i_low:
                    r_vals.append((marker, r))
                if r > i_thresh:
                    good_r_vals.append((marker, r))
        d_result_good.update({f'{QUERY_MARKER}_{QUERY_SCENE}':(good_r_vals)})
        d_result.update({f'{QUERY_MARKER}_{QUERY_SCENE}':(r_vals)})

In [None]:
#select top r vals
d_result_r = {}
for QUERY_MARKER in raw_marker_cols:
    #print(QUERY_MARKER)
    for QUERY_SCENE in sorted(set(df.batch)):
        good_r_vals = d_result_good[f'{QUERY_MARKER}_{QUERY_SCENE}']
        r_vals = d_result[f'{QUERY_MARKER}_{QUERY_SCENE}']
        es_marker = [item[0] for item in good_r_vals]
        es_marker_low = [item[0] for item in r_vals]
        if len(es_marker) == 0:
            d_result_r.update({f'{QUERY_MARKER}_{QUERY_SCENE}':es_marker_low})
        else:
            d_result_r.update({f'{QUERY_MARKER}_{QUERY_SCENE}':es_marker})

In [None]:
#find median threshold of r vals above 0.66, or 0.5 if none above 0.66
df_result = pd.DataFrame(index=raw_marker_cols,columns=sorted(set(df.batch)))
for key, items in d_result_r.items():
    if s_type == "JE-TMA":
        QUERY_MARKER = key.split('_JE')[0]
        QUERY_SCENE = 'JE' + key.split('_JE')[1]
    elif s_type == "BM-Her2":
        QUERY_MARKER = key.split('_BM')[0]
        QUERY_SCENE = 'BM' + key.split('_BM')[1]
    tissue_mask = (df.batch==QUERY_SCENE)
    query_df = df.loc[tissue_mask,raw_marker_cols]
    a_norm = np.array([])
    for BG_MARKER in items:
        X = RESTORE.process_data(query_df[[QUERY_MARKER, BG_MARKER]], 
                        QUERY_MARKER, BG_MARKER)
        norm_factor, clusters = RESTORE.get_ssc_thresh(X)
        a_norm = np.append(a_norm,norm_factor)
    df_result.loc[QUERY_MARKER,QUERY_SCENE] = np.median(a_norm)
    #break

In [None]:
#save restore normalization factors
if s_type == "JE-TMA":
    if not os.path.exists('20201229_JE-TMA-41-43-62_restore_normfactor.csv'):
        df_result.dropna().to_csv('20201229_JE-TMA-41-43-62_restore_normfactor.csv')
elif s_type == "BM-Her2":
    if not os.path.exists('20201229_BM-Her2N75-15-17-18_restore_normfactor.csv'):
        df_result.dropna().to_csv('20201229_BM-Her2N75-15-17-18_restore_normfactor.csv')

## apply RESTORE normalization: original

As we infer background signal based on the negative control, we can scale intensity values by the inferred background signal level of the negative control for individual sample, respectively, to align intensity distribution.(https://doi.org/10.1038/s42003-020-0828-1)

In [None]:
#load saved normalization factors
if s_type == "JE-TMA":
    df_result = pd.read_csv('20201229_JE-TMA-41-43-62_restore_normfactor.csv',index_col=0)
elif s_type == "BM-Her2":
    df_result = pd.read_csv('20201229_BM-Her2N75-15-17-18_restore_normfactor.csv',index_col=0)
# apply normalization: division
df_norm = pd.DataFrame(index=df.index)
for QUERY_MARKER in df_result.dropna().index.tolist():
    for QUERY_SCENE in sorted(set(df.batch)):
        ls_index = df[df.batch==QUERY_SCENE].index
        i_min = df.loc[ls_index,QUERY_MARKER].min()
        i_thresh = df_result.loc[QUERY_MARKER,QUERY_SCENE]
        df_norm.loc[ls_index,QUERY_MARKER] = (df.loc[ls_index,QUERY_MARKER] - i_min)/(i_thresh - i_min)

In [None]:
df_norm['batch'] = [item.split('_')[0] for item in df_norm.index]

In [None]:
s_type

In [None]:
# save for Kbet
#save the 3 normalized, sampled dataframes for kbet analysis
s_trans='raw'
s_date = '20201209'
if s_type == "JE-TMA":
    df_sample=pd.read_csv(f'{s_date}_JE-TMA-41-43-62_SampledMeanIntensity_{s_trans}.csv',index_col=0)
elif s_type == "BM-Her2":
    df_sample=pd.read_csv(f'{s_date}_BM-Her2N75_SampledMeanIntensity_{s_trans}.csv',index_col=0) # for Biomax TMA
#save for kbet
if s_type == "JE-TMA":
    df_norm.loc[df_sample.index,:].to_csv(f'{s_date}_JE-TMA-41-43-62_SampledMeanIntensity_restore_div.csv')
elif s_type == "BM-Her2":
    df_norm.loc[df_sample[~df_sample.index.str.contains('cell0000')].index,:].to_csv(f'{s_date}_BM-Her2N75_SampledMeanIntensity_restore_div.csv')

In [None]:
# process with arcsinh 
# remove outliers
X = df_norm.loc[:,df_norm.dtypes=='float64']
X = X.clip(upper=X.quantile(q=.999),axis=1)

# deskew
X = (X/5).apply(np.arcsinh)

# scale
X = X.apply(minmax_scale)

In [None]:
X['batch'] = [item.split('_')[0] for item in X.index]

In [None]:
# save for Kbet
#save the 3 normalized, sampled dataframes for kbet analysis
s_trans='raw'
s_date = '20201209'
if s_type == "JE-TMA":
    df_sample=pd.read_csv(f'{s_date}_JE-TMA-41-43-62_SampledMeanIntensity_{s_trans}.csv',index_col=0)
elif s_type == "BM-Her2":
    df_sample=pd.read_csv(f'{s_date}_BM-Her2N75_SampledMeanIntensity_{s_trans}.csv',index_col=0) # for Biomax TMA
#save for kbet
if s_type == "JE-TMA":
    X.loc[df_sample.index,:].to_csv(f'{s_date}_JE-TMA-41-43-62_SampledMeanIntensity_restore_div_arcsinh.csv')
elif s_type == "BM-Her2":
    X.loc[df_sample[~df_sample.index.str.contains('cell0000')].index,:].to_csv(f'{s_date}_BM-Her2N75_SampledMeanIntensity_restore_div_arcsinh.csv')

In [None]:
#save full dataframe, normalized
if s_type == "JE-TMA":
    if not os.path.exists('20201229_JE-TMA-41-43-62_restore_norm_div.csv'):
        df_norm.to_csv('20201229_JE-TMA-41-43-62_restore_norm_div.csv')
elif s_type == "BM-Her2":
    if not os.path.exists('20201229_BM-Her2N75-15-17-18_restore_norm_div.csv'):
        df_norm.to_csv('20201229_BM-Her2N75-15-17-18_restore_norm_div.csv')

## apply RESTORE normalization: scaled

All values below the background level were randomly set within a range
between 0 and 0.02, while all values exceeding the background level (corresponding to
signals) were linearly scaled to a range between 0.02 and 1. Thereby, influence of background
variation on the subsequently applied single-cell analysis was eliminated, while foreground
signals were stretched to a larger dynamic range. (https://www.biorxiv.org/content/10.1101/2020.09.30.321539v1.full.pdf)

In [None]:
# apply normalization: set below threshold to random value between 0 - 0.02
df_norm = pd.DataFrame(index=df.index)
for QUERY_MARKER in df_result.dropna().index.tolist():
    for QUERY_SCENE in sorted(set(df.batch)):
        i_thresh = df_result.loc[QUERY_MARKER,QUERY_SCENE]
        ls_index_neg = df[(df.batch==QUERY_SCENE) & (df.loc[:,QUERY_MARKER] < i_thresh)].index
        a_rand = np.random.random_sample((len(ls_index_neg),))*0.02
        df_norm.loc[ls_index_neg,QUERY_MARKER] = a_rand
        ls_index = df[(df.batch==QUERY_SCENE) & (df.loc[:,QUERY_MARKER] >= i_thresh)].index
        df_norm.loc[ls_index,QUERY_MARKER] = sklearn.preprocessing.minmax_scale(df.loc[ls_index,QUERY_MARKER],feature_range=(0.02,1))


In [None]:
df_norm['batch'] = [item.split('_')[0] for item in df_norm.index]

In [None]:
#save full normalized dataframes
if s_type == "JE-TMA":
    if not os.path.exists('20201229_JE-TMA-41-43-62_restore_norm.csv'):
        df_norm.to_csv('20201229_JE-TMA-41-43-62_restore_norm.csv')
elif s_type == "BM-Her2":
    if not os.path.exists('BM-Her2N75-15-17-18_restore_norm.csv'):
        df_norm.to_csv('20201229_BM-Her2N75-15-17-18_restore_norm.csv')

In [None]:
#save the 3 normalized, sampled dataframes for kbet analysis
s_trans='raw'
s_date = '20201208'
if s_type == "JE-TMA":
    df_sample=pd.read_csv(f'{s_date}_JE-TMA-41-43-62_SampledMeanIntensity_{s_trans}.csv',index_col=0)
elif s_type == "BM-Her2":
    df_sample=pd.read_csv(f'{s_date}_BM-Her2N75_SampledMeanIntensity_{s_trans}.csv',index_col=0) # for Biomax TMA
#save for kbet
if s_type == "JE-TMA":
    df_norm.loc[df_sample.index,:].to_csv(f'{s_date}_JE-TMA-41-43-62_SampledMeanIntensity_restore_scale.csv')
elif s_type == "BM-Her2":
    df_norm.loc[df_sample[~df_sample.index.str.contains('cell0000')].index,:].to_csv(f'{s_date}_BM-Her2N75_SampledMeanIntensity_restore_scale.csv')

# visualize 

In [None]:

#plot histograms
#save
df_norm['batch'] = [item.split('_')[0] for item in df_norm.index]
#plot 
%matplotlib inline
s_trans = 'scale'#'div'
s_date = s_type #"BM-Her2N75"
bins=50
for s_marker in df_result.dropna().index.tolist():
    print(s_marker)
    fig,ax=plt.subplots(2,1,figsize = (5,5))
    for idxs, s_batch in enumerate(sorted(set(df_norm.batch))):
        df_batch = df_norm[(df_norm.batch==s_batch)].loc[:,s_marker] #+ 1 #set minimum to 1
        if len(df_batch.dropna()) == 0:
            continue
        ax[0].hist(df.loc[df.index.str.contains(s_batch),s_marker],bins=bins,alpha=0.4, color=f'C{idxs}',label=s_batch)
        ax[1].hist(df_batch,bins=bins,alpha=0.4, color=f'C{idxs}',label=s_batch)
        ax[0].axvline(df_result.loc[s_marker,s_batch], c=f'C{idxs}',alpha=0.7,ls='--')
        ax[0].set_yscale('log')
        ax[1].set_yscale('log')
        ax[0].set_title(f'{s_marker.split("_")[0]}: Raw Data')
        ax[1].set_title(f'{s_marker.split("_")[0]}: Restore {s_trans}')
        ax[0].legend()
    plt.tight_layout()
    fig.savefig(f'{rootdir}20201228/Different_Scaling_restore_{s_marker}_{s_trans}_{s_date}.png')