# Testing the RESTORE algorithm

I ran the RESTORE algorithm on my replicate data to see how well it normalized. 

The conclusion: the batch effects after RESTORE normalization were the same as before normalization.

### Adapted From Single-cell analysis hands-on session
Erik Burlingame, Chang Lab, OHSU, 2020-12-2


Note: This notebook requires a special python environment and the RESTORE code

*please see:*
https://gitlab.com/eburling/SCA

In [None]:
import sklearn
import os
import pandas as pd # dataframe operations
import seaborn as sns # clustergram viz
import numpy as np # array operations
import holoviews as hv # viz
import datashader as ds # viz for large data
from sklearn.preprocessing import minmax_scale # data scaling
from holoviews.operation.datashader import datashade # viz for large data
from colorcet import fire, glasbey_hv # perceptually accurate colormaps
hv.extension('bokeh') # specify which library for plotting, e.g. 'bokeh' or 'matplotlib'
import matplotlib.pyplot as plt

In [None]:
#os.chdir('../Collaborators')
import RESTORE # normalization code, clone Erik's repository https://gitlab.com/eburling/SCA
#also follow his instructions for building the rapids environment

In [None]:
#load data
#os.chdir('/home/groups/graylab_share/OMERO.rdsStore/engje/Data/cycIF_ValidationStudies/cycIF_Validation')
codedir=os.getcwd()
rootdir = f'{codedir}/Data/'
datadir = f'{codedir}/Data/filtered_data'
os.chdir(datadir)
s_type = "BM-Her2"#'JE-TMA' 

if s_type == 'sampled':
    s_date = '20201209'
    s_trans = 'raw'
    df=pd.read_csv(f'{s_date}_JE-TMA-41-43-62_SampledMeanIntensity_{s_trans}.csv',index_col=0)
    #df=pd.read_csv(f'{s_date}_BM-Her2N75_SampledMeanIntensity_{s_trans}.csv',index_col=0) # for Biomax TMA
#or load full data
if s_type == "JE-TMA":
    df=pd.read_csv(f'20201210_JE-TMA-41-43-62_FilteredMeanIntensity.csv',index_col=0)
if s_type == "BM-Her2":
    df=pd.read_csv(f'20201229_BM-Her2N75-15-17-18_MeanIntensity.csv',index_col=0)
df['batch'] = [item.split('_')[0] for item in df.index]


In [None]:
df.head()

In [None]:
raw_marker_mask = df.dtypes=='float64'
raw_marker_cols = df.columns[raw_marker_mask]
df['scene'] = [item.split('_cell')[0] for item in df.index]

# local thresholds

In [None]:
#scene by scene (i.e. local thresholds)
i_thresh = 0.5 #0.66
i_low = 0.5
d_result = {}
d_result_good = {}
figs = []
for QUERY_MARKER in raw_marker_cols:
    for QUERY_SCENE in sorted(set(df.scene)):
        tissue_mask = (df.scene==QUERY_SCENE)
        query_df = df.loc[tissue_mask,raw_marker_cols]
        r_vals = []
        good_r_vals = []

        for marker in query_df:
            if marker != QUERY_MARKER:
                X = query_df[[QUERY_MARKER, marker]].T
                svd =sklearn.decomposition.TruncatedSVD(n_components=2)
                svd.fit(X)
                r = svd.singular_values_[1] / svd.singular_values_[0]
                if r > 0.66:
                    fig = datashade(hv.Scatter(X.T, QUERY_MARKER, [marker]),
                        cmap=fire,
                        normalization='log')
                    figs.append(fig)
                if r > i_low:
                    r_vals.append((marker, r))
                if r > i_thresh:
                    good_r_vals.append((marker, r))
        d_result_good.update({f'{QUERY_MARKER}_{QUERY_SCENE}':(good_r_vals)})
        d_result.update({f'{QUERY_MARKER}_{QUERY_SCENE}':(r_vals)})

In [None]:
total = 0
for key, item in d_result.items():
    if len(item) ==0:
        total +=1
        print(key)

In [None]:
#manually add mutually exclusive - sacrifice DAPI and CK17 for JE-TMA
if s_type == "JE-TMA":
    d_result_good['HER2_cellmem25_JE-TMA-62_scene09'] = [('Ki67_nuclei', 0.5725681893270853)] 
    d_result_good['aSMA_perinuc5_JE-TMA-43_scene05'] = [('CD31_perinuc5', 0.8094300105819111)]
    d_result_good['aSMA_perinuc5_JE-TMA-43_scene12'] = [('CK7_cytoplasm', 0.6378682711189212)]
    d_result_good['Ki67_nuclei_JE-TMA-41_scene13'] = [('PCNA_nuclei', 0.46020656345741257)]
if s_type == "BM-Her2":
    d_result_good['CK14_Ring_BM-Her2N75-17_scene017'] = [('CD68_Ring', 0.6527263360741544)]
    d_result_good['CK14_Ring_BM-Her2N75-17_scene049'] = [('CD68_Ring', 0.6527263360741544)]
    d_result_good['CK5_Ring_BM-Her2N75-17_scene017'] = [('CD68_Ring', 0.6527263360741544)]

In [None]:
#JE TMA results
# at r val cut off of 0.5, 44% of scenes had no mutually exclusive partners
425/950
# at r val cut off if 0.3, 7% had no mututally exclusive partners
68/950
# at r val cut off if 0.2, 1% had no mututally exclusive partners
10/950

In [None]:
# BM-Her2 results
# at r val cut off if 0.3, 1.6% had no mututally exclusive partners
3/180
# at r val cut off of 0.5, 23% of scenes had no mutually exclusive partners
total/180

In [None]:
# plot r vals
'''
r_box = hv.BoxWhisker(r_vals,'neg_marker','r_val').opts(title=QUERY_MARKER,
                                    xrotation=45,
                                    width=800,
                                    toolbar=None).sort('r_val')
r_box
'''

In [None]:
#bi_scatters = hv.Layout(figs).cols(3)
#bi_scatters

In [None]:
#select top r vals
d_result_r = {}
for QUERY_MARKER in raw_marker_cols:
    for QUERY_SCENE in sorted(set(df.scene)):
        good_r_vals = d_result_good[f'{QUERY_MARKER}_{QUERY_SCENE}']
        r_vals = d_result[f'{QUERY_MARKER}_{QUERY_SCENE}']
        es_marker = [item[0] for item in good_r_vals]
        es_marker_low = [item[0] for item in r_vals]
        if len(es_marker) == 0:
            d_result_r.update({f'{QUERY_MARKER}_{QUERY_SCENE}':es_marker_low})
        else:
            d_result_r.update({f'{QUERY_MARKER}_{QUERY_SCENE}':es_marker})

In [None]:
#find median threshold of r vals above 0.5, or 0.2 if none above 0.5
df_result = pd.DataFrame(index=raw_marker_cols,columns=sorted(set(df.scene)))
for key, items in d_result_r.items():
    if s_type == "JE-TMA":
        QUERY_MARKER = key.split('_JE')[0]
        QUERY_SCENE = 'JE' + key.split('_JE')[1]
    elif s_type == "BM-Her2":
        QUERY_MARKER = key.split('_BM')[0]
        QUERY_SCENE = 'BM' + key.split('_BM')[1]
    tissue_mask = (df.scene==QUERY_SCENE)
    query_df = df.loc[tissue_mask,raw_marker_cols]
    a_norm = np.array([])
    for BG_MARKER in items:
        X = RESTORE.process_data(query_df[[QUERY_MARKER, BG_MARKER]], 
                        QUERY_MARKER, BG_MARKER)
        if len(X) > 2:
            norm_factor, clusters = RESTORE.get_ssc_thresh(X)
            a_norm = np.append(a_norm,norm_factor)
    df_result.loc[QUERY_MARKER,QUERY_SCENE] = np.median(a_norm)
    #break

In [None]:
df_result

In [None]:
# for JE-TMA half (48%) of the markers did not have sufficient mutual exclusive relationship
# for BM-HER2 all markers had partners!
len(df_result.dropna())/len(df_result)
s_type

In [None]:
#save restore normalization factors
if s_type == "JE-TMA":
    if not os.path.exists('20201229_JE-TMA-41-43-62_restore_local_normfactor.csv'):
        df_result.to_csv('20201229_JE-TMA-41-43-62_restore_local_normfactor.csv')
elif s_type == "BM-Her2":
    if not os.path.exists('20201229_BM-Her2N75-15-17-18_restore_local_normfactor.csv'):
        df_result.to_csv('20201229_BM-Her2N75-15-17-18_restore_local_normfactor.csv')

In [None]:
#load saved normalization factors
if s_type == "JE-TMA":
    df_result = pd.read_csv('20201229_JE-TMA-41-43-62_restore_local_normfactor.csv',index_col=0)
elif s_type == "BM-Her2":
    df_result = pd.read_csv('20201229_BM-Her2N75-15-17-18_restore_local_normfactor.csv',index_col=0)
# apply normalization: division
df_norm = pd.DataFrame(index=df.index)
for QUERY_MARKER in df_result.dropna().index.tolist():
    for QUERY_SCENE in sorted(set(df.scene)):
        ls_index = df[df.scene==QUERY_SCENE].index
        i_min = df.loc[ls_index,QUERY_MARKER].min()
        i_thresh = df_result.loc[QUERY_MARKER,QUERY_SCENE]
        df_norm.loc[ls_index,QUERY_MARKER] = (df.loc[ls_index,QUERY_MARKER] - i_min)/(i_thresh - i_min)

In [None]:
df_norm['batch'] = [item.split('_')[0] for item in df_norm.index]

In [None]:
#save the 3 normalized, sampled dataframes for kbet analysis
s_trans ='raw'
s_date = '20201207'
if s_type == "JE-TMA":
    df_sample=pd.read_csv(f'{s_date}_JE-TMA-41-43-62_SampledMeanIntensity_{s_trans}.csv',index_col=0)
elif s_type == "BM-Her2":
    df_sample=pd.read_csv(f'{s_date}_BM-Her2N75_SampledMeanIntensity_{s_trans}.csv',index_col=0) # for Biomax TMA
#save for kbet
if s_type == "JE-TMA":
    df_norm.loc[df_sample.index,:].to_csv(f'{s_date}_JE-TMA-41-43-62_SampledMeanIntensity_restore_local.csv')
elif s_type == "BM-Her2":
    df_norm.loc[df_sample[~df_sample.index.str.contains('cell0000')].index,:].to_csv(f'{s_date}_BM-Her2N75_SampledMeanIntensity_restore_local.csv')

In [None]:

#plot histograms
#save
df_norm['batch'] = [item.split('_')[0] for item in df_norm.index]
#plot 
%matplotlib inline
s_trans = 'local'
s_date = s_type
bins=50
for s_marker in df_result.dropna().index.tolist():
    print(s_marker)
    fig,ax=plt.subplots(2,1,figsize = (5,5))
    for idxs, s_batch in enumerate(sorted(set(df_norm.batch))):
        df_batch = df_norm[(df_norm.batch==s_batch)].loc[:,s_marker] + 1 #set minimum to 1
        if len(df_batch.dropna()) == 0:
            continue
        ax[1].hist(df_batch,bins=bins,alpha=0.4, color=f'C{idxs}',label=s_batch)
        ax[0].hist(df.loc[df.index.str.contains(s_batch),s_marker],bins=bins,alpha=0.4, color=f'C{idxs}',label=s_batch)
        ax[0].set_yscale('log')
        ax[1].set_yscale('log')
        ax[0].set_title(f'{s_marker.split("_")[0]}: Raw Data')
        ax[1].set_title(f'{s_marker.split("_")[0]}: Restore {s_trans}')
        ax[0].legend()
    plt.tight_layout()
    fig.savefig(f'{rootdir}/20201228/Different_Scaling_restore_{s_marker}_{s_trans}_{s_date}.png')
