## Test Batch Normalization on tissues


**Question:** How does combat batch normalization algorithms perform on adjacent section tissues stained with the same panel, when training with unmatched cores?

**Samples:** 
- TMA: Purchased from biomax: https://www.biomax.us/tissue-arrays/Breast/BR1506
- Adjacent Sections: BM-Her2N75-15, BM-Her2N75-17, BM-Her2N75-18 (section 16 skipped.)
- Scenes: (i.e. TMA cores) 
  - 17: ER+/HER2+, immune rich. 
  - 49: ER+/HER2+
  - 59: HER2+ immune rich

**Method**: We performed combat normalization using unlike tissues as training set (different cores in training set and all cores in testing set). We visualized the resulting histograms.

In [None]:
#load libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os
import copy
import seaborn as sns
import scipy
import scanpy as sc
from sklearn.cluster import KMeans
from sklearn.preprocessing import scale, minmax_scale, StandardScaler
from sklearn.metrics import silhouette_score
import matplotlib as mpl
import util
mpl.rc('figure', max_open_warning = 0)
#os.chdir('/home/groups/graylab_share/OMERO.rdsStore/engje/Data/cmIF')
#from mplex_image import visualize as viz, process, preprocess, normalize
np.random.seed(1202)
np.random.seed(1211)

In [None]:
#change to correct directory
os.chdir('/home/groups/graylab_share/OMERO.rdsStore/engje/Data/cycIF_ValidationStudies/cycIF_Validation')
codedir=os.getcwd()
rootdir = f'{codedir}/Data/'
datadir = f'{codedir}/Data/filtered_data'
os.chdir(datadir)
%matplotlib inline

In [None]:
# load data
df=pd.read_csv(f'20201229_BM-Her2N75-15-17-18_MeanIntensity.csv',index_col=0)
df.rename({'slide':'batch'},axis=1,inplace=True)

In [None]:
df.groupby('slide_scene').mean().loc[:,['CK7_Ring','CK14_Ring','CK5_Ring','CK19_Ring','Ecad_Ring','HER2_Ring']]

In [None]:
#all cores
data = df.loc[:,df.dtypes=='float64'].T
batch = df.batch
gamma_star, delta_star, stand_mean, var_pooled = util.combat_fit(data, batch)
#transform
bayesdata = util.combat_transform(data,batch,gamma_star, delta_star, stand_mean, var_pooled)
df_norm=bayesdata.T
s_train='all'
s_tissue = 'all'
df_norm['batch'] = [item.split('_')[0] for item in df_norm.index]
d_fig = util.plot_histograms(df_norm,df,s_train,s_tissue)

In [None]:
for s_marker, fig in d_fig.items():
    fig.savefig(f'{rootdir}/20201228/Different_Scaling_combat_training_{s_train}_{s_marker}_{s_tissue}.png')
    fig
    break

## Training combat with controls

How sensitive is the combat algorithm to the inputs that determine the parameters?

- Can different tissues in each batch be used to fit combat?
- Can a limited set of tissues be used to fit combat?
- Or, should the set of tissues used to fit combat be very similar to those it is applied to?

In [None]:
lls_batch = [['BM-Her2N75-15', 'BM-Her2N75-17', 'BM-Her2N75-18'],
            [ 'BM-Her2N75-17', 'BM-Her2N75-18','BM-Her2N75-15'],
            [ 'BM-Her2N75-18','BM-Her2N75-15', 'BM-Her2N75-17']]
ls_scene = sorted(set(df.scene))

In [None]:
#normalize with different training sets
for idxx, ls_batch in enumerate(lls_batch):
    data = pd.DataFrame()
    for idx, s_batch in enumerate(ls_batch):
        s_scene = ['scene017', 'scene049', 'scene059'][idx]
        data = data.append(df.loc[((df.scene==s_scene)&(df.batch==s_batch)),:])   
    #fit training set
    gamma_star, delta_star, stand_mean, var_pooled = util.combat_fit(data.loc[:,data.dtypes=='float64'].T, data.batch)
    #transform full data set
    bayesdata = util.combat_transform(df.loc[:,df.dtypes=='float64'].T,df.batch,gamma_star, delta_star,stand_mean, var_pooled)
    df_norm=bayesdata.T
    s_train = "_".join([(item + ls_scene[idx]).split('-')[-1].replace('scene','s') for idx, item in enumerate(ls_batch)])
    s_tissue = 'diff'
    d_fig = util.plot_histograms(df_norm,data,s_train,s_tissue)
    #break
    df_norm.sample(5400,random_state=3).to_csv(f'20210301-{idxx}_BM-Her2N75_SampledMeanIntensity_diff_train.csv')


In [None]:

for s_marker, fig in d_fig.items():
    fig.savefig(f'{rootdir}/20201228/Different_Scaling_combat_training_{s_train}_{s_marker}_{s_tissue}.png')
    fig
    break

In [None]:
#normalize with same training sets
for idx, ls_batch in enumerate(lls_batch):
    s_scene = ['scene017', 'scene049', 'scene059'][idx]
    data = pd.DataFrame()
    for s_batch in ls_batch:   
        data = data.append(df.loc[((df.scene==s_scene)&(df.batch==s_batch)),:])   
    #fit training set
    gamma_star, delta_star, stand_mean, var_pooled = util.combat_fit(data.loc[:,data.dtypes=='float64'].T, data.batch)
    #transform full data set
    bayesdata = util.combat_transform(df.loc[:,df.dtypes=='float64'].T,df.batch,gamma_star, delta_star, stand_mean, var_pooled)
    df_norm=bayesdata.T
    s_train = "_".join([(item + s_scene).split('-')[-1].replace('scene','s') for idx, item in enumerate(ls_batch)])
    s_tissue = 'same'
    d_fig = util.plot_histograms(df_norm,data,s_train,s_tissue)
    break
    df_norm.sample(5400,random_state=3).to_csv(f'20210301-{idx}_BM-Her2N75_SampledMeanIntensity_same_train.csv')


In [None]:
for s_marker, fig in d_fig.items():
    fig.savefig(f'{rootdir}/20201228/Different_Scaling_combat_training_{s_train}_{s_marker}_{s_tissue}.png')
    fig
    break

note: 
20210301-0_BM-Her2N75-15-17-18_kbet_same_train.csv gave the worst kbet results
this is the ER+/HER2+, immune rich core
markers failed on were CK14,  CK5, CK7, and Ecad; HER2 and CK19 to a lesser degree.

In [None]:
#normalize with sampled training sets
ls_date = ['20201207','20201208','20201209']
for idx, s_date in enumerate(ls_date):
    data = pd.read_csv(f'{s_date}_BM-Her2N75-15-17-18_SampledMeanIntensity_raw.csv',index_col=0)
    data['batch'] = [item.split('_')[0] for item in data.index]
    #fit training set
    gamma_star, delta_star, stand_mean, var_pooled = util.combat_fit(data.loc[:,data.dtypes=='float64'].T, data.batch)
    #transform full data set
    bayesdata = util.combat_transform(df.loc[:,df.dtypes=='float64'].T,df.batch,gamma_star, delta_star, stand_mean, var_pooled)
    df_norm=bayesdata.T
    s_train = s_date
    s_tissue = 'sampled'
    d_fig =util.plot_histograms(df_norm,data,s_train,s_tissue)
    break
    df_norm.sample(5400,random_state=3).to_csv(f'20210301-{idx}_BM-Her2N75_SampledMeanIntensity_sampled_train.csv')


In [None]:
for s_marker, fig in d_fig.items():
    fig.savefig(f'{rootdir}/20201228/Different_Scaling_combat_training_{s_train}_{s_marker}_{s_tissue}.png')
    fig
    break

## RESTORE methods: Results

What is the best practice for restore normalization?

- Global or local thresholds? - **Local slightly better but neither very good**
- Raw data or arcsinh transformation? - **No difference**
- Divide by threshold or scale values above threshold? - **Scale gives good normaliztion, but not as good as combat**

In [None]:
df_file = pd.DataFrame(index=os.listdir())
df_file = df_file[df_file.index.str.contains('BM-Her2N75-15-17-18_kbet')]

In [None]:
#os.chdir(filterdir)
df_file = pd.DataFrame(index=os.listdir())
df_file = df_file[df_file.index.str.contains('BM-Her2N75-15-17-18_kbet_')]
#ls_train = ['diff_train','raw', 'same_train', 'sampled_train','raw_combat']
ls_method = ['raw', 'raw_combat', 'raw_regress_out', 'restore_local', 'restore_div','restore_div_arcsinh', 'restore_scale','restore_scale_combat']#,'raw_restore_combat' 'raw_restore','raw_restore_regress_out',

#add mean kbet
for s_file in df_file.index:
    df = pd.read_csv(s_file,index_col=0)
    df_file.loc[s_file,'mean_kbet'] = df.loc['mean','kBET.observed']
df_file['norm'] = [item.split('kbet_')[1].split('.csv')[0] for item in df_file.index]
df_file = df_file[df_file.norm.isin(ls_method)]
ls_index= df_file.groupby('norm').mean_kbet.mean().sort_values().index
df_file.groupby('norm').mean_kbet.mean().sort_values()
df_file['Norm'] = pd.Categorical(
    df_file['norm'], 
    categories=ls_index.tolist(), 
    ordered=True
)
%matplotlib inline
fig, ax = plt.subplots(figsize=(5,3),dpi=200)
df_plot = df_file.sort_values('Norm')
sns.lineplot(data=df_plot,x='norm',y='mean_kbet',ax=ax,err_style='bars')
labels = [item.replace('raw_','').replace('_','\n').replace('div','global') for item in ls_index.tolist()]
ax.set_xticks(range(len(df_file.groupby('norm').mean_kbet)))
ax.set_xticklabels(labels,rotation=0)
ax.set_ylabel('Rejection Rate')
ax.set_xlabel('Normalization')
ax.set_title('kBET Evaluation of Batch Correction II')
fig.set_tight_layout(True)
plt.tight_layout
fig.savefig(f'{rootdir}/20201228/BatchEffectII.png',dpi=200)

In [None]:
df_plot.groupby('Norm').mean()

In [None]:
df_plot.groupby('Norm').std()

In [None]:
df_select = df_plot[df_plot.Norm.isin(['raw_combat','restore_scale','raw_regress_out','raw'])]
fig, ax = plt.subplots(figsize=(4.1,3),dpi=300)
df_plot = df_select.sort_values('Norm')
sns.lineplot(data=df_plot,x='norm',y='mean_kbet',ax=ax,err_style='bars')
labels = ['combat','RESTORE','regress\n out','raw']
ax.set_xticks(range(len(df_select.groupby('norm').mean_kbet)))
ax.set_xticklabels(labels,rotation=0,fontsize=13)
ax.set_ylabel('Rejection Rate',fontsize=14)
ax.set_xlabel('Normalization',fontsize=14)
ax.set_ylim(.4,1)
ax.set_title('kBET Evaluation of\n Batch Correction', fontsize=18)
fig.set_tight_layout(True)
plt.tight_layout
fig.savefig(f'{codedir}/Figures/BatchEffectII_select.png',dpi=300)

In [None]:
pwd

In [None]:
sns.set_style("whitegrid")
df_pearson = pd.read_csv(f'Pearson_correlation_tissue_0.6.csv',index_col=0)
ls_order = df_pearson.mean(axis=1).sort_values(ascending=False).index.tolist()
#ls_order = ['raw_combat','log2_combat','restore_scale','raw_regress_out','raw','raw_robust','raw_standard']
fig,ax = plt.subplots(figsize=(3.5,2.6),dpi=300)
sns.boxplot(data=df_pearson.loc[ls_order].T,ax=ax,orient='h',showfliers=False,palette='muted')
sns.stripplot(data=df_pearson.loc[ls_order].T,ax=ax,orient='h',palette='dark')
#ax.set_title(f'Cluster Correlation \n (resolution {resolution})',fontsize=16)
ax.set_title(f'Cluster Correlation',fontsize=16)
ax.set_xlabel('Pearson Correlation',fontsize=14)
ax.yaxis.set_label_position("right")
ax.set_yticklabels(['combat','RESTORE','regress out','raw'],fontsize=14)
ax.yaxis.tick_right()
plt.tight_layout()
fig.savefig(f'{rootdir}filtered_data/figures/PearsonCorrelation_tissue_0.6.png',dpi=300)

## Training combat with controls: Results

How sensitive is the combat algorithm to the inputs that determine the parameters?

- Can different tissues in each batch be used to fit combat? - **No, kbet rejection rate is near 1**
- Can a limited set of tissues be used to fit combat? - **Varies, kbet rejection from 0.85 to 1, depending on training tissue**
- Or, should the set of tissues used to fit combat be very similar to those it is applied to? - **Yes, lowest kbet rejection rate**

In [None]:
#add mean kbet
df_select = df_file[df_file.norm.isin(['raw_combat','raw_standard','restore_scale','raw','raw_regress_out','raw_robust','log2_combat'])] #'raw_restore_combat'
sns.set_style("white")
df_file = pd.DataFrame(index=os.listdir())
df_file = df_file[df_file.index.str.contains('BM-Her2N75-15-17-18_kbet_')]
ls_train = ['diff_train','raw', 'same_train', 'sampled_train','raw_combat']
for s_file in df_file.index:
    df = pd.read_csv(s_file,index_col=0)
    df_file.loc[s_file,'mean_kbet'] = df.loc['mean','kBET.observed']
df_file['norm'] = [item.split('kbet_')[1].split('.csv')[0] for item in df_file.index]
df_file = df_file[df_file.norm.isin(ls_train)]
ls_index= df_file.groupby('norm').mean_kbet.mean().sort_values().index
df_file.groupby('norm').mean_kbet.mean().sort_values()
df_file['Norm'] = pd.Categorical(
    df_file['norm'], 
    categories=ls_index.tolist(), 
    ordered=True
)
%matplotlib inline
fig, ax = plt.subplots(figsize=(3.8,3),dpi=300)
df_plot = df_file.sort_values('Norm')
sns.lineplot(data=df_plot,x='norm',y='mean_kbet',ax=ax,err_style='bars')
labels = [item.replace('raw_','baseline_').replace('_','\n').replace('train','training').replace('diff','different') for item in ls_index.tolist()]
ax.set_xticks(range(len(df_file.groupby('norm').mean_kbet)))
ax.set_xticklabels(labels,rotation=90,fontsize=13)
ax.set_ylabel('Rejection Rate',fontsize=14)
ax.set_xlabel('Normalization',fontsize=14)
ax.set_ylim(.4,1)
ax.set_title('kBET Evaluation of\n Combat Parameterization',fontsize=18,x=.43)
fig.set_tight_layout(True)
plt.tight_layout
fig.savefig(f'{codedir}/Figures/BatchEffect_CombatParam.png',dpi=200)

In [None]:
df_plot.groupby('Norm').mean()

In [None]:
df_plot.groupby('Norm').std()

In [None]:
df_file.sort_values('Norm')

In [None]:
ls_index = [ '20201208_BM-Her2N75-15-17-18_SampledMeanIntensity_raw.csv',
 '20210301-2_BM-Her2N75_SampledMeanIntensity_same_train.csv',
 '20210301-2_BM-Her2N75_SampledMeanIntensity_diff_train.csv',
 '20210301-0_BM-Her2N75_SampledMeanIntensity_sampled_train.csv',
 '20201209_BM-Her2N75-15-17-18_SampledMeanIntensity_raw_combat.csv'
]

In [None]:

marker_genes = ['CD44', 'CD45', 'CD4', 'CD68', 'CK14', 'CK19', 'CK5', 'CK7', 'ER',
       'Ecad', 'Ki67', 'PD1', 'Vim', 'aSMA', 'pHH3']
d_je_tma = {'scene01':'tonsil1','scene02':'HCC1143', 'scene03':'HCC3153', 'scene04':'NBreast',
    'scene05':'T47D','scene06':'T47D','scene07':'tonsil2','scene08':'BT474','scene09':'BT474','scene10':'AU565',
    'scene11':'AU565','scene12':'MDAMB-436','scene13':'MDAMB-436' }
d_je_tma = {'scene017': 'ER-HER2-imm',
  'scene049': 'ER-HER2',
  'scene059': 'HER2-imm'}

In [None]:
# normalize 
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score
%matplotlib inline
#sc.set_figure_params(scanpy=True, fontsize=14)
df_pearson = pd.DataFrame()
for s_index in ls_index:
    print(s_index)
    df = pd.read_csv(s_index,index_col=0)
    df.columns = [item.split('_')[0] for item in df.columns]
    marker_genes = df.columns[df.dtypes=='float64'].tolist()
    adata = sc.AnnData(df.loc[:,df.dtypes=='float64'])
    adata.obs['batch'] = [item.split('_scene')[0].split('BM-Her2N75-')[1] for item in adata.obs.index]
    adata.obs['scene'] = [item.split('_')[1] for item in adata.obs.index]
    s_norm = ''#s_index.split('SampledMeanIntensity_')[1].split('.')[0]
    adata.raw = adata
    #reduce dimensionality
    sc.tl.pca(adata, svd_solver='auto')
    #save normalized data
    df = pd.DataFrame(data=adata.X,index=adata.obs.index,columns=adata.var.index)
    df['batch'] = [item.split('_scene')[0] for item in df.index]
    #df.to_csv(f'{s_index.replace(".csv",f"_{s_norm}.csv")}')
    # calculate neighbors     
    sc.pp.neighbors(adata, n_neighbors=10, n_pcs=15)
    sc.tl.umap(adata)
    #umap plot
    s_type = s_index.split('SampledMeanIntensity_')[1].split('.')[0]
    figname = f"UmapBatch_param_{s_type}_{s_norm}.png"
    fig,ax = plt.subplots(figsize=(5,5), dpi=200)
    sc.pl.umap(adata, color='batch',title=f"{s_type.replace('_',' ')} {s_norm}",wspace=.25,ax=ax,save=figname)
    fig,ax = plt.subplots(figsize=(5,5), dpi=200)
    figname = f'UmapScene_param_{s_type}_{s_norm}.png'
    fig = sc.pl.umap(adata, color='scene',save=figname,ax=ax,title=f"{s_type.replace('_',' ')} {s_norm}")
    X_pca = adata.obsm['X_pca'] 
    #leiden
    #resolution=0.25
    resolution=0.4
    #resolution=0.6
    sc.tl.leiden(adata,resolution=resolution)
    fig,ax = plt.subplots(figsize=(4.5,5),dpi=200)
    figname=f'leiden_param_{s_type}_{s_norm}{resolution}.png'
    sc.pl.umap(adata, color='leiden',ax=ax,save=figname)
    fig,ax = plt.subplots(figsize=(6,3.7), dpi=200)
    figname=f'Matrixplot_param_leiden_{s_type}_{s_norm}{resolution}.png'
    sc.pl.matrixplot(adata, var_names=marker_genes, groupby=f'leiden',title=s_type.replace('_',' '),
                     dendrogram=True,ax=ax,save=figname,standard_scale='var',colorbar_title='Relative\nintensity')
    df = pd.DataFrame(data=adata.raw.X,index=adata.obs.index,columns=adata.var.index)
    df['leiden'] = adata.obs['leiden']
    #stacked bar
    s_trans = s_index.split('Intensity_')[1].split('.')[0]
    df['slide'] = [item.split('_')[0] for item in df.index]
    df['scene'] = [d_je_tma[item.split('_')[1]] for item in df.index]
    df['slide_scene'] = df.slide + '_' + df.scene
    df_prop = (df.groupby([f'leiden','slide_scene']).CD4.count())/(df.groupby(['slide_scene']).CD4.count())
    df_prop = df_prop.unstack().fillna(value=0).T
    #barplot
    fig,ax=plt.subplots(figsize=(5,3.7), dpi=200)
    df_prop.columns = df_prop.columns.add_categories(['slide','scene'])
    df_prop.index = [item.replace('BM-Her2N75-','') for item in df_prop.index]
    df_prop['slide'] =[item.split('_')[0] for item in df_prop.index]
    df_prop['scene'] =[item.split('_')[1] for item in df_prop.index]
    df_prop.sort_values(['scene','slide']).plot(kind='bar',stacked=True,ax=ax,legend=True,cmap='tab20',width=.8)
    ax.legend(bbox_to_anchor=(1.02, 1.2), ncol=1,fontsize=12)
    ax.set_ylabel('Fraction Positive')
    labels = ax.get_xticklabels()
    ax.set_xticklabels(labels, fontsize=14)
    ax.set_title(f"{s_trans.replace('_',' ')} {s_norm}")
    ax.grid(False)
    plt.tight_layout()
    fig.savefig(f'./figures/StackedBar_param_{s_trans}_{s_norm}{resolution}_Leiden.png')
    #pearson
    ls_core = sorted(set(df_prop.scene))
    se_all = pd.Series(dtype='float64',name=s_trans)
    for s_core in ls_core:
        test = df_prop.loc[df_prop.scene==s_core,df_prop.dtypes=='float64']
        se_test = pd.Series([scipy.stats.pearsonr(test.iloc[0],test.iloc[1])[0],scipy.stats.pearsonr(test.iloc[1],test.iloc[2])[0],scipy.stats.pearsonr(test.iloc[2],test.iloc[0])[0]])
        se_all = se_all.append(se_test)
    print(se_all.mean())
    se_all.name = s_trans
    se_all.index = (range(len(se_all)))
    df_pearson = df_pearson.append(se_all)
    #break

In [None]:
#df_pearson.to_csv(f'20210301_Pearson_correlation_param{resolution}.csv')
rootdir

In [None]:
resolution = 0.6
#df_pearson.to_csv(f'Pearson_correlation_param{resolution}.csv')
df_pearson = pd.read_csv(f'20210301_Pearson_correlation_param{resolution}.csv',index_col=0)
ls_order = df_pearson.mean(axis=1).sort_values(ascending=False).index.tolist()
%matplotlib inline
s_date = '20201228'
fig,ax = plt.subplots(figsize=(4,4),dpi=200)
sns.boxplot(data=df_pearson.loc[ls_order].T,ax=ax,orient='h',showfliers=False,palette='muted')
sns.stripplot(data=df_pearson.loc[ls_order].T,ax=ax,orient='h',palette='dark')
ax.set_title(f'Cluster Correlation \n (resolution {resolution})',fontsize=16)
#ax.set_title(f'Cluster Correlation',fontsize=16)
ax.set_xlabel('Pearson Correlation',fontsize=14)
if resolution == 0.6:
    #pass
    ax.set_yticklabels(['sampled training','baseline combat','same training','raw','different training'],fontsize=14)
else:
    ax.set_yticklabels(['baseline combat','same training','raw','sampled training','different training'],fontsize=14)
ax.yaxis.set_label_position("right")
ax.yaxis.tick_right()
plt.tight_layout()
fig.savefig(f'{codedir}/Figures/PearsonCorrelation_param{resolution}.png',dpi=300)

In [None]:
df_pearson.mean(axis=1)

In [None]:
df_pearson.std(axis=1)

In [None]:
resolution = 0.6
sns.set_style("whitegrid")
df_pearson = pd.read_csv(f'20210301_Pearson_correlation_param{resolution}.csv',index_col=0)
ls_order = df_pearson.mean(axis=1).sort_values(ascending=False).index.tolist()
%matplotlib inline
s_date = '20201228'
fig,ax = plt.subplots(figsize=(4.5,3),dpi=300)
sns.boxplot(data=df_pearson.loc[ls_order].T,ax=ax,orient='h',showfliers=False,palette='muted')
sns.stripplot(data=df_pearson.loc[ls_order].T,ax=ax,orient='h',palette='dark')
ax.set_title(f'Cluster Correlation',fontsize=16) #\n (resolution {resolution}
ax.set_xlabel('Pearson Correlation',fontsize=14)
if resolution == 0.6:
    ax.set_yticklabels(['sampled training','baseline combat','same training','raw','different training'],fontsize=14)
else:
    pass
    ax.set_yticklabels(['baseline combat','same training','sampled training','raw','different training'],fontsize=14)
ax.yaxis.set_label_position("right")
ax.yaxis.tick_right()
plt.tight_layout()
fig.savefig(f'{codedir}/Figures/PearsonCorrelation_param.png',dpi=300)

In [None]:
df_pearson.mean(axis=1)

In [None]:
df_pearson.std(axis=1)