## Libraries

In [1]:
import pandas as pd
import numpy as np
from scipy.stats import gmean, mannwhitneyu, pearsonr, zscore, chi2
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.cross_decomposition import PLSRegression
from sklearn.utils import resample
from sklearn.feature_selection import VarianceThreshold
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import silhouette_score, adjusted_rand_score
from sklearn.cluster import KMeans
import plotly.express as px
import plotly.graph_objects as go
from plotly.validators.scatter.marker import SymbolValidator
from clustergrammer2 import net, Network, CGM2
from BatchEffectDataLoader import DataPreprocess, DataTransform, DataReverseTransform
from BatchEffectPlots import plotPCA, plotPCoA, plotOTUBox, plotRLE, plotClusterHeatMap
from BatchEffectCorrection import correctCombat, correctLimma_rBE, correctSVD, correctPLSDA_batch
from BatchEffectMetrics import kBET, iLISI, cLISI, ASW, ARI
from skbio.stats.distance import permanova, DistanceMatrix
from scipy.spatial.distance import pdist, squareform

>> clustergrammer2 backend version 0.18.0


## Initial data preparation

In [2]:
path = "data/dataset_sponge.csv"
raw_data = DataPreprocess(path)
data = DataTransform(raw_data)

## Batch Effect Assessment plots

### PCoA plot with Aitchison distances

In [3]:
plotPCoA(raw_data, method = "aitchison")

### PCA plot

In [4]:
plotPCA(data)

### OTU Boxplot

In [5]:
plotOTUBox(data)

### RLE plot

In [6]:
plotRLE(data)

### Heat Map cluster

In [7]:
plotClusterHeatMap(data)

CGM2(network='{"row_nodes": [{"name": "OTU1", "ini": 24, "clust": 22, "rank": 10, "rankvar": 1}, {"name": "OTU…

## Batch Effect Correction

### ComBat

In [8]:
data_combat = correctCombat(data)

Found 2 batches.
Adjusting for 1 covariate(s) or covariate level(s).
Standardizing Data across genes.
Fitting L/S model and finding priors.
Finding parametric adjustments.
Adjusting the Data


In [12]:
plotPCoA(DataReverseTransform(data_combat, raw_data))

### limma - removeBatchEffect

In [13]:
data_limma = correctLimma_rBE(data, covariates_labels = ["tissue"])

In [14]:
plotPCoA(DataReverseTransform(data_limma, raw_data))

### SVD

In [15]:
data_SVD = correctSVD(data)


Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`



In [16]:
plotPCoA(DataReverseTransform(data_SVD, raw_data))

### PLSDA-batch

In [17]:
data_PLSDA = correctPLSDA_batch(data)

In [19]:
plotPCoA(DataReverseTransform(data_PLSDA, raw_data))

### ConQuR

## Batch Effect Quantitative Metrics

### kBET

In [None]:
print("kBET with original data: {}".format(kBET(data)))
print("kBET with ComBat corrected data: {}".format(kBET(data_combat)))
print("kBET with limma corrected data: {}".format(kBET(data_limma)))
print("kBET with SVD-based corrected data: {}".format(kBET(data_SVD)))
print("kBET with PLSDA-batch corrected data: {}".format(kBET(data_PLSDA)))

### LISI

In [None]:
print("iLISI with original data: {}".format(iLISI(data)))
print("iLISI with ComBat corrected data: {}".format(iLISI(data_combat)))
print("iLISI with limma corrected data: {}".format(iLISI(data_limma)))
print("iLISI with SVD-based corrected data: {}".format(iLISI(data_SVD)))
print("iLISI with PLSDA-batch corrected data: {}".format(iLISI(data_PLSDA)))

print("cLISI with original data: {}".format(cLISI(data)))
print("cLISI with ComBat corrected data: {}".format(cLISI(data_combat)))
print("cLISI with limma corrected data: {}".format(cLISI(data_limma)))
print("cLISI with SVD-based corrected data: {}".format(cLISI(data_SVD)))
print("cLISI with PLSDA-batch corrected data: {}".format(cLISI(data_PLSDA)))

### ASW

In [None]:
print("ASW with original data: {}".format(ASW(data)))
print("ASW with ComBat corrected data: {}".format(ASW(data_combat)))
print("ASW with limma corrected data: {}".format(ASW(data_limma)))
print("ASW with SVD-based corrected data: {}".format(ASW(data_SVD)))
print("ASW with PLSDA-batch corrected data: {}".format(ASW(data_PLSDA)))

### ARI

In [None]:
print("ARI with original data: {}".format(ARI(data)))
print("ARI with ComBat corrected data: {}".format(ARI(data_combat)))
print("ARI with limma corrected data: {}".format(ARI(data_limma)))
print("ARI with SVD-based corrected data: {}".format(ARI(data_SVD)))
print("ARI with PLSDA-batch corrected data: {}".format(ARI(data_PLSDA)))

## Case Study: Batch Effect Correction DRE analysis

### PERMANOVA analysis between tissues

In [None]:
sample_ids = data["sample"].tolist() 
tissue_labels = data.set_index("sample").loc[sample_ids, "tissue"].tolist()

euclidean_dist = squareform(pdist(data.select_dtypes(include="number"), metric="euclidean"))
distance_matrix = DistanceMatrix(euclidean_dist, ids = sample_ids)

permanova(distance_matrix, grouping=tissue_labels)

In [None]:
sample_ids = data_combat["sample"].tolist() 
tissue_labels = data_combat.set_index("sample").loc[sample_ids, "tissue"].tolist()

euclidean_dist = squareform(pdist(data_combat.select_dtypes(include="number"), metric="euclidean"))
distance_matrix = DistanceMatrix(euclidean_dist, ids = sample_ids)

permanova(distance_matrix, grouping=tissue_labels)

In [None]:
sample_ids = data_limma["sample"].tolist() 
tissue_labels = data_limma.set_index("sample").loc[sample_ids, "tissue"].tolist()

euclidean_dist = squareform(pdist(data_limma.select_dtypes(include="number"), metric="euclidean"))
distance_matrix = DistanceMatrix(euclidean_dist, ids = sample_ids)

permanova(distance_matrix, grouping=tissue_labels)

### DRE Analysis

#### Construct data with different transformations

In [None]:
#Loading data with CLR transformations
data_sqrt = DataPreprocess(path, preprocess=True, transformation = "Sqrt")
data_clr = DataPreprocess(path, preprocess=True, transformation = "CLR")

#### Identify significantly different OTUs between tissues

##### Square-root transformed data

In [None]:
# Store results
otu_results = []

# Perform Mann-Whitney for each OTU
for otu in data_sqrt.select_dtypes(include="number").columns:
    group_E = data_sqrt.loc[data_sqrt["tissue"] == "Ectosome", otu]
    group_C = data_sqrt.loc[data_sqrt["tissue"] == "Choanosome", otu]

    #Mann-Whitney
    stat, p_value = mannwhitneyu(group_E, group_C, alternative="two-sided")

    #Only store significant results
    if p_value < 0.05:
        #Append results
        otu_results.append({"OTU": otu, "U-statistic": stat, "p-value": p_value})

# Convert to DataFrame
otu_results_df = pd.DataFrame(otu_results)

# Display results
print(otu_results_df.sort_values(by="p-value"))

In [None]:
#Applying ComBat
data_sqrt_combat = correctCombat(data_sqrt)

# Store results
otu_results = []

# Perform Mann-Whitney for each OTU
for otu in data_sqrt_combat.select_dtypes(include="number").columns:
    group_E = data_sqrt_combat.loc[data_sqrt_combat["tissue"] == "Ectosome", otu]
    group_C = data_sqrt_combat.loc[data_sqrt_combat["tissue"] == "Choanosome", otu]

    #Mann-Whitney
    stat, p_value = mannwhitneyu(group_E, group_C, alternative="two-sided")

    #Only store significant results
    if p_value < 0.05:
        #Append results
        otu_results.append({"OTU": otu, "U-statistic": stat, "p-value": p_value})

# Convert to DataFrame
otu_results_df = pd.DataFrame(otu_results)

# Display results
print(otu_results_df.sort_values(by="p-value"))

In [None]:
#Applying limma - removeBatchEffect
data_sqrt_limma = correctLimma_rBE(data_sqrt, covariates_labels=["tissue"])

# Store results
otu_results = []

# Perform Mann-Whitney for each OTU
for otu in data_sqrt_limma.select_dtypes(include="number").columns:
    group_E = data_sqrt_limma.loc[data_sqrt_limma["tissue"] == "Ectosome", otu]
    group_C = data_sqrt_limma.loc[data_sqrt_limma["tissue"] == "Choanosome", otu]

    #Mann-Whitney
    stat, p_value = mannwhitneyu(group_E, group_C, alternative="two-sided")

    #Only store significant results
    if p_value < 0.05:
        #Append results
        otu_results.append({"OTU": otu, "U-statistic": stat, "p-value": p_value})

# Convert to DataFrame
otu_results_df = pd.DataFrame(otu_results)

# Display results
print(otu_results_df.sort_values(by="p-value"))

By correcting with ComBat OTUs 18, 14, 17 and 12 where added as significantly different

By correcting with limma OTUs 14 and 17 where added as significantly different

In [None]:
#Applying SVD
data_sqrt_svd = correctSVD(data_sqrt)

# Store results
otu_results = []

# Perform Mann-Whitney for each OTU
for otu in data_sqrt_svd.select_dtypes(include="number").columns:
    group_E = data_sqrt_svd.loc[data_sqrt_svd["tissue"] == "Ectosome", otu]
    group_C = data_sqrt_svd.loc[data_sqrt_svd["tissue"] == "Choanosome", otu]

    #Mann-Whitney
    stat, p_value = mannwhitneyu(group_E, group_C, alternative="two-sided")

    #Only store significant results
    if p_value < 0.05:
        #Append results
        otu_results.append({"OTU": otu, "U-statistic": stat, "p-value": p_value})

# Convert to DataFrame
otu_results_df = pd.DataFrame(otu_results)

# Display results
print(otu_results_df.sort_values(by="p-value"))

In [None]:
#Applying PLSDA-batch
data_sqrt_plsda = correctPLSDA_batch(data_sqrt)

# Store results
otu_results = []

# Perform Mann-Whitney for each OTU
for otu in data_sqrt_plsda.select_dtypes(include="number").columns:
    group_E = data_sqrt_plsda.loc[data_sqrt_plsda["tissue"] == "Ectosome", otu]
    group_C = data_sqrt_plsda.loc[data_sqrt_plsda["tissue"] == "Choanosome", otu]

    #Mann-Whitney
    stat, p_value = mannwhitneyu(group_E, group_C, alternative="two-sided")

    #Only store significant results
    if p_value < 0.05:
        #Append results
        otu_results.append({"OTU": otu, "U-statistic": stat, "p-value": p_value})

# Convert to DataFrame
otu_results_df = pd.DataFrame(otu_results)

# Display results
print(otu_results_df.sort_values(by="p-value"))

##### Center log-ratio transformed data

In [None]:
# Store results
otu_results = []

# Perform Mann-Whitney for each OTU
for otu in data_clr.select_dtypes(include="number").columns:
    group_E = data_clr.loc[data_sqrt["tissue"] == "Ectosome", otu]
    group_C = data_clr.loc[data_sqrt["tissue"] == "Choanosome", otu]

    #Mann-Whitney
    stat, p_value = mannwhitneyu(group_E, group_C, alternative="two-sided")

    #Only store significant results
    if p_value < 0.05:
        #Append results
        otu_results.append({"OTU": otu, "U-statistic": stat, "p-value": p_value})

# Convert to DataFrame
otu_results_df = pd.DataFrame(otu_results)

# Display results
print(otu_results_df)

In [None]:
#Applying ComBat
data_clr_combat = correctCombat(data_clr)

# Store results
otu_results = []

# Perform Mann-Whitney for each OTU
for otu in data_clr_combat.select_dtypes(include="number").columns:
    group_E = data_clr_combat.loc[data_clr_combat["tissue"] == "Ectosome", otu]
    group_C = data_clr_combat.loc[data_clr_combat["tissue"] == "Choanosome", otu]

    #Mann-Whitney
    stat, p_value = mannwhitneyu(group_E, group_C, alternative="two-sided")

    #Only store significant results
    if p_value < 0.05:
        #Append results
        otu_results.append({"OTU": otu, "U-statistic": stat, "p-value": p_value})

# Convert to DataFrame
otu_results_df = pd.DataFrame(otu_results)

# Display results
print(otu_results_df)

In [None]:
#Applying limma - removeBatchEffect
data_clr_limma = correctLimma_rBE(data_clr, covariates_labels=["tissue"])

# Store results
otu_results = []

# Perform Mann-Whitney for each OTU
for otu in data_clr_limma.select_dtypes(include="number").columns:
    group_E = data_clr_limma.loc[data_clr_limma["tissue"] == "Ectosome", otu]
    group_C = data_clr_limma.loc[data_clr_limma["tissue"] == "Choanosome", otu]

    #Mann-Whitney
    stat, p_value = mannwhitneyu(group_E, group_C, alternative="two-sided")

    #Only store significant results
    if p_value < 0.05:
        #Append results
        otu_results.append({"OTU": otu, "U-statistic": stat, "p-value": p_value})

# Convert to DataFrame
otu_results_df = pd.DataFrame(otu_results)

# Display results
print(otu_results_df)

By applying ComBat OTUs 14, 23 and 24 where added as significantly different, while OTUs 1, 13 and 22 where removed.

By applying limma OTUs 14 and 24 where added as significantly different, while OTU 22 was removed.

In [None]:
#Applying SVD
data_clr_svd = correctSVD(data_clr)

# Store results
otu_results = []

# Perform Mann-Whitney for each OTU
for otu in data_clr_svd.select_dtypes(include="number").columns:
    group_E = data_clr_svd.loc[data_clr_svd["tissue"] == "Ectosome", otu]
    group_C = data_clr_svd.loc[data_clr_svd["tissue"] == "Choanosome", otu]

    #Mann-Whitney
    stat, p_value = mannwhitneyu(group_E, group_C, alternative="two-sided")

    #Only store significant results
    if p_value < 0.05:
        #Append results
        otu_results.append({"OTU": otu, "U-statistic": stat, "p-value": p_value})

# Convert to DataFrame
otu_results_df = pd.DataFrame(otu_results)

# Display results
print(otu_results_df)

In [None]:
#Applying PLSDA-batch
data_clr_plsda = correctPLSDA_batch(data_clr)

# Store results
otu_results = []

# Perform Mann-Whitney for each OTU
for otu in data_clr_plsda.select_dtypes(include="number").columns:
    group_E = data_clr_plsda.loc[data_clr_plsda["tissue"] == "Ectosome", otu]
    group_C = data_clr_plsda.loc[data_clr_plsda["tissue"] == "Choanosome", otu]

    #Mann-Whitney
    stat, p_value = mannwhitneyu(group_E, group_C, alternative="two-sided")

    #Only store significant results
    if p_value < 0.05:
        #Append results
        otu_results.append({"OTU": otu, "U-statistic": stat, "p-value": p_value})

# Convert to DataFrame
otu_results_df = pd.DataFrame(otu_results)

# Display results
print(otu_results_df)