## Libraries

In [2]:
import pandas as pd
import numpy as np
from scipy.stats import gmean, mannwhitneyu, pearsonr, zscore, chi2
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.cross_decomposition import PLSRegression
from sklearn.utils import resample
from sklearn.feature_selection import VarianceThreshold
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import silhouette_score, adjusted_rand_score
from sklearn.cluster import KMeans
import plotly.express as px
import plotly.graph_objects as go
from plotly.validators.scatter.marker import SymbolValidator
from clustergrammer2 import net, Network, CGM2
from BatchEffectPlots import DataPreprocess, plotPCA, plotOTUBox, plotRLE, plotClusterHeatMap
from BatchEffectCorrection import correctCombat, correctLimma_rBE, correctSVD, correctPLSDA_batch
from BatchEffectMetrics import kBET, iLISI, cLISI, ASW, ARI
from skbio.stats.distance import permanova, DistanceMatrix
from scipy.spatial.distance import pdist, squareform

>> clustergrammer2 backend version 0.18.0


## Initial data preparation

In [3]:
path = "data/dataset_sponge.csv"
data = DataPreprocess(path)

## Batch Effect Assessment plots

### PCA plot

In [3]:
plotPCA(data)

### OTU Boxplot

In [3]:
plotOTUBox(data)

### RLE plot

In [4]:
plotRLE(data)

### Heat Map cluster

In [4]:
plotClusterHeatMap(data)

CGM2(network='{"row_nodes": [{"name": "OTU1", "ini": 24, "clust": 19, "rank": 8, "rankvar": 0}, {"name": "OTU2…

## Batch Effect Correction

### ComBat

In [3]:
data_combat = correctCombat(data)

Found 2 batches.
Adjusting for 1 covariate(s) or covariate level(s).
Standardizing Data across genes.
Fitting L/S model and finding priors.
Finding parametric adjustments.
Adjusting the Data


In [29]:
plotPCA(data_combat)

### limma - removeBatchEffect

In [4]:
data_limma = correctLimma_rBE(data, covariates_labels = ["tissue"])

In [31]:
plotPCA(data_limma)

### SVD

In [5]:
data_SVD = correctSVD(data)

  corrected_data[:, i] = svd_deflated_matrix[:, i] * otu_sd[i] + otu_mu[i]


In [5]:
plotPCA(data_SVD)

### PLSDA-batch

In [6]:
data_PLSDA = correctPLSDA_batch(data)

In [63]:
plotPCA(data_PLSDA)

### ConQuR

## Batch Effect Quantitative Metrics

### kBET

In [64]:
print("kBET with original data: {}".format(kBET(data)))
print("kBET with ComBat corrected data: {}".format(kBET(data_combat)))
print("kBET with limma corrected data: {}".format(kBET(data_limma)))
print("kBET with SVD-based corrected data: {}".format(kBET(data_SVD)))
print("kBET with PLSDA-batch corrected data: {}".format(kBET(data_PLSDA)))

kBET with original data: 0.25
kBET with ComBat corrected data: 0.9375
kBET with limma corrected data: 0.90625
kBET with SVD-based corrected data: 0.375
kBET with PLSDA-batch corrected data: 0.71875


### LISI

In [None]:
print("iLISI with original data: {}".format(iLISI(data)))
print("iLISI with ComBat corrected data: {}".format(iLISI(data_combat)))
print("iLISI with limma corrected data: {}".format(iLISI(data_limma)))
print("iLISI with SVD-based corrected data: {}".format(iLISI(data_SVD)))
print("iLISI with PLSDA-batch corrected data: {}".format(iLISI(data_PLSDA)))

print("cLISI with original data: {}".format(cLISI(data)))
print("cLISI with ComBat corrected data: {}".format(cLISI(data_combat)))
print("cLISI with limma corrected data: {}".format(cLISI(data_limma)))
print("cLISI with SVD-based corrected data: {}".format(cLISI(data_SVD)))
print("cLISI with PLSDA-batch corrected data: {}".format(cLISI(data_PLSDA)))

iLISI with original data: 1.1459276018099547
iLISI with ComBat corrected data: 1.6108597285067872
iLISI with limma corrected data: 1.6102941176470587
iLISI with SVD-based corrected data: 1.26131221719457
iLISI with PLSDA-batch corrected data: 1.4372171945701355
cLISI with original data: 1.0
cLISI with ComBat corrected data: 1.0
cLISI with limma corrected data: 1.0
cLISI with SVD-based corrected data: 1.479072398190045
cLISI with PLSDA-batch corrected data: 1.0147058823529411


### ASW

In [7]:
print("ASW with original data: {}".format(ASW(data)))
print("ASW with ComBat corrected data: {}".format(ASW(data_combat)))
print("ASW with limma corrected data: {}".format(ASW(data_limma)))
print("ASW with SVD-based corrected data: {}".format(ASW(data_SVD)))
print("ASW with PLSDA-batch corrected data: {}".format(ASW(data_PLSDA)))

ASW with original data: 0.6789795004758423
ASW with ComBat corrected data: 0.7370945120333365
ASW with limma corrected data: 0.7093791962568354
ASW with SVD-based corrected data: 0.5421706702188906
ASW with PLSDA-batch corrected data: 0.688571815724345


### ARI

In [9]:
print("ARI with original data: {}".format(ARI(data)))
print("ARI with ComBat corrected data: {}".format(ARI(data_combat)))
print("ARI with limma corrected data: {}".format(ARI(data_limma)))
print("ARI with SVD-based corrected data: {}".format(ARI(data_SVD)))
print("ARI with PLSDA-batch corrected data: {}".format(ARI(data_PLSDA)))

ARI with original data: 1.0
ARI with ComBat corrected data: 1.0
ARI with limma corrected data: 1.0
ARI with SVD-based corrected data: 0.23140495867768596
ARI with PLSDA-batch corrected data: 1.0


## Case Study: Batch Effect Correction DRE analysis

### PERMANOVA analysis between tissues

In [19]:
sample_ids = data["sample"].tolist() 
tissue_labels = data.set_index("sample").loc[sample_ids, "tissue"].tolist()

euclidean_dist = squareform(pdist(data.select_dtypes(include="number"), metric="euclidean"))
distance_matrix = DistanceMatrix(euclidean_dist, ids = sample_ids)

permanova(distance_matrix, grouping=tissue_labels)

method name               PERMANOVA
test statistic name        pseudo-F
sample size                      32
number of groups                  2
test statistic            21.885265
p-value                       0.001
number of permutations          999
Name: PERMANOVA results, dtype: object

In [20]:
sample_ids = data_combat["sample"].tolist() 
tissue_labels = data_combat.set_index("sample").loc[sample_ids, "tissue"].tolist()

euclidean_dist = squareform(pdist(data_combat.select_dtypes(include="number"), metric="euclidean"))
distance_matrix = DistanceMatrix(euclidean_dist, ids = sample_ids)

permanova(distance_matrix, grouping=tissue_labels)

method name               PERMANOVA
test statistic name        pseudo-F
sample size                      32
number of groups                  2
test statistic            40.730683
p-value                       0.001
number of permutations          999
Name: PERMANOVA results, dtype: object

In [21]:
sample_ids = data_limma["sample"].tolist() 
tissue_labels = data_limma.set_index("sample").loc[sample_ids, "tissue"].tolist()

euclidean_dist = squareform(pdist(data_limma.select_dtypes(include="number"), metric="euclidean"))
distance_matrix = DistanceMatrix(euclidean_dist, ids = sample_ids)

permanova(distance_matrix, grouping=tissue_labels)

method name               PERMANOVA
test statistic name        pseudo-F
sample size                      32
number of groups                  2
test statistic             30.61138
p-value                       0.001
number of permutations          999
Name: PERMANOVA results, dtype: object

### DRE Analysis

#### Construct data with different transformations

In [6]:
#Loading data with CLR transformations
data_sqrt = DataPreprocess(path, preprocess=True, transformation = "Sqrt")
data_clr = DataPreprocess(path, preprocess=True, transformation = "CLR")

#### Identify significantly different OTUs between tissues

##### Square-root transformed data

In [7]:
# Store results
otu_results = []

# Perform Mann-Whitney for each OTU
for otu in data_sqrt.select_dtypes(include="number").columns:
    group_E = data_sqrt.loc[data_sqrt["tissue"] == "Ectosome", otu]
    group_C = data_sqrt.loc[data_sqrt["tissue"] == "Choanosome", otu]

    #Mann-Whitney
    stat, p_value = mannwhitneyu(group_E, group_C, alternative="two-sided")

    #Only store significant results
    if p_value < 0.05:
        #Append results
        otu_results.append({"OTU": otu, "U-statistic": stat, "p-value": p_value})

# Convert to DataFrame
otu_results_df = pd.DataFrame(otu_results)

# Display results
print(otu_results_df.sort_values(by="p-value"))

     OTU  U-statistic       p-value
1   OTU5        256.0  2.805795e-07
2   OTU6          0.0  2.805795e-07
0   OTU2        240.0  3.570341e-06
3   OTU7         37.0  6.468719e-04
6  OTU24         64.0  1.651213e-03
4  OTU10         48.5  2.664819e-03
5  OTU16         64.0  1.643138e-02


In [8]:
#Applying ComBat
data_sqrt_combat = correctCombat(data_sqrt)

# Store results
otu_results = []

# Perform Mann-Whitney for each OTU
for otu in data_sqrt_combat.select_dtypes(include="number").columns:
    group_E = data_sqrt_combat.loc[data_sqrt_combat["tissue"] == "Ectosome", otu]
    group_C = data_sqrt_combat.loc[data_sqrt_combat["tissue"] == "Choanosome", otu]

    #Mann-Whitney
    stat, p_value = mannwhitneyu(group_E, group_C, alternative="two-sided")

    #Only store significant results
    if p_value < 0.05:
        #Append results
        otu_results.append({"OTU": otu, "U-statistic": stat, "p-value": p_value})

# Convert to DataFrame
otu_results_df = pd.DataFrame(otu_results)

# Display results
print(otu_results_df.sort_values(by="p-value"))

Found 2 batches.
Adjusting for 1 covariate(s) or covariate level(s).
Standardizing Data across genes.
Fitting L/S model and finding priors.
Finding parametric adjustments.
Adjusting the Data
      OTU  U-statistic       p-value
10  OTU24          0.0  8.640761e-07
0    OTU2        256.0  1.050172e-06
2    OTU6          0.0  1.055101e-06
1    OTU5        256.0  1.055101e-06
3    OTU7         13.0  1.593278e-05
9   OTU18         43.0  1.214217e-03
7   OTU16         44.0  1.642911e-03
6   OTU14         50.0  3.421749e-03
8   OTU17         55.0  6.243658e-03
5   OTU12         58.0  8.471338e-03
4   OTU10         61.0  1.201810e-02


In [9]:
#Applying limma - removeBatchEffect
data_sqrt_limma = correctLimma_rBE(data_sqrt, covariates_labels=["tissue"])

# Store results
otu_results = []

# Perform Mann-Whitney for each OTU
for otu in data_sqrt_limma.select_dtypes(include="number").columns:
    group_E = data_sqrt_limma.loc[data_sqrt_limma["tissue"] == "Ectosome", otu]
    group_C = data_sqrt_limma.loc[data_sqrt_limma["tissue"] == "Choanosome", otu]

    #Mann-Whitney
    stat, p_value = mannwhitneyu(group_E, group_C, alternative="two-sided")

    #Only store significant results
    if p_value < 0.05:
        #Append results
        otu_results.append({"OTU": otu, "U-statistic": stat, "p-value": p_value})

# Convert to DataFrame
otu_results_df = pd.DataFrame(otu_results)

# Display results
print(otu_results_df.sort_values(by="p-value"))

     OTU  U-statistic   p-value
1   OTU5        256.0  0.000001
2   OTU6          0.0  0.000001
0   OTU2        248.0  0.000004
3   OTU7         18.0  0.000037
8  OTU24         32.0  0.000104
6  OTU16         51.0  0.003900
4  OTU10         51.5  0.004038
5  OTU14         67.5  0.022948
7  OTU17         70.5  0.031349


By correcting with ComBat OTUs 18, 14, 17 and 12 where added as significantly different

By correcting with limma OTUs 14 and 17 where added as significantly different

In [10]:
#Applying SVD
data_sqrt_svd = correctSVD(data_sqrt)

# Store results
otu_results = []

# Perform Mann-Whitney for each OTU
for otu in data_sqrt_svd.select_dtypes(include="number").columns:
    group_E = data_sqrt_svd.loc[data_sqrt_svd["tissue"] == "Ectosome", otu]
    group_C = data_sqrt_svd.loc[data_sqrt_svd["tissue"] == "Choanosome", otu]

    #Mann-Whitney
    stat, p_value = mannwhitneyu(group_E, group_C, alternative="two-sided")

    #Only store significant results
    if p_value < 0.05:
        #Append results
        otu_results.append({"OTU": otu, "U-statistic": stat, "p-value": p_value})

# Convert to DataFrame
otu_results_df = pd.DataFrame(otu_results)

# Display results
print(otu_results_df.sort_values(by="p-value"))

      OTU  U-statistic   p-value
0    OTU1         31.0  0.000276
1    OTU3         31.0  0.000276
6   OTU13        225.0  0.000276
4    OTU6         33.0  0.000369
10  OTU20         35.0  0.000490
3    OTU5        215.0  0.001114
11  OTU22         41.0  0.001114
5    OTU8         44.0  0.001649
7   OTU15         47.0  0.002414
8   OTU18         54.0  0.005603
9   OTU19         58.0  0.008809
2    OTU4        191.0  0.018495



Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`



In [11]:
#Applying PLSDA-batch
data_sqrt_plsda = correctPLSDA_batch(data_sqrt)

# Store results
otu_results = []

# Perform Mann-Whitney for each OTU
for otu in data_sqrt_plsda.select_dtypes(include="number").columns:
    group_E = data_sqrt_plsda.loc[data_sqrt_plsda["tissue"] == "Ectosome", otu]
    group_C = data_sqrt_plsda.loc[data_sqrt_plsda["tissue"] == "Choanosome", otu]

    #Mann-Whitney
    stat, p_value = mannwhitneyu(group_E, group_C, alternative="two-sided")

    #Only store significant results
    if p_value < 0.05:
        #Append results
        otu_results.append({"OTU": otu, "U-statistic": stat, "p-value": p_value})

# Convert to DataFrame
otu_results_df = pd.DataFrame(otu_results)

# Display results
print(otu_results_df.sort_values(by="p-value"))

      OTU  U-statistic   p-value
3    OTU5        256.0  0.000002
4    OTU6          0.0  0.000002
12  OTU24          3.0  0.000003
1    OTU2        247.0  0.000008
5    OTU7         31.0  0.000276
7   OTU12         40.0  0.000975
9   OTU19         40.0  0.000975
10  OTU20         57.0  0.007882
6   OTU10         60.0  0.010959
8   OTU18         62.0  0.013563
11  OTU23        186.0  0.030226
0    OTU1        184.0  0.036462
2    OTU3         73.0  0.039971


##### Center log-ratio transformed data

In [12]:
# Store results
otu_results = []

# Perform Mann-Whitney for each OTU
for otu in data_clr.select_dtypes(include="number").columns:
    group_E = data_clr.loc[data_sqrt["tissue"] == "Ectosome", otu]
    group_C = data_clr.loc[data_sqrt["tissue"] == "Choanosome", otu]

    #Mann-Whitney
    stat, p_value = mannwhitneyu(group_E, group_C, alternative="two-sided")

    #Only store significant results
    if p_value < 0.05:
        #Append results
        otu_results.append({"OTU": otu, "U-statistic": stat, "p-value": p_value})

# Convert to DataFrame
otu_results_df = pd.DataFrame(otu_results)

# Display results
print(otu_results_df)

      OTU  U-statistic   p-value
0    OTU1        205.0  0.003937
1    OTU2        256.0  0.000002
2    OTU3        206.0  0.003490
3    OTU5        256.0  0.000002
4    OTU6          0.0  0.000002
5    OTU7         62.0  0.013563
6    OTU8        212.0  0.001649
7   OTU10         57.0  0.007882
8   OTU13        191.0  0.018495
9   OTU15        206.0  0.003490
10  OTU16         71.0  0.033219
11  OTU22        182.0  0.043763


In [13]:
#Applying ComBat
data_clr_combat = correctCombat(data_clr)

# Store results
otu_results = []

# Perform Mann-Whitney for each OTU
for otu in data_clr_combat.select_dtypes(include="number").columns:
    group_E = data_clr_combat.loc[data_clr_combat["tissue"] == "Ectosome", otu]
    group_C = data_clr_combat.loc[data_clr_combat["tissue"] == "Choanosome", otu]

    #Mann-Whitney
    stat, p_value = mannwhitneyu(group_E, group_C, alternative="two-sided")

    #Only store significant results
    if p_value < 0.05:
        #Append results
        otu_results.append({"OTU": otu, "U-statistic": stat, "p-value": p_value})

# Convert to DataFrame
otu_results_df = pd.DataFrame(otu_results)

# Display results
print(otu_results_df)

Found 2 batches.
Adjusting for 1 covariate(s) or covariate level(s).
Standardizing Data across genes.
Fitting L/S model and finding priors.
Finding parametric adjustments.
Adjusting the Data
      OTU  U-statistic   p-value
0    OTU2        256.0  0.000002
1    OTU3        192.0  0.016700
2    OTU5        256.0  0.000002
3    OTU6          0.0  0.000002
4    OTU7         46.0  0.002129
5    OTU8        209.0  0.002414
6   OTU10         65.0  0.018495
7   OTU14         68.0  0.024930
8   OTU15        216.0  0.000975
9   OTU16         50.0  0.003490
10  OTU23        181.0  0.047853
11  OTU24          5.0  0.000004


In [14]:
#Applying limma - removeBatchEffect
data_clr_limma = correctLimma_rBE(data_clr, covariates_labels=["tissue"])

# Store results
otu_results = []

# Perform Mann-Whitney for each OTU
for otu in data_clr_limma.select_dtypes(include="number").columns:
    group_E = data_clr_limma.loc[data_clr_limma["tissue"] == "Ectosome", otu]
    group_C = data_clr_limma.loc[data_clr_limma["tissue"] == "Choanosome", otu]

    #Mann-Whitney
    stat, p_value = mannwhitneyu(group_E, group_C, alternative="two-sided")

    #Only store significant results
    if p_value < 0.05:
        #Append results
        otu_results.append({"OTU": otu, "U-statistic": stat, "p-value": p_value})

# Convert to DataFrame
otu_results_df = pd.DataFrame(otu_results)

# Display results
print(otu_results_df)

      OTU  U-statistic   p-value
0    OTU1        194.0  0.013563
1    OTU2        256.0  0.000002
2    OTU3        210.0  0.002129
3    OTU5        256.0  0.000002
4    OTU6          0.0  0.000002
5    OTU7         52.0  0.004434
6    OTU8        191.0  0.018495
7   OTU10         58.0  0.008809
8   OTU13        181.0  0.047853
9   OTU14         71.0  0.033219
10  OTU15        200.0  0.007044
11  OTU16         58.0  0.008809
12  OTU24         41.0  0.001114


By applying ComBat OTUs 14, 23 and 24 where added as significantly different, while OTUs 1, 13 and 22 where removed.

By applying limma OTUs 14 and 24 where added as significantly different, while OTU 22 was removed.

In [15]:
#Applying SVD
data_clr_svd = correctSVD(data_clr)

# Store results
otu_results = []

# Perform Mann-Whitney for each OTU
for otu in data_clr_svd.select_dtypes(include="number").columns:
    group_E = data_clr_svd.loc[data_clr_svd["tissue"] == "Ectosome", otu]
    group_C = data_clr_svd.loc[data_clr_svd["tissue"] == "Choanosome", otu]

    #Mann-Whitney
    stat, p_value = mannwhitneyu(group_E, group_C, alternative="two-sided")

    #Only store significant results
    if p_value < 0.05:
        #Append results
        otu_results.append({"OTU": otu, "U-statistic": stat, "p-value": p_value})

# Convert to DataFrame
otu_results_df = pd.DataFrame(otu_results)

# Display results
print(otu_results_df)

     OTU  U-statistic   p-value
0   OTU1         69.0  0.027468
1   OTU4        191.0  0.018495
2   OTU5        229.0  0.000152
3   OTU6         21.0  0.000060
4  OTU18         55.0  0.006287
5  OTU19         59.0  0.009832
6  OTU20         57.0  0.007882



Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`



In [16]:
#Applying PLSDA-batch
data_clr_plsda = correctPLSDA_batch(data_clr)

# Store results
otu_results = []

# Perform Mann-Whitney for each OTU
for otu in data_clr_plsda.select_dtypes(include="number").columns:
    group_E = data_clr_plsda.loc[data_clr_plsda["tissue"] == "Ectosome", otu]
    group_C = data_clr_plsda.loc[data_clr_plsda["tissue"] == "Choanosome", otu]

    #Mann-Whitney
    stat, p_value = mannwhitneyu(group_E, group_C, alternative="two-sided")

    #Only store significant results
    if p_value < 0.05:
        #Append results
        otu_results.append({"OTU": otu, "U-statistic": stat, "p-value": p_value})

# Convert to DataFrame
otu_results_df = pd.DataFrame(otu_results)

# Display results
print(otu_results_df)

      OTU  U-statistic   p-value
0    OTU1        209.0  0.002414
1    OTU2        247.0  0.000008
2    OTU3        221.0  0.000490
3    OTU5        256.0  0.000002
4    OTU6          0.0  0.000002
5    OTU7         59.0  0.009832
6    OTU8        199.0  0.007882
7   OTU12         55.0  0.006287
8   OTU13        201.0  0.006287
9   OTU15        203.0  0.004988
10  OTU18         71.0  0.033219
11  OTU19         65.0  0.018495
12  OTU23        189.0  0.022597
13  OTU24         10.0  0.000009
