This notebook is going to be used to recollect different studies from MGnify and try the ABaCo model for training on this data and correct for batch effect on it.

### Libraries

In [1]:
#Essentials
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from torch.utils.data import DataLoader, Subset, ConcatDataset, TensorDataset
from torchvision import datasets, transforms
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
from umap import UMAP

#User libraries
from BatchEffectDataLoader import DataPreprocess, DataTransform, ABaCoDataLoader
from BatchEffectCorrection import correctCombat
from BatchEffectPlots import plotPCA
from BatchEffectMetrics import kBET, iLISI, cLISI, ARI, ASW
from ABaCo import ABaCo, BatchDiscriminator, TissueClassifier

>> clustergrammer2 backend version 0.18.0


### Data preprocessing (uncomment to run)

In [2]:
# file = "all_biomes_merged_abund_tables_species.csv"

# raw_data = pd.read_csv(f"data/MGnify/{file}", index_col=False)
# t_raw_data = raw_data.transpose().reset_index()
# t_raw_data.columns = t_raw_data.iloc[0]
# t_raw_data.rename(columns={"OTU":"sample"}, inplace=True)
# t_raw_data = t_raw_data.drop(t_raw_data.index[0])
# t_raw_data.to_csv(f"data/MGnify/dataset_{file}", index=False)

### Data processing - including metadata into datasets (uncomment to run)

In [3]:
# #data
# file = "dataset_all_biomes_merged_abund_tables_species.csv"
# data = pd.read_csv(f"data/MGnify/{file}")

# #metadata
# meta_data = pd.read_csv("data/MGnify/Mgnify_analyses_wwt_shot_metag_assembly.csv")
# meta_data = meta_data[["assembly_run_id", "experiment_type", "instrument_platform", "biomes"]]
# meta_data.rename(columns={"assembly_run_id":"sample"}, inplace=True)

# #merge data based on sample ID
# data_merged = pd.merge(meta_data, data, on="sample", how="right")
# data_merged = data_merged.drop_duplicates()
# data_merged = data_merged.reset_index()

# #save file
# data_merged.to_csv(f"data/MGnify/metadataset_{file}", index=False)

### Data loading

In [19]:
#Loading data and CLR transform it
path = "data/MGnify/datasets/metadataset_dataset_all_biomes_merged_abund_tables_genus.csv"
batch_label = "instrument_platform"
exp_label = "biomes"
drop_cols = ["index", "experiment_type"]
data = DataPreprocess(path, factors = ["sample", batch_label, exp_label]).drop(drop_cols, axis = 1).dropna().reset_index() #drop samples without meta info
data = DataTransform(data, factors = ["sample", batch_label, exp_label], count=True)
data[exp_label] = data[exp_label].str.replace("root:Engineered:", "", regex=False) #remove redundant label

# dtu_data = data[]

#plot PCA of data to visualize it
plotPCA(data, sample_label="sample", batch_label=batch_label, experiment_label=exp_label)

### Prepare data for ABaCo model

In [3]:
#Setting up device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using {device}")

otu_dataloader, ohe_batch, ohe_biome, otu_data, otu_batch, otu_biome = ABaCoDataLoader(data, 
                                                                                       device = device, 
                                                                                       batch_label=batch_label, 
                                                                                       exp_label=exp_label, 
                                                                                       batch_size = 777, 
                                                                                       total_size = 3327, 
                                                                                       total_batch=10)

Using cuda


### Training ABaCo model

In [8]:
#Epochs 
num_epochs=5000

#Defining autoencoder and other models
d_z = 256
model = ABaCo(d_z=d_z, input_size=3327, batch_size=10).to(device)
batch_discriminator = BatchDiscriminator(input_size=3327, batch_size=10, tissue_size=9).to(device)
latent_classifier = TissueClassifier(input_size=d_z, tissue_size=9).to(device)
output_classifier = TissueClassifier(input_size=3327, tissue_size=9).to(device)

# Training
_, _, _ = model.train_model(batch_model=batch_discriminator,
                latent_class_model=latent_classifier,
                out_class_model=output_classifier,
                train_loader=otu_dataloader,
                ohe_exp_loader=ohe_biome,
                num_epochs=num_epochs,
                w_recon=1.0,
                lr_recon=1e-3,
                w_adver=5.0,
                lr_adver=1e-3,
                w_disc=5.0,
                lr_disc=1e-3,
                w_latent=1.0,
                lr_latent=1e-3,
                w_output=1.0,
                lr_output=1e-3,
                device=device,
                model_name="ABC"
                )


reduction: 'mean' divides the total loss by both the batch size and the support size.'batchmean' divides only by the batch size, and aligns with the KL div math definition.'mean' will be changed to behave the same as 'batchmean' in the next major release.



Epoch 1/5000 | Dis. Train Loss: 11.8300 | Adv. Train Loss: 0.0012 | Recon. Train Loss: 0.4562 | Lat. Train Loss: 2.2137 | Out. Train Loss: 0.4562
Epoch 2/5000 | Dis. Train Loss: 11.5966 | Adv. Train Loss: 0.0008 | Recon. Train Loss: 0.4528 | Lat. Train Loss: 2.1194 | Out. Train Loss: 0.4528
Epoch 3/5000 | Dis. Train Loss: 11.6362 | Adv. Train Loss: 0.0006 | Recon. Train Loss: 0.4558 | Lat. Train Loss: 1.9184 | Out. Train Loss: 0.4558
Epoch 4/5000 | Dis. Train Loss: 11.5120 | Adv. Train Loss: 0.0045 | Recon. Train Loss: 0.4464 | Lat. Train Loss: 1.7675 | Out. Train Loss: 0.4464
Epoch 5/5000 | Dis. Train Loss: 11.1641 | Adv. Train Loss: 0.0114 | Recon. Train Loss: 0.4471 | Lat. Train Loss: 1.6165 | Out. Train Loss: 0.4471
Epoch 6/5000 | Dis. Train Loss: 10.7675 | Adv. Train Loss: 0.0350 | Recon. Train Loss: 0.4480 | Lat. Train Loss: 1.4783 | Out. Train Loss: 0.4480
Epoch 7/5000 | Dis. Train Loss: 9.8916 | Adv. Train Loss: 0.1233 | Recon. Train Loss: 0.4635 | Lat. Train Loss: 1.3765 | Out

### Batch corrected data

In [9]:
otu_batch_corrected = []

#Load data into a single batch
one_batch_data, _, _, _, _, _ = ABaCoDataLoader(data, 
                                                device = device, 
                                                batch_label=batch_label, 
                                                exp_label=exp_label, 
                                                batch_size = 777, 
                                                total_size = 3327, 
                                                total_batch=10)

for x, _, _ in one_batch_data:

    otu_batch_corrected.append(model(x).tolist())

otu_batch_corrected = np.array(otu_batch_corrected)  # Convert list to NumPy array
otu_batch_corrected = otu_batch_corrected.reshape(-1, otu_batch_corrected.shape[-1])

otu_corrected_pd = pd.concat([pd.DataFrame(otu_batch_corrected, index = otu_data.index, columns = otu_data.columns),
                          otu_batch,
                          otu_biome,
                          data["sample"]],
                          axis=1)

In [10]:
plotPCA(otu_corrected_pd, sample_label="sample", batch_label=batch_label, experiment_label=exp_label)

### Other BECA

In [11]:
otu_combat_corrected = correctCombat(data, sample_label="sample", batch_label=batch_label, experiment_label=exp_label)
plotPCA(otu_combat_corrected, sample_label="sample", batch_label=batch_label, experiment_label=exp_label)

Found 10 batches.
Adjusting for 8 covariate(s) or covariate level(s).
Standardizing Data across genes.
Fitting L/S model and finding priors.
Finding parametric adjustments.
Adjusting the Data


### Evaluation metrics

In [13]:
# kBET
kBET_data = kBET(data=data, batch_label=batch_label)
kBET_abaco = kBET(data=otu_corrected_pd, batch_label=batch_label)
kBET_combat = kBET(data=otu_combat_corrected, batch_label=batch_label)

# iLISI
iLISI_data = iLISI(data=data, batch_label=batch_label)
iLISI_abaco = iLISI(data=otu_corrected_pd, batch_label=batch_label)
iLISI_combat = iLISI(data=otu_combat_corrected, batch_label=batch_label)

# cLISI
cLISI_data = cLISI(data=data, cell_label=exp_label)
cLISI_abaco = cLISI(data=otu_corrected_pd, cell_label=exp_label)
cLISI_combat = cLISI(data=otu_combat_corrected, cell_label=exp_label)

# ARI
ARI_data = ARI(data=data, bio_label=exp_label)
ARI_abaco = ARI(data=otu_corrected_pd, bio_label=exp_label)
ARI_combat = ARI(data=otu_combat_corrected, bio_label=exp_label)

#ASW
ASW_data = ASW(data=data, interest_label=exp_label)
ASW_abaco = ASW(data=otu_corrected_pd, interest_label=exp_label)
ASW_combat = ASW(data=otu_combat_corrected, interest_label=exp_label)

print("                        ORIGINAL DATA       |    COMBAT CORRECTED DATA   |    ABACO CORRECTED DATA   ")
print(f"     kBET:             {round(kBET_data, 4)}      |       {round(kBET_combat, 4)}       |       {round(kBET_abaco, 4)}        ")
print(f"     iLISI:            {round(iLISI_data, 4)}     |       {round(iLISI_combat, 4)}       |       {round(iLISI_abaco, 4)}        ")
print(f"     cLISI:            {round(cLISI_data, 4)}     |       {round(cLISI_combat, 4)}       |       {round(cLISI_abaco, 4)}        ")
print(f"     ARI:              {round(ARI_data, 4)}     |       {round(ARI_combat, 4)}       |       {round(ARI_abaco, 4)}        ")
print(f"     ASW:              {round(ASW_data, 4)}     |       {round(ASW_combat, 4)}       |       {round(ASW_abaco, 4)}        ")


                        ORIGINAL DATA       |    COMBAT CORRECTED DATA   |    ABACO CORRECTED DATA   
     kBET:             0.009      |       0.0013       |       0.0        
     iLISI:            1.5413     |       1.3926       |       1.5414        
     cLISI:            1.4752     |       1.1806       |       1.0923        
     ARI:              0.1647     |       0.6077       |       0.6365        
     ASW:              0.4068     |       0.4954       |       0.7786        


### Training ABaCo with other metrics

In [14]:
#Epochs 
num_epochs=500

#Defining autoencoder and other models
d_z = 128
model = ABaCo(d_z=d_z, input_size=3327, batch_size=10).to(device)
batch_discriminator = BatchDiscriminator(input_size=3327, batch_size=10, tissue_size=9).to(device)
latent_classifier = TissueClassifier(input_size=d_z, tissue_size=9).to(device)
output_classifier = TissueClassifier(input_size=3327, tissue_size=9).to(device)

# Training
_, _, _ = model.train_model(batch_model=batch_discriminator,
                latent_class_model=latent_classifier,
                out_class_model=output_classifier,
                train_loader=otu_dataloader,
                ohe_exp_loader=ohe_biome,
                num_epochs=num_epochs,
                w_recon=1.0,
                lr_recon=2e-3,
                w_adver=5.0,
                lr_adver=1e-3,
                w_disc=5.0,
                lr_disc=1e-3,
                w_latent=1.0,
                lr_latent=1e-3,
                w_output=1.0,
                lr_output=1e-3,
                device=device,
                model_name="ABC"
                )


reduction: 'mean' divides the total loss by both the batch size and the support size.'batchmean' divides only by the batch size, and aligns with the KL div math definition.'mean' will be changed to behave the same as 'batchmean' in the next major release.



Epoch 1/500 | Dis. Train Loss: 11.3989 | Adv. Train Loss: 0.0017 | Recon. Train Loss: 0.4512 | Lat. Train Loss: 2.1555 | Out. Train Loss: 0.4512
Epoch 2/500 | Dis. Train Loss: 11.2968 | Adv. Train Loss: 0.0043 | Recon. Train Loss: 0.4446 | Lat. Train Loss: 2.0788 | Out. Train Loss: 0.4446
Epoch 3/500 | Dis. Train Loss: 11.1490 | Adv. Train Loss: 0.0355 | Recon. Train Loss: 0.4611 | Lat. Train Loss: 1.8890 | Out. Train Loss: 0.4611
Epoch 4/500 | Dis. Train Loss: 9.6264 | Adv. Train Loss: 0.3952 | Recon. Train Loss: 0.9676 | Lat. Train Loss: 1.5941 | Out. Train Loss: 0.9676
Epoch 5/500 | Dis. Train Loss: 10.6070 | Adv. Train Loss: 0.0514 | Recon. Train Loss: 0.5346 | Lat. Train Loss: 1.7206 | Out. Train Loss: 0.5346
Epoch 6/500 | Dis. Train Loss: 10.0193 | Adv. Train Loss: 0.0697 | Recon. Train Loss: 0.4675 | Lat. Train Loss: 1.6926 | Out. Train Loss: 0.4675
Epoch 7/500 | Dis. Train Loss: 8.6480 | Adv. Train Loss: 0.2784 | Recon. Train Loss: 0.4997 | Lat. Train Loss: 1.5199 | Out. Train 

In [17]:
otu_batch_corrected = []

#Load data into a single batch
one_batch_data, _, _, _, _, _ = ABaCoDataLoader(data, 
                                                device = device, 
                                                batch_label=batch_label, 
                                                exp_label=exp_label, 
                                                batch_size = 777, 
                                                total_size = 3327, 
                                                total_batch=10)

for x, _, _ in one_batch_data:

    otu_batch_corrected.append(model(x).tolist())

otu_batch_corrected = np.array(otu_batch_corrected)  # Convert list to NumPy array
otu_batch_corrected = otu_batch_corrected.reshape(-1, otu_batch_corrected.shape[-1])

otu_corrected_pd = pd.concat([pd.DataFrame(otu_batch_corrected, index = otu_data.index, columns = otu_data.columns),
                          otu_batch,
                          otu_biome,
                          data["sample"]],
                          axis=1)

In [18]:
plotPCA(otu_corrected_pd, sample_label="sample", batch_label=batch_label, experiment_label=exp_label)