In [2]:
from transformers import AutoTokenizer, AutoModelForMaskedLM, DataCollatorForLanguageModeling, TrainingArguments, Trainer, AutoModelForSequenceClassification
from datasets import Dataset, load_dataset
import pandas as pd 
import math
import numpy as np
import torch, numpy as np
from tqdm.auto import tqdm
from sklearn.cluster import DBSCAN
import shutil
from sklearn.model_selection import train_test_split
import evaluate
import json

In [3]:
# The paths for the original version of the model with sentences format of clauses with 4 taxonomic level 
OUTPUT_DIR_sample_type = "/sci/backup/morani/lab/Projects/Aluma/ANLP/Project/supervised_model/Sample_type"
OUTPUT_DIR_age = "/sci/backup/morani/lab/Projects/Aluma/ANLP/Project/supervised_model/age"
DATA_DIR = "/sci/backup/morani/lab/Projects/Aluma/ANLP/Project/sentences_with_labels"

In [3]:
# paths for the version of the model with sentences with only species level
# OUTPUT_DIR_sample_type = "/sci/backup/morani/lab/Projects/Aluma/ANLP/Project/supervised_model_species_level/Sample_type"
# OUTPUT_DIR_age = "/sci/backup/morani/lab/Projects/Aluma/ANLP/Project/supervised_model_species_level/age"
# DATA_DIR = "/sci/backup/morani/lab/Projects/Aluma/ANLP/Project/sentences_with_labels_species_level"

In [7]:
MODEL_NAME = "microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext"
TSV_FILE = f"{DATA_DIR}/data_for_fine_tuning.tsv"
SEED = 42
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
TARGET_LABEL_age = 'Age'  # Change this to 'Age' if you want to predict age / Sample_type
TARGET_LABEL_sample_type = 'Sample_type' 

In [8]:
#### age label ###

In [8]:
# Read the TSV file
print("Reading TSV file...")
df = pd.read_csv(TSV_FILE, sep='\t')
print(f"Loaded {len(df)} samples")
print(f"Columns: {df.columns.tolist()}")

# Display first few rows
print("\nFirst 3 samples:")
print(df[['Filename', 'Sentence', TARGET_LABEL_age]].head(3))

# Remove samples with missing target labels or sentences
df_clean = df.dropna(subset=['Sentence', TARGET_LABEL_age])
print(f"\nSamples after removing NaN: {len(df_clean)}")

# Check label distribution
print(f"\n{TARGET_LABEL_age} distribution:")
print(df_clean[TARGET_LABEL_age].value_counts())

# Prepare labels
unique_labels = sorted(df_clean[TARGET_LABEL_age].unique())
label2id = {label: i for i, label in enumerate(unique_labels)}
id2label = {i: label for label, i in label2id.items()}

print(f"\nLabel mapping:")
for label, id in label2id.items():
    print(f"  {label}: {id}")

# Convert labels to numeric
df_clean['labels'] = df_clean[TARGET_LABEL_age].map(label2id)

# Extract texts and labels
texts = df_clean['Sentence'].tolist()
labels = df_clean['labels'].tolist()

print(f"\nTotal samples: {len(texts)}")
print(f"Number of classes: {len(unique_labels)}")

train_dataset = Dataset.from_dict({
    "text": texts,
    "labels": labels
})

Reading TSV file...
Loaded 5950 samples
Columns: ['Filename', 'Sentence', 'Location', 'Sample_type', 'Age', 'Lifestyle']

First 3 samples:
           Filename                                           Sentence    Age
0  Zeevi_ERR1110297  GGB28271_SGB40830 GGB28262_SGB40814 Phocaeicol...  adult
1  Zeevi_ERR1110298  GGB1364_SGB1834 Alistipes_putredinis Bacteroid...  adult
2  Zeevi_ERR1110299  Bacteroides_uniformis GGB1627_SGB2230 Phocaeic...  adult

Samples after removing NaN: 5950

Age distribution:
Age
adult    2935
1-3Y     1217
6-12M     569
1-4M      411
4-6M      373
0-1M      346
child      99
Name: count, dtype: int64

Label mapping:
  0-1M: 0
  1-3Y: 1
  1-4M: 2
  4-6M: 3
  6-12M: 4
  adult: 5
  child: 6

Total samples: 5950
Number of classes: 7


In [9]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=len(unique_labels),
    id2label=id2label,
    label2id=label2id
)
model.to(DEVICE)

# Add custom tokens
new_tokens = ['acea', 'ales', 'um', 'bacter', 'coccus', 'bacill']
num_added = tokenizer.add_tokens(new_tokens)
print(f"Added {num_added} new tokens.")

# Resize model embeddings
model.resize_token_embeddings(len(tokenizer))
print(f"Resized embeddings to: {model.get_input_embeddings().weight.shape}")

# OPTIONAL: freeze all other parameters (only train embeddings)
# Freeze all parameters first
for param in model.parameters():
    param.requires_grad = False

# Unfreeze classifier
for param in model.classifier.parameters():
    param.requires_grad = True

# Unfreeze last 2 transformer layers (adjust number as needed)
for param in model.bert.encoder.layer[-2:].parameters():
    param.requires_grad = True

# Unfreeze new token embeddings
model.get_input_embeddings().weight.requires_grad = True

# Tokenization function
def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        padding=True,
        max_length=512,
        return_tensors="pt"
    )

# Tokenize dataset
print("\nTokenizing dataset...")
train_dataset = train_dataset.map(tokenize_function, batched=True, remove_columns=["text"])
train_dataset.set_format("torch")

print(f"Tokenized dataset: {train_dataset}")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Added 6 new tokens.




Resized embeddings to: torch.Size([30526, 768])

Tokenizing dataset...


Map:   0%|          | 0/5950 [00:00<?, ? examples/s]

Tokenized dataset: Dataset({
    features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 5950
})


In [10]:
# Load evaluation metrics
accuracy_metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)

    # Calculate accuracy
    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)

    return {
        "accuracy": accuracy["accuracy"],
    }

In [11]:
# Training arguments
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR_age, # if age label - OUTPUT_DIR_age / OUTPUT_DIR_sample_type
    eval_strategy="no",  # No evaluation during training (training on full dataset)
    per_device_train_batch_size=8,
    gradient_accumulation_steps=4,
    num_train_epochs=3,
    weight_decay=0.01,
    learning_rate=2e-5,  # Common learning rate for classification
    fp16=True,
    seed=SEED,
    push_to_hub=False,
    report_to="none",
    logging_dir=f"{OUTPUT_DIR_age}/logs", # if age label - OUTPUT_DIR_age / OUTPUT_DIR_sample_type
    logging_steps=50,
    logging_first_step=True,
    save_strategy="epoch",
    save_total_limit=2,
    dataloader_num_workers=2,
)

# Calculate steps per epoch
steps_per_epoch = len(train_dataset) // (training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps)
print(f"\nSteps per epoch: {steps_per_epoch}")
print(f"Total training steps: {steps_per_epoch * training_args.num_train_epochs}")

# Create trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Train the model
print("\nStarting training...")
trainer.train()


Steps per epoch: 185
Total training steps: 555

Starting training...


  trainer = Trainer(
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Step,Training Loss
1,1.9399
50,1.4539
100,1.0384
150,0.869
200,0.8607
250,0.7969
300,0.751
350,0.7162
400,0.7339
450,0.7031


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

TrainOutput(global_step=558, training_loss=0.8465914375893104, metrics={'train_runtime': 160.9818, 'train_samples_per_second': 110.882, 'train_steps_per_second': 3.466, 'total_flos': 4696743179520000.0, 'train_loss': 0.8465914375893104, 'epoch': 3.0})

In [12]:
# Age:
model.save_pretrained(f"{OUTPUT_DIR_age}/model")
tokenizer.save_pretrained(f"{OUTPUT_DIR_age}/tokenizer")

('/sci/backup/morani/lab/Projects/Aluma/ANLP/Project/supervised_model_species_level/age/tokenizer/tokenizer_config.json',
 '/sci/backup/morani/lab/Projects/Aluma/ANLP/Project/supervised_model_species_level/age/tokenizer/special_tokens_map.json',
 '/sci/backup/morani/lab/Projects/Aluma/ANLP/Project/supervised_model_species_level/age/tokenizer/vocab.txt',
 '/sci/backup/morani/lab/Projects/Aluma/ANLP/Project/supervised_model_species_level/age/tokenizer/added_tokens.json',
 '/sci/backup/morani/lab/Projects/Aluma/ANLP/Project/supervised_model_species_level/age/tokenizer/tokenizer.json')

In [13]:
# Save label mappings - age
with open(f"{OUTPUT_DIR_age}/label_mappings.json", "w") as f:
    json.dump({
        "label2id": label2id,
        "id2label": id2label,
        "target_label": TARGET_LABEL_age
    }, f, indent=2)

In [14]:
def predict_sentences(sentences, model, tokenizer, id2label):
    """
    Predict labels for given sentences
    """
    model.eval()
    predictions = []
    probabilities = []

    for sentence in sentences:
        # Tokenize the sentence
        encoded = tokenizer(
            sentence,
            truncation=True,
            padding=True,
            max_length=512,
            return_tensors="pt"
        ).to(DEVICE)

        # Make prediction
        with torch.no_grad():
            outputs = model(**encoded)
            logits = outputs.logits

            # Get probabilities
            probs = torch.nn.functional.softmax(logits, dim=-1)

            # Get predicted class
            predicted_class_id = torch.argmax(logits, dim=-1).item()
            predicted_label = id2label[predicted_class_id]

            predictions.append(predicted_label)
            probabilities.append(probs.cpu().numpy()[0])

    return predictions, probabilities

In [15]:
# Get a few random examples from your dataset
sample_indices = np.random.choice(len(texts), size=3, replace=False)
sample_texts = [texts[i] for i in sample_indices]
sample_true_labels = [id2label[labels[i]] for i in sample_indices]

# Make predictions
sample_predictions, sample_probabilities = predict_sentences(sample_texts, model, tokenizer, id2label)

print(f"\nPredictions for dataset examples:")
print("-" * 60)

for i, (text, true_label, pred, probs) in enumerate(zip(sample_texts, sample_true_labels, sample_predictions, sample_probabilities)):
    print(f"\nExample {i+1}:")
    print(f"Text: {text[:100]}...")  # Show first 100 characters
    print(f"True {TARGET_LABEL_age}: {true_label}")
    print(f"Predicted {TARGET_LABEL_age}: {pred}")
    print(f"Confidence: {max(probs):.4f}")

    # Check if prediction is correct
    is_correct = "✓" if pred == true_label else "✗"
    print(f"Correct: {is_correct}")


Predictions for dataset examples:
------------------------------------------------------------

Example 1:
Text: Veillonella_ratti Collinsella_sp_AK_207A Bifidobacterium_longum Bifidobacterium_bifidum Bifidobacter...
True Age: 6-12M
Predicted Age: 4-6M
Confidence: 0.3522
Correct: ✗

Example 2:
Text: Bifidobacterium_bifidum Limosilactobacillus_mucosae GGB4266_SGB5809 Megamonas_funiformis Megasphaera...
True Age: 6-12M
Predicted Age: 4-6M
Confidence: 0.3773
Correct: ✗

Example 3:
Text: Klebsiella_grimontii Enterobacter_kobei Enterococcus_faecalis Veillonella_parvula Robinsoniella_peor...
True Age: 4-6M
Predicted Age: 0-1M
Confidence: 0.5498
Correct: ✗


In [None]:
###### sample type label #####

In [16]:
# Read the TSV file
print("Reading TSV file...")
df = pd.read_csv(TSV_FILE, sep='\t')
print(f"Loaded {len(df)} samples")
print(f"Columns: {df.columns.tolist()}")

# Display first few rows
print("\nFirst 3 samples:")
print(df[['Filename', 'Sentence', TARGET_LABEL_sample_type]].head(3))

# Remove samples with missing target labels or sentences
df_clean = df.dropna(subset=['Sentence', TARGET_LABEL_sample_type])
print(f"\nSamples after removing NaN: {len(df_clean)}")

# Check label distribution
print(f"\n{TARGET_LABEL_sample_type} distribution:")
print(df_clean[TARGET_LABEL_sample_type].value_counts())

# Prepare labels
unique_labels = sorted(df_clean[TARGET_LABEL_sample_type].unique())
label2id = {label: i for i, label in enumerate(unique_labels)}
id2label = {i: label for label, i in label2id.items()}

print(f"\nLabel mapping:")
for label, id in label2id.items():
    print(f"  {label}: {id}")

# Convert labels to numeric
df_clean['labels'] = df_clean[TARGET_LABEL_sample_type].map(label2id)

# Extract texts and labels
texts = df_clean['Sentence'].tolist()
labels = df_clean['labels'].tolist()

print(f"\nTotal samples: {len(texts)}")
print(f"Number of classes: {len(unique_labels)}")

train_dataset = Dataset.from_dict({
    "text": texts,
    "labels": labels
})

Reading TSV file...
Loaded 5950 samples
Columns: ['Filename', 'Sentence', 'Location', 'Sample_type', 'Age', 'Lifestyle']

First 3 samples:
           Filename                                           Sentence  \
0  Zeevi_ERR1110297  GGB28271_SGB40830 GGB28262_SGB40814 Phocaeicol...   
1  Zeevi_ERR1110298  GGB1364_SGB1834 Alistipes_putredinis Bacteroid...   
2  Zeevi_ERR1110299  Bacteroides_uniformis GGB1627_SGB2230 Phocaeic...   

  Sample_type  
0   adult_gut  
1   adult_gut  
2   adult_gut  

Samples after removing NaN: 5950

Sample_type distribution:
Sample_type
infant_gut    2916
adult_gut     1679
vaginal       1256
child_gut       99
Name: count, dtype: int64

Label mapping:
  adult_gut: 0
  child_gut: 1
  infant_gut: 2
  vaginal: 3

Total samples: 5950
Number of classes: 4


In [17]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=len(unique_labels),
    id2label=id2label,
    label2id=label2id
)
model.to(DEVICE)

# Add custom tokens
new_tokens = ['acea', 'ales', 'um', 'bacter', 'coccus', 'bacill']
num_added = tokenizer.add_tokens(new_tokens)
print(f"Added {num_added} new tokens.")

# Resize model embeddings
model.resize_token_embeddings(len(tokenizer))
print(f"Resized embeddings to: {model.get_input_embeddings().weight.shape}")

# OPTIONAL: freeze all other parameters (only train embeddings)
# Freeze all parameters first
for param in model.parameters():
    param.requires_grad = False

# Unfreeze classifier
for param in model.classifier.parameters():
    param.requires_grad = True

# Unfreeze last 2 transformer layers (adjust number as needed)
for param in model.bert.encoder.layer[-2:].parameters():
    param.requires_grad = True

# Unfreeze new token embeddings
model.get_input_embeddings().weight.requires_grad = True


# Tokenization function
def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        padding=True,
        max_length=512,
        return_tensors="pt"
    )

# Tokenize dataset
print("\nTokenizing dataset...")
train_dataset = train_dataset.map(tokenize_function, batched=True, remove_columns=["text"])
train_dataset.set_format("torch")

print(f"Tokenized dataset: {train_dataset}")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Added 6 new tokens.
Resized embeddings to: torch.Size([30526, 768])

Tokenizing dataset...


Map:   0%|          | 0/5950 [00:00<?, ? examples/s]

Tokenized dataset: Dataset({
    features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 5950
})


In [19]:
# Training arguments
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR_sample_type, 
    eval_strategy="no",  # No evaluation during training (training on full dataset)
    per_device_train_batch_size=8,
    gradient_accumulation_steps=4,
    num_train_epochs=3,
    weight_decay=0.01,
    learning_rate=2e-5,  # Common learning rate for classification
    fp16=True,
    seed=SEED,
    push_to_hub=False,
    report_to="none",
    logging_dir=f"{OUTPUT_DIR_sample_type}/logs", 
    logging_steps=50,
    logging_first_step=True,
    save_strategy="epoch",
    save_total_limit=2,
    dataloader_num_workers=2,
)

# Calculate steps per epoch
steps_per_epoch = len(train_dataset) // (training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps)
print(f"\nSteps per epoch: {steps_per_epoch}")
print(f"Total training steps: {steps_per_epoch * training_args.num_train_epochs}")

# Create trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Train the model
print("\nStarting training...")
trainer.train()


Steps per epoch: 185
Total training steps: 555

Starting training...


  trainer = Trainer(
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Step,Training Loss
1,1.49
50,0.9214
100,0.4801
150,0.3282
200,0.2978
250,0.2596
300,0.2209
350,0.2189
400,0.2079
450,0.2159


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

TrainOutput(global_step=558, training_loss=0.3227600805648339, metrics={'train_runtime': 156.1072, 'train_samples_per_second': 114.344, 'train_steps_per_second': 3.574, 'total_flos': 4696616674713600.0, 'train_loss': 0.3227600805648339, 'epoch': 3.0})

In [20]:
# sample_type:
model.save_pretrained(f"{OUTPUT_DIR_sample_type}/model")
tokenizer.save_pretrained(f"{OUTPUT_DIR_sample_type}/tokenizer")

('/sci/backup/morani/lab/Projects/Aluma/ANLP/Project/supervised_model_species_level/Sample_type/tokenizer/tokenizer_config.json',
 '/sci/backup/morani/lab/Projects/Aluma/ANLP/Project/supervised_model_species_level/Sample_type/tokenizer/special_tokens_map.json',
 '/sci/backup/morani/lab/Projects/Aluma/ANLP/Project/supervised_model_species_level/Sample_type/tokenizer/vocab.txt',
 '/sci/backup/morani/lab/Projects/Aluma/ANLP/Project/supervised_model_species_level/Sample_type/tokenizer/added_tokens.json',
 '/sci/backup/morani/lab/Projects/Aluma/ANLP/Project/supervised_model_species_level/Sample_type/tokenizer/tokenizer.json')

In [21]:
# Save label mappings - sample type
with open(f"{OUTPUT_DIR_sample_type}/label_mappings.json", "w") as f:
    json.dump({
        "label2id": label2id,
        "id2label": id2label,
        "target_label": TARGET_LABEL_sample_type
    }, f, indent=2)

In [23]:
# Get a few random examples from your dataset
sample_indices = np.random.choice(len(texts), size=3, replace=False)
sample_texts = [texts[i] for i in sample_indices]
sample_true_labels = [id2label[labels[i]] for i in sample_indices]

# Make predictions
sample_predictions, sample_probabilities = predict_sentences(sample_texts, model, tokenizer, id2label)

print(f"\nPredictions for dataset examples:")
print("-" * 60)

for i, (text, true_label, pred, probs) in enumerate(zip(sample_texts, sample_true_labels, sample_predictions, sample_probabilities)):
    print(f"\nExample {i+1}:")
    print(f"Text: {text[:100]}...")  # Show first 100 characters
    print(f"True {TARGET_LABEL_sample_type}: {true_label}")
    print(f"Predicted {TARGET_LABEL_sample_type}: {pred}")
    print(f"Confidence: {max(probs):.4f}")

    # Check if prediction is correct
    is_correct = "✓" if pred == true_label else "✗"
    print(f"Correct: {is_correct}")


Predictions for dataset examples:
------------------------------------------------------------

Example 1:
Text: Veillonella_ratti Collinsella_sp_AK_207A Bifidobacterium_longum Bifidobacterium_bifidum Bifidobacter...
True Sample_type: infant_gut
Predicted Sample_type: infant_gut
Confidence: 0.9881
Correct: ✓

Example 2:
Text: Bifidobacterium_bifidum Limosilactobacillus_mucosae GGB4266_SGB5809 Megamonas_funiformis Megasphaera...
True Sample_type: infant_gut
Predicted Sample_type: infant_gut
Confidence: 0.9884
Correct: ✓

Example 3:
Text: Klebsiella_grimontii Enterobacter_kobei Enterococcus_faecalis Veillonella_parvula Robinsoniella_peor...
True Sample_type: infant_gut
Predicted Sample_type: infant_gut
Confidence: 0.9670
Correct: ✓
