In [1]:
from transformers import AutoTokenizer, AutoModelForMaskedLM, DataCollatorForLanguageModeling, TrainingArguments, Trainer
from datasets import Dataset, load_dataset
import pandas as pd 
import math
import numpy as np
import torch, numpy as np
from tqdm.auto import tqdm
from sklearn.cluster import DBSCAN
import shutil
from sklearn.model_selection import train_test_split

In [None]:
# The paths for the original version of the model with sentences format of clauses with 4 taxonomic level 
OUTPUT_DIR = "/sci/backup/morani/lab/Projects/Aluma/ANLP/Project/unsupervised_model"
DATA_DIR = "/sci/backup/morani/lab/Projects/Aluma/ANLP/Project/sentences_with_labels"

In [2]:
# paths for the version of the model with sentences with only species level
# OUTPUT_DIR = "/sci/backup/morani/lab/Projects/Aluma/ANLP/Project/unsupervised_model_species_level"
# DATA_DIR = "/sci/backup/morani/lab/Projects/Aluma/ANLP/Project/sentences_with_labels_species_level"

In [2]:
# paths for the version of the model with sentences with only species level
# OUTPUT_DIR = "/sci/backup/morani/lab/Projects/Aluma/ANLP/Project/unsupervised_model_regular_tokenizer"
# DATA_DIR = "/sci/backup/morani/lab/Projects/Aluma/ANLP/Project/sentences_with_labels"

In [5]:
MODEL_NAME = "microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext"
TSV_FILE = f"{DATA_DIR}/data_for_fine_tuning.tsv"
SEED = 42
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [6]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model     = AutoModelForMaskedLM.from_pretrained(MODEL_NAME)
model.to(DEVICE)

# Add custom tokens
new_tokens = ['acea', 'ales', 'um', 'bacter', 'coccus', 'bacill']
num_added = tokenizer.add_tokens(new_tokens)
print(f"Added {num_added} new tokens.")

# Resize model embeddings
model.resize_token_embeddings(len(tokenizer))
print(f"Resized embeddings to: {model.get_input_embeddings().weight.shape}")

# OPTIONAL: freeze all other parameters (only train embeddings)
for param in model.parameters():
    param.requires_grad = False
model.get_input_embeddings().weight.requires_grad = True

Some weights of the model checkpoint at microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Added 6 new tokens.


The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Resized embeddings to: torch.Size([30526, 768])


In [6]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model     = AutoModelForMaskedLM.from_pretrained(MODEL_NAME)
model.to(DEVICE)

# OPTIONAL: freeze all other parameters (only train embeddings)
for param in model.parameters():
    param.requires_grad = False
model.get_input_embeddings().weight.requires_grad = True

Some weights of the model checkpoint at microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [7]:
# Read the TSV file
print("Reading TSV file...")
df = pd.read_csv(TSV_FILE, sep='\t')
print(f"Loaded {len(df)} samples")
print(f"Columns: {df.columns.tolist()}")

# Display first few rows to verify data
print("\nFirst 3 samples:")
print(df[['Filename', 'Sentence']].head(3))

# Extract sentences for training
sentences = df['Sentence'].dropna().tolist()
print(f"\nTotal sentences after removing NaN: {len(sentences)}")
train_ds = Dataset.from_dict({"text": sentences})

# Tokenization function
def tokenize(batch):
    return tokenizer(batch["text"],
                     truncation=True,
                     max_length=512,
                     return_special_tokens_mask=True)

# Map tokenization
print("\nTokenizing dataset...")
train_ds = train_ds.map(tokenize, batched=True, remove_columns=["text"])

# Set format for PyTorch
train_ds.set_format("torch")

# Create data collator
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm_probability=0.15)

print("\nDatasets prepared successfully!")

Reading TSV file...




Loaded 5950 samples
Columns: ['Filename', 'Sentence', 'Location', 'Sample_type', 'Age', 'Lifestyle']

First 3 samples:
           Filename                                           Sentence
0  Zeevi_ERR1110297  Bacteroidales Bacteroidaceae GGB28271 GGB28271...
1  Zeevi_ERR1110298  Bacteroidales Bacteroidaceae GGB1364 GGB1364_S...
2  Zeevi_ERR1110299  Bacteroidales Bacteroidaceae Bacteroides Bacte...

Total sentences after removing NaN: 5950

Tokenizing dataset...


Map:   0%|          | 0/5950 [00:00<?, ? examples/s]


Datasets prepared successfully!


In [8]:
training_args = TrainingArguments(
    OUTPUT_DIR,
    eval_strategy="no",  # No evaluation during training
    per_device_train_batch_size=8,
    gradient_accumulation_steps=4,    # effectively 32 samples per step
    num_train_epochs=3,
    weight_decay=0.01,
    fp16=True,
    seed=SEED,
    push_to_hub=False,
    report_to="none",
    logging_dir=f"{OUTPUT_DIR}/logs",
    logging_steps=50,  # Log training loss every 50 steps
    logging_first_step=True,  # Log the first step
    save_strategy="epoch",
    save_total_limit=2,
    dataloader_num_workers=2
)

# Calculate steps per epoch for reference
steps_per_epoch = len(train_ds) // (training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps)
print(f"Steps per epoch: {steps_per_epoch}")
print(f"Total training steps: {steps_per_epoch * training_args.num_train_epochs}")

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    data_collator=data_collator,
)

trainer.train()

Steps per epoch: 185
Total training steps: 555


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Step,Training Loss
1,2.3096
50,1.9519
100,1.4424
150,1.1798
200,1.0523
250,0.9662
300,0.9364
350,0.8945
400,0.8887
450,0.86


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

TrainOutput(global_step=558, training_loss=1.0756663284848669, metrics={'train_runtime': 183.72, 'train_samples_per_second': 97.159, 'train_steps_per_second': 3.037, 'total_flos': 4698205908480000.0, 'train_loss': 1.0756663284848669, 'epoch': 3.0})

In [9]:
model.save_pretrained(f"{OUTPUT_DIR}/model")
tokenizer.save_pretrained(f"{OUTPUT_DIR}/tokenizer")

('/sci/backup/morani/lab/Projects/Aluma/ANLP/Project/unsupervised_model_regular_tokenizer/tokenizer/tokenizer_config.json',
 '/sci/backup/morani/lab/Projects/Aluma/ANLP/Project/unsupervised_model_regular_tokenizer/tokenizer/special_tokens_map.json',
 '/sci/backup/morani/lab/Projects/Aluma/ANLP/Project/unsupervised_model_regular_tokenizer/tokenizer/vocab.txt',
 '/sci/backup/morani/lab/Projects/Aluma/ANLP/Project/unsupervised_model_regular_tokenizer/tokenizer/added_tokens.json',
 '/sci/backup/morani/lab/Projects/Aluma/ANLP/Project/unsupervised_model_regular_tokenizer/tokenizer/tokenizer.json')