#### Credits
Adapted from Rubungo et al. Llm-prop: Predicting physical and electronic properties of crystalline solids from their text descriptions, 2023. <br> 
Vertaix. Llm-prop: A repository for property prediction using large language models. https: //github.com/vertaix/LLM-Prop/tree/main, 2025

Copyright (c) 2024 Vertaix
MIT License

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%cd drive/MyDrive/cours/cs224n/project/
# %cd cours/cs224n/student
# verify that you are in the right directory
%cd LLM-Prop/
! ls

/content/drive/MyDrive/cours/cs224n/project
/content/drive/MyDrive/cours/cs224n/project/LLM-Prop
checkpoints  LICENSE		     llmprop_model.py	  __pycache__	    scripts
data	     llmprop_args_parser.py  llmprop_train.py	  README.md	    statistics
embeddings   llmprop_dataset.py      llmprop_utils_OG.py  requirements.txt  stopwords
figures      llmprop_evaluate.py     llmprop_utils.py	  savings	    tokenizers


In [3]:
import re
import time
import glob
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import argparse
from tqdm import tqdm

from transformers import AdamW
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
from transformers import AutoTokenizer, T5EncoderModel, T5Tokenizer
from tokenizers.pre_tokenizers import Whitespace
pre_tokenizer = Whitespace()

from statistics import stdev

# pre-defined functions
from llmprop_utils import *
from llmprop_dataset import *
from llmprop_args_parser import *
from llmprop_train import evaluate
from llmprop_train import train
from llmprop_model import T5Predictor




In [4]:
# check if the GPU is available
import torch
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f'Number of available devices: {torch.cuda.device_count()}')
    print(f'Current device is: {torch.cuda.current_device()}')
    print("Training and testing on", torch.cuda.device_count(), "GPUs!")
    print('-'*50)
else:
    print("No GPU available, please connect to the GPU first or continue to use CPU instead")
    print('-'*50)
    device = torch.device("cpu")

Number of available devices: 1
Current device is: 0
Training and testing on 1 GPUs!
--------------------------------------------------


In [5]:
parser = argparse.ArgumentParser(description='LLM-Prop')
parser.add_argument('--epochs',
                    help='Number of epochs',
                    type=int,
                    default=200)
parser.add_argument('--bs',
                    help='Batch size',
                    type=int,
                    default=64)
parser.add_argument('--lr',
                    help='Learning rate',
                    type=float,
                    default=0.001)
parser.add_argument('--max_len',
                    help='Max input sequence length',
                    type=int,
                    default=888)
parser.add_argument('--dr',
                    help='Drop rate',
                    type=float,
                    default=0.2)
parser.add_argument('--warmup_steps',
                    help='Warmpup steps',
                    type=int,
                    default=30000)
parser.add_argument('--preprocessing_strategy',
                    help='Data preprocessing technique: "none", "bond_lengths_replaced_with_num", "bond_angles_replaced_with_ang", "no_stopwords", or "no_stopwords_and_lengths_and_angles_replaced"',
                    type=str,
                    default="no_stopwords_and_lengths_and_angles_replaced")
parser.add_argument('--tokenizer',
                    help='Tokenizer name: "t5_tokenizer" or "modified"',
                    type=str,
                    default="modified")
parser.add_argument('--pooling',
                    help='Pooling method. "cls" or "mean"',
                    type=str,
                      default="cls")
parser.add_argument('--normalizer',
                      help='Labels scaling technique. "z_norm", "mm_norm", or "ls_norm"',
                      type=str,
                    default="z_norm")
parser.add_argument('--scheduler',
                    help='Learning rate scheduling technique. "linear", "onecycle", "step", or "lambda" (no scheduling))',
                    type=str,
                    default="onecycle")
parser.add_argument('--property_name',
                      help='The name of the property to predict. "band_gap", "volume", or "is_gap_direct"',
                      type=str,
                      default="band_gap")
parser.add_argument('--optimizer',
                    help='Optimizer type. "adamw" or "sgd"',
                    type=str,
                    default="adamw")
parser.add_argument('--task_name',
                    help='the name of the task: "regression" if propert_name is band_gap or volume or "classification" if property_name is is_gap_direct',
                    type=str,
                    default="regression")
parser.add_argument('--train_data_path',
                    help="the path to the training data",
                    type=str,
                    default="data/samples/textedge_prop_mp22_train.csv")
parser.add_argument('--valid_data_path',
                    help="the path to the valid data",
                    type=str,
                    default="data/samples/textedge_prop_mp22_valid.csv")
parser.add_argument('--test_data_path',
                    help="the path to the test data",
                    type=str,
                    default="data/samples/textedge_prop_mp22_test.csv")
parser.add_argument('--checkpoint',
                      help="the path to the the best checkpoint for evaluation",
                      type=str,
                      default="")
args = parser.parse_args([])
args_dict = vars(args)

# Load them into variables and use the correct variable names:
globals().update(args_dict)

# set parameters
batch_size = bs
max_length = max_len
learning_rate = lr
drop_rate = dr
epochs = epochs
warmup_steps = warmup_steps
preprocessing_strategy = preprocessing_strategy
tokenizer_name = tokenizer
pooling = pooling
scheduler_type = scheduler
normalizer_type = normalizer
property = property_name
optimizer_type = optimizer
task_name = task_name
train_data_path = train_data_path
valid_data_path = valid_data_path
test_data_path = test_data_path
best_model_path = checkpoint

#set specific variables not default
batch_size =32
drop_rate = 0.4

#defining datasets
default_test_data_path ="data/samples/textedge_prop_mp22_test.csv"
test_data_path = "data/test_no_stopwords_and_lengths_and_angles_replaced.csv"


In [6]:
# prepare the data
# train_data = pd.read_csv(train_data_path)
train_data = pd.read_csv('data/train_pp_15000.csv')
# valid_data = pd.read_csv(valid_data_path)
valid_data = pd.read_csv('data/val_pp_5000.csv')
test_data = pd.read_csv(default_test_data_path)

# train_data = train_data[:50]
# valid_data = valid_data[:40]

In [7]:
print(train_data.shape)
print(valid_data.shape)
print(test_data.shape)

(15000, 7)
(5000, 7)
(10, 7)


In [8]:


# check property type to determine the task name (whether it is regression or classification)
if train_data[property].dtype == 'bool':
    task_name = 'classification'

    #converting True->1.0 and False->0.0
    train_data[property] = train_data[property].astype(float)
    valid_data[property] = valid_data[property].astype(float)
    test_data[property] = test_data[property].astype(float)
else:
    task_name = 'regression'

train_labels_array = np.array(train_data[property])
train_labels_mean = torch.mean(torch.tensor(train_labels_array))
train_labels_std = torch.std(torch.tensor(train_labels_array))
train_labels_min = torch.min(torch.tensor(train_labels_array))
train_labels_max = torch.max(torch.tensor(train_labels_array))

In [9]:
# define loss functions
mae_loss_function = nn.L1Loss()
bce_loss_function = nn.BCEWithLogitsLoss()

freeze = False # a boolean variable to determine if we freeze the pre-trained T5 weights

# define the tokenizer
if tokenizer_name == 't5_tokenizer':
    tokenizer = AutoTokenizer.from_pretrained("t5-small")

elif tokenizer_name == 'modified':
    tokenizer = AutoTokenizer.from_pretrained("tokenizers/t5_tokenizer_trained_on_modified_part_of_C4_and_textedge")

# add defined special tokens to the tokenizer
if pooling == 'cls':
    tokenizer.add_tokens(["[CLS]"])

if preprocessing_strategy == "bond_lengths_replaced_with_num":
    tokenizer.add_tokens(["[NUM]"]) # special token to replace bond lengths

elif preprocessing_strategy == "bond_angles_replaced_with_ang":
    tokenizer.add_tokens(["[ANG]"]) # special token to replace bond angles

elif preprocessing_strategy == "no_stopwords_and_lengths_and_angles_replaced":
    tokenizer.add_tokens(["[NUM]"])
    tokenizer.add_tokens(["[ANG]"])

print('-'*50)
print(f"train data = {len(train_data)} samples")
print(f"valid data = {len(valid_data)} samples")
print('-'*50)
print(f"training on {get_sequence_len_stats(train_data, tokenizer, max_length)}% samples with whole sequence")
print(f"validating on {get_sequence_len_stats(valid_data, tokenizer, max_length)}% samples with whole sequence")
print('-'*50)

print("labels statistics on training set:")
print("Mean:", train_labels_mean)
print("Standard deviation:", train_labels_std)
print("Max:", train_labels_max)
print("Min:", train_labels_min)
print("-"*50)

Token indices sequence length is longer than the specified maximum sequence length for this model (614 > 512). Running this sequence through the model will result in indexing errors


--------------------------------------------------
train data = 15000 samples
valid data = 5000 samples
--------------------------------------------------
training on 89.51333333333334% samples with whole sequence
validating on 89.9% samples with whole sequence
--------------------------------------------------
labels statistics on training set:
Mean: tensor(1.0225, dtype=torch.float64)
Standard deviation: tensor(1.5183, dtype=torch.float64)
Max: tensor(17.8914, dtype=torch.float64)
Min: tensor(0., dtype=torch.float64)
--------------------------------------------------


In [10]:
# define the model
base_model = T5EncoderModel.from_pretrained("google/t5-v1_1-small")
base_model_output_size = 512

# freeze the pre-trained LM's parameters
if freeze:
    for param in base_model.parameters():
        param.requires_grad = False

# resizing the model input embeddings matrix to adapt to newly added tokens by the new tokenizer
# this is to avoid the "RuntimeError: CUDA error: device-side assert triggered" error
base_model.resize_token_embeddings(len(tokenizer))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/537 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/308M [00:00<?, ?B/s]

Embedding(32106, 512)

In [11]:
    # instantiate the model
model = T5Predictor(base_model, base_model_output_size, drop_rate=drop_rate, pooling=pooling)

device_ids = [d for d in range(torch.cuda.device_count())]

if torch.cuda.device_count() > 1:
  model = nn.DataParallel(model, device_ids=device_ids).cuda()
else:
  model.to(device)

# print the model parameters
model_trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Total parameters = {model_trainable_params}")

Total parameters = 35322049


In [12]:
# create dataloaders
train_dataloader = create_dataloaders(
    tokenizer,
    train_data,
    max_length,
    batch_size,
    property_value=property,
    pooling=pooling,
    normalize=True,
    normalizer=normalizer_type
)

valid_dataloader = create_dataloaders(
    tokenizer,
    valid_data,
    max_length,
    batch_size,
    property_value=property,
    pooling=pooling
)

test_dataloader = create_dataloaders(
    tokenizer,
    test_data,
    max_length,
    batch_size,
    property_value=property,
    pooling=pooling
)

In [13]:
# define the optimizer
if optimizer_type == 'adamw':
    optimizer = AdamW(
        model.parameters(),
        lr = learning_rate
    )
elif optimizer_type == 'sgd':
    optimizer = SGD(
        model.parameters(),
        lr=learn_rate
    )



In [14]:
# set up the scheduler
total_training_steps = len(train_dataloader) * epochs
if scheduler_type == 'linear':
    scheduler = get_linear_schedule_with_warmup( #get_linear_schedule_with_warmup
        optimizer,
        num_warmup_steps= warmup_steps, #steps_ratio*total_training_steps,
        num_training_steps=total_training_steps
    )

# from <https://github.com/usnistgov/alignn/blob/main/alignn/train.py>
elif scheduler_type == 'onecycle':
    steps_per_epoch = len(train_dataloader)
    # pct_start = config.warmup_steps / (config.epochs * steps_per_epoch)
    scheduler = torch.optim.lr_scheduler.OneCycleLR(
        optimizer,
        max_lr=learning_rate,
        epochs=epochs,
        steps_per_epoch=steps_per_epoch,
        # pct_start=pct_start,
        pct_start=0.3,
    )

elif scheduler_type == 'step':
      # pct_start = config.warmup_steps / (config.epochs * steps_per_epoch)
    scheduler = torch.optim.lr_scheduler.StepLR(
        optimizer,
        step_size=warmup_steps
    )

elif scheduler_type == 'lambda':
    # always return multiplier of 1 (i.e. do nothing)
    scheduler = torch.optim.lr_scheduler.LambdaLR(
        optimizer, lambda epoch: 1.0
    )

In [15]:
# Get GPU memory information
total_memory = torch.cuda.get_device_properties(device).total_memory
allocated_memory = torch.cuda.memory_allocated(device)
reserved_memory = torch.cuda.memory_reserved(device)
free_memory = total_memory - allocated_memory

print(f"Total GPU memory: {total_memory / (1024**3):.2f} GB")
print(f"Allocated GPU memory: {allocated_memory / (1024**3):.2f} GB")
print(f"Reserved GPU memory: {reserved_memory / (1024**3):.2f} GB")
print(f"Free GPU memory: {free_memory / (1024**3):.2f} GB")

train_data.shape, valid_data.shape, test_data.shape

Total GPU memory: 39.56 GB
Allocated GPU memory: 0.13 GB
Reserved GPU memory: 0.15 GB
Free GPU memory: 39.43 GB


((15000, 7), (5000, 7), (10, 7))

In [16]:
def train(
    model,
    optimizer,
    scheduler,
    bce_loss_function,
    mae_loss_function,
    epochs,
    train_dataloader,
    valid_dataloader,
    device,
    normalizer="z_norm",
    task_name="regression"
):

    training_starting_time = time.time()
    training_stats = []
    validation_predictions = {}

    best_loss = 1e10 # Set the best loss variable which record the best loss for each epoch
    best_roc = 0.0
    print(task_name)
    for epoch in range(epochs):
        print(f"========== Epoch {epoch+1}/{epochs} =========")

        epoch_starting_time = time.time()

        total_training_loss = 0
        total_training_mae_loss = 0
        total_training_normalized_mae_loss = 0

        model.train()

        for step, batch in tqdm(enumerate(train_dataloader), total=len(train_dataloader)):

            print(f"Step {step+1}/{len(train_dataloader)}")

            batch_inputs, batch_masks, batch_labels, batch_norm_labels = tuple(b.to(device) for b in batch)

            _, predictions = model(batch_inputs, batch_masks)

            if task_name == 'classification':
                loss = bce_loss_function(predictions.squeeze(), batch_labels.squeeze())

            elif task_name == 'regression':
                loss = mae_loss_function(predictions.squeeze(), batch_norm_labels.squeeze())

                if normalizer == 'z_norm':
                    predictions_denorm = z_denormalize(predictions, train_labels_mean, train_labels_std)

                elif normalizer == 'mm_norm':
                    predictions_denorm = mm_denormalize(predictions, train_labels_min, train_labels_max)

                elif normalizer == 'ls_norm':
                    predictions_denorm = ls_denormalize(predictions)

                elif normalizer == 'no_norm':
                    loss = mae_loss_function(predictions.squeeze(), batch_labels.squeeze())
                    predictions_denorm = predictions

                mae_loss = mae_loss_function(predictions_denorm.squeeze(), batch_labels.squeeze())

            # total training loss on actual output
            if task_name == "classification":
                total_training_loss += loss.item()

            elif task_name == "regression":
                total_training_loss += mae_loss.item()

            # back propagate
            loss.backward()

            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()

        # average training loss on actual output
        average_training_loss = total_training_loss/len(train_dataloader)

        epoch_ending_time = time.time()
        training_time = time_format(epoch_ending_time - epoch_starting_time)

        print(f"Average training loss = {average_training_loss}")
        print(f"Training for this epoch took {training_time}")

        # Validation
        print("")
        print("Running Validation ....")

        valid_start_time = time.time()

        model.eval()

        total_eval_mae_loss = 0
        predictions_list = []
        targets_list = []

        for step, batch in tqdm(enumerate(valid_dataloader), total=len(valid_dataloader)):
            batch_inputs, batch_masks, batch_labels = tuple(b.to(device) for b in batch)

            with torch.no_grad():
                _, predictions = model(batch_inputs, batch_masks)

                if task_name == "classification":
                    predictions_denorm = predictions

                elif task_name == "regression":
                    if normalizer == 'z_norm':
                        predictions_denorm = z_denormalize(predictions, train_labels_mean, train_labels_std)

                    elif normalizer == 'mm_norm':
                        predictions_denorm = mm_denormalize(predictions, train_labels_min, train_labels_max)

                    elif normalizer == 'ls_norm':
                        predictions_denorm = ls_denormalize(predictions)

                    elif normalizer == 'no_norm':
                        predictions_denorm = predictions

            predictions = predictions_denorm.detach().cpu().numpy()
            targets = batch_labels.detach().cpu().numpy()

            for i in range(len(predictions)):
                predictions_list.append(predictions[i][0])
                targets_list.append(targets[i])

        valid_ending_time = time.time()
        validation_time = time_format(valid_ending_time-valid_start_time)

        # save model checkpoint and the statistics of the epoch where the model performs the best
        if task_name == "classification":
            valid_performance = get_roc_score(predictions_list, targets_list)

            if valid_performance >= best_roc:
                best_roc = valid_performance
                best_epoch = epoch+1

                # save the best model checkpoint
                save_to_path = f"checkpoints/samples/{task_name}/best_checkpoint_for_{property}.pt"
                if isinstance(model, nn.DataParallel):
                    torch.save(model.module.state_dict(), save_to_path)
                    compressCheckpointsWithTar(save_to_path)
                else:
                    torch.save(model.state_dict(), save_to_path)
                    compressCheckpointsWithTar(save_to_path)

                # save statistics of the best model
                training_stats.append(
                    {
                        "best_epoch": epoch + 1,
                        "training_loss": average_training_loss,
                        "validation_roc_score": valid_performance,
                        "training time": training_time,
                        "validation time": validation_time
                    }
                )

                validation_predictions.update(
                    {
                        f"epoch_{epoch+1}": predictions_list
                    }
                )

                saveCSV(pd.DataFrame(data=training_stats), f"statistics/samples/{task_name}/training_stats_for_{property}.csv")
                saveCSV(pd.DataFrame(validation_predictions), f"statistics/samples/{task_name}/validation_stats_for_{property}.csv")

            else:
                best_roc = best_roc

            print(f"Validation roc score = {valid_performance}")

        elif task_name == "regression":
            predictions_tensor = torch.tensor(predictions_list)
            targets_tensor = torch.tensor(targets_list)
            valid_performance = mae_loss_function(predictions_tensor.squeeze(), targets_tensor.squeeze())

            if valid_performance <= best_loss:
                best_loss = valid_performance
                best_epoch = epoch+1

                # save the best model checkpoint
                save_to_path = f"checkpoints/samples/{task_name}/best_checkpoint_for_{property}.pt"
                if isinstance(model, nn.DataParallel):
                    torch.save(model.module.state_dict(), save_to_path)
                    compressCheckpointsWithTar(save_to_path)
                else:
                    torch.save(model.state_dict(), save_to_path)
                    compressCheckpointsWithTar(save_to_path)

                # save statistics of the best model
                training_stats.append(
                    {
                        "best_epoch": epoch + 1,
                        "training mae loss": average_training_loss,
                        "validation mae loss": valid_performance,
                        "training time": training_time,
                        "validation time": validation_time
                    }
                )

                validation_predictions.update(
                    {
                        f"epoch_{epoch+1}": predictions_list
                    }
                )

                saveCSV(pd.DataFrame(data=training_stats), f"statistics/samples/{task_name}/training_stats_for_{property}.csv")
                saveCSV(pd.DataFrame(validation_predictions), f"statistics/samples/{task_name}/validation_stats_for_{property}.csv")

            else:
                best_loss = best_loss

            print(f"Validation mae error = {valid_performance}")
        print(f"validation took {validation_time}")
    train_ending_time = time.time()
    total_training_time = train_ending_time-training_starting_time

    print("\n========== Training complete ========")
    print(f"Training LLM_Prop on {property} prediction took {time_format(total_training_time)}")

    if task_name == "classification":
        print(f"The lowest roc score achieved on validation set on {property} is {best_roc} at {best_epoch}th epoch \n")

    elif task_name == "regression":
        print(f"The lowest mae error achieved on validation set on predicting {property} is {best_loss} at {best_epoch}th epoch \n")

    return training_stats, validation_predictions

In [19]:
batch_size,drop_rate,train_data.shape,valid_data.shape

(32, 0.4, (15000, 7), (5000, 7))

In [20]:
print("======= Training ... ========")
training_stats, validation_predictions = train(model, optimizer, scheduler, mae_loss_function, mae_loss_function,
    epochs, train_dataloader, valid_dataloader, device, normalizer=normalizer_type)

Output hidden; open in https://colab.research.google.com to view.

In [None]:
for _ in range(3):  # Sometimes needs multiple passes
    torch.cuda.empty_cache()
    torch.cuda.ipc_collect()
del model
torch.cuda.reset_max_memory_allocated()  # Frees unused memory
torch.cuda.memory_allocated()

In [None]:
torch.cuda.memory_allocated()/1024/1024/1024

In [None]:
model.to_device

In [None]:
# prompt: move model to cpu

device = torch.device("cpu")
model.to(device)


In [None]:
# ! bash scripts/llmprop_train.sh