#### Credits
Author: Alexis Geslin.\
Adapted from Rubungo et al. Llm-prop: Predicting physical and electronic properties of crystalline solids from their text descriptions, 2023. <br> 
Vertaix. Llm-prop: A repository for property prediction using large language models. https: //github.com/vertaix/LLM-Prop/tree/main, 2025

Copyright (c) 2024 Vertaix
MIT License

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%cd drive/MyDrive/cours/cs224n/project/LLM-Prop/
! ls

/content/drive/MyDrive/cours/cs224n/project/LLM-Prop
checkpoints  LICENSE		     llmprop_model.py	  __pycache__	    scripts
data	     llmprop_args_parser.py  llmprop_train.py	  README.md	    statistics
embeddings   llmprop_dataset.py      llmprop_utils_OG.py  requirements.txt  stopwords
figures      llmprop_evaluate.py     llmprop_utils.py	  savings	    tokenizers


In [3]:
import re
import time
import glob
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import argparse

from transformers import AutoTokenizer, T5EncoderModel, T5Tokenizer

from statistics import stdev

# pre-defined functions
from llmprop_utils import *
from llmprop_dataset import *
from llmprop_args_parser import *
from llmprop_train import evaluate
from llmprop_model import T5Predictor

In [4]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f'Number of available devices: {torch.cuda.device_count()}')
    print(f'Current device is: {torch.cuda.current_device()}')
    print("Testing on", torch.cuda.device_count(), "GPUs!")
    print('-'*50)
else:
    print("No GPU available, please connect to the GPU first or continue to use CPU instead")
    print('-'*50)
    device = torch.device("cpu")

Number of available devices: 1
Current device is: 0
Testing on 1 GPUs!
--------------------------------------------------


In [5]:
parser = argparse.ArgumentParser(description='LLM-Prop')
parser.add_argument('--epochs',
                    help='Number of epochs',
                    type=int,
                    default=200)
parser.add_argument('--bs',
                    help='Batch size',
                    type=int,
                    default=64)
parser.add_argument('--lr',
                    help='Learning rate',
                    type=float,
                    default=0.001)
parser.add_argument('--max_len',
                    help='Max input sequence length',
                    type=int,
                    default=888)
parser.add_argument('--dr',
                    help='Drop rate',
                    type=float,
                    default=0.2)
parser.add_argument('--warmup_steps',
                    help='Warmpup steps',
                    type=int,
                    default=30000)
parser.add_argument('--preprocessing_strategy',
                    help='Data preprocessing technique: "none", "bond_lengths_replaced_with_num", "bond_angles_replaced_with_ang", "no_stopwords", or "no_stopwords_and_lengths_and_angles_replaced"',
                    type=str,
                    default="no_stopwords_and_lengths_and_angles_replaced")
parser.add_argument('--tokenizer',
                    help='Tokenizer name: "t5_tokenizer" or "modified"',
                    type=str,
                    default="modified")
parser.add_argument('--pooling',
                    help='Pooling method. "cls" or "mean"',
                    type=str,
                      default="cls")
parser.add_argument('--normalizer',
                      help='Labels scaling technique. "z_norm", "mm_norm", or "ls_norm"',
                      type=str,
                    default="z_norm")
parser.add_argument('--scheduler',
                    help='Learning rate scheduling technique. "linear", "onecycle", "step", or "lambda" (no scheduling))',
                    type=str,
                    default="onecycle")
parser.add_argument('--property_name',
                      help='The name of the property to predict. "band_gap", "volume", or "is_gap_direct"',
                      type=str,
                      default="band_gap")
parser.add_argument('--optimizer',
                    help='Optimizer type. "adamw" or "sgd"',
                    type=str,
                    default="adamw")
parser.add_argument('--task_name',
                    help='the name of the task: "regression" if propert_name is band_gap or volume or "classification" if property_name is is_gap_direct',
                    type=str,
                    default="regression")
parser.add_argument('--train_data_path',
                    help="the path to the training data",
                    type=str,
                    default="data/samples/textedge_prop_mp22_train.csv")
parser.add_argument('--valid_data_path',
                    help="the path to the valid data",
                    type=str,
                    default="data/samples/textedge_prop_mp22_valid.csv")
parser.add_argument('--test_data_path',
                    help="the path to the test data",
                    type=str,
                    default="data/samples/textedge_prop_mp22_test.csv")
parser.add_argument('--checkpoint',
                      help="the path to the the best checkpoint for evaluation",
                      type=str,
                      default="")
args = parser.parse_args([])
args_dict = vars(args)

# Load them into variables and use the correct variable names:
globals().update(args_dict)
batch_size = bs
drop_rate = dr
max_length = max_len
tokenizer_name = tokenizer
normalizer_type = normalizer
best_model_path = checkpoint
property = property_name

# set parameters
batch_size= 32

#set specific variables not default
# best_model_path = f"checkpoints/llmprop_best_checkpoint_for_band_gap_regression_description.tar.gz"
default_best_model_path = "checkpoints/samples/regression/best_checkpoint_for_band_gap.tar.gz"
best_model_path ="checkpoints/checkpoint_train15000_200epoch.pt"

#define train and tests
default_test_data_path ="data/samples/textedge_prop_mp22_test.csv"
test_data_path = "data/test_no_stopwords_and_lengths_and_angles_replaced.csv"
test_data_path = "data/test_pp_500.csv"


In [6]:
train_data = pd.read_csv("data/train_no_stopwords_and_lengths_and_angles_replaced.csv")
valid_data = pd.read_csv("data/validation_no_stopwords_and_lengths_and_angles_replaced.csv")
test_data = pd.read_csv("data/test_no_stopwords_and_lengths_and_angles_replaced.csv")

In [7]:
print(train_data.shape)
print(valid_data.shape)
print(test_data.shape)

(125098, 7)
(9945, 7)
(11531, 7)


In [8]:
# check property type to determine the task name (whether it is regression or classification)
if test_data[property].dtype == 'bool':
    task_name = 'classification'

    #converting True->1.0 and False->0.0
    train_data[property] = train_data[property].astype(float)
    test_data[property] = test_data[property].astype(float)
else:
    task_name = 'regression'

train_labels_array = np.array(train_data[property])
train_labels_mean = torch.mean(torch.tensor(train_labels_array))
train_labels_std = torch.std(torch.tensor(train_labels_array))
train_labels_min = torch.min(torch.tensor(train_labels_array))
train_labels_max = torch.max(torch.tensor(train_labels_array))


In [9]:
### ONLY IF NOT DEALING WITH NON-PREPROCESSED DATA

# if preprocessing_strategy == "none":
#     test_data = test_data
#     print(test_data['description'][0])
#     print('-'*50)

# elif preprocessing_strategy == "bond_lengths_replaced_with_num":
#     test_data['description'] = test_data['description'].apply(replace_bond_lengths_with_num)
#     print(test_data['description'][0])
#     print('-'*50)

# elif preprocessing_strategy == "bond_angles_replaced_with_ang":
#     test_data['description'] = test_data['description'].apply(replace_bond_angles_with_ang)
#     print(test_data['description'][0])
#     print('-'*50)

# elif preprocessing_strategy == "no_stopwords":
#     stopwords = get_cleaned_stopwords()
#     test_data['description'] = test_data['description'].apply(remove_mat_stopwords)
#     print(test_data['description'][0])
#     print('-'*50)

# elif preprocessing_strategy == "no_stopwords_and_lengths_and_angles_replaced":
#     stopwords = get_cleaned_stopwords()
#     test_data['description'] = test_data['description'].apply(remove_mat_stopwords)
#     test_data['description'] = test_data['description'].apply(replace_bond_lengths_with_num)
#     test_data['description'] = test_data['description'].apply(replace_bond_angles_with_ang)
#     print(test_data['description'][0])
#     print('-'*50)

In [10]:
mae_loss_function = nn.L1Loss()

# define the tokenizer
if tokenizer_name == 't5_tokenizer':
    tokenizer = AutoTokenizer.from_pretrained("t5-small")

elif tokenizer_name == 'modified':
    tokenizer = AutoTokenizer.from_pretrained("tokenizers/t5_tokenizer_trained_on_modified_part_of_C4_and_textedge")

# add defined special tokens to the tokenizer
if pooling == 'cls':
    tokenizer.add_tokens(["[CLS]"])

if preprocessing_strategy == "bond_lengths_replaced_with_num":
    tokenizer.add_tokens(["[NUM]"]) # special token to replace bond lengths

elif preprocessing_strategy == "bond_angles_replaced_with_ang":
    tokenizer.add_tokens(["[ANG]"]) # special token to replace bond angles

elif preprocessing_strategy == "no_stopwords_and_lengths_and_angles_replaced":
    tokenizer.add_tokens(["[NUM]"])
    tokenizer.add_tokens(["[ANG]"])

print(f"test data = {len(test_data)} samples")
print('-'*50)
print(f"testing on {get_sequence_len_stats(test_data, tokenizer, max_length)}% samples with whole sequence")
print('-'*50)

print("labels statistics on training set:")
print("Mean:", train_labels_mean)
print("Standard deviation:", train_labels_std)
print("Max:", train_labels_max)
print("Min:", train_labels_min)
print("-"*50)

Token indices sequence length is longer than the specified maximum sequence length for this model (890 > 512). Running this sequence through the model will result in indexing errors


test data = 11531 samples
--------------------------------------------------
testing on 90.32174139276732% samples with whole sequence
--------------------------------------------------
labels statistics on training set:
Mean: tensor(1.0258, dtype=torch.float64)
Standard deviation: tensor(1.5106, dtype=torch.float64)
Max: tensor(17.8914, dtype=torch.float64)
Min: tensor(0., dtype=torch.float64)
--------------------------------------------------


In [11]:
# best_model_path = f"checkpoints/llmprop_best_checkpoint_for_band_gap_regression_cif_structure.pt"
# best_model_path =default_best_model_path
# best_model_path ="checkpoint_train50_epoch200.pt"
best_model_path

'checkpoints/checkpoint_train15000_200epoch.pt'

In [12]:
base_model = T5EncoderModel.from_pretrained("google/t5-v1_1-small")
base_model_output_size = 512
base_model.resize_token_embeddings(len(tokenizer))

if "tar.gz" in best_model_path:
  print("decompressing checkpoint")
  decompressTarCheckpoints(best_model_path)
  best_model_path = best_model_path[0:-7] + ".pt"

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/537 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/308M [00:00<?, ?B/s]

In [13]:
best_model = T5Predictor(base_model, base_model_output_size, drop_rate=drop_rate, pooling=pooling)

device_ids = [d for d in range(torch.cuda.device_count())]

if torch.cuda.is_available():
    best_model = nn.DataParallel(best_model, device_ids=device_ids).cuda()

if isinstance(best_model, nn.DataParallel):
    print("here")
    best_model.module.load_state_dict(torch.load(best_model_path, map_location=torch.device(device)), strict=False)
else:
    best_model.load_state_dict(torch.load(best_model_path, map_location=torch.device(device)), strict=False)
    best_model.to(device)

here


  best_model.module.load_state_dict(torch.load(best_model_path, map_location=torch.device(device)), strict=False)


model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

In [14]:
model_trainable_params = sum(p.numel() for p in best_model.parameters() if p.requires_grad)
print(f"Total parameters = {model_trainable_params}")

Total parameters = 35322049


In [15]:
# if you need to reset the test_data
# test_data = pd.read_csv("data/test_no_stopwords_and_lengths_and_angles_replaced.csv")

In [16]:
# mini_test_data = test_data[:150]
# test_dataloader = create_dataloaders(
#     tokenizer,
#     mini_test_data,
#     max_length,
#     batch_size,
#     property_value=property,
#     pooling=pooling
# )


In [17]:

# create dataloaders
train_dataloader = create_dataloaders(
    tokenizer,
    train_data,
    max_length,
    batch_size,
    property_value=property,
    pooling=pooling,
    # normalize=True,
    # normalizer=normalizer_type
)

valid_dataloader = create_dataloaders(
    tokenizer,
    valid_data,
    max_length,
    batch_size,
    property_value=property,
    pooling=pooling
)

test_dataloader = create_dataloaders(
    tokenizer,
    test_data,
    max_length,
    batch_size,
    property_value=property,
    pooling=pooling
)


In [18]:
def evaluate_and_embed(
    model,
    mae_loss_function,
    test_dataloader,
    train_labels_mean,
    train_labels_std,
    train_labels_min,
    train_labels_max,
    property,
    device,
    task_name,
    normalizer="z_norm"
):
    test_start_time = time.time()

    model.eval()

    total_test_loss = 0
    predictions_list = []
    targets_list = []
    embeddings_all = []
    targets_all =[]

    for step, batch in enumerate(test_dataloader):
        batch_inputs, batch_masks, batch_labels = tuple(b.to(device) for b in batch)

        with torch.no_grad():
            embeddings, predictions = model(batch_inputs, batch_masks)
            embeddings =embeddings.detach().cpu()
            embeddings_all.append(embeddings)

            if task_name == "classification":
                predictions_denorm = predictions

            elif task_name == "regression":
                if normalizer == 'z_norm':
                    predictions_denorm = z_denormalize(predictions, train_labels_mean, train_labels_std)

                elif normalizer == 'mm_norm':
                    predictions_denorm = mm_denormalize(predictions, train_labels_min, train_labels_max)

                elif normalizer == 'ls_norm':
                    predictions_denorm = ls_denormalize(predictions)

                elif normalizer == 'no_norm':
                    predictions_denorm = predictions

        predictions = predictions_denorm.detach().cpu().numpy()
        targets = batch_labels.detach().cpu().numpy()
        targets_all = targets_all +targets.tolist()

        for i in range(len(predictions)):
            predictions_list.append(predictions[i][0])
            targets_list.append(targets[i])

    test_predictions = {f"{property}": predictions_list}

    saveCSV(pd.DataFrame(test_predictions), f"statistics/samples/{task_name}/test_stats_for_{property}.csv")

    if task_name == "classification":
        test_performance = get_roc_score(predictions_list, targets_list)
        print(f"\n The roc score achieved on test set for predicting {property} is {test_performance}")

    elif task_name == "regression":
        predictions_tensor = torch.tensor(predictions_list)
        targets_tensor = torch.tensor(targets_list)
        test_performance = mae_loss_function(predictions_tensor.squeeze(), targets_tensor.squeeze())
        print(f"\n The mae error achieved on test set for predicting {property} is {test_performance}")

    average_test_loss = total_test_loss / len(test_dataloader)
    test_ending_time = time.time()
    testing_time = time_format(test_ending_time-test_start_time)
    print(f"testing took {testing_time} \n")

    embeddings_all = torch.cat(embeddings_all, dim=0)

    return predictions_list, test_performance, embeddings_all,targets_all

In [27]:
mydataloader = test_dataloader
predictions_list, performance,embeddings_out,targets_list = evaluate_and_embed(best_model, mae_loss_function, mydataloader, train_labels_mean, train_labels_std, train_labels_min, train_labels_max, property, device, task_name, normalizer=normalizer_type)


 The mae error achieved on test set for predicting band_gap is 0.3624833154230082
testing took 0:02:21 



In [28]:
mae_loss_function(torch.tensor(predictions_list),torch.tensor(targets_list))

tensor(0.3625)

In [29]:
embeddings = embeddings_out.numpy()
df = pd.DataFrame(embeddings)
test_results = pd.DataFrame(np.array([predictions_list,targets_list]).T,columns = ['prediction','label'])

if mydataloader == train_dataloader:
  dataset_used ="train"
  mylength = len(train_data)
elif mydataloader == valid_dataloader:
  dataset_used ="valid"
  mylength = len(valid_data)
elif mydataloader == test_dataloader:
  dataset_used ="test"
  mylength = len(test_data)
else:
  print("error")

if 'dummy' in best_model_path:
  model_used = 'dummy'
else:
  model_used = '_'.join(best_model_path.split('_')[-2:]).split('.')[0]

print(dataset_used, model_used, mylength)

test train15000_200epoch 11531


In [30]:
df.to_csv(f"embeddings/embeddings_{dataset_used}_{model_used}_{mylength}.csv", index=False,header=False)
test_results.to_csv(f"embeddings/pred_labels_{dataset_used}_{model_used}_{mylength}.csv", index=False,header=True)

Testing our savings

In [None]:
mydata = pd.read_csv(f"embeddings/embeddings_{dataset_used}_{model_used}_{mylength}.csv", header=None)
torch.tensor(mydata.to_numpy(),dtype =torch.float32)

tensor([[ 0.2114, -0.6339,  0.1635,  ..., -0.1252, -0.3196,  0.1045],
        [ 0.3434,  0.1072, -0.0283,  ..., -0.2017, -0.0060, -0.4495],
        [ 0.0013, -0.9475,  0.3593,  ..., -0.2091,  0.0332, -0.4379],
        ...,
        [ 0.3059,  0.1230, -0.0065,  ..., -0.2250,  0.1165, -0.4974],
        [ 0.2718,  0.0973, -0.0276,  ..., -0.3071,  0.1327, -0.6291],
        [ 0.4251,  0.1203, -0.0181,  ..., -0.3184,  0.0837, -0.4275]])

In [None]:
mydata = pd.read_csv(f"embeddings/pred_labels_{dataset_used}_{model_used}_{mylength}.csv")
mydata

Unnamed: 0,prediction,label
0,3.097338,1.8906
1,-0.015010,1.4395
2,3.233179,2.6027
3,0.548115,0.0000
4,1.697269,0.0000
...,...,...
9940,0.021686,0.0000
9941,0.019668,0.0000
9942,-0.158934,0.0000
9943,0.327024,1.7621
