#### Credits
Adapted from Rubungo et al. Llm-prop: Predicting physical and electronic properties of crystalline solids from their text descriptions, 2023. <br> 
Vertaix. Llm-prop: A repository for property prediction using large language models. https: //github.com/vertaix/LLM-Prop/tree/main, 2025

Copyright (c) 2024 Vertaix
MIT License

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%cd drive/MyDrive/cours/cs224n/project/LLM-Prop/
! ls

/content/drive/MyDrive/cours/cs224n/project/LLM-Prop
checkpoints  LICENSE		     llmprop_model.py	  __pycache__	    scripts
data	     llmprop_args_parser.py  llmprop_train.py	  README.md	    statistics
embeddings   llmprop_dataset.py      llmprop_utils_OG.py  requirements.txt  stopwords
figures      llmprop_evaluate.py     llmprop_utils.py	  savings	    tokenizers


In [None]:
import re
import time
import glob
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import argparse

from transformers import AutoTokenizer, T5EncoderModel, T5Tokenizer

from statistics import stdev

# pre-defined functions
from llmprop_utils import *
from llmprop_dataset import *
from llmprop_args_parser import *
from llmprop_train import evaluate
from llmprop_model import T5Predictor

In [None]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f'Number of available devices: {torch.cuda.device_count()}')
    print(f'Current device is: {torch.cuda.current_device()}')
    print("Testing on", torch.cuda.device_count(), "GPUs!")
    print('-'*50)
else:
    print("No GPU available, please connect to the GPU first or continue to use CPU instead")
    print('-'*50)
    device = torch.device("cpu")

Number of available devices: 1
Current device is: 0
Testing on 1 GPUs!
--------------------------------------------------


In [None]:
parser = argparse.ArgumentParser(description='LLM-Prop')
parser.add_argument('--epochs',
                    help='Number of epochs',
                    type=int,
                    default=200)
parser.add_argument('--bs',
                    help='Batch size',
                    type=int,
                    default=64)
parser.add_argument('--lr',
                    help='Learning rate',
                    type=float,
                    default=0.001)
parser.add_argument('--max_len',
                    help='Max input sequence length',
                    type=int,
                    default=888)
parser.add_argument('--dr',
                    help='Drop rate',
                    type=float,
                    default=0.2)
parser.add_argument('--warmup_steps',
                    help='Warmpup steps',
                    type=int,
                    default=30000)
parser.add_argument('--preprocessing_strategy',
                    help='Data preprocessing technique: "none", "bond_lengths_replaced_with_num", "bond_angles_replaced_with_ang", "no_stopwords", or "no_stopwords_and_lengths_and_angles_replaced"',
                    type=str,
                    default="no_stopwords_and_lengths_and_angles_replaced")
parser.add_argument('--tokenizer',
                    help='Tokenizer name: "t5_tokenizer" or "modified"',
                    type=str,
                    default="modified")
parser.add_argument('--pooling',
                    help='Pooling method. "cls" or "mean"',
                    type=str,
                      default="cls")
parser.add_argument('--normalizer',
                      help='Labels scaling technique. "z_norm", "mm_norm", or "ls_norm"',
                      type=str,
                    default="z_norm")
parser.add_argument('--scheduler',
                    help='Learning rate scheduling technique. "linear", "onecycle", "step", or "lambda" (no scheduling))',
                    type=str,
                    default="onecycle")
parser.add_argument('--property_name',
                      help='The name of the property to predict. "band_gap", "volume", or "is_gap_direct"',
                      type=str,
                      default="band_gap")
parser.add_argument('--optimizer',
                    help='Optimizer type. "adamw" or "sgd"',
                    type=str,
                    default="adamw")
parser.add_argument('--task_name',
                    help='the name of the task: "regression" if propert_name is band_gap or volume or "classification" if property_name is is_gap_direct',
                    type=str,
                    default="regression")
parser.add_argument('--train_data_path',
                    help="the path to the training data",
                    type=str,
                    default="data/samples/textedge_prop_mp22_train.csv")
parser.add_argument('--valid_data_path',
                    help="the path to the valid data",
                    type=str,
                    default="data/samples/textedge_prop_mp22_valid.csv")
parser.add_argument('--test_data_path',
                    help="the path to the test data",
                    type=str,
                    default="data/samples/textedge_prop_mp22_test.csv")
parser.add_argument('--checkpoint',
                      help="the path to the the best checkpoint for evaluation",
                      type=str,
                      default="")
args = parser.parse_args([])
args_dict = vars(args)

# Load them into variables and use the correct variable names:
globals().update(args_dict)

# set parameters
batch_size = bs
drop_rate = dr
max_length = max_len
tokenizer_name = tokenizer
normalizer_type = normalizer
best_model_path = checkpoint
property = property_name

#set specific variables not default
# best_model_path = f"checkpoints/llmprop_best_checkpoint_for_band_gap_regression_description.tar.gz"
default_best_model_path = "checkpoints/samples/regression/best_checkpoint_for_band_gap.tar.gz"
best_model_path ="checkpoints/checkpoint_train50_200epoch.pt"

#define train and tests
default_test_data_path ="data/samples/textedge_prop_mp22_test.csv"
test_data_path = "data/test_no_stopwords_and_lengths_and_angles_replaced.csv"


In [None]:
train_data = pd.read_csv(train_data_path)
test_data = pd.read_csv(default_test_data_path)
test_data = pd.read_csv(test_data_path)

In [None]:
print(test_data.shape)
# test_data =test_data.iloc[:100]
print(test_data.shape)

(11531, 7)
(11531, 7)


In [None]:
# check property type to determine the task name (whether it is regression or classification)
if test_data[property].dtype == 'bool':
    task_name = 'classification'

    #converting True->1.0 and False->0.0
    train_data[property] = train_data[property].astype(float)
    test_data[property] = test_data[property].astype(float)
else:
    task_name = 'regression'

train_labels_array = np.array(train_data[property])
train_labels_mean = torch.mean(torch.tensor(train_labels_array))
train_labels_std = torch.std(torch.tensor(train_labels_array))
train_labels_min = torch.min(torch.tensor(train_labels_array))
train_labels_max = torch.max(torch.tensor(train_labels_array))


In [None]:
### ONLY IF NOT DEALING WITH NON-PREPROCESSED DATA

# if preprocessing_strategy == "none":
#     test_data = test_data
#     print(test_data['description'][0])
#     print('-'*50)

# elif preprocessing_strategy == "bond_lengths_replaced_with_num":
#     test_data['description'] = test_data['description'].apply(replace_bond_lengths_with_num)
#     print(test_data['description'][0])
#     print('-'*50)

# elif preprocessing_strategy == "bond_angles_replaced_with_ang":
#     test_data['description'] = test_data['description'].apply(replace_bond_angles_with_ang)
#     print(test_data['description'][0])
#     print('-'*50)

# elif preprocessing_strategy == "no_stopwords":
#     stopwords = get_cleaned_stopwords()
#     test_data['description'] = test_data['description'].apply(remove_mat_stopwords)
#     print(test_data['description'][0])
#     print('-'*50)

# elif preprocessing_strategy == "no_stopwords_and_lengths_and_angles_replaced":
#     stopwords = get_cleaned_stopwords()
#     test_data['description'] = test_data['description'].apply(remove_mat_stopwords)
#     test_data['description'] = test_data['description'].apply(replace_bond_lengths_with_num)
#     test_data['description'] = test_data['description'].apply(replace_bond_angles_with_ang)
#     print(test_data['description'][0])
#     print('-'*50)

In [None]:
mae_loss_function = nn.L1Loss()

# define the tokenizer
if tokenizer_name == 't5_tokenizer':
    tokenizer = AutoTokenizer.from_pretrained("t5-small")

elif tokenizer_name == 'modified':
    tokenizer = AutoTokenizer.from_pretrained("tokenizers/t5_tokenizer_trained_on_modified_part_of_C4_and_textedge")

# add defined special tokens to the tokenizer
if pooling == 'cls':
    tokenizer.add_tokens(["[CLS]"])

if preprocessing_strategy == "bond_lengths_replaced_with_num":
    tokenizer.add_tokens(["[NUM]"]) # special token to replace bond lengths

elif preprocessing_strategy == "bond_angles_replaced_with_ang":
    tokenizer.add_tokens(["[ANG]"]) # special token to replace bond angles

elif preprocessing_strategy == "no_stopwords_and_lengths_and_angles_replaced":
    tokenizer.add_tokens(["[NUM]"])
    tokenizer.add_tokens(["[ANG]"])

print(f"test data = {len(test_data)} samples")
print('-'*50)
print(f"testing on {get_sequence_len_stats(test_data, tokenizer, max_length)}% samples with whole sequence")
print('-'*50)

print("labels statistics on training set:")
print("Mean:", train_labels_mean)
print("Standard deviation:", train_labels_std)
print("Max:", train_labels_max)
print("Min:", train_labels_min)
print("-"*50)

Token indices sequence length is longer than the specified maximum sequence length for this model (890 > 512). Running this sequence through the model will result in indexing errors


test data = 11531 samples
--------------------------------------------------
testing on 90.32174139276732% samples with whole sequence
--------------------------------------------------
labels statistics on training set:
Mean: tensor(0.9767, dtype=torch.float64)
Standard deviation: tensor(1.4387, dtype=torch.float64)
Max: tensor(4.9055, dtype=torch.float64)
Min: tensor(0., dtype=torch.float64)
--------------------------------------------------


In [None]:
# best_model_path = f"checkpoints/llmprop_best_checkpoint_for_band_gap_regression_cif_structure.pt"
# best_model_path =default_best_model_path
best_model_path

'checkpoints/checkpoint_train50_200epoch.pt'

In [None]:
base_model = T5EncoderModel.from_pretrained("google/t5-v1_1-small")
base_model_output_size = 512
base_model.resize_token_embeddings(len(tokenizer))

if "tar.gz" in best_model_path:
  print("decompressing checkpoint")
  decompressTarCheckpoints(best_model_path)
  best_model_path = best_model_path[0:-7] + ".pt"

In [None]:
best_model = T5Predictor(base_model, base_model_output_size, drop_rate=drop_rate, pooling=pooling)

device_ids = [d for d in range(torch.cuda.device_count())]

if torch.cuda.is_available():
    best_model = nn.DataParallel(best_model, device_ids=device_ids).cuda()

if isinstance(best_model, nn.DataParallel):
    print("here")
    best_model.module.load_state_dict(torch.load(best_model_path, map_location=torch.device(device)), strict=False)
else:
    best_model.load_state_dict(torch.load(best_model_path, map_location=torch.device(device)), strict=False)
    best_model.to(device)

here


  best_model.module.load_state_dict(torch.load(best_model_path, map_location=torch.device(device)), strict=False)


In [None]:
model_trainable_params = sum(p.numel() for p in best_model.parameters() if p.requires_grad)
print(f"Total parameters = {model_trainable_params}")

Total parameters = 35322049


In [None]:
test_dataloader = create_dataloaders(
    tokenizer,
    test_data,
    max_length,
    batch_size,
    property_value=property,
    pooling=pooling
)

In [None]:
predictions_list, performance = evaluate(best_model, mae_loss_function, test_dataloader, train_labels_mean, train_labels_std, train_labels_min, train_labels_max, property, device, task_name, normalizer=normalizer_type)


 The mae error achieved on test set for predicting band_gap is 0.9316902916325275
testing took 0:02:23 



In [None]:
# test_results = pd.DataFrame([predictions_list,test_data[property]],columns = ['a','aa'])
test_results = pd.DataFrame(np.array([predictions_list,test_data[property]]).T,columns = ['prediction','label'])
test_results

Unnamed: 0,prediction,label
0,-0.085124,0.0000
1,0.100780,1.9765
2,-0.131660,0.0000
3,0.100987,3.4571
4,0.982250,0.0000
...,...,...
11526,1.858977,1.5467
11527,2.398273,1.8794
11528,-0.008056,0.0000
11529,-0.060100,0.0000


In [None]:
# test_results.to_csv(f'/content/test_results_20250220.csv',index=False)
test_results.to_csv(f'savings/test_results_train50_200epoch.csv',index=False)