## In this notebook:

#### LLM-prop Preprocessing 
Adapted from Rubungo et al. Llm-prop: Predicting physical and electronic properties of crystalline solids from their text descriptions, 2023. <br> 
Vertaix. Llm-prop: A repository for property prediction using large language models. https: //github.com/vertaix/LLM-Prop/tree/main, 2025
Copyright (c) 2024 Vertaix
MIT License

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

#### Extracting numerical tokens features
Author: Alexis Geslin.\
Inspired by Vertaix. Llm-prop: A repository for property prediction using large language models. https: //github.com/vertaix/LLM-Prop/tree/main, 2025

#### Formatting dataframes
Author: Alexis Geslin.

#### Getting E5 embeddings
Author: Alexis Geslin.

In [1]:
import re
import time
import glob
import pandas as pd
import numpy as np
import argparse
from statistics import stdev
from llmprop_utils import *


## LLM-Prop preprocessing

In [2]:
parser = argparse.ArgumentParser(description='LLM-Prop')
parser.add_argument('--epochs',
                    help='Number of epochs',
                    type=int,
                    default=200)
parser.add_argument('--bs',
                    help='Batch size',
                    type=int,
                    default=64)
parser.add_argument('--lr',
                    help='Learning rate',
                    type=float,
                    default=0.001)
parser.add_argument('--max_len',
                    help='Max input sequence length',
                    type=int,
                    default=888)
parser.add_argument('--dr',
                    help='Drop rate',
                    type=float,
                    default=0.2)
parser.add_argument('--warmup_steps',
                    help='Warmpup steps',
                    type=int,
                    default=30000)
parser.add_argument('--preprocessing_strategy',
                    help='Data preprocessing technique: "none", "bond_lengths_replaced_with_num", "bond_angles_replaced_with_ang", "no_stopwords", or "no_stopwords_and_lengths_and_angles_replaced"',
                    type=str,
                    default="no_stopwords_and_lengths_and_angles_replaced")
parser.add_argument('--tokenizer',
                    help='Tokenizer name: "t5_tokenizer" or "modified"',
                    type=str,
                    default="modified")
parser.add_argument('--pooling', 
                    help='Pooling method. "cls" or "mean"',
                    type=str,
                      default="cls")
parser.add_argument('--normalizer', 
                      help='Labels scaling technique. "z_norm", "mm_norm", or "ls_norm"',
                      type=str,
                    default="z_norm") 
parser.add_argument('--scheduler', 
                    help='Learning rate scheduling technique. "linear", "onecycle", "step", or "lambda" (no scheduling))',
                    type=str,
                    default="onecycle")
parser.add_argument('--property_name', 
                      help='The name of the property to predict. "band_gap", "volume", or "is_gap_direct"',
                      type=str,
                      default="band_gap")
parser.add_argument('--optimizer', 
                    help='Optimizer type. "adamw" or "sgd"',
                    type=str,
                    default="adamw")
parser.add_argument('--task_name', 
                    help='the name of the task: "regression" if propert_name is band_gap or volume or "classification" if property_name is is_gap_direct',
                    type=str,
                    default="regression")
parser.add_argument('--train_data_path',
                    help="the path to the training data",
                    type=str,
                    default="data/samples/textedge_prop_mp22_train.csv")
parser.add_argument('--valid_data_path',
                    help="the path to the valid data",
                    type=str,
                    default="data/samples/textedge_prop_mp22_valid.csv")
parser.add_argument('--test_data_path',
                    help="the path to the test data",
                    type=str,
                    default="data/samples/textedge_prop_mp22_test.csv")
parser.add_argument('--checkpoint',
                      help="the path to the the best checkpoint for evaluation",
                      type=str,
                      default="") 
args = parser.parse_args([])
args_dict = vars(args)

# Load them into variables and use the correct variable names:
globals().update(args_dict)

# set parameters
batch_size = bs
drop_rate = dr
max_length = max_len
tokenizer_name = tokenizer
normalizer_type = normalizer
best_model_path = checkpoint
property = property_name

#set specific variables not default
preprocessing_strategy ="no_stopwords"
# preprocessing_strategy = "no_stopwords_and_lengths_and_angles_replaced"

#training and testing 
default_train_data_path = "data/samples/textedge_prop_mp22_train.csv"
default_valid_data_path = "data/samples/textedge_prop_mp22_valid.csv"
default_test_data_path = "data/samples/textedge_prop_mp22_test.csv"
prepocessed_test_data_path = "data/test_no_stopwords_and_lengths_and_angles_replaced.csv"

train_data_path = "data/train.csv"
valid_data_path = "data/validation.csv"
test_data_path = "data/test.csv"


In [3]:
# train_data = pd.read_csv(train_data_path)
# valid_data = pd.read_csv(valid_data_path)
# test_data = pd.read_csv(test_data_path)
train_data = pd.read_csv(default_train_data_path)
valid_data = pd.read_csv(default_valid_data_path)
test_data = pd.read_csv(default_test_data_path)


In [4]:
print(preprocessing_strategy,train_data.shape, valid_data.shape, test_data.shape)

no_stopwords (50, 7) (10, 7) (10, 7)


In [5]:
# check property type to determine the task name (whether it is regression or classification)
if test_data[property].dtype == 'bool':
    task_name = 'classification'

    #converting True->1.0 and False->0.0
    train_data[property] = train_data[property].astype(float)
    test_data[property] = test_data[property].astype(float)
else:
    task_name = 'regression'

train_labels_array = np.array(train_data[property])
train_labels_mean = torch.mean(torch.tensor(train_labels_array))
train_labels_std = torch.std(torch.tensor(train_labels_array))
train_labels_min = torch.min(torch.tensor(train_labels_array))
train_labels_max = torch.max(torch.tensor(train_labels_array))

if preprocessing_strategy == "none":
    test_data = test_data
    print(test_data['description'][0])
    print('-'*50)

elif preprocessing_strategy == "bond_lengths_replaced_with_num":
    test_data['description'] = test_data['description'].apply(replace_bond_lengths_with_num)
    print(test_data['description'][0])
    print('-'*50)

elif preprocessing_strategy == "bond_angles_replaced_with_ang":
    test_data['description'] = test_data['description'].apply(replace_bond_angles_with_ang) 
    print(test_data['description'][0])
    print('-'*50)

elif preprocessing_strategy == "no_stopwords":
    stopwords = get_cleaned_stopwords()
    test_data['description'] = test_data['description'].apply(remove_mat_stopwords)
    print(test_data['description'][0])
    print('-'*50)

elif preprocessing_strategy == "no_stopwords_and_lengths_and_angles_replaced":
    print('in the right place')
    stopwords = get_cleaned_stopwords()
    test_data['description'] = test_data['description'].apply(remove_mat_stopwords)
    test_data['description'] = test_data['description'].apply(replace_bond_lengths_with_num)
    test_data['description'] = test_data['description'].apply(replace_bond_angles_with_ang)
    print(test_data['description'][0])
    print('-'*50) 

# define loss functions
# mae_loss_function = nn.L1Loss()

KPrNbMnO₆ (Cubic) Perovskite-derived structured crystallizes cubic F̅43m space group. K¹⁺ bonded equivalent O²⁻ atoms form KO₁₂ cuboctahedra share corners equivalent KO₁₂ cuboctahedra, six equivalent PrO₁₂ cuboctahedra, four equivalent NbO₆ octahedra, four equivalent MnO₆ octahedra. K–O bond lengths 2.83 Å. Pr³⁺ bonded equivalent O²⁻ atoms form PrO₁₂ cuboctahedra share corners equivalent PrO₁₂ cuboctahedra, six equivalent KO₁₂ cuboctahedra, four equivalent NbO₆ octahedra, four equivalent MnO₆ octahedra. Pr–O bond lengths 2.83 Å. Nb⁵⁺ bonded six equivalent O²⁻ atoms form NbO₆ octahedra share corners six equivalent MnO₆ octahedra, four equivalent KO₁₂ cuboctahedra, four equivalent PrO₁₂ cuboctahedra. corner-sharing octahedra tilted. Nb–O bond lengths 2.00 Å. Mn³⁺ bonded six equivalent O²⁻ atoms form MnO₆ octahedra share corners six equivalent NbO₆ octahedra, four equivalent KO₁₂ cuboctahedra, four equivalent PrO₁₂ cuboctahedra. corner-sharing octahedra tilted. Mn–O bond lengths 2.01 Å. O

In [6]:
# test_data.to_csv('data/train_no_stopwords.csv', index=False)

## Extracting Numerical tokens

In [7]:
pp_test = pd.read_csv('data/test_no_stopwords_and_lengths_and_angles_replaced.csv')
np_test = pd.read_csv('data/test_no_stopwords.csv')
np_train = pd.read_csv('data/train_no_stopwords.csv')
np_valid = pd.read_csv('data/validation_no_stopwords.csv')
print(pp_test.shape, np_test.shape, np_train.shape, np_valid.shape)

(11531, 7) (11531, 7) (125098, 7) (9945, 7)


In [8]:
def extract_bond_lengths_max(sentence, padding = 3): 
    "extract at most X= padding  bond lengths from the s`entence"
    "If ranges, take the upper value"
    num_tokens = re.findall(r"\d+(?:\.\d+)?\s*Å",sentence)
    num_tokens = [float(num.split(" Å")[0]) for num in num_tokens][:3]
    if len(num_tokens) < padding:
        num_tokens += [0.0] * (padding - len(num_tokens))
    return num_tokens

def extract_bond_lengths(sentence, padding = 3):
    "extract at most X= padding  bond lengths from the sentence"
    "If ranges, take the upper value"
    matches = re.findall(r"\d+(?:\.\d+)?(?:–\d+(?:\.\d+)?)?\s*Å",sentence)
    final_tokens = []
    for match in matches[:padding]:
        if '–' not in match:
            final_tokens.append(float(match.split(' ')[0]))
        else:
            numbers = re.findall(r"\d+(?:\.\d+)?",match)
            if len(numbers) == 2:
                a,b = float(numbers[0]),float(numbers[1])
                final_tokens.append(round((a+b)/2,2))
            else:
                continue

    if len(final_tokens) < padding:
        final_tokens += [0.0] * (padding - len(final_tokens))
    return final_tokens

def extract_angles(sentence, padding_rad =3,padding_degree =1):
    final_tokens = []
    rad_tokens = re.findall(r"\d+(?:\.\d+)?(?:–\d+(?:\.\d+)?)?(?=\s*°)", sentence) #not getting the ° token, nor the space
    degree_tokens = re.findall(r"\d+(?:\.\d+)?(?:–\d+(?:\.\d+)?)?(?=\s*degrees)", sentence) #not getting the degree token, nor the space

    #getting the rad tokens and average the ranges
    for match in rad_tokens[:padding_rad]:
        if '–' not in match:
            final_tokens.append(float(match))
        else:
            numbers = re.findall(r"\d+(?:\.\d+)?",match)
            if len(numbers) == 2:
                a,b = float(numbers[0]),float(numbers[1])
                final_tokens.append(round((a+b)/2,2))
            else:
                continue
    #padding to padding_rad
    if len(final_tokens) < padding_rad:
        final_tokens += [0.0] * (padding_rad - len(final_tokens))

    #getting the degree tokens and average the ranges
    for match in degree_tokens[:padding_degree]:
        if '–' not in match:
            final_tokens.append(float(match))
        else:
            numbers = re.findall(r"\d+(?:\.\d+)?",match)
            if len(numbers) == 2:
                a,b = float(numbers[0]),float(numbers[1])
                final_tokens.append(round((a+b)/2,2))
            else:
                continue
    #padding to padding_degree
    if len(final_tokens) < padding_rad + padding_degree:
        final_tokens += [0.0] * (padding_degree +padding_rad - len(final_tokens))

    assert(len(final_tokens) == padding_rad + padding_degree)

    return final_tokens

In [9]:
def mini_wrapper(sentence,padding=3, padding_rad =3,padding_degree =1):
    "wrapper function to extract bond lengths and angles"
    bond_lengths = extract_bond_lengths(sentence, padding)
    angles = extract_angles(sentence, padding_rad, padding_degree)
    return bond_lengths + angles

In [10]:
def numerical_token_wrapper(df,padding=3, padding_rad =3,padding_degree =1):
    "wrapper function to apply the extract bond lengths and angles to the dataframe"
    new_df = df['description'].apply(lambda x :mini_wrapper(x,padding=padding,padding_rad=padding_rad,padding_degree=padding_degree)).apply(pd.Series)

    bond_labels = [f'bond_length_{i}' for i in range(1,padding+1)]
    angle_rad_labels = [f'angle_rad_{i}' for i in range(1,padding_rad+1)]
    angle_degree_labels = [f'angle_degree_{i}' for i in range(1,padding_degree+1)]
    labels = bond_labels + angle_rad_labels + angle_degree_labels
    # labels = ['bond_length_1','bond_length_2','bond_length_3','angle_rad_1','angle_rad_2','angle_rad_3','angle_degree_1']
    new_df.columns = labels
    return new_df
    

In [11]:
#Actually tokenizing our data
padding  = 5
padding_rad = 3
padding_degree = 1

train_numerical_tokens  = numerical_token_wrapper(np_train,padding=padding,padding_rad=padding_rad,padding_degree=padding_degree)
print("done with train")
valid_numerical_tokens  = numerical_token_wrapper(np_valid,padding=padding,padding_rad=padding_rad,padding_degree=padding_degree)
print("done with valid")
test_numerical_tokens  = numerical_token_wrapper(np_test,padding=padding,padding_rad=padding_rad,padding_degree=padding_degree)
print("done with test")

#saving
train_numerical_tokens.to_csv(f'data/train_numerical_tokens_{padding}_{padding_rad}_{padding_degree}.csv',index=False)
valid_numerical_tokens.to_csv(f'data/valid_numerical_tokens_{padding}_{padding_rad}_{padding_degree}.csv',index=False)
test_numerical_tokens.to_csv(f'data/test_numerical_tokens_{padding}_{padding_rad}_{padding_degree}.csv',index=False)
print("Done")



done with train
done with valid
done with test
Done


In [173]:
print(train_numerical_tokens.shape,valid_numerical_tokens.shape,test_numerical_tokens.shape)

(125098, 9) (9945, 9) (11531, 9)


testing

In [162]:
len(np_train)
np_train.head(2)

Unnamed: 0,material_id,formula,cif_structure,description,band_gap,volume,is_gap_direct
0,mp-759684,Na3Bi(P2O7)2,# generated using pymatgen\ndata_Na3Bi(P2O7)2\...,Na₃Bi(P₂O₇)₂ crystallizes triclinic P̅1 space ...,0.0,633.18183,False
1,mp-1247592,SrCa7Ti2Mn6O23,# generated using pymatgen\ndata_SrCa7Ti2Mn6O2...,SrCa₇Ti₂Mn₆O₂₃ crystallizes triclinic P1 space...,0.0,454.9727,False


In [163]:

for i in range(len(np_valid)):
    np_sentence = np_valid['description'][i]
    # print(np_sentence)
    print(extract_bond_lengths(np_sentence))
    print(extract_angles(np_sentence))
    print('-'*50)
    if i==10:
        break
    # print(pp_test['description'][i])
    # print(extract_bond_lengths(pp_test['description'][i]))
    # print(extract_bond_lengths(np_test['description'][i]))
    # print('-'*50)

[2.2, 2.06, 1.27]
[52.5, 0.0, 0.0, 120.0]
--------------------------------------------------
[3.02, 2.5, 0.0]
[0.0, 0.0, 0.0, 0.0]
--------------------------------------------------
[1.8, 1.81, 1.81]
[0.0, 0.0, 0.0, 0.0]
--------------------------------------------------
[2.0, 2.05, 0.0]
[28.0, 28.0, 0.0, 150.0]
--------------------------------------------------
[2.01, 2.08, 2.01]
[59.0, 0.0, 0.0, 0.0]
--------------------------------------------------
[1.72, 1.75, 0.0]
[0.0, 0.0, 0.0, 120.0]
--------------------------------------------------
[2.46, 2.47, 2.43]
[0.0, 0.0, 0.0, 0.0]
--------------------------------------------------
[3.94, 2.75, 2.91]
[0.0, 0.0, 0.0, 0.0]
--------------------------------------------------
[1.84, 1.99, 1.4]
[40.5, 0.0, 0.0, 150.0]
--------------------------------------------------
[3.23, 3.19, 2.95]
[0.0, 0.0, 0.0, 0.0]
--------------------------------------------------
[2.15, 2.17, 0.0]
[0.0, 0.0, 0.0, 0.0]
----------------------------------------------

In [164]:
new_valid

Unnamed: 0,bond_length_1,bond_length_2,bond_length_3,angle_rad_1,angle_rad_2,angle_rad_3,angle_degree_1
0,2.20,2.06,1.27,52.5,0.0,0.0,120.0
1,3.02,2.50,0.00,0.0,0.0,0.0,0.0
2,1.80,1.81,1.81,0.0,0.0,0.0,0.0
3,2.00,2.05,0.00,28.0,28.0,0.0,150.0
4,2.01,2.08,2.01,59.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...
9940,2.75,2.90,2.66,5.0,0.0,0.0,0.0
9941,2.98,3.17,2.61,0.0,0.0,0.0,0.0
9942,2.51,2.52,2.52,0.0,0.0,0.0,0.0
9943,2.37,1.32,1.10,0.0,0.0,0.0,120.0


In [120]:

print(np_sentence)
mnp = re.sub(r"\d+(\.\d+)?(?:–\d+(\.\d+)?)?\s*°", "[ANG]", np_sentence)
mnp = re.sub(r"\d+(\.\d+)?(?:–\d+(\.\d+)?)?\s*degrees", "[ANG]", mnp)
print(mnp)

rad = re.findall(r"\d+(\.\d+)?(?:–\d+(\.\d+)?)?\s*°",np_sentence)
print(rad)
degrees = re.findall(r"\d+(\.\d+)?(?:–\d+(\.\d+)?)?\s*degrees",np_sentence)
print(degrees)
dr  = re.findall(r"\d+(?:\.\d+)?(?:–\d+(?:\.\d+)?)?\s*°", np_sentence)
print(dr)
dd  = re.findall(r"\d+(?:\.\d+)?(?:–\d+(?:\.\d+)?)?\s*degrees", np_sentence)
print(dd)

K₂MgWO₂(PO₄)₂ crystallizes triclinic P1 space group. eight inequivalent K¹⁺ sites. K¹⁺ site, K¹⁺ bonded 5-coordinate geometry five O²⁻ atoms. spread K–O bond distances ranging 2.65–2.86 Å. K¹⁺ site, K¹⁺ bonded 8-coordinate geometry eight O²⁻ atoms. spread K–O bond distances ranging 2.70–3.35 Å. K¹⁺ site, K¹⁺ bonded 8-coordinate geometry eight O²⁻ atoms. spread K–O bond distances ranging 2.68–3.28 Å. K¹⁺ site, K¹⁺ bonded 7-coordinate geometry seven O²⁻ atoms. spread K–O bond distances ranging 2.68–3.41 Å. K¹⁺ site, K¹⁺ bonded eight O²⁻ atoms form distorted KO₈ hexagonal bipyramids share cornercorner one MgO₆ octahedra, cornercorner one WO₆ octahedra, corners two PO₄ tetrahedra, edgeedge one WO₆ octahedra, edges two PO₄ tetrahedra, two MgO₆ octahedra. corner-sharing octahedral tilt angles range 54–58°. spread K–O bond distances ranging 2.64–3.12 Å. K¹⁺ site, K¹⁺ bonded 8-coordinate geometry eight O²⁻ atoms. spread K–O bond distances ranging 2.62–3.28 Å. K¹⁺ site, K¹⁺ bonded 8-coordinate 

In [None]:
def extract_bond_lengths(sentence, padding = 3):
    "extract at most X= padding  bond lengths from the sentence"
    "If ranges, take the upper value"
    matches = re.findall(r"\d+(?:\.\d+)?(?:–\d+(?:\.\d+)?)?\s*Å",sentence)
    final_tokens = []
    for match in matches[:padding]:
        if '–' not in match:
            final_tokens.append(float(match.split(' ')[0]))
        else:
            numbers = re.findall(r"\d+(?:\.\d+)?",match)
            if len(numbers) == 2:
                a,b = float(numbers[0]),float(numbers[1])
                final_tokens.append(round((a+b)/2,2))
            else:
                continue

    if len(final_tokens) < padding:
        final_tokens += [0.0] * (padding - len(final_tokens))
    return final_tokens

In [130]:
print(np_sentence)
padding_rad,padding_degree  = 3,1
final_tokens = []
# rad_tokens = re.findall(r"\d+(?:\.\d+)?(?:–\d+(?:\.\d+)?)?\s*°", np_sentence)
rad_tokens = re.findall(r"\d+(?:\.\d+)?(?:–\d+(?:\.\d+)?)?(?=\s*°)", np_sentence) #not getting the ° token, nor the space
# rad_tokens= re.findall(r"\d+(?:\.\d+)?(?=\s*°)", np_sentence)
# degree_tokens = re.findall(r"\d+(?:\.\d+)?(?:–\d+(?:\.\d+)?)?\s*degrees", np_sentence)
degree_tokens = re.findall(r"\d+(?:\.\d+)?(?:–\d+(?:\.\d+)?)?(?=\s*degrees)", np_sentence) #not getting the degree token, nor the space
print(rad_tokens)
print(degree_tokens)
for match in rad_tokens[:padding_rad]:
    if '–' not in match:
        final_tokens.append(float(match))
    else:
        numbers = re.findall(r"\d+(?:\.\d+)?",match)
        if len(numbers) == 2:
            a,b = float(numbers[0]),float(numbers[1])
            final_tokens.append(round((a+b)/2,2))
        else:
            continue
#padding to padding_rad
if len(final_tokens) < padding_rad:
    final_tokens += [0.0] * (padding_rad - len(final_tokens))
print(final_tokens)

for match in degree_tokens[:padding_degree]:
    if '–' not in match:
        final_tokens.append(float(match))
    else:
        numbers = re.findall(r"\d+(?:\.\d+)?",match)
        if len(numbers) == 2:
            a,b = float(numbers[0]),float(numbers[1])
            final_tokens.append(round((a+b)/2,2))
        else:
            continue
#padding to padding_degree
if len(final_tokens) < padding_rad + padding_degree:
    final_tokens += [0.0] * (padding_degree +padding_rad - len(final_tokens))

print(final_tokens)

K₂MgWO₂(PO₄)₂ crystallizes triclinic P1 space group. eight inequivalent K¹⁺ sites. K¹⁺ site, K¹⁺ bonded 5-coordinate geometry five O²⁻ atoms. spread K–O bond distances ranging 2.65–2.86 Å. K¹⁺ site, K¹⁺ bonded 8-coordinate geometry eight O²⁻ atoms. spread K–O bond distances ranging 2.70–3.35 Å. K¹⁺ site, K¹⁺ bonded 8-coordinate geometry eight O²⁻ atoms. spread K–O bond distances ranging 2.68–3.28 Å. K¹⁺ site, K¹⁺ bonded 7-coordinate geometry seven O²⁻ atoms. spread K–O bond distances ranging 2.68–3.41 Å. K¹⁺ site, K¹⁺ bonded eight O²⁻ atoms form distorted KO₈ hexagonal bipyramids share cornercorner one MgO₆ octahedra, cornercorner one WO₆ octahedra, corners two PO₄ tetrahedra, edgeedge one WO₆ octahedra, edges two PO₄ tetrahedra, two MgO₆ octahedra. corner-sharing octahedral tilt angles range 54–58°. spread K–O bond distances ranging 2.64–3.12 Å. K¹⁺ site, K¹⁺ bonded 8-coordinate geometry eight O²⁻ atoms. spread K–O bond distances ranging 2.62–3.28 Å. K¹⁺ site, K¹⁺ bonded 8-coordinate 

In [82]:
# re.sub(r"\d+(\.\d+)?(?:–\d+(\.\d+)?)?\s*Å","[NUM]",sentence)
padding  = 8
matches = re.findall(r"\d+(?:\.\d+)?(?:–\d+(?:\.\d+)?)?\s*Å",np_sentence)
final_tokens = []
for match in matches:
    if '–' not in match:
        final_tokens.append(float(match.split(' ')[0]))
        if len(final_tokens) == padding:
            break
    else:
        numbers = re.findall(r"\d+(?:\.\d+)?",match)
        if len(numbers) == 2:
            a,b = float(numbers[0]),float(numbers[1])
            print(a,b)
            final_tokens.append(round((a+b)/2,2))
            # final_tokens += ([float(num) for num in numbers])
            if len(final_tokens) == padding:
                break
        else:
            continue

if len(final_tokens) < padding:
    final_tokens += [0.0] * (padding - len(final_tokens))
final_tokens

2.05 2.16
2.12 2.15


[2.1, 2.1, 2.14, 2.12, 2.14, 2.06, 2.15, 2.13]

In [65]:
def replace_bond_lengths_and_angles_with_num_and_ang(sentence):
    sentence = re.sub(r"\d+(\.\d+)?(?:–\d+(\.\d+)?)?\s*Å", "[NUM]", sentence) # Regex pattern to match bond lengths and units
    sentence = re.sub(r"\d+(\.\d+)?(?:–\d+(\.\d+)?)?\s*°", "[ANG]", sentence) # Regex pattern to match angles and units
    sentence = re.sub(r"\d+(\.\d+)?(?:–\d+(\.\d+)?)?\s*degrees", "[ANG]", sentence) # Regex pattern to match angles and units
    return sentence.strip()

In [31]:
test_data

Unnamed: 0,material_id,formula,cif_structure,description,band_gap,volume,is_gap_direct
0,mp-1519998,KPrMnNbO6,# generated using pymatgen\ndata_KPrMnNbO6\n_s...,KPrNbMnO₆ (Cubic) Perovskite-derived structure...,0.0000,128.624052,False
1,mp-1038035,Mg30MnSiO32,# generated using pymatgen\ndata_Mg30MnSiO32\n...,Mg₃₀MnSiO₃₂ alpha Po-derived structured crysta...,1.9765,624.296565,False
2,mp-1206636,SrTl3,# generated using pymatgen\ndata_SrTl3\n_symme...,SrTl₃ Uranium Silicide structured crystallizes...,0.0000,120.860598,False
3,mp-555398,K2MgP2WO10,# generated using pymatgen\ndata_K2MgP2WO10\n_...,K₂MgWO₂(PO₄)₂ crystallizes triclinic P1 space ...,3.4571,942.118297,False
4,mp-642803,Zr2HBr2,# generated using pymatgen\ndata_Zr2HBr2\n_sym...,Zr₂HBr₂ crystallizes monoclinic C2/m space gro...,0.0000,212.606592,False
...,...,...,...,...,...,...,...
95,mp-1221849,Mn4P6SN12,# generated using pymatgen\ndata_Mn4P6SN12\n_s...,Mn₄P₆N₁₂S crystallizes tetragonal I̅42m space ...,0.0000,233.783450,False
96,mp-684427,LiSb(PO3)4,# generated using pymatgen\ndata_LiSb(PO3)4\n_...,LiSb(PO₃)₄ crystallizes monoclinic P2₁/c space...,4.5327,945.706367,False
97,mp-1193049,Cs2Sn(HO2)6,# generated using pymatgen\ndata_Cs2Sn(HO2)6\n...,Cs₂Sn(HO₂)₆ crystallizes trigonal P̅3 space gr...,2.5642,273.022836,False
98,mp-9732,BaNaP,# generated using pymatgen\ndata_BaNaP\n_symme...,NaBaP crystallizes hexagonal P̅62m space group...,0.8555,258.402162,True


'no_stopwords_and_lengths_and_angles_replaced'

## Making smaller dataframes

In [1]:
import pandas as pd

In [2]:
train = pd.read_csv('data/train_no_stopwords_and_lengths_and_angles_replaced.csv')
val = pd.read_csv('data/validation_no_stopwords_and_lengths_and_angles_replaced.csv')
test = pd.read_csv('data/test_no_stopwords_and_lengths_and_angles_replaced.csv')

#print all shapes
print('train shape:', train.shape)
print('val shape:', val.shape)
print('test shape:', test.shape)


train shape: (125098, 7)
val shape: (9945, 7)
test shape: (11531, 7)


In [3]:

# for i, limit in enumerate([50,100,500,1000]):
for i, limit in enumerate([50,100,500,1000,5000]):
    train2 = train[:limit]
    val2 = val[:limit]
    test2 = test[:limit]
    print(train2.shape, val2.shape, test2.shape)
    train2.to_csv(f'data/train_pp_{limit}.csv', index=False)
    val2.to_csv(f'data/val_pp_{limit}.csv', index=False)
    test2.to_csv(f'data/test_pp_{limit}.csv', index=False)
    print(f'{limit} done')

(50, 7) (50, 7) (50, 7)
50 done
(100, 7) (100, 7) (100, 7)
100 done
(500, 7) (500, 7) (500, 7)
500 done
(1000, 7) (1000, 7) (1000, 7)
1000 done
(5000, 7) (5000, 7) (5000, 7)
5000 done


In [4]:
tv = pd.read_csv('data/val_pp_500.csv')
tr = pd.read_csv('data/train_pp_500.csv')
tt = pd.read_csv('data/test_pp_500.csv')

Just the training set

In [None]:
train = pd.read_csv('data/train_no_stopwords_and_lengths_and_angles_replaced.csv')


In [3]:
newlimit= 15000
print('train shape:', train.shape)
train2 = train[:newlimit]
print('train2 shape:', train2.shape)


train shape: (125098, 7)
train2 shape: (15000, 7)


In [4]:
train2.to_csv(f'data/train_pp_{newlimit}.csv', index=False)

## E5 Embeddings 

In [2]:
import torch
from transformers import AutoTokenizer, AutoModel
import numpy as np
from tqdm import tqdm


In [3]:
model_name = "intfloat/e5-large-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/616 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

In [4]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
model.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 1024, padding_idx=0)
    (position_embeddings): Embedding(512, 1024)
    (token_type_embeddings): Embedding(2, 1024)
    (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-23): 24 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=1024, out_features=1024, bias=True)
            (key): Linear(in_features=1024, out_features=1024, bias=True)
            (value): Linear(in_features=1024, out_features=1024, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=1024, out_features=1024, bias=True)
            (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, 

In [5]:
np_test = pd.read_csv('data/test_no_stopwords.csv')
np_train = pd.read_csv('data/train_no_stopwords.csv')
np_valid = pd.read_csv('data/validation_no_stopwords.csv')

In [6]:
#print the shapes
print(np_test.shape, np_train.shape, np_valid.shape)

(11531, 7) (125098, 7) (9945, 7)


In [7]:
def get_sentence_embedding(text):
    """Convert text to numerical embeddings."""
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    inputs = {key: value.to(device) for key, value in inputs.items()}
    
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Mean pooling of last hidden state
    embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
    return embeddings

def embed_batch(text_list):
    inputs = tokenizer(text_list, return_tensors="pt", truncation=True, padding=True, max_length=512)
    inputs = {key: value.to(device) for key, value in inputs.items()}
    #print the values of the inputs
    with torch.no_grad():
        outputs = model(**inputs)
    
    embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
    return embeddings


In [8]:
sentence   = np_test['description'][0]
embeddings = get_sentence_embedding(sentence)
embeddings.shape

(1, 1024)

In [9]:
sentences = np_test['description'][:100].tolist()
batch_size = 128  if device ==  "cuda" else 64
print(batch_size)
all_embeddings = []

for i in tqdm(range(0, len(sentences), batch_size)):
    batch = sentences[i:i+batch_size]
    embeddings = embed_batch(batch)
    all_embeddings.append(embeddings)

all_embeddings = np.vstack(all_embeddings)

all_embeddings.shape


64


  0%|          | 0/2 [00:00<?, ?it/s]

100%|██████████| 2/2 [00:37<00:00, 18.79s/it]


(100, 1024)