### Import Packages

In [1]:
import os
import re
import json
import torch
import pandas as pd
from pandas import json_normalize
import numpy as np
from vllm import LLM, SamplingParams
from transformers import  AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, HfArgumentParser, TrainingArguments
from peft import LoraConfig, PeftModel
import math
# Setup environment 
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

### Load Fine-Tuned Model

In [2]:
# Fine-tuned model name
new_model_name = "saved_models/Mistral-7B-Instruct-v0.2/split_train_800_without_prompt_lr5e-06_bs1/checkpoint-1600"
sampling_params = SamplingParams(temperature=0, top_p=1, max_tokens = 4096, stop = ['!!!'])
llm = LLM(model = new_model_name, tensor_parallel_size=1)

INFO 09-02 11:59:51 llm_engine.py:79] Initializing an LLM engine with config: model='saved_models/Meta-Llama-3-8B-Instruct/split_train_800_without_prompt_lr5e-06_bs1/checkpoint-3200', tokenizer='saved_models/Meta-Llama-3-8B-Instruct/split_train_800_without_prompt_lr5e-06_bs1/checkpoint-3200', tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, seed=0)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


INFO 09-02 11:59:58 llm_engine.py:337] # GPU blocks: 9559, # CPU blocks: 2048
INFO 09-02 11:59:59 model_runner.py:666] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 09-02 11:59:59 model_runner.py:670] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 09-02 12:00:04 model_runner.py:738] Graph capturing finished in 5 secs.


### Preprocess Data (Eunsure the instruction same with training)

In [3]:
# Data Loading and Preprocessing

#test_file = "../data/test/test_300.csv"
test_file = "../data/test/test_1022.csv"
test_df = pd.read_csv(test_file, encoding='utf-8')

def create_assistant_message(row):
    return f"""{{\"IUPAC\":\"{row['IUPAC']}\",\"1H NMR text\":\"{row['1H NMR text']}\",\"1H NMR conditions\":\"{row['1H NMR conditions']}\",\"1H NMR data\":\"{row['1H NMR data']}\",\"13C NMR text\":\"{row['13C NMR text']}\",\"13C NMR conditions\":\"{row['13C NMR conditions']}\",\"13C NMR data\":\"{row['13C NMR data']}\"}}"""
    
test_df['NMRInfo'] = test_df.apply(create_assistant_message, axis=1)

source_text = "Paragraph"
target_text = "NMRInfo"
instruction = f'{source_text}2{target_text}: '
instruction = '''Extract text containing 1H NMR and 13C NMR data, remove interference information such as reactants, raw materials, solvents and other non-final product names based on text semantics, and then extract the name, code or number of the final product. Please delete the IUPAC name Alias, numbers and ordinal numbers before and after fields, such as '2.1.3.', '(HL4)', '(9)', '(4d)'. NMR text should contain complete information, such as instrument power and solvent information, For example, "13C NMR text": "13C NMR (400 MHz, acetone-d6) 174.0 (C), 157.7 (C). Then split the NMR text. The content in NMR conditions is NMR instrument power and solvent information, such as "13C NMR conditions": "400MHz, acetone-d6". The content in the 13C NMR data removes information such as the position and shape of the peak,  such as "13C NMR data": "131.4-128.0, 157.7". The content in the 1H NMR data should include information such as the position and shape of the peak, such as "1H NMR data": "12.57 (s, 1H), 7.97-7.95 (d, J = 8.25 Hz, 2H)". Please keep the duplicate values of the original data and do not modify the number of decimal places. All responses must originate from information extracted from the given text, ensuring that the extracted content has not been modified or fragmented, and that capitalization and punctuation are exactly the same as the given text. Must end with {"IUPAC":"text","1H NMR text":"text","1H NMR conditions":"text","1H NMR data":"text","13C NMR text":"text","13C NMR conditions":"text","13C NMR data":"text"} format reply.'''

test_df['text'] = f'<s>[INST] {instruction}' + test_df[source_text] + " [/INST]"

prompts = list(test_df['text'])
prompts[:5]

['<s>[INST] Extract text containing 1H NMR and 13C NMR data, remove interference information such as reactants, raw materials, solvents and other non-final product names based on text semantics, and then extract the name, code or number of the final product. Please delete the IUPAC name Alias, numbers and ordinal numbers before and after fields, such as \'2.1.3.\', \'(HL4)\', \'(9)\', \'(4d)\'. NMR text should contain complete information, such as instrument power and solvent information, For example, "13C NMR text": "13C NMR (400 MHz, acetone-d6) 174.0 (C), 157.7 (C). Then split the NMR text. The content in NMR conditions is NMR instrument power and solvent information, such as "13C NMR conditions": "400MHz, acetone-d6". The content in the 13C NMR data removes information such as the position and shape of the peak,  such as "13C NMR data": "131.4-128.0, 157.7". The content in the 1H NMR data should include information such as the position and shape of the peak, such as "1H NMR data": 

### Inference

In [4]:
# Generate texts from the prompts. 
# The output is a list of RequestOutput objects that contain the prompt, generated text, and other information.
outputs = llm.generate(prompts, sampling_params)
predictions = []
predictions_prob = []
# Print the outputs.
for output in outputs:
    prompt = output.prompt
    generated_text = output.outputs[0].text
    print(f"Prompt: {prompt},\nGenerated text: {generated_text!r}")
    predictions.append(generated_text.strip())
    predictions_prob.append(math.exp(output.outputs[0].cumulative_logprob))

Processed prompts: 100%|██████████| 1022/1022 [06:45<00:00,  2.52it/s]

Prompt: <s>[INST] Extract text containing 1H NMR and 13C NMR data, remove interference information such as reactants, raw materials, solvents and other non-final product names based on text semantics, and then extract the name, code or number of the final product. Please delete the IUPAC name Alias, numbers and ordinal numbers before and after fields, such as '2.1.3.', '(HL4)', '(9)', '(4d)'. NMR text should contain complete information, such as instrument power and solvent information, For example, "13C NMR text": "13C NMR (400 MHz, acetone-d6) 174.0 (C), 157.7 (C). Then split the NMR text. The content in NMR conditions is NMR instrument power and solvent information, such as "13C NMR conditions": "400MHz, acetone-d6". The content in the 13C NMR data removes information such as the position and shape of the peak,  such as "13C NMR data": "131.4-128.0, 157.7". The content in the 1H NMR data should include information such as the position and shape of the peak, such as "1H NMR data": "1




### Save the Predictions

In [5]:
pred_df = pd.DataFrame()
pred_df['Generated Text'] = predictions
pred_df['Actual Text'] = test_df[target_text]
pred_df['Paragraph'] = test_df[source_text]
pred_df['Generated Text Prob'] = predictions_prob
def safe_json_loads(val):
    try:
        if val is np.nan:
            return {}
        else:
            return json.loads(str(val).replace("'", "\""))

    except (ValueError, TypeError, json.JSONDecodeError):

        pattern = r'"(IUPAC|1H NMR text|1H NMR conditions|1H NMR data|13C NMR text|13C NMR conditions|13C NMR data)":"(.*?)"'
        matches = re.findall(pattern, val)
        result = {key: value for key, value in matches}

        keys = ["IUPAC", "1H NMR text", "1H NMR conditions", "1H NMR data", "13C NMR text", "13C NMR conditions", "13C NMR data"]

        for key in keys:
            result[key] = result.get(key, 'N/A')
        return result

# split the dictionary into multiple columns by json_normalize, then concat with the original DataFrame
pred_df['Generated Text'] = pred_df['Generated Text'].apply(safe_json_loads)
pred_df = pd.concat([pred_df.drop('Generated Text', axis=1), pd.json_normalize(pred_df['Generated Text'])], axis=1)
pred_df = pred_df.fillna('N/A')
pred_df.to_csv(f"results/predictions/split_prediction_{len(test_df)}_of_{new_model_name.replace('/', '-')}.csv", index = None)
pred_df

Unnamed: 0,Actual Text,Paragraph,Generated Text Prob,IUPAC,1H NMR text,1H NMR conditions,1H NMR data,13C NMR text,13C NMR conditions,13C NMR data,M.S.,13C NMR data removed,13C{1H} NMR text,77Se NMR text,77Se NMR conditions,77Se NMR data,19F NMR text,19F NMR conditions,19F NMR data
0,"{""IUPAC"":""nemorosone"",""1H NMR text"":""1H NMR (C...","Purity of isolated nemorosone was >99%, as det...",0.434644,nemorosone,"1H NMR (CD3OD, 400 MHz): 7.53 (br d, J = 7.7 H...","CD3OD, 400 MHz","7.53 (br d, J = 7.7 Hz, 2H), 7.43 (app tt, J =...","13C NMR (CD3OD, 100 MHz): 209.26 (C), 194.88 (...","CD3OD, 100 MHz","209.26, 194.88, 138.23, 135.25, 134.24, 133.66...",,,,,,,,,
1,"{""IUPAC"":""Mach"",""1H NMR text"":""1H-NMR (500 MHz...",The method used to isolate Mach from S. chinen...,0.677526,Mach,"1H-NMR (500 MHz, CDCl3) assign data: δ 6.92~6....","500 MHz, CDCl3","6.92~6.82 (6H, m, aromatic protons), 6.33 (1H,...","13C-NMR (125 MHz, CDCl3) assign data: δ 151.0 ...","125 MHz, CDCl3","151.0, 147.0, 146.8, 145.7, 133.7, 132.2, 130....",,,,,,,,,
2,"{""IUPAC"":""N-(3-chlorophenyl)-N-((5-(hydrazinec...","Compound 3 (0.35 g, 1.00 mmol, 1.0 equiv.) was...",0.719605,N-(3-chlorophenyl)-N-((5-(hydrazinecarbonyl)py...,"1H NMR (401 MHz, CD3OD): δ 8.92 (dd, J = 0.8, ...","401 MHz, CD3OD","8.92 (dd, J = 0.8, 2.2 Hz, 1H), 8.22 (dd, J = ...","13C NMR (101 MHz, CD3OD): δ 166.6, 162.3, 149....","101 MHz, CD3OD","166.6, 162.3, 149.3, 142.4, 137.8, 135.7, 131....",,,,,,,,,
3,"{""IUPAC"":""N-(3-chlorophenyl)-N-((5-(5-(difluor...","Compound 5 (24 mg, 0.05 mmol, 1.0 equiv.) and ...",0.590637,N-(3-chlorophenyl)-N-((5-(5-(difluoromethyl)-1...,"1H NMR (401 MHz, CDCl3): δ 9.24 (d, J = 1.6 Hz...","401 MHz, CDCl3","9.24 (d, J = 1.6 Hz, 1H), 8.37 (dd, J = 2.2, 8...","13C NMR (101 MHz, CDCl3): δ 164.1, 160.9, 158....","101 MHz, CDCl3","164.1, 160.9, 158.7(t, JC,F = 29.0 Hz), 148.0,...",,,,,,,,,
4,"{""IUPAC"":""Pulchin A"",""1H NMR text"":""The 1H NMR...",Pulchin A (1) was obtained in the form of whit...,0.133641,pulchin A,The 1H NMR spectral data exhibited four tertia...,,"0.91, 0.94, 1.05, 1.18, 3.48, 3.74, 4.71, 7.54...",The 20 carbon resonances in the spectrum were ...,,"14.4, 14.5, 17.1, 28.5, 61.7, 81.9, 210.6",,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1017,"{""IUPAC"":""bis-S-thio­carbamate ester"",""1H NMR ...","3,3′-Diisopropyl-[1,1′-bi­naphthalene]-2,2′-di...",0.389673,"3,3′-Diisopropyl-[1,1′-bi­naphthalene]-2,2′-di...",1H NMR (200 MHz) δ (ppm): 1.34 and 1.46 [2 × (...,200 MHz,"1.34 and 1.46 [2 × (6H, d, CHMe)], 2.51 (12H, ...",13C NMR (50 MHz) δ (ppm): 23.4 and 24.8 (2 × C...,50 MHz,"23.4 and 24.8, 31.5, 36.7, 124.1, 125.1, 126.7...",,,,,,,,,
1018,"{""IUPAC"":""(2,6-Diiso­propyldi­naphtho­[2,1-d:1...","(2,6-Diiso­propyldi­naphtho­[2,1-d:1′,2′-f][1,...",0.466833,"(2,6-Diiso­propyldi­naphtho­[2,1-d:1′,2′-f][1,...","1H NMR (200 MHz) δ (ppm): 1.26, 1.34, 1.35 and...",200 MHz,"1.26, 1.34, 1.35 and 1.61 [4 × (3H, d, CHMe)],...","13C NMR (50 MHz) δ (ppm): 22.9, 24.0, 24.2 and...",50 MHz,"22.9, 24.0, 24.2 and 26.5 (4 × CHMe), 31.3 and...",,,,,,,,,
1019,"{""IUPAC"":""Isopropyl 2,6,6-trimethyl-4-(3-fluor...","Isopropyl 2,6,6-trimethyl-4-(3-fluoro-5-tri­fl...",0.681657,"Isopropyl 2,6,6-trimethyl-4-(3-fluoro-5-tri­fl...","1H NMR (400 MHz, DMSO-d 6): δ 0.82 (3H, s, 6-C...","400 MHz, DMSO-d 6","0.82 (3H, s, 6-CH3), 0.91 [3H, d, J = 6.4 Hz, ...","13C NMR (100 MHz, DMSO-d 6): δ 18.2 (2-CH3), 2...","100 MHz, DMSO-d 6","18.2, 21.2, 21.7, 22.9, 24.2, 24.7, 33.1, 34.0...",,,,,,,,,
1020,"{""IUPAC"":""2"",""1H NMR text"":""1H NMR (400 MHz, C...",The synthesis of (2) is shown schematically in...,0.350611,(2),"1H NMR (400 MHz, CDCl3): δ = 3.60 (s, 3H), 3.5...","400 MHz, CDCl3","3.60 (s, 3H), 3.59 (s, 3H), 3.21–3.12 (m, 1H),...","13C NMR (101 MHz, CDCl3): δ = 174.5, 148.0, 14...","101 MHz, CDCl3","174.5, 148.0, 147.8, 141.5, 132.5, 125.2, 119....",,,,,,,,,
