In [None]:
# Step 1: Install dependencies
!pip install transformers datasets scikit-learn evaluate -q
!pip install rouge_score


# Step 2: Import libraries
import os
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, TrainingArguments, Trainer, DataCollatorForSeq2Seq
import numpy as np
import evaluate

# 🚫 Disable Weights & Biases
#os.environ["WANDB_DISABLED"] = "true"

# Step 3: Load datasets
train_df = pd.read_csv("rxn_train.csv")
val_df = pd.read_csv("rxn_val.csv")
test_df = pd.read_csv("rxn_test.csv")

# Step 4: Preprocess: separate input/output
train_df = train_df.rename(columns={"reactants>reagents>production": "full", "class": "label"})
val_df = val_df.rename(columns={"reactants>reagents>production": "full", "class": "label"})
test_df = test_df.rename(columns={"reactants>reagents>production": "full", "class": "label"})

# Split 'reactants>reagents>product' into input/output
def split_input_output(df):
    splits = df["full"].str.split(">", expand=True)
    df["input_text"] = splits[0] + ">" + splits[1]  # reactants>reagents
    df["target_text"] = splits[2]                   # product
    return df

train_df = split_input_output(train_df)
val_df = split_input_output(val_df)
test_df = split_input_output(test_df)

# Step 5: Convert to HuggingFace Datasets
train_dataset = Dataset.from_pandas(train_df[["input_text", "target_text"]])
val_dataset = Dataset.from_pandas(val_df[["input_text", "target_text"]])
test_dataset = Dataset.from_pandas(test_df[["input_text", "target_text"]])

# Step 6: Load tokenizer and model (T5-small)
model_checkpoint = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

# Step 7: Tokenize data
max_input_length = 128
max_target_length = 64

def preprocess_function(examples):
    inputs = tokenizer(examples["input_text"], max_length=max_input_length, padding="max_length", truncation=True)
    targets = tokenizer(examples["target_text"], max_length=max_target_length, padding="max_length", truncation=True)

    inputs["labels"] = targets["input_ids"]
    return inputs

train_dataset = train_dataset.map(preprocess_function, batched=True)
val_dataset = val_dataset.map(preprocess_function, batched=True)
test_dataset = test_dataset.map(preprocess_function, batched=True)

# Step 8: Define TrainingArguments
training_args = TrainingArguments(
    output_dir="./t5-chem-product-generator",
    #evaluation_strategy="epoch",
    learning_rate=3e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=5,
    #predict_with_generate=True,
    logging_dir="./logs",
    report_to="none"  # 👈 disables Weights & Biases cleanly
)


# Step 9: Define metric
rouge = evaluate.load("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    return rouge.compute(predictions=decoded_preds, references=decoded_labels)

# Step 10: Initialize Trainer
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# Step 11: Train the model
trainer.train()

# Step 12: Generate predictions on test set
test_results = trainer.predict(test_dataset)
decoded_preds = tokenizer.batch_decode(test_results.predictions, skip_special_tokens=True)

# Display sample predictions
for i in range(5):
    print(f"Input: {test_df['input_text'][i]}")
    print(f"Expected Output: {test_df['target_text'][i]}")
    print(f"Predicted Output: {decoded_preds[i]}")
    print("-" * 60)




The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Map:   0%|          | 0/40008 [00:00<?, ? examples/s]

Map:   0%|          | 0/5001 [00:00<?, ? examples/s]

Map:   0%|          | 0/5007 [00:00<?, ? examples/s]

Downloading builder script: 0.00B [00:00, ?B/s]

  trainer = Trainer(


Step,Training Loss
500,0.3794
1000,0.2271
1500,0.1912
2000,0.1666
2500,0.1414
3000,0.1293
3500,0.1178
4000,0.1038
4500,0.0962
5000,0.0903


Step,Training Loss
500,0.3794
1000,0.2271
1500,0.1912
2000,0.1666
2500,0.1414
3000,0.1293
3500,0.1178
4000,0.1038
4500,0.0962
5000,0.0903


OutOfMemoryError: CUDA out of memory. Tried to allocate 6.31 GiB. GPU 0 has a total capacity of 14.74 GiB of which 6.29 GiB is free. Process 14983 has 8.45 GiB memory in use. Of the allocated memory 7.21 GiB is allocated by PyTorch, and 1.11 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
import torch



In [None]:
import torch
from tqdm import tqdm

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

batch_size = 8  # reduce if still OOM
decoded_preds = []

test_inputs_list = test_df["input_text"].tolist()

for i in tqdm(range(0, len(test_inputs_list), batch_size)):
    batch_inputs = test_inputs_list[i:i+batch_size]

    # Tokenize batch
    inputs = tokenizer(
        batch_inputs,
        return_tensors="pt",
        padding="max_length",
        truncation=True,
        max_length=max_input_length
    )

    input_ids = inputs["input_ids"].to(device)
    attention_mask = inputs["attention_mask"].to(device)

    with torch.no_grad():
        outputs = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_length=max_target_length,
            num_beams=4,
            early_stopping=True
        )

    preds = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    decoded_preds.extend(preds)

# Display some results
for i in range(5):
    print(f"Input: {test_df['input_text'][i]}")
    print(f"Expected Output: {test_df['target_text'][i]}")
    print(f"Predicted Output: {decoded_preds[i]}")
    print("-" * 60)

# Save all predictions
test_df["predicted_output"] = decoded_preds
test_df.to_csv("t5_predictions.csv", index=False)


100%|██████████| 626/626 [09:38<00:00,  1.08it/s]

Input: CC(C)(C)OC(=O)O[C:6]([O:5][C:2]([CH3:1])([CH3:3])[CH3:4])=[O:7].[CH3:8][C:9](=[O:10])[c:11]1[cH:12][cH:13][c:14]2[nH:15][cH:16][cH:17][c:18]2[cH:19]1>
Expected Output: [CH3:1][C:2]([CH3:3])([CH3:4])[O:5][C:6](=[O:7])[n:15]1[c:14]2[cH:13][cH:12][c:11]([C:9]([CH3:8])=[O:10])[cH:19][c:18]2[cH:17][cH:16]1
Predicted Output: [CH3:1][C:2]([CH3:3])([CH3:4])[O:5][C:6](=[O:7])[n:15]1[c:14]2[cH:13]
------------------------------------------------------------
Input: CC(C)(C)OC(=O)O[C:6]([O:5][C:2]([CH3:1])([CH3:3])[CH3:4])=[O:7].[CH3:8][c:9]1[cH:10][cH:11][c:12]([S:13](=[O:14])(=[O:15])[O:16][C@@H:17]2[CH2:18][NH:19][C@H:20]3[C@@H:21]2[O:22][CH2:23][C@@H:24]3[OH:25])[cH:26][cH:27]1>
Expected Output: [CH3:1][C:2]([CH3:3])([CH3:4])[O:5][C:6](=[O:7])[N:19]1[CH2:18][C@@H:17]([O:16][S:13]([c:12]2[cH:11][cH:10][c:9]([CH3:8])[cH:27][cH:26]2)(=[O:14])=[O:15])[C@@H:21]2[C@H:20]1[C@@H:24]([OH:25])[CH2:23][O:22]2
Predicted Output: [CH3:1][C:2]([CH3:3])([CH3:4])[O:5][C:6](=[O:7])[N:22]1[CH2:21][CH2:20]




In [None]:
# Step 1: Install dependencies
!pip install transformers datasets scikit-learn evaluate -q
!pip install rouge_score

# Step 2: Import libraries
import os
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, TrainingArguments, Trainer, DataCollatorForSeq2Seq
import numpy as np
import evaluate

# Step 3: Load datasets
train_df = pd.read_csv("rxn_train.csv")
val_df = pd.read_csv("rxn_val.csv")
test_df = pd.read_csv("rxn_test.csv")

# Step 4: Preprocess: separate input/output
train_df = train_df.rename(columns={"reactants>reagents>production": "full", "class": "label"})
val_df = val_df.rename(columns={"reactants>reagents>production": "full", "class": "label"})
test_df = test_df.rename(columns={"reactants>reagents>production": "full", "class": "label"})

def split_input_output(df):
    splits = df["full"].str.split(">", expand=True)
    df["input_text"] = splits[0] + ">" + splits[1]  # reactants>reagents
    df["target_text"] = splits[2]                   # product
    return df

train_df = split_input_output(train_df)
val_df = split_input_output(val_df)
test_df = split_input_output(test_df)

# Step 5: Convert to HuggingFace Datasets
train_dataset = Dataset.from_pandas(train_df[["input_text", "target_text"]])
val_dataset = Dataset.from_pandas(val_df[["input_text", "target_text"]])
test_dataset = Dataset.from_pandas(test_df[["input_text", "target_text"]])

# Step 6: Load tokenizer and model
model_checkpoint = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

# Step 7: Tokenize data
max_input_length = 128
max_target_length = 64

def preprocess_function(examples):
    inputs = tokenizer(examples["input_text"], max_length=max_input_length, padding="max_length", truncation=True)
    targets = tokenizer(examples["target_text"], max_length=max_target_length, padding="max_length", truncation=True)
    inputs["labels"] = targets["input_ids"]
    return inputs

train_dataset = train_dataset.map(preprocess_function, batched=True)
val_dataset = val_dataset.map(preprocess_function, batched=True)
test_dataset = test_dataset.map(preprocess_function, batched=True)

# Step 8: Define TrainingArguments
training_args = TrainingArguments(
    output_dir="./t5-chem-product-generator",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=20,
    logging_dir="./logs",
    report_to="none"
)

# Step 9: Define metric
rouge = evaluate.load("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    return rouge.compute(predictions=decoded_preds, references=decoded_labels)

# Step 10: Initialize Trainer
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# Step 11: Train the model
trainer.train()

# Step 12: Manual prediction loop with beam search settings
import torch
from tqdm import tqdm

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

batch_size = 8
decoded_preds = []

test_inputs_list = test_df["input_text"].tolist()

for i in tqdm(range(0, len(test_inputs_list), batch_size)):
    batch_inputs = test_inputs_list[i:i+batch_size]

    # Tokenize batch
    inputs = tokenizer(
        batch_inputs,
        return_tensors="pt",
        padding="max_length",
        truncation=True,
        max_length=max_input_length
    )

    input_ids = inputs["input_ids"].to(device)
    attention_mask = inputs["attention_mask"].to(device)

    with torch.no_grad():
        outputs = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_length=max_target_length,
            num_beams=8,          # more beams for better search
            length_penalty=1.0,
            early_stopping=True
        )

    preds = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    decoded_preds.extend(preds)

# Step 13: Display sample predictions
for i in range(5):
    print(f"Input: {test_df['input_text'][i]}")
    print(f"Expected Output: {test_df['target_text'][i]}")
    print(f"Predicted Output: {decoded_preds[i]}")
    print("-" * 60)

# Step 14: Save predictions
test_df["predicted_output"] = decoded_preds
test_df.to_csv("t5_predictions.csv", index=False)





The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Map:   0%|          | 0/40008 [00:00<?, ? examples/s]

Map:   0%|          | 0/5001 [00:00<?, ? examples/s]

Map:   0%|          | 0/5007 [00:00<?, ? examples/s]

Downloading builder script: 0.00B [00:00, ?B/s]

  trainer = Trainer(


Step,Training Loss
500,0.5687
1000,0.3636
1500,0.3062
2000,0.2765
2500,0.243
3000,0.2302
3500,0.219
4000,0.2009
4500,0.1924
5000,0.1852


 59%|█████▉    | 372/626 [05:45<03:55,  1.08it/s]

In [None]:
# Step 1: Install dependencies
!pip install transformers datasets scikit-learn evaluate python-Levenshtein -q
!pip install rouge_score

# Step 2: Import libraries
import os
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, TrainingArguments, Trainer, DataCollatorForSeq2Seq
import numpy as np
import evaluate
import Levenshtein
import torch
from tqdm import tqdm

# Step 3: Load datasets
train_df = pd.read_csv("rxn_train.csv")
val_df = pd.read_csv("rxn_val.csv")
test_df = pd.read_csv("rxn_test.csv")

# Step 4: Preprocess: separate input/output
train_df = train_df.rename(columns={"reactants>reagents>production": "full", "class": "label"})
val_df = val_df.rename(columns={"reactants>reagents>production": "full", "class": "label"})
test_df = test_df.rename(columns={"reactants>reagents>production": "full", "class": "label"})

def split_input_output(df):
    splits = df["full"].str.split(">", expand=True)
    df["input_text"] = splits[0] + ">" + splits[1]  # reactants>reagents
    df["target_text"] = splits[2]                   # product
    return df

train_df = split_input_output(train_df)
val_df = split_input_output(val_df)
test_df = split_input_output(test_df)

# Step 5: Convert to HuggingFace Datasets
train_dataset = Dataset.from_pandas(train_df[["input_text", "target_text"]])
val_dataset = Dataset.from_pandas(val_df[["input_text", "target_text"]])
test_dataset = Dataset.from_pandas(test_df[["input_text", "target_text"]])

# Step 6: Load tokenizer and model
model_checkpoint = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

# Step 7: Tokenize data
max_input_length = 128
max_target_length = 64

def preprocess_function(examples):
    model_inputs = tokenizer(examples["input_text"], max_length=max_input_length, padding="max_length", truncation=True)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["target_text"], max_length=max_target_length, padding="max_length", truncation=True)

    # Replace pad token id's with -100 so they're ignored by loss
    labels_ids = labels["input_ids"]
    labels_ids = [[(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels_ids]
    model_inputs["labels"] = labels_ids
    return model_inputs

train_dataset = train_dataset.map(preprocess_function, batched=True)
val_dataset = val_dataset.map(preprocess_function, batched=True)
test_dataset = test_dataset.map(preprocess_function, batched=True)

# Step 8: Define TrainingArguments
training_args = TrainingArguments(
    output_dir="./t5-chem-product-generator",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=20,
    predict_with_generate=True,
    logging_dir="./logs",
    report_to="none"
)

# Step 9: Define metric with Levenshtein similarity
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Compute Levenshtein similarity for each prediction
    levenshtein_scores = [
        Levenshtein.ratio(pred.strip(), ref.strip()) for pred, ref in zip(decoded_preds, decoded_labels)
    ]
    avg_levenshtein = np.mean(levenshtein_scores)

    # Also return ROUGE-L for reference
    rouge = evaluate.load("rouge")
    rouge_result = rouge.compute(predictions=decoded_preds, references=decoded_labels)

    rouge_result["levenshtein_similarity"] = avg_levenshtein
    return rouge_result

# Step 10: Initialize Trainer
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# Step 11: Train the model
trainer.train()

# Step 12: Generate predictions on test set
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

batch_size = 8
decoded_preds = []
test_inputs_list = test_df["input_text"].tolist()

for i in tqdm(range(0, len(test_inputs_list), batch_size)):
    batch_inputs = test_inputs_list[i:i+batch_size]
    inputs = tokenizer(batch_inputs, return_tensors="pt", padding="max_length", truncation=True, max_length=max_input_length)
    input_ids = inputs["input_ids"].to(device)
    attention_mask = inputs["attention_mask"].to(device)

    with torch.no_grad():
        outputs = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_length=max_target_length,
            num_beams=8,
            length_penalty=1.0,
            early_stopping=True
        )

    preds = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    decoded_preds.extend(preds)

# Step 13: Save and display results
test_df["predicted_output"] = decoded_preds
test_df["levenshtein_similarity"] = [
    Levenshtein.ratio(pred.strip(), ref.strip()) for pred, ref in zip(decoded_preds, test_df["target_text"])
]
test_df.to_csv("t5_predictions.csv", index=False)

for i in range(5):
    print(f"Input: {test_df['input_text'][i]}")
    print(f"Expected Output: {test_df['target_text'][i]}")
    print(f"Predicted Output: {decoded_preds[i]}")
    print(f"Levenshtein Similarity: {test_df['levenshtein_similarity'][i]:.4f}")
    print("-" * 60)


In [None]:
# Step 1: Install dependencies
!pip install transformers datasets scikit-learn evaluate -q
!pip install rouge_score

# Step 2: Import libraries
import os
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, TrainingArguments, Trainer, DataCollatorForSeq2Seq
import numpy as np
import evaluate

# Step 3: Load datasets
train_df = pd.read_csv("rxn_train.csv")
val_df = pd.read_csv("rxn_val.csv")
test_df = pd.read_csv("rxn_test.csv")

# Step 4: Preprocess: separate input/output
train_df = train_df.rename(columns={"reactants>reagents>production": "full", "class": "label"})
val_df = val_df.rename(columns={"reactants>reagents>production": "full", "class": "label"})
test_df = test_df.rename(columns={"reactants>reagents>production": "full", "class": "label"})

def split_input_output(df):
    splits = df["full"].str.split(">", expand=True)
    df["input_text"] = splits[0] + ">" + splits[1]  # reactants>reagents
    df["target_text"] = splits[2]                   # product
    return df

train_df = split_input_output(train_df)
val_df = split_input_output(val_df)
test_df = split_input_output(test_df)

# Step 5: Convert to HuggingFace Datasets
train_dataset = Dataset.from_pandas(train_df[["input_text", "target_text"]])
val_dataset = Dataset.from_pandas(val_df[["input_text", "target_text"]])
test_dataset = Dataset.from_pandas(test_df[["input_text", "target_text"]])

# Step 6: Load tokenizer and model
model_checkpoint = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

# Step 7: Tokenize data
max_input_length = 128
max_target_length = 256

def preprocess_function(examples):
    inputs = ["predict product: " + ex for ex in examples["input_text"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, padding="max_length", truncation=True)

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["target_text"], max_length=max_target_length, padding="max_length", truncation=True)
    labels["input_ids"] = [
        [(token if token != tokenizer.pad_token_id else -100) for token in label]
        for label in labels["input_ids"]
    ]
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


train_dataset = train_dataset.map(preprocess_function, batched=True)
val_dataset = val_dataset.map(preprocess_function, batched=True)
test_dataset = test_dataset.map(preprocess_function, batched=True)

# Step 8: Define TrainingArguments
training_args = TrainingArguments(
    output_dir="./t5-chem-product-generator",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=1,
    logging_dir="./logs",
    report_to="none"
)

# Step 9: Define metric
rouge = evaluate.load("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    return rouge.compute(predictions=decoded_preds, references=decoded_labels)

# Step 10: Initialize Trainer
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# Step 11: Train the model
trainer.train()

# Step 12: Manual prediction loop with beam search settings
import torch
from tqdm import tqdm

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

batch_size = 8
decoded_preds = []

test_inputs_list = test_df["input_text"].tolist()

for i in tqdm(range(0, len(test_inputs_list), batch_size)):
    batch_inputs = test_inputs_list[i:i+batch_size]

    # Tokenize batch
    inputs = tokenizer(
        batch_inputs,
        return_tensors="pt",
        padding="max_length",
        truncation=True,
        max_length=max_input_length
    )

    input_ids = inputs["input_ids"].to(device)
    attention_mask = inputs["attention_mask"].to(device)

    with torch.no_grad():
        outputs = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_length=max_target_length,
            num_beams=12,          # more beams for better search
            length_penalty=1.0,

            #early_stopping=True
        )

    preds = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    decoded_preds.extend(preds)

# Step 13: Display sample predictions
for i in range(5):
    print(f"Input: {test_df['input_text'][i]}")
    print(f"Expected Output: {test_df['target_text'][i]}")
    print(f"Predicted Output: {decoded_preds[i]}")
    print("-" * 60)

# Step 14: Save predictions
test_df["predicted_output"] = decoded_preds
test_df.to_csv("t5_predictions.csv", index=False)





The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Map:   0%|          | 0/40008 [00:00<?, ? examples/s]



Map:   0%|          | 0/5001 [00:00<?, ? examples/s]

Map:   0%|          | 0/5007 [00:00<?, ? examples/s]

Downloading builder script: 0.00B [00:00, ?B/s]

  trainer = Trainer(


Step,Training Loss
500,0.7632
1000,0.473
1500,0.4091
2000,0.3711
2500,0.3406
3000,0.3229
3500,0.3151
4000,0.2991
4500,0.2966
5000,0.2991


  0%|          | 0/626 [00:00<?, ?it/s]


ValueError: The following `model_kwargs` are not used by the model: ['repeatation_penalty'] (note: typos in the generate arguments will also show up in this list)

In [None]:
# Step 12: Manual prediction loop with beam search settings
import torch
from tqdm import tqdm

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

batch_size = 8
decoded_preds = []

test_inputs_list = test_df["input_text"].tolist()

for i in tqdm(range(0, len(test_inputs_list), batch_size)):
    batch_inputs = test_inputs_list[i:i+batch_size]

    # Tokenize batch
    inputs = tokenizer(
        batch_inputs,
        return_tensors="pt",
        padding="max_length",
        truncation=True,
        max_length=max_input_length
    )

    input_ids = inputs["input_ids"].to(device)
    attention_mask = inputs["attention_mask"].to(device)

    with torch.no_grad():
        outputs = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_length=max_target_length,
            num_beams=12,          # more beams for better search
            length_penalty=1.0,

            #early_stopping=True
        )

    preds = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    decoded_preds.extend(preds)

# Step 13: Display sample predictions
for i in range(5):
    print(f"Input: {test_df['input_text'][i]}")
    print(f"Expected Output: {test_df['target_text'][i]}")
    print(f"Predicted Output: {decoded_preds[i]}")
    print("-" * 60)

# Step 14: Save predictions
test_df["predicted_output"] = decoded_preds
test_df.to_csv("t5_predictions.csv", index=False)



 90%|█████████ | 565/626 [46:17<05:09,  5.07s/it]

In [None]:
# Step 1: Install dependencies
!pip install transformers datasets scikit-learn evaluate -q
!pip install rouge_score

# Step 2: Import libraries
import os
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, TrainingArguments, Trainer, DataCollatorForSeq2Seq
import numpy as np
import evaluate

# Step 3: Load datasets
train_df = pd.read_csv("rxn_train.csv")
val_df = pd.read_csv("rxn_val.csv")
test_df = pd.read_csv("rxn_test.csv")

# Step 4: Preprocess: separate input/output
train_df = train_df.rename(columns={"reactants>reagents>production": "full", "class": "label"})
val_df = val_df.rename(columns={"reactants>reagents>production": "full", "class": "label"})
test_df = test_df.rename(columns={"reactants>reagents>production": "full", "class": "label"})

def split_input_output(df):
    splits = df["full"].str.split(">", expand=True)
    df["input_text"] = splits[0] + ">" + splits[1]  # reactants>reagents
    df["target_text"] = splits[2]                   # product
    return df

train_df = split_input_output(train_df)
val_df = split_input_output(val_df)
test_df = split_input_output(test_df)

# Step 5: Convert to HuggingFace Datasets
train_dataset = Dataset.from_pandas(train_df[["input_text", "target_text"]])
val_dataset = Dataset.from_pandas(val_df[["input_text", "target_text"]])
test_dataset = Dataset.from_pandas(test_df[["input_text", "target_text"]])

# Step 6: Load tokenizer and model
model_checkpoint = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

# Step 7: Tokenize data
max_input_length = 256
max_target_length = 512

def preprocess_function(examples):
    inputs = ["predict product: " + ex for ex in examples["input_text"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, padding="max_length", truncation=True)

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["target_text"], max_length=max_target_length, padding="max_length", truncation=True)
    labels["input_ids"] = [
        [(token if token != tokenizer.pad_token_id else -100) for token in label]
        for label in labels["input_ids"]
    ]
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


train_dataset = train_dataset.map(preprocess_function, batched=True)
val_dataset = val_dataset.map(preprocess_function, batched=True)
test_dataset = test_dataset.map(preprocess_function, batched=True)

# Step 8: Define TrainingArguments
training_args = TrainingArguments(
    output_dir="./t5-chem-product-generator",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=10,
    logging_dir="./logs",
    report_to="none"
)

# Step 9: Define metric
rouge = evaluate.load("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    return rouge.compute(predictions=decoded_preds, references=decoded_labels)

# Step 10: Initialize Trainer
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# Step 11: Train the model
trainer.train()

# Step 12: Manual prediction loop with beam search settings
import torch
from tqdm import tqdm

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

batch_size = 8
decoded_preds = []

test_inputs_list = test_df["input_text"].tolist()

for i in tqdm(range(0, len(test_inputs_list), batch_size)):
    batch_inputs = test_inputs_list[i:i+batch_size]

    # Tokenize batch
    inputs = tokenizer(
        batch_inputs,
        return_tensors="pt",
        padding="max_length",
        truncation=True,
        max_length=max_input_length
    )

    input_ids = inputs["input_ids"].to(device)
    attention_mask = inputs["attention_mask"].to(device)

    with torch.no_grad():
        outputs = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_length=max_target_length,
            num_beams=12,          # more beams for better search
            length_penalty=1.0,

            #early_stopping=True
        )

    preds = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    decoded_preds.extend(preds)

# Step 13: Display sample predictions
for i in range(5):
    print(f"Input: {test_df['input_text'][i]}")
    print(f"Expected Output: {test_df['target_text'][i]}")
    print(f"Predicted Output: {decoded_preds[i]}")
    print("-" * 60)

# Step 14: Save predictions
test_df["predicted_output"] = decoded_preds
test_df.to_csv("t5_predictions.csv", index=False)



[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=b9d4b35d974c8d48e7f2b93002fb6cee6a2d8a43714e8b68d3d2e57d7faac630
  Stored in directory: /root/.cache/pip/wheels/1e/19/43/8a442dc83660ca25e163e1bd1f89919284ab0d0c1475475148
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Map:   0%|          | 0/40008 [00:00<?, ? examples/s]



Map:   0%|          | 0/5001 [00:00<?, ? examples/s]

Map:   0%|          | 0/5007 [00:00<?, ? examples/s]

Downloading builder script: 0.00B [00:00, ?B/s]

  trainer = Trainer(


Step,Training Loss
500,0.5547
1000,0.2692
1500,0.2153
2000,0.1842
2500,0.1571
3000,0.1418


In [None]:
# Step 1: Install dependencies
!pip install transformers datasets scikit-learn evaluate -q
!pip install rouge_score

# Step 2: Import libraries
import os
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, TrainingArguments, Trainer, DataCollatorForSeq2Seq
import numpy as np
import evaluate

# Step 3: Load datasets
train_df = pd.read_csv("rxn_train.csv")
val_df = pd.read_csv("rxn_val.csv")
test_df = pd.read_csv("rxn_test.csv")

# Step 4: Preprocess: separate input/output
train_df = train_df.rename(columns={"reactants>reagents>production": "full", "class": "label"})
val_df = val_df.rename(columns={"reactants>reagents>production": "full", "class": "label"})
test_df = test_df.rename(columns={"reactants>reagents>production": "full", "class": "label"})

def split_input_output(df):
    splits = df["full"].str.split(">", expand=True)
    df["input_text"] = splits[0] + ">" + splits[1]  # reactants>reagents
    df["target_text"] = splits[2]                   # product
    return df

train_df = split_input_output(train_df)
val_df = split_input_output(val_df)
test_df = split_input_output(test_df)

# Step 5: Convert to HuggingFace Datasets
train_dataset = Dataset.from_pandas(train_df[["input_text", "target_text"]])
val_dataset = Dataset.from_pandas(val_df[["input_text", "target_text"]])
test_dataset = Dataset.from_pandas(test_df[["input_text", "target_text"]])

# Step 6: Load tokenizer and model
model_checkpoint = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

# Step 7: Tokenize data
max_input_length = 256
max_target_length = 512

def preprocess_function(examples):
    inputs = ["predict product: " + ex for ex in examples["input_text"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, padding="max_length", truncation=True)

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["target_text"], max_length=max_target_length, padding="max_length", truncation=True)
    labels["input_ids"] = [
        [(token if token != tokenizer.pad_token_id else -100) for token in label]
        for label in labels["input_ids"]
    ]
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


train_dataset = train_dataset.map(preprocess_function, batched=True)
val_dataset = val_dataset.map(preprocess_function, batched=True)
test_dataset = test_dataset.map(preprocess_function, batched=True)

# Step 8: Define TrainingArguments
training_args = TrainingArguments(
    output_dir="./t5-chem-product-generator",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=5,
    logging_dir="./logs",
    report_to="none"
)

# Step 9: Define metric
rouge = evaluate.load("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    return rouge.compute(predictions=decoded_preds, references=decoded_labels)

# Step 10: Initialize Trainer
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# Step 11: Train the model
trainer.train()

# Save model and tokenizer
'''trainer.save_model("./t5-chem-product-generator")
tokenizer.save_pretrained("./t5-chem-product-generator")

from google.colab import files
import shutil

# Zip the model folder
shutil.make_archive("t5-chem-product-generator", 'zip', "./t5-chem-product-generator")

# Download the zip file
files.download("t5-chem-product-generator.zip")

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Load model from local folder (unzipped)
model = AutoModelForSeq2SeqLM.from_pretrained("./t5-chem-product-generator")
tokenizer = AutoTokenizer.from_pretrained("./t5-chem-product-generator")

# Move to device
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)


# Step 12: Manual prediction loop with beam search settings
import torch
from tqdm import tqdm

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

batch_size = 8
decoded_preds = []

test_inputs_list = test_df["input_text"].tolist()

for i in tqdm(range(0, len(test_inputs_list), batch_size)):
    batch_inputs = test_inputs_list[i:i+batch_size]

    # Tokenize batch
    inputs = tokenizer(
        batch_inputs,
        return_tensors="pt",
        padding="max_length",
        truncation=True,
        max_length=max_input_length
    )

    input_ids = inputs["input_ids"].to(device)
    attention_mask = inputs["attention_mask"].to(device)

    with torch.no_grad():
        outputs = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_length=max_target_length,
            num_beams=12,          # more beams for better search
            length_penalty=1.0,

            #early_stopping=True
        )

    preds = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    decoded_preds.extend(preds)

# Step 13: Display sample predictions
for i in range(5):
    print(f"Input: {test_df['input_text'][i]}")
    print(f"Expected Output: {test_df['target_text'][i]}")
    print(f"Predicted Output: {decoded_preds[i]}")
    print("-" * 60)

# Step 14: Save predictions
test_df["predicted_output"] = decoded_preds
test_df.to_csv("t5_predictions.csv", index=False)'''



[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=fdd8d6f083cc4178b9a856800586314ef0a1b00b54ae066b563447106cd0ca55
  Stored in directory: /root/.cache/pip/wheels/85/9d/af/01feefbe7d55ef5468796f0c68225b6788e85d9d0a281e7a70
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Map:   0%|          | 0/40008 [00:00<?, ? examples/s]



Map:   0%|          | 0/5001 [00:00<?, ? examples/s]

Map:   0%|          | 0/5007 [00:00<?, ? examples/s]

Downloading builder script: 0.00B [00:00, ?B/s]

  trainer = Trainer(


Step,Training Loss
500,0.554
1000,0.2713
1500,0.2175
2000,0.1864
2500,0.1596
3000,0.1444
3500,0.1341
4000,0.1171
4500,0.1142
5000,0.1107


Step,Training Loss
500,0.554
1000,0.2713
1500,0.2175
2000,0.1864
2500,0.1596
3000,0.1444
3500,0.1341
4000,0.1171
4500,0.1142
5000,0.1107


'trainer.save_model("./t5-chem-product-generator")\ntokenizer.save_pretrained("./t5-chem-product-generator")\n\nfrom google.colab import files\nimport shutil\n\n# Zip the model folder\nshutil.make_archive("t5-chem-product-generator", \'zip\', "./t5-chem-product-generator")\n\n# Download the zip file\nfiles.download("t5-chem-product-generator.zip")\n\nfrom transformers import AutoTokenizer, AutoModelForSeq2SeqLM\n\n# Load model from local folder (unzipped)\nmodel = AutoModelForSeq2SeqLM.from_pretrained("./t5-chem-product-generator")\ntokenizer = AutoTokenizer.from_pretrained("./t5-chem-product-generator")\n\n# Move to device\nimport torch\ndevice = "cuda" if torch.cuda.is_available() else "cpu"\nmodel.to(device)\n\n\n# Step 12: Manual prediction loop with beam search settings\nimport torch\nfrom tqdm import tqdm\n\ndevice = "cuda" if torch.cuda.is_available() else "cpu"\nmodel.to(device)\n\nbatch_size = 8\ndecoded_preds = []\n\ntest_inputs_list = test_df["input_text"].tolist()\n\nfor i 

In [None]:
# Save model and tokenizer
trainer.save_model("./t5-chem-product-generator")
tokenizer.save_pretrained("./t5-chem-product-generator")

from google.colab import files
import shutil

# Zip the model folder
shutil.make_archive("t5-chem-product-generator", 'zip', "./t5-chem-product-generator")

# Download the zip file
files.download("t5-chem-product-generator.zip")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Load model from local folder (unzipped)
model = AutoModelForSeq2SeqLM.from_pretrained("./t5-chem-product-generator")
tokenizer = AutoTokenizer.from_pretrained("./t5-chem-product-generator")

# Move to device
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)


T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

KeyError: 'input_text'

In [None]:
import zipfile

zip_file_path = "/content/t5-chem-product-generator.zip"
extract_path = "/content/t5-chem-product-generator"

with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
      zip_ref.extractall(extract_path)


In [None]:
# ==========================
# Prediction Script
# ==========================

import torch
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from tqdm import tqdm

# Load fine-tuned model & tokenizer
model_path = "./t5-chem-product-generator"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSeq2SeqLM.from_pretrained(model_path)

# Device setup
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

# Load test data
test_df = pd.read_csv("rxn_test.csv")

# Ensure consistent column naming
test_df = test_df.rename(columns={"reactants>reagents>production": "full"})
splits = test_df["full"].str.split(">", expand=True)
test_df["input_text"] = splits[0] + ">" + splits[1]   # reactants>reagents
test_df["target_text"] = splits[2]                    # product

# Prediction parameters
batch_size = 8
max_input_length = 256
max_target_length = 512

decoded_preds = []

# Loop through test set in batches
for i in tqdm(range(0, len(test_df), batch_size)):
    batch_inputs = test_df["input_text"].tolist()[i:i+batch_size]

    # Tokenize
    inputs = tokenizer(
        ["predict product: " + text for text in batch_inputs],
        return_tensors="pt",
        padding="max_length",
        truncation=True,
        max_length=max_input_length
    )

    input_ids = inputs["input_ids"].to(device)
    attention_mask = inputs["attention_mask"].to(device)

    # Generate predictions
    with torch.no_grad():
        outputs = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_length=max_target_length,
            num_beams=12,         # beam search
            length_penalty=1.0
        )

    preds = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    decoded_preds.extend(preds)

# Add predictions to DataFrame
test_df["predicted_output"] = decoded_preds

# Show sample results
for i in range(5):
    print(f"Input: {test_df['input_text'][i]}")
    print(f"Expected: {test_df['target_text'][i]}")
    print(f"Predicted: {test_df['predicted_output'][i]}")
    print("-" * 60)

# Save predictions
test_df.to_csv("t5_predictions.csv", index=False)
print("✅ Predictions saved to t5_predictions.csv")


  1%|          | 7/626 [42:33<62:42:59, 364.75s/it]


KeyboardInterrupt: 