In [1]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
import torch
from sklearn.model_selection import train_test_split
import pandas as pd
import wandb

In [2]:
wandb.init(project="my-t5-project")

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [3]:
# Load data from CSV
data = pd.read_csv('combined_dataset.csv')
data.head()
print(len(data))

1665820


In [4]:
df = data[data['Label'] == 1.0].copy()
df['Structure'] = df['Structure'].str.replace('\n', ',', regex=False)

In [5]:
df.head()

Unnamed: 0,Sequence,Structure,Energy,Label
0,[0 0 0 0],"[[0 1 0 1], [0 0 1 0]]",-0.0,1.0
1,[0 0 0 0],"[[0 1 0 1], [0 0 1 0]]",-0.0,1.0
2,[0 0 0 0],"[[ 0 1 0 -1], [ 0 0 1 0]]",-0.0,1.0
3,[0 0 0 0],"[[ 0 1 0 -1], [ 0 0 1 0]]",-0.0,1.0
4,[0 0 0 0],"[[0 1 0 0], [0 0 1 1]]",-0.0,1.0


In [6]:
# Function to process 'Sequence' column
def process_sequence(seq):
    return ' '.join(seq.strip('[]').split())

# Function to process 'Structure' column
def process_structure(struct):
    struct = struct.replace('[', '').replace(']', '')
    return ' '.join(struct.split())

# Apply the functions to process the columns
df['Sequence'] = df['Sequence'].apply(process_sequence)
df['Structure'] = df['Structure'].apply(process_structure)

In [7]:
df.head()

Unnamed: 0,Sequence,Structure,Energy,Label
0,0 0 0 0,"0 1 0 1, 0 0 1 0",-0.0,1.0
1,0 0 0 0,"0 1 0 1, 0 0 1 0",-0.0,1.0
2,0 0 0 0,"0 1 0 -1, 0 0 1 0",-0.0,1.0
3,0 0 0 0,"0 1 0 -1, 0 0 1 0",-0.0,1.0
4,0 0 0 0,"0 1 0 0, 0 0 1 1",-0.0,1.0


In [8]:
# Split data into training and evaluation sets (80% train, 20% eval)
train_data, eval_data = train_test_split(df, test_size=0.2, random_state=42)

In [9]:
# Load pretrained T5 model and tokenizer
model = T5ForConditionalGeneration.from_pretrained('t5-small')
tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = model.to('cuda')
device = next(model.parameters()).device
print(device)  # This will print cuda:0 if the model is on a GPU, or cpu if the model is on the CPU

Downloading config.json:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/231M [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/773k [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/2.27k [00:00<?, ?B/s]

cuda:0


In [10]:
# Tokenize your data
train_input_encodings = tokenizer(train_data['Sequence'].tolist(), truncation=True, padding=True)
train_config_encodings = tokenizer(train_data['Structure'].tolist(), truncation=True, padding=True)

eval_input_encodings = tokenizer(eval_data['Sequence'].tolist(), truncation=True, padding=True)
eval_config_encodings = tokenizer(eval_data['Structure'].tolist(), truncation=True, padding=True)

In [11]:
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, input_encodings, config_encodings):
        self.input_encodings = input_encodings
        self.config_encodings = config_encodings

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx], device='cpu') for key, val in self.input_encodings.items()}
        item['labels'] = torch.tensor(self.config_encodings['input_ids'][idx], device='cpu')
        item['decoder_attention_mask'] = torch.tensor(self.config_encodings['attention_mask'][idx], device='cpu')
        return item

    def __len__(self):
        return len(self.input_encodings['input_ids'])

In [12]:
# Create datasets
train_dataset = MyDataset(train_input_encodings, train_config_encodings)
eval_dataset = MyDataset(eval_input_encodings, eval_config_encodings)
first_item = train_dataset[0]
print(first_item['input_ids'].device)

cpu


In [13]:
training_args = TrainingArguments(
    output_dir='./output',  
    per_device_train_batch_size=512,
    per_device_eval_batch_size=512,
    num_train_epochs=3,
    logging_dir='./logs',
    logging_steps=10,
    do_train=True,
    do_eval=True,  
    evaluation_strategy="steps",
    report_to="wandb",  # Enable WandB logging
    load_best_model_at_end=True  
)


In [None]:
# Update the Trainer initialization with the eval_dataset
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,  # provide the evaluation dataset here
)

# Now you can call trainer.train() without encountering the error
trainer.train()

***** Running training *****
  Num examples = 541587
  Num Epochs = 3
  Instantaneous batch size per device = 512
  Total train batch size (w. parallel, distributed & accumulation) = 512
  Gradient Accumulation steps = 1
  Total optimization steps = 3174
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Step,Training Loss,Validation Loss
10,4.0808,1.6737
20,2.0358,1.113816
30,1.4116,0.82895
40,0.9915,0.627512
50,0.7403,0.537747
60,0.6352,0.49406
70,0.5842,0.462835
80,0.5475,0.443368
90,0.5243,0.420399
100,0.5003,0.396898


***** Running Evaluation *****
  Num examples = 135397
  Batch size = 512
***** Running Evaluation *****
  Num examples = 135397
  Batch size = 512
***** Running Evaluation *****
  Num examples = 135397
  Batch size = 512
***** Running Evaluation *****
  Num examples = 135397
  Batch size = 512
***** Running Evaluation *****
  Num examples = 135397
  Batch size = 512
***** Running Evaluation *****
  Num examples = 135397
  Batch size = 512
***** Running Evaluation *****
  Num examples = 135397
  Batch size = 512
***** Running Evaluation *****
  Num examples = 135397
  Batch size = 512
***** Running Evaluation *****
  Num examples = 135397
  Batch size = 512
***** Running Evaluation *****
  Num examples = 135397
  Batch size = 512
***** Running Evaluation *****
  Num examples = 135397
  Batch size = 512
***** Running Evaluation *****
  Num examples = 135397
  Batch size = 512
***** Running Evaluation *****
  Num examples = 135397
  Batch size = 512
***** Running Evaluation *****
  Num e

In [19]:
model.save_pretrained('./your-model-name')
tokenizer.save_pretrained('./your-model-name')

Configuration saved in ./your-model-name/config.json
Model weights saved in ./your-model-name/pytorch_model.bin
tokenizer config file saved in ./your-model-name/tokenizer_config.json
Special tokens file saved in ./your-model-name/special_tokens_map.json


('./your-model-name/tokenizer_config.json',
 './your-model-name/special_tokens_map.json',
 './your-model-name/spiece.model',
 './your-model-name/added_tokens.json')