In [1]:
import pandas as pd

# Read the CSV file into a DataFrame
file_path = 'classifier_training_data/posts_articles_about.csv'
df = pd.read_csv(file_path)

# Display the DataFrame
df


Unnamed: 0,content,label
0,title: Timeless Existence and Principle of Cre...,rmrj_articles
1,title: Ratooning Response of Lowland Rice (Ory...,rmrj_articles
2,title: Paternal Resilience in Time of Pandemic...,rmrj_articles
3,title: An Inquiry into the Problems Concerning...,rmrj_articles
4,title: Correlating the Psychological and Spiri...,rmrj_articles
...,...,...
270,LIVE | Witness the 14th Pinning Ceremony of nu...,facebook_posts
271,LIVE | Watch the 113th USJ-R Commencement Exer...,facebook_posts
272,One hundred and fifty Josenian students were w...,facebook_posts
273,SED Recognizes Outstanding Josenian Educators ...,facebook_posts


In [12]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import Dataset

# Split the data into training and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Define the T5 dataset
class T5Dataset(Dataset):
    def __init__(self, data, tokenizer, max_len):
        self.data = data
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        row = self.data.iloc[index]
        input_text = "classify: " + row['content']
        target_text = row['label']

        inputs = self.tokenizer.encode_plus(
            input_text,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        targets = self.tokenizer.encode_plus(
            target_text,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'labels': targets['input_ids'].flatten()
        }

tokenizer = T5Tokenizer.from_pretrained('t5-base')
train_dataset = T5Dataset(train_df, tokenizer, max_len=512)
val_dataset = T5Dataset(val_df, tokenizer, max_len=512)

model = T5ForConditionalGeneration.from_pretrained('t5-base')

# If MPS is available, move model to MPS device
if torch.backends.mps.is_available():
    model = model.to(torch.device("mps"))

training_args = TrainingArguments(
    output_dir='./results',          
    num_train_epochs=3,              
    per_device_train_batch_size=16,  
    per_device_eval_batch_size=64,   
    warmup_steps=500,                
    weight_decay=0.01,               
    logging_dir='./logs',            
    logging_steps=10,
    evaluation_strategy="steps", # evaluate the model every 'logging_steps'
    fp16=False, # add this line
    fp16_full_eval=False, # and this line
)

trainer = Trainer(
    model=model,                         
    args=training_args,                  
    train_dataset=train_dataset,         
    eval_dataset=val_dataset,            
)

# Train the model
trainer.train()

# Save the model
model.save_pretrained('./t5_model')

ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.20.1`: Please run `pip install transformers[torch]` or `pip install accelerate -U`