In [2]:
import pandas as pd

# Read the CSV file into a DataFrame
file_path = 'classifier_training_data/posts_articles_about.csv'
df = pd.read_csv(file_path)

# Display the DataFrame
df


Unnamed: 0,content,label
0,title: Timeless Existence and Principle of Cre...,rmrj_articles
1,title: Ratooning Response of Lowland Rice (Ory...,rmrj_articles
2,title: Paternal Resilience in Time of Pandemic...,rmrj_articles
3,title: An Inquiry into the Problems Concerning...,rmrj_articles
4,title: Correlating the Psychological and Spiri...,rmrj_articles
...,...,...
270,LIVE | Witness the 14th Pinning Ceremony of nu...,facebook_posts
271,LIVE | Watch the 113th USJ-R Commencement Exer...,facebook_posts
272,One hundred and fifty Josenian students were w...,facebook_posts
273,SED Recognizes Outstanding Josenian Educators ...,facebook_posts


In [6]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
import string
nltk.download('wordnet')
nltk.download('stopwords')

# You may need to download WordNet data by running: nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()

# This line of code can be used to update the list of stopwords
# nltk.download('stopwords')
stopwords = set(stopwords.words('english'))

def preprocess_text(text):
    # Lowercase the text
    text = text.lower()
    
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Remove stopwords and lemmatize
    text = [lemmatizer.lemmatize(word) for word in text.split() if word not in stopwords]
    
    # Join the words back into one string
    text = ' '.join(text)
    
    return text

# Apply the preprocessing to every document in the data
df['content'] = df['content'].apply(preprocess_text)


[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/garfieldgreglim/nltk_data...
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/garfieldgreglim/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split

# Split the data into training and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Define the T5 dataset
class T5Dataset(Dataset):
    def __init__(self, data, tokenizer, max_len):
        self.data = data
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        row = self.data.iloc[index]
        input_text = "classify: " + row['content']
        target_text = row['label']

        inputs = self.tokenizer.encode_plus(
            input_text,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        targets = self.tokenizer.encode_plus(
            target_text,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'labels': targets['input_ids'].flatten()
        }

tokenizer = T5Tokenizer.from_pretrained('t5-base')
train_dataset = T5Dataset(train_df, tokenizer, max_len=512)
val_dataset = T5Dataset(val_df, tokenizer, max_len=512)

model = T5ForConditionalGeneration.from_pretrained('t5-base')

training_args = TrainingArguments(
    output_dir='./results',          
    num_train_epochs=3,              
    per_device_train_batch_size=16,  
    per_device_eval_batch_size=64,   
    warmup_steps=500,                
    weight_decay=0.01,               
    logging_dir='./logs',            
    logging_steps=10,
    evaluation_strategy="steps", # evaluate the model every 'logging_steps'
)

trainer = Trainer(
    model=model,                         
    args=training_args,                  
    train_dataset=train_dataset,         
    eval_dataset=val_dataset,            
)

# Train the model
trainer.train()

# Save the model
model.save_pretrained('./t5_model')


In [None]:
pip install --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/cpu


Looking in indexes: https://pypi.org/simple, https://download.pytorch.org/whl/nightly/cpu
Collecting torchaudio
  Downloading https://download.pytorch.org/whl/nightly/cpu/torchaudio-2.1.0.dev20230716-cp310-cp310-macosx_10_13_x86_64.whl (2.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.9/2.9 MB[0m [31m600.9 kB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
INFO: pip is looking at multiple versions of torchaudio to determine which version is compatible with other requirements. This could take a while.


In [None]:
import torch

if torch.backends.mps.is_available():
    mps_device = torch.device("mps")
    x = torch.ones(1, device=mps_device)
    print(x)
else:
    print("MPS device not found.")
