# Import Lib

In [2]:
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pytorch_lightning as pl
import torch.nn.functional as F
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizer, BertModel

# Define Transformer

In [3]:
class LitTransformer(pl.LightningModule):
    def __init__(self, hidden_size, output_size):
        super().__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, input_ids, attention_mask):
        _, pooled_output = self.bert(input_ids=input_ids, attention_mask=attention_mask, return_dict=False)
        output = self.fc(pooled_output)
        return output

    def training_step(self, batch, batch_idx):
        input_ids, attention_mask, labels = batch
        outputs = self(input_ids, attention_mask)
        loss = F.cross_entropy(outputs, labels)
        self.log('train_loss', loss)
        return loss

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=2e-5)

# Datasets

In [4]:
class ReviewsDataset(Dataset):
    def __init__(self, reviews, ratings, tokenizer, max_length):
        self.reviews = reviews
        self.ratings = ratings
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.reviews)

    def __getitem__(self, idx):
        review = self.reviews[idx]
        rating = self.ratings[idx]
        encoding = self.tokenizer.encode_plus(
            review,
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
        )
        return encoding['input_ids'].flatten(), encoding['attention_mask'].flatten(), torch.tensor(rating)

# Load data and pre-processing

In [5]:
data = pd.read_csv("reviews_suicide_squad.csv", encoding='utf-8')
reviews = data['review'].tolist()
ratings = data['rating'].tolist()

label_encoder = LabelEncoder()
ratings = label_encoder.fit_transform(ratings)

train_reviews, test_reviews, train_ratings, test_ratings = train_test_split(reviews, ratings, test_size=0.2, random_state=42)

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
max_length = 128

train_dataset = ReviewsDataset(train_reviews, train_ratings, tokenizer, max_length)
test_dataset = ReviewsDataset(test_reviews, test_ratings, tokenizer, max_length)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)



# Train Model

In [6]:
hidden_size = 768
output_size = len(label_encoder.classes_)
model = LitTransformer(hidden_size, output_size)

trainer = pl.Trainer(accelerator='tpu', devices=4, max_epochs=100)
trainer.fit(model, train_loader)

GPU available: False, used: False
TPU available: True, using: 4 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
/home/song/ML/.venv/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py:75: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `pytorch_lightning` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default
I0000 00:00:1715310370.047047  697586 pjrt_api.cc:100] GetPjrtApi was found for tpu at /home/song/ML/.venv/lib/python3.10/site-packages/libtpu/libtpu.so
I0000 00:00:1715310370.047047  697580 pjrt_api.cc:100] GetPjrtApi was found for tpu at /home/song/ML/.venv/lib/python3.10/site-packages/libtpu/libtpu.so
I0000 00:00:171

Epoch 99: 100%|██████████| 31/31 [00:15<00:00,  1.98it/s, v_num=2]

`Trainer.fit` stopped: `max_epochs=100` reached.


Epoch 99: 100%|██████████| 31/31 [00:18<00:00,  1.70it/s, v_num=2]


# Evaluate Model

In [9]:
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for batch in test_loader:
        input_ids, attention_mask, labels = batch
        outputs = model(input_ids, attention_mask)
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (abs(predicted - labels) <= 1).sum().item()

accuracy = correct / total
print('Test Accuracy: {:.2f}%'.format(accuracy * 100))

Test Accuracy: 52.12%


# Some random comments..

In [31]:
review_text = "This is a masterpiece of my life"

# Review-pre-processing
encoded_review = tokenizer.encode_plus(
    review_text,
    add_special_tokens=True,
    max_length=128,
    return_token_type_ids=False,
    padding='max_length',
    return_attention_mask=True,
    return_tensors='pt'
)


# Predict Ratings
with torch.no_grad():
    input_ids = encoded_review['input_ids']
    attention_mask = encoded_review['attention_mask']
    output = model(input_ids, attention_mask)
    _, predicted_rating = torch.max(output, 1)

# 예측된 레이팅 출력
print("Predicted Rating:", predicted_rating.item())

Predicted Rating: 9
