# Import Library

In [1]:
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pytorch_lightning as pl
import torch.nn.functional as F

# Detach train Dset and test Dset
from sklearn.model_selection import train_test_split
# Text to vector
from sklearn.feature_extraction.text import CountVectorizer
# Rating to Vector
from sklearn.preprocessing import LabelEncoder


# My RNN

In [2]:
class LitRatingRNN(pl.LightningModule):
    def __init__(self, input_size, hidden_size, output_size):
        super().__init__()
        self.hidden_size = hidden_size
        self.rnn = nn.RNN(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        h0 = torch.zeros(1, x.size(0), self.hidden_size, device=x.device)
        out, _ = self.rnn(x, h0)
        out = self.fc(out[:, -1, :])
        return out

    def training_step(self, batch, batch_idx):
        X, y = batch
        outputs = self(X.to(self.device).unsqueeze(1))
        _, predicted = torch.max(outputs.data, 1)
        loss = F.cross_entropy(outputs, y)
        self.log('train_loss', loss)
        return loss

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=0.001)
    

# Using Comment CSV file

In [3]:
data = pd.read_csv("reviews_suicide_squad.csv", encoding='utf-8')

# Extract reviews and ratings

reviews = data['review'].tolist()
ratings = data['rating'].tolist()

# Data Check
print(len(reviews), len(ratings))

2471 2471


# Load Data


In [8]:
# Data pre-processing

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(reviews).toarray()

# Label Encoding

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(ratings)

# Data seperation
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=42)

AttributeError: 'numpy.ndarray' object has no attribute 'todevice'

# Set Hyperparameter

In [5]:
input_size = X.shape[1]
hidden_size = 16
output_size = 2
lr = 0.001
num_epochs = 150

# Generate Data Loader

In [6]:
class ReviewsDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.int64)

    def __len__(self):
        return len(self.X)
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]
        
# Data Loader gen

train_dataset = ReviewsDataset(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size = 4, shuffle=True)

# Train!

In [7]:
trainer = pl.Trainer(accelerator='tpu', devices=4, max_epochs=num_epochs)

# 모델 학습
model = LitRatingRNN(input_size, hidden_size, output_size)
trainer.fit(model, train_loader)

GPU available: False, used: False
TPU available: True, using: 4 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
/home/song/ML/.venv/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py:75: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `pytorch_lightning` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default
I0000 00:00:1715301196.343083  546446 pjrt_api.cc:100] GetPjrtApi was found for tpu at /home/song/ML/.venv/lib/python3.10/site-packages/libtpu/libtpu.so
I0000 00:00:1715301196.343185  546446 pjrt_api.cc:79] PJRT_Api is set for device type tpu
I0000 00:00:1715301196.343197  546446 pjrt_api.cc:146] The PJRT plugin has PJR

Epoch 0:   0%|          | 0/109 [00:00<?, ?it/s] 

RuntimeError: Input and hidden tensors are not at the same device, found input tensor at xla:0 and hidden tensor at cpu

# Check Validality

In [None]:
test_dataset = ReviewsDataset(X_test, y_test)
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)
model.eval()

with torch.no_grad():
    correct = 0
    total = 0
    for inputs, labels in test_loader:
        outputs = model(inputs.to(model.device).unsqueeze(1))
        _, predicted = torch.max(outputs.data, 1)
        total += 1
        correct += (predicted == labels).sum().item()
    accuracy = correct / total
    print('Test Accuracy: {:.2f}%'.format(accuracy * 100))