In [1]:
!pip install Unidecode

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting Unidecode
  Obtaining dependency information for Unidecode from https://files.pythonhosted.org/packages/84/b7/6ec57841fb67c98f52fc8e4a2d96df60059637cba077edc569a302a8ffc7/Unidecode-1.3.8-py3-none-any.whl.metadata
  Downloading Unidecode-1.3.8-py3-none-any.whl.metadata (13 kB)
Downloading Unidecode-1.3.8-py3-none-any.whl (235 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.5/235.5 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: Unidecode
Successfully installed Unidecode-1.3.8

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [15]:
import os
import re
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

# Thư viện xử lý văn bản
import nltk
import unidecode
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

# Thư viện chia train/val/test
from sklearn.model_selection import train_test_split

# -----------------------------
# 1. Thiết lập random seed
# -----------------------------
seed = 42
np.random.seed(seed)
torch.manual_seed(seed)

# Đảm bảo đã cài đặt nltk stopwords
nltk.download('stopwords')

# -----------------------------
# 2. Đọc dữ liệu
# -----------------------------
"""
Giả sử file all-data.csv có cấu trúc hai cột:
- sentiment: 'positive', 'negative' hoặc 'neutral'
- content: nội dung văn bản tin tức
Ví dụ:
sentiment,content
neutral,"According to Gran, the company..."
negative,"The company laid off tens of employees..."
positive,"This move would increase capacity..."
"""

dataset_path = "all-data.csv"  # chỉnh lại đường dẫn file nếu cần
headers = ["sentiment", "content"]

df = pd.read_csv(
    dataset_path,
    names=headers,
    encoding="ISO-8859-1"
)

# Bỏ các dòng trống hoặc NaN (nếu có)
df.dropna(subset=["sentiment", "content"], inplace=True)

# -----------------------------
# 3. Gán label dạng chuỗi -> ID (0/1/2)
# -----------------------------
unique_sentiments = df["sentiment"].unique().tolist()  # ví dụ: ['neutral', 'negative', 'positive']
classes = {class_name: idx for idx, class_name in enumerate(unique_sentiments)}
df["sentiment"] = df["sentiment"].apply(lambda x: classes[x])

# -----------------------------
# 4. Tiền xử lý dữ liệu văn bản
# -----------------------------
# - lower casing
# - bỏ dấu unicode
# - bỏ stopwords
# - stemming
# - xóa dấu câu (punctuation)
# -----------------------------
english_stop_words = set(stopwords.words("english"))
stemmer = PorterStemmer()

def text_normalize(text):
    # 1) lower
    text = text.lower()
    # 2) unidecode
    text = unidecode.unidecode(text)
    # 3) strip
    text = text.strip()
    # 4) remove punctuation
    text = re.sub(r"[^\w\s]", "", text)
    # 5) remove stopwords
    words = [w for w in text.split() if w not in english_stop_words]
    # 6) stemming
    words = [stemmer.stem(w) for w in words]
    return " ".join(words)

df["content"] = df["content"].apply(text_normalize)

# -----------------------------
# 5. Tạo vocabulary
# -----------------------------
vocab = set()
for sentence in df["content"].tolist():
    for token in sentence.split():
        vocab.add(token)

# Thêm token đặc biệt
vocab = list(vocab)
vocab.append("UNK")
vocab.append("PAD")

# Tạo word_to_idx
word_to_idx = {word: idx for idx, word in enumerate(vocab)}
vocab_size = len(word_to_idx)

# -----------------------------
# 6. Hàm chuyển văn bản -> chuỗi ID
# -----------------------------
def transform(text, word_to_idx, max_seq_len=32):
    tokens = []
    for w in text.split():
        if w in word_to_idx:
            tokens.append(word_to_idx[w])
        else:
            tokens.append(word_to_idx["UNK"])

    # padding / cắt chuỗi
    if len(tokens) < max_seq_len:
        tokens += [word_to_idx["PAD"]] * (max_seq_len - len(tokens))
    else:
        tokens = tokens[:max_seq_len]

    return tokens

# -----------------------------
# 7. Chia dữ liệu train/val/test
#    tỉ lệ ví dụ: train 70%, val 20%, test 10%
# -----------------------------
texts = df["content"].tolist()
labels = df["sentiment"].tolist()

# 70% train, còn lại 30% -> chia tiếp val/test (2/3 val, 1/3 test)
X_train, X_temp, y_train, y_temp = train_test_split(
    texts, labels,
    test_size=0.30,
    random_state=seed,
    shuffle=True
)

# Trong 30% còn lại, ta chia 2/3 val, 1/3 test => val=0.2, test=0.1 (tương đối)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp,
    test_size=0.33,  # khoảng ~10% của tổng
    random_state=seed,
    shuffle=True
)

# -----------------------------
# 8. Tạo Dataset và DataLoader
# -----------------------------
class FinancialNews(Dataset):
    def __init__(self, X, y, word_to_idx, max_seq_len=32, transform=None):
        self.X = X
        self.y = y
        self.word_to_idx = word_to_idx
        self.max_seq_len = max_seq_len
        self.transform = transform

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        text = self.X[idx]
        label = self.y[idx]

        if self.transform:
            text = self.transform(text, self.word_to_idx, self.max_seq_len)
        text_tensor = torch.tensor(text, dtype=torch.long)
        label_tensor = torch.tensor(label, dtype=torch.long)
        return text_tensor, label_tensor

max_seq_len = 32

train_dataset = FinancialNews(
    X_train, y_train,
    word_to_idx=word_to_idx,
    max_seq_len=max_seq_len,
    transform=transform
)

val_dataset = FinancialNews(
    X_val, y_val,
    word_to_idx=word_to_idx,
    max_seq_len=max_seq_len,
    transform=transform
)

test_dataset = FinancialNews(
    X_test, y_test,
    word_to_idx=word_to_idx,
    max_seq_len=max_seq_len,
    transform=transform
)

train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
val_loader   = DataLoader(val_dataset, batch_size=32, shuffle=False)
test_loader  = DataLoader(test_dataset, batch_size=32, shuffle=False)

# -----------------------------
# 9. Định nghĩa mô hình RNN
# -----------------------------
class SentimentClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, n_layers, n_classes, dropout_prob=0.2):
        super(SentimentClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        # Ở đây dùng RNN thường. Nếu muốn LSTM thì thay thành nn.LSTM(...)
        # hoặc BiLSTM: nn.LSTM(..., bidirectional=True)
        self.rnn = nn.RNN(
            input_size=embedding_dim,
            hidden_size=hidden_size,
            num_layers=n_layers,
            batch_first=True
        )
        self.norm = nn.LayerNorm(hidden_size)
        self.dropout = nn.Dropout(dropout_prob)
        self.fc1 = nn.Linear(hidden_size, 16)
        self.fc2 = nn.Linear(16, n_classes)
        self.relu = nn.ReLU()

    def forward(self, x):
        # x shape: (batch_size, seq_len)
        x = self.embedding(x)     # -> (batch_size, seq_len, embedding_dim)
        rnn_out, h_n = self.rnn(x)  # -> (batch_size, seq_len, hidden_size)
        # Lấy hidden state tại time step cuối
        x = rnn_out[:, -1, :]     # -> (batch_size, hidden_size)
        x = self.norm(x)
        x = self.dropout(x)
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

# -----------------------------
# 10. Tạo instance model
# -----------------------------
n_classes = len(unique_sentiments)  # số nhãn (3: pos, neg, neutral)
embedding_dim = 64
hidden_size = 64
n_layers = 2
dropout_prob = 0.2

device = "cuda" if torch.cuda.is_available() else "cpu"

model = SentimentClassifier(
    vocab_size=vocab_size,
    embedding_dim=embedding_dim,
    hidden_size=hidden_size,
    n_layers=n_layers,
    n_classes=n_classes,
    dropout_prob=dropout_prob
).to(device)

# -----------------------------
# 11. Loss, optimizer
# -----------------------------
lr = 1e-4
epochs = 10

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

# -----------------------------
# 12. Định nghĩa hàm train + evaluate
# -----------------------------
def evaluate(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0.0
    correct = 0
    total = 0

    with torch.no_grad():
        for inputs, labels in dataloader:
            inputs = inputs.to(device)
            labels = labels.to(device)

            outputs = model(inputs)
            loss = criterion(outputs, labels)
            total_loss += loss.item()

            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    avg_loss = total_loss / len(dataloader)
    acc = correct / total
    return avg_loss, acc


def fit(model, train_loader, val_loader, criterion, optimizer, device, epochs=10):
    for epoch in range(epochs):
        model.train()
        running_loss = 0.0

        for inputs, labels in train_loader:
            inputs = inputs.to(device)
            labels = labels.to(device)

            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

        # Tính loss trên tập train & val
        train_loss = running_loss / len(train_loader)
        val_loss, val_acc = evaluate(model, val_loader, criterion, device)

        print(f"Epoch [{epoch+1}/{epochs}] "
              f"Train Loss: {train_loss:.4f} | "
              f"Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}")

    print("Training finished!")

# -----------------------------
# 13. Huấn luyện mô hình
# -----------------------------
fit(model, train_loader, val_loader, criterion, optimizer, device, epochs=epochs)

# -----------------------------
# 14. Đánh giá mô hình trên tập test
# -----------------------------
test_loss, test_acc = evaluate(model, test_loader, criterion, device)
print(f"Test Loss = {test_loss:.4f} | Test Accuracy = {test_acc:.4f}")


[nltk_data] Downloading package stopwords to /home/daoan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Epoch [1/10] Train Loss: 1.0355 | Val Loss: 0.9417, Val Acc: 0.5873
Epoch [2/10] Train Loss: 0.9321 | Val Loss: 0.9269, Val Acc: 0.5873
Epoch [3/10] Train Loss: 0.9263 | Val Loss: 0.9250, Val Acc: 0.5873
Epoch [4/10] Train Loss: 0.9240 | Val Loss: 0.9245, Val Acc: 0.5873
Epoch [5/10] Train Loss: 0.9290 | Val Loss: 0.9247, Val Acc: 0.5873
Epoch [6/10] Train Loss: 0.9226 | Val Loss: 0.9247, Val Acc: 0.5873
Epoch [7/10] Train Loss: 0.9234 | Val Loss: 0.9251, Val Acc: 0.5873
Epoch [8/10] Train Loss: 0.9236 | Val Loss: 0.9249, Val Acc: 0.5873
Epoch [9/10] Train Loss: 0.9269 | Val Loss: 0.9245, Val Acc: 0.5873
Epoch [10/10] Train Loss: 0.9258 | Val Loss: 0.9253, Val Acc: 0.5873
Training finished!
Test Loss = 0.9511 | Test Accuracy = 0.5708


In [16]:
import torch
import torch.nn as nn
import torch.nn.functional as F

# Giả sử bạn đã có sẵn word_to_idx, vocab_size, max_seq_len và model 
# đã được huấn luyện hoặc sẵn sàng để export.
# Dưới đây minh hoạ ngắn gọn một mô hình tương tự SentimentClassifier
# rồi export sang ONNX.

class SentimentClassifier(nn.Module):
    def __init__(
        self, 
        vocab_size, 
        embedding_dim=64,
        hidden_size=64, 
        n_layers=2, 
        n_classes=3,     # ví dụ 3 class: positive/negative/neutral
        dropout_prob=0.2
    ):
        super(SentimentClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.RNN(
            input_size=embedding_dim,
            hidden_size=hidden_size,
            num_layers=n_layers,
            batch_first=True
        )
        self.norm = nn.LayerNorm(hidden_size)
        self.dropout = nn.Dropout(dropout_prob)
        self.fc1 = nn.Linear(hidden_size, 16)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(16, n_classes)

    def forward(self, x):
        """
        x.shape = (batch_size, seq_len)
        Mỗi phần tử trong x là chỉ mục (index) của từ trong vocab
        """
        x = self.embedding(x)   # => (batch_size, seq_len, embedding_dim)
        # RNN trả về (batch_size, seq_len, hidden_size)
        x, hn = self.rnn(x)
        # Ta lấy hidden state ở time step cuối:
        # x[:, -1, :] => (batch_size, hidden_size)
        x = x[:, -1, :]
        x = self.norm(x)
        x = self.dropout(x)
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x


def main():
    # ------------------------------
    # 1. Khởi tạo mô hình (đã huấn luyện hoặc load checkpoint)
    # ------------------------------
    device = "cuda" if torch.cuda.is_available() else "cpu"
    
    # Giả sử vocab_size = 5000, số class = 3
    vocab_size = 5000
    n_classes = 3
    
    model = SentimentClassifier(
        vocab_size=vocab_size,
        embedding_dim=64,
        hidden_size=64,
        n_layers=2,
        n_classes=n_classes,
        dropout_prob=0.2
    ).to(device)
    
    # Ở đây giả sử mô hình đã load trọng số (nếu có), ví dụ:
    # model.load_state_dict(torch.load("sentiment_model.pth", map_location=device))
    # model.eval()
    
    # ------------------------------
    # 2. Tạo dummy input để export
    # ------------------------------
    max_seq_len = 32  # độ dài cố định mỗi batch
    batch_size = 1     # xuất mô hình cho batch_size=1 (thường để dynamic_axes)
    
    # Tạo random đầu vào (mô phỏng tokens ID)
    # Mỗi token ID nằm trong [0, vocab_size)
    dummy_input = torch.randint(
        low=0,
        high=vocab_size,  # exclusive
        size=(batch_size, max_seq_len),
        dtype=torch.long
    ).to(device)
    
    # ------------------------------
    # 3. Export sang ONNX
    # ------------------------------
    # Filename ONNX
    onnx_filename = "sentiment_classifier.onnx"
    
    # Đặt chế độ eval
    model.eval()
    
    # `dynamic_axes` giúp mô hình linh hoạt kích thước batch hoặc seq_len
    #  - "input": {0: "batch_size", 1: "seq_len"}
    #  - "output": {0: "batch_size"}
    # opset_version >= 11 để compatible nhiều công cụ
    torch.onnx.export(
        model,              # mô hình
        dummy_input,        # data giả
        onnx_filename,      # tên file .onnx
        input_names=["input"],
        output_names=["output"],
        dynamic_axes={
            "input": {0: "batch_size", 1: "seq_len"},
            "output": {0: "batch_size"}
        },
        opset_version=11
    )
    
    print(f"Exported model to {onnx_filename}")


if __name__ == "__main__":
    main()


Exported model to sentiment_classifier.onnx


