<a href="https://colab.research.google.com/github/geexe/thai-clickbait-classification-with-pretrained-nlp/blob/main/2024_12_03_Clickbait_Classification_BERT_with_PyThaiNLP_Preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Preparation (Run Once)

In [5]:
import pandas as pd

# File paths
positive_train_file = '/content/dataset-train-clickbait.txt'  # Path to the positive train .txt file
positive_test_file = '/content/dataset-test-clickbait.txt'    # Path to the positive test .txt file
negative_train_file = '/content/prachatai_train.csv'  # Path to the negative train .csv file
negative_test_file = '/content/prachatai_test.csv'    # Path to the negative test .csv file

# Column name in the CSV file that contains sentences
csv_sentence_column = 'title'

# Function to load and label positive .txt files
def load_positive_txt(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        sentences = [line.strip().replace('|', '') for line in file if line.strip()]  # Replace "|" with space
    return pd.DataFrame({'sentence': sentences, 'label': 1})

# Function to load and label negative .csv files
def load_negative_csv(file_path):
    negative_df = pd.read_csv(file_path)
    if csv_sentence_column not in negative_df.columns:
        raise ValueError(f"The CSV file must contain a column named '{csv_sentence_column}'.")
    return negative_df[[csv_sentence_column]].rename(columns={csv_sentence_column: 'sentence'}).assign(label=0)

# Load datasets
positive_train_df = load_positive_txt(positive_train_file)
positive_test_df = load_positive_txt(positive_test_file)
negative_train_df = load_negative_csv(negative_train_file)
negative_test_df = load_negative_csv(negative_test_file)

# Combine datasets
train_data = pd.concat([positive_train_df, negative_train_df], ignore_index=True).sample(frac=0.2, random_state=33)
test_data = pd.concat([positive_test_df, negative_test_df], ignore_index=True).sample(frac=1, random_state=42)

# Remove "!" and "?" from the sentences
train_data['sentence'] = train_data['sentence'].str.replace(r'[!?“”"]', '', regex=True)
test_data['sentence'] = test_data['sentence'].str.replace(r'[!?“”"]', '', regex=True)

#train_data['sentence_length'] = train_data['sentence'].str.len()
#test_data['sentence_length'] = test_data['sentence'].str.len()

In [6]:
print("Negative: ", len(train_data[train_data['label'] == 0]))
print("Positive: " , len(train_data[train_data['label'] == 1]))

Negative:  10765
Positive:  7586


In [7]:
# Display the datasets
print(f"Training set size: {len(train_data)}")
print(f"Validation set size: {len(test_data)}")

# Example output
print("Sample training data:")
print(train_data.head())

print("Sample validation data:")
print(test_data.head())

Training set size: 18351
Validation set size: 16133
Sample training data:
                                                sentence  label
51791  อรรถจักร์ สัตยานุรักษ์: ศาสนาในพื้นที่สาธารณะใหม่      0
84577  Nick Nostitz รายงานจากสนามข่าว วันเสาร์สีแดง ว...      0
3906   ชีวิต ดี๊ดี   เคย เห็น ยัง 11 ภาพ ไลฟ์สไตล์ คว...      1
81218        100 ปีพุทธทาส : ธรรมะดับร้อนยุคนายกฯ ทักษิณ      0
88529  จ่ายยาเม็ดคุมกำเนิดครั้งละมากๆ ช่วยผู้หญิงคุมก...      0
Sample validation data:
                                                sentence  label
7900   ไม่ธรรมดา   ไป ฟัง วลี เด็ด ใหม่ ของ  โค้ช ซิ ...      1
2254   อัพเดท ก่อน ใคร   โปรแกรม การแข่งขัน  วอลเลย์บ...      1
13538      พล.อ. สุรยุทธ์ เรียกร้อง WTO พื้นเจรจารอบโดฮา      0
7171   มาแรง โค้งสุดท้าย   เลขเด็ด  หลวงพ่อปากแดง  จั...      1
6051   ถึงกับ เงิบ    เผยโฉม หน้า เก่า ของ  เปรี้ยว  ...      1


In [8]:
# Save datasets to CSV files (optional)
train_data.to_csv('train_dataset.csv', index=False, encoding='utf-8')
test_data.to_csv('test_dataset.csv', index=False, encoding='utf-8')

import gc

# Clear memory
#del positive_train_df, positive_test_df, negative_train_df, negative_test_df, train_data, test_data  # Delete DataFrames
gc.collect()  # Run garbage collection

# Confirmation
print("Data saved and memory cleared.")

Data saved and memory cleared.


# Word Tokenize

## Run once (for each runtime initialization)

In [None]:
# Install required libraries (if not already installed)
!pip install transformers pythainlp wandb

# Import necessary libraries
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.utils.data import DataLoader, Dataset
from sklearn.preprocessing import LabelEncoder
import pandas as pd
from pythainlp.tokenize import word_tokenize
from pythainlp.util import normalize
import wandb

# Initialize W&B (login only once)
wandb.login

# Thai text preprocessing utility
def preprocess_text(text):
    text = normalize(text)  # Normalize Thai characters
    tokens = word_tokenize(text, engine="newmm")  # Tokenize with PyThaiNLP
    return " ".join(tokens)

# Dataset class
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "label": torch.tensor(label, dtype=torch.long),
        }

# Model training function
def train_model(model, dataloader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    correct = 0

    for batch in dataloader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        correct += (logits.argmax(dim=-1) == labels).sum().item()

    accuracy = correct / len(dataloader.dataset)
    return total_loss / len(dataloader), accuracy

# Model evaluation function
def evaluate_model(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0
    correct = 0

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            loss = criterion(logits, labels)

            total_loss += loss.item()
            correct += (logits.argmax(dim=-1) == labels).sum().item()

    accuracy = correct / len(dataloader.dataset)
    return total_loss / len(dataloader), accuracy

Collecting pythainlp
  Downloading pythainlp-5.0.4-py3-none-any.whl.metadata (7.7 kB)
Downloading pythainlp-5.0.4-py3-none-any.whl (17.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.9/17.9 MB[0m [31m54.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pythainlp
Successfully installed pythainlp-5.0.4


## Run many times (to have multiple runs of the same setup so we can report mean +- SD)

### BERT Automodel

In [None]:
# Initialize W&B for each experiment
wandb.init(project="thai-clickbait-classification", config={
    "model_name": "bert-multilingual-uncased",
    "max_length": 128,
    "batch_size": 8,
    "learning_rate": 1e-4,
    "epochs": 10,
    "architecture": 'AutoModelForSequenceClassification',
    "preprocessing": 'pythainlp.tokenize.word_tokenize'
})

# Access W&B configuration
config = wandb.config

# Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load and preprocess data
train_df = pd.read_csv('/content/train_dataset.csv')
test_df = pd.read_csv('/content/test_dataset.csv')

train_texts = [preprocess_text(text) for text in train_df['sentence']]
test_texts = [preprocess_text(text) for text in test_df['sentence']]

label_encoder = LabelEncoder()
train_labels = label_encoder.fit_transform(train_df['label'])
test_labels = label_encoder.transform(test_df['label'])

# Dataset and DataLoader
tokenizer = AutoTokenizer.from_pretrained(config.model_name)
train_dataset = TextDataset(train_texts, train_labels, tokenizer, config.max_length)
test_dataset = TextDataset(test_texts, test_labels, tokenizer, config.max_length)

train_loader = DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=config.batch_size)

# Model setup
model = AutoModelForSequenceClassification.from_pretrained(config.model_name, num_labels=2)
model.to(device)

# Optimizer and loss function
optimizer = torch.optim.AdamW(model.parameters(), lr=config.learning_rate)
criterion = torch.nn.CrossEntropyLoss()

# Freeze BERT layers (optional for parameter-efficient fine-tuning)
for param in model.bert.parameters():
    param.requires_grad = False

# Training loop
for epoch in range(config.epochs):
    train_loss, train_accuracy = train_model(model, train_loader, optimizer, criterion, device)
    val_loss, val_accuracy = evaluate_model(model, test_loader, criterion, device)

    print(f"Epoch {epoch + 1}/{config.epochs}")
    print(f"Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.4f}")
    print(f"Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}")

    # Log metrics to W&B
    wandb.log({
        "epoch": epoch + 1,
        "train_loss": train_loss,
        "train_accuracy": train_accuracy,
        "val_loss": val_loss,
        "val_accuracy": val_accuracy,
    })

# Save model
# wandb.save("model.pt")
wandb.finish()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/466 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/542M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


AttributeError: 'DistilBertForSequenceClassification' object has no attribute 'bert'

In [None]:
import torch

# Predict function for PyTorch
def predict(texts, model, tokenizer, max_length):
    model.eval()  # Set model to evaluation mode
    encodings = tokenizer(
        texts,
        max_length=max_length,
        padding=True,
        truncation=True,
        return_tensors="pt"  # Use PyTorch tensors
    )

    input_ids = encodings["input_ids"].to(device)
    attention_mask = encodings["attention_mask"].to(device)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1).cpu().numpy()  # Convert to NumPy array

    return predictions

# Predict on the test set
test_predictions = predict(test_texts, model, tokenizer, config.max_length)

AttributeError: 'Tensor' object has no attribute 'logits'

In [None]:
from sklearn.metrics import classification_report

# Convert encoded labels back to original string labels
target_names = [str(label) for label in label_encoder.classes_]

# Generate classification report
print(classification_report(test_labels, test_predictions, target_names=target_names))

              precision    recall  f1-score   support

           0       0.83      0.89      0.86       698
           1       0.91      0.86      0.88       915

    accuracy                           0.87      1613
   macro avg       0.87      0.88      0.87      1613
weighted avg       0.88      0.87      0.87      1613



In [None]:
import pandas as pd

# Convert the true labels and predictions back to their original label names
true_labels = label_encoder.inverse_transform(test_labels)
predicted_labels = label_encoder.inverse_transform(test_predictions)

# Create a DataFrame with test sentences, true labels, and predicted labels
results_df = pd.DataFrame({
    "Sentence": test_texts,
    "True Label": true_labels,
    "Predicted Label": predicted_labels
})

# Display the first few rows of the DataFrame
print(results_df[results_df['True Label'] != results_df['Predicted Label']])
print(results_df[results_df['True Label'] == results_df['Predicted Label']])

                                               Sentence  True Label  \
1     อัพเดท   ก่อน   ใคร   โปรแกรม   การแข่งขัน   ว...           1   
6     ยิ่ง ลักษณ์   ขอ ศาล เลื่อน พิพากษา ระบุ น้ำ ใ...           0   
8     เผย ความประสงค์ พ.อ. อภิวันท์ ก่อน สิ้นใจ   ขอ...           0   
16    หมอ   ก้อง   ส   รวิชญ์   ซาบซึ้งใจ   น้อง   โ...           1   
20    โปรด เก็บ บัญชี ผู้ มี สิทธิ ประชามติ ไว้ ไกล ...           0   
...                                                 ...         ...   
1536  คนงาน หญิง กัมพูชา ถูก ตำรวจ ไล่ ทุบ   หลัง ปร...           0   
1538  โคตร เท่ห์   กิ้งก่า   นักรบ   สายพันธุ์   หา ...           1   
1542  ถก   พ.ร.บ. ชุมนุม   กฤษฎีกา เชื่อ ช่วย สอน คน...           0   
1574  อัยการสูงสุด ชี้   กกต. ไม่ ชี้ มูล ความผิด พร...           0   
1588  หนุ่ม   สั่ง   กางเกง   ยีนส์   จาก   เว็บ   อ...           1   

      Predicted Label  
1                   0  
6                   1  
8                   1  
16                  0  
20                  1  
...

In [None]:
# Inference example
def predict(text, model, tokenizer, device):
    model.eval()
    text = preprocess_text(text)  # Preprocess the input text
    encoding = tokenizer(
        text,
        max_length=MAX_LENGTH,
        padding="max_length",
        truncation=True,
        return_tensors="pt",
    )
    input_ids = encoding["input_ids"].to(device)
    attention_mask = encoding["attention_mask"].to(device)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predicted_class = logits.argmax(dim=-1).item()

    return predicted_class

example_text = "ควยควยควย โมะโมะโมะ"
predicted_label = predict(example_text, model, tokenizer, device)
print(f"Predicted label: {label_encoder.inverse_transform([predicted_label])[0]}")

Predicted label: 1


### BERT Custom Layer

In [None]:
from transformers import AutoModel
import torch.nn as nn

def evaluate_model(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0
    correct = 0

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device)

            # The custom model returns logits directly
            logits = model(input_ids=input_ids, attention_mask=attention_mask)
            loss = criterion(logits, labels)

            total_loss += loss.item()
            correct += (logits.argmax(dim=-1) == labels).sum().item()

    accuracy = correct / len(dataloader.dataset)
    return total_loss / len(dataloader), accuracy

# Define a custom model class with a classification head
class CustomBERTModel(nn.Module):
    def __init__(self, model_name, num_labels, dropout_rate=0.3):
        super(CustomBERTModel, self).__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(dropout_rate)
        self.norm = nn.LayerNorm(self.bert.config.hidden_size)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        # Get the [CLS] token output (pooled output)
        cls_output = outputs.last_hidden_state[:, 0, :]
        # Apply normalization and dropout
        norm_output = self.norm(cls_output)
        dropout_output = self.dropout(norm_output)
        # Pass through the classifier
        logits = self.classifier(dropout_output)
        return logits

# Initialize W&B for each experiment
wandb.init(project="thai-clickbait-classification", config={
    "model_name": "bert-base-multilingual-cased",
    "max_length": 128,
    "batch_size": 8,
    "learning_rate": 1e-4,
    "epochs": 10,
    "dropout_rate": 0.3,
    "architecture": 'CustomBERTModel',
    "preprocessing": 'pythainlp.tokenize.word_tokenize'
})

# Access W&B configuration
config = wandb.config

# Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load and preprocess data
train_df = pd.read_csv('/content/train_dataset.csv')
test_df = pd.read_csv('/content/test_dataset.csv')

train_texts = [preprocess_text(text) for text in train_df['sentence']]
test_texts = [preprocess_text(text) for text in test_df['sentence']]

label_encoder = LabelEncoder()
train_labels = label_encoder.fit_transform(train_df['label'])
test_labels = label_encoder.transform(test_df['label'])

# Dataset and DataLoader
tokenizer = AutoTokenizer.from_pretrained(config.model_name)
train_dataset = TextDataset(train_texts, train_labels, tokenizer, config.max_length)
test_dataset = TextDataset(test_texts, test_labels, tokenizer, config.max_length)

train_loader = DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=config.batch_size)

# Model setup
model = CustomBERTModel(config.model_name, num_labels=2, dropout_rate=config.dropout_rate)
model.to(device)

# Optimizer and loss function
optimizer = torch.optim.AdamW(model.parameters(), lr=config.learning_rate)
criterion = torch.nn.CrossEntropyLoss()

# Freeze BERT layers (optional for parameter-efficient fine-tuning)
for param in model.bert.parameters():
    param.requires_grad = False

# Training loop
for epoch in range(config.epochs):
    model.train()
    total_train_loss = 0
    correct_train = 0

    for batch in train_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        optimizer.zero_grad()
        logits = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()

        total_train_loss += loss.item()
        correct_train += (logits.argmax(dim=-1) == labels).sum().item()

    train_loss = total_train_loss / len(train_loader)
    train_accuracy = correct_train / len(train_dataset)

    val_loss, val_accuracy = evaluate_model(model, test_loader, criterion, device)

    print(f"Epoch {epoch + 1}/{config.epochs}")
    print(f"Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.4f}")
    print(f"Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}")

    # Log metrics to W&B
    wandb.log({
        "epoch": epoch + 1,
        "train_loss": train_loss,
        "train_accuracy": train_accuracy,
        "val_loss": val_loss,
        "val_accuracy": val_accuracy,
    })

# Save model (optional)
# wandb.save("model.pt")
wandb.finish()

Epoch 1/10
Train Loss: 0.4996, Train Accuracy: 0.7622
Validation Loss: 0.3482, Validation Accuracy: 0.8847
Epoch 2/10
Train Loss: 0.3772, Train Accuracy: 0.8459
Validation Loss: 0.3202, Validation Accuracy: 0.9020
Epoch 3/10
Train Loss: 0.3471, Train Accuracy: 0.8659
Validation Loss: 0.2802, Validation Accuracy: 0.9120
Epoch 4/10
Train Loss: 0.3267, Train Accuracy: 0.8695
Validation Loss: 0.2630, Validation Accuracy: 0.9151
Epoch 5/10
Train Loss: 0.3193, Train Accuracy: 0.8738
Validation Loss: 0.2429, Validation Accuracy: 0.9175
Epoch 6/10
Train Loss: 0.3152, Train Accuracy: 0.8736
Validation Loss: 0.2701, Validation Accuracy: 0.9089
Epoch 7/10
Train Loss: 0.3123, Train Accuracy: 0.8770
Validation Loss: 0.2534, Validation Accuracy: 0.9188
Epoch 8/10
Train Loss: 0.3162, Train Accuracy: 0.8736
Validation Loss: 0.2563, Validation Accuracy: 0.9169
Epoch 9/10
Train Loss: 0.3149, Train Accuracy: 0.8752
Validation Loss: 0.2267, Validation Accuracy: 0.9206
Epoch 10/10
Train Loss: 0.3078, Train

VBox(children=(Label(value='0.014 MB of 0.014 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_accuracy,▁▆▇▇██████
train_loss,█▄▂▂▁▁▁▁▁▁
val_accuracy,▁▄▆▇▇▆█▇██
val_loss,█▆▄▃▂▄▃▃▁▂

0,1
epoch,10.0
train_accuracy,0.87843
train_loss,0.30776
val_accuracy,0.92064
val_loss,0.23585


In [None]:
wandb.login

SyntaxError: invalid syntax (<ipython-input-10-041154708e5d>, line 1)

### Other Models with Custom Layer

In [None]:
from transformers import AutoModel
import torch.nn as nn

def evaluate_model(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0
    correct = 0

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device)

            # The custom model returns logits directly
            logits = model(input_ids=input_ids, attention_mask=attention_mask)
            loss = criterion(logits, labels)

            total_loss += loss.item()
            correct += (logits.argmax(dim=-1) == labels).sum().item()

    accuracy = correct / len(dataloader.dataset)
    return total_loss / len(dataloader), accuracy

# Define a custom model class with a classification head
class CustomBERTModel(nn.Module):
    def __init__(self, model_name, num_labels, dropout_rate=0.3):
        super(CustomBERTModel, self).__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(dropout_rate)
        self.norm = nn.LayerNorm(self.bert.config.hidden_size)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        # Get the [CLS] token output (pooled output)
        cls_output = outputs.last_hidden_state[:, 0, :]
        # Apply normalization and dropout
        norm_output = self.norm(cls_output)
        dropout_output = self.dropout(norm_output)
        # Pass through the classifier
        logits = self.classifier(dropout_output)
        return logits

# Initialize W&B for each experiment
wandb.init(project="thai-clickbait-classification", config={
    "model_name": "FacebookAI/xlm-roberta-base",
    "max_length": 128,
    "batch_size": 8,
    "learning_rate": 1e-4,
    "epochs": 10,
    "dropout_rate": 0.3,
    "architecture": 'CustomBERTModel',
    "preprocessing": 'pythainlp.tokenize.word_tokenize'
})

# Access W&B configuration
config = wandb.config

# Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load and preprocess data
train_df = pd.read_csv('/content/train_dataset.csv')
test_df = pd.read_csv('/content/test_dataset.csv')

train_texts = [preprocess_text(text) for text in train_df['sentence']]
test_texts = [preprocess_text(text) for text in test_df['sentence']]

label_encoder = LabelEncoder()
train_labels = label_encoder.fit_transform(train_df['label'])
test_labels = label_encoder.transform(test_df['label'])

# Dataset and DataLoader
tokenizer = AutoTokenizer.from_pretrained(config.model_name)
train_dataset = TextDataset(train_texts, train_labels, tokenizer, config.max_length)
test_dataset = TextDataset(test_texts, test_labels, tokenizer, config.max_length)

train_loader = DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=config.batch_size)

# Model setup
model = CustomBERTModel(config.model_name, num_labels=2, dropout_rate=config.dropout_rate)
model.to(device)

# Optimizer and loss function
optimizer = torch.optim.AdamW(model.parameters(), lr=config.learning_rate)
criterion = torch.nn.CrossEntropyLoss()

# Freeze BERT layers (optional for parameter-efficient fine-tuning)
for param in model.bert.parameters():
    param.requires_grad = False

# Training loop
for epoch in range(config.epochs):
    model.train()
    total_train_loss = 0
    correct_train = 0

    for batch in train_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        optimizer.zero_grad()
        logits = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()

        total_train_loss += loss.item()
        correct_train += (logits.argmax(dim=-1) == labels).sum().item()

    train_loss = total_train_loss / len(train_loader)
    train_accuracy = correct_train / len(train_dataset)

    val_loss, val_accuracy = evaluate_model(model, test_loader, criterion, device)

    print(f"Epoch {epoch + 1}/{config.epochs}")
    print(f"Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.4f}")
    print(f"Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}")

    # Log metrics to W&B
    wandb.log({
        "epoch": epoch + 1,
        "train_loss": train_loss,
        "train_accuracy": train_accuracy,
        "val_loss": val_loss,
        "val_accuracy": val_accuracy,
    })

# Save model (optional)
# wandb.save("model.pt")
wandb.finish()

tokenizer_config.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/430 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/471M [00:00<?, ?B/s]

Epoch 1/10
Train Loss: 0.6516, Train Accuracy: 0.5915
Validation Loss: 0.6500, Validation Accuracy: 0.4989
Epoch 2/10
Train Loss: 0.5802, Train Accuracy: 0.7183
Validation Loss: 0.5689, Validation Accuracy: 0.8341
Epoch 3/10
Train Loss: 0.5183, Train Accuracy: 0.8310
Validation Loss: 0.5201, Validation Accuracy: 0.8206
Epoch 4/10
Train Loss: 0.4658, Train Accuracy: 0.8707
Validation Loss: 0.4699, Validation Accuracy: 0.8485
Epoch 5/10
Train Loss: 0.4224, Train Accuracy: 0.8867
Validation Loss: 0.4054, Validation Accuracy: 0.9100
Epoch 6/10
Train Loss: 0.3853, Train Accuracy: 0.8949
Validation Loss: 0.3715, Validation Accuracy: 0.9104
Epoch 7/10
Train Loss: 0.3550, Train Accuracy: 0.8998
Validation Loss: 0.3487, Validation Accuracy: 0.9050
Epoch 8/10
Train Loss: 0.3326, Train Accuracy: 0.9026
Validation Loss: 0.3275, Validation Accuracy: 0.9055
Epoch 9/10
Train Loss: 0.3108, Train Accuracy: 0.9037
Validation Loss: 0.2889, Validation Accuracy: 0.9260
Epoch 10/10
Train Loss: 0.2981, Train

# Vectorizer Sentence

## Run Once

In [51]:
# Install required libraries (if not already installed)
!pip install transformers pythainlp wandb

# Import necessary libraries
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.utils.data import DataLoader, Dataset
from sklearn.preprocessing import LabelEncoder
import pandas as pd
from pythainlp.tokenize import word_tokenize
from pythainlp.util import normalize
import wandb
from pythainlp.word_vector import WordVector
import numpy as np

wandb.login

# Thai text preprocessing utility
def preprocess_text(text):
    text = normalize(text)  # Normalize Thai characters
    tokens = word_tokenize(text, engine="newmm")  # Tokenize with PyThaiNLP
    return " ".join(tokens)

# Dataset class
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "label": torch.tensor(label, dtype=torch.long),
        }

# Load Thai word vectors
wv = WordVector()
#wv_model = wv.get_model()

# Function to generate sentence embeddings
def sentence_embedding(sentence):
    vector = wv.sentence_vectorizer(sentence)
    return vector

# Custom Dataset class to include sentence embeddings
class CombinedTextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length, sentence_vectors):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.sentence_vectors = torch.tensor(sentence_vectors, dtype=torch.float32)

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        sentence_vector = self.sentence_vectors[idx]

        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "sentence_vector": sentence_vector,
            "label": torch.tensor(label, dtype=torch.long),
        }

from torch import nn
import torch.nn.functional as F

class EnhancedBERTModel(nn.Module):
    def __init__(self, model_name, num_labels, sentence_vector_dim, dropout_rate, hidden_dim=256):
        super(EnhancedBERTModel, self).__init__()
        self.bert = AutoModel.from_pretrained(model_name)

        # Dimension of combined features
        combined_dim = self.bert.config.hidden_size + sentence_vector_dim

        # Add dense layers
        self.fc1 = nn.Linear(combined_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim // 2)

        # Normalization, Dropout, and Classifier
        self.norm = nn.LayerNorm(hidden_dim // 2)
        self.dropout = nn.Dropout(dropout_rate)
        self.classifier = nn.Linear(hidden_dim // 2, num_labels)

    def forward(self, input_ids, attention_mask, sentence_vectors):
        # BERT output
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0, :]  # [CLS] token

        # Mean pooling of sentence vectors
        sentence_vectors_pooled = sentence_vectors.mean(dim=1)

        # Combine features
        combined_features = torch.cat([cls_output, sentence_vectors_pooled], dim=1)

        # Pass through dense layers
        x = F.relu(self.fc1(combined_features))
        x = F.relu(self.fc2(x))

        # Apply normalization and dropout
        x = self.norm(x)
        x = self.dropout(x)

        # Final classification
        logits = self.classifier(x)
        return logits



In [None]:
# Load and preprocess data
train_df = pd.read_csv('/content/train_dataset.csv')
test_df = pd.read_csv('/content/test_dataset.csv')

train_texts = [preprocess_text(text) for text in train_df['sentence']]
test_texts = [preprocess_text(text) for text in test_df['sentence']]

# Generate sentence embeddings for train and test sets
train_sentence_vectors = np.array([sentence_embedding(text) for text in train_texts])
test_sentence_vectors = np.array([sentence_embedding(text) for text in test_texts])

label_encoder = LabelEncoder()
train_labels = label_encoder.fit_transform(train_df['label'])
test_labels = label_encoder.transform(test_df['label'])

## Run Many times

### Pre-trained with Custom Layers

In [None]:
from transformers import AutoModel
import torch.nn as nn

def evaluate_model(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0
    correct = 0

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device)
            # Generate sentence vectors for the batch
            sentence_vectors = batch['sentence_vector'].to(device)

            # The custom model returns logits directly
            logits = model(input_ids=input_ids, attention_mask=attention_mask, sentence_vectors=sentence_vectors)
            loss = criterion(logits, labels)

            total_loss += loss.item()
            correct += (logits.argmax(dim=-1) == labels).sum().item()

    accuracy = correct / len(dataloader.dataset)
    return total_loss / len(dataloader), accuracy

# Initialize W&B for each experiment
wandb.init(project="thai-clickbait-classification", config={
    "model_name": "monsoon-nlp/bert-base-thai",
    "max_length": 128,
    "batch_size": 8,
    "learning_rate": 1e-4,
    "epochs": 10,
    "dropout_rate": 0.3,
    "architecture": 'CustomBERTModel',
    "preprocessing": 'pythainlp.word_vector.WordVector'
})

# Access W&B configuration
config = wandb.config

# Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Dataset and DataLoader
tokenizer = AutoTokenizer.from_pretrained(config.model_name)
train_dataset = TextDataset(train_texts, train_labels, tokenizer, config.max_length)
test_dataset = TextDataset(test_texts, test_labels, tokenizer, config.max_length)

# Update Datasets and DataLoader
train_dataset = CombinedTextDataset(train_texts, train_labels, tokenizer, config.max_length, train_sentence_vectors)
test_dataset = CombinedTextDataset(test_texts, test_labels, tokenizer, config.max_length, test_sentence_vectors)

train_loader = DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=config.batch_size)

# Model initialization
sentence_vector_dim = train_sentence_vectors.shape[2]  # Dimensionality of word vectors
model = EnhancedBERTModel(
    config.model_name,
    num_labels=2,
    sentence_vector_dim=sentence_vector_dim,
    dropout_rate=config.dropout_rate
)
model.to(device)

# Optimizer and loss function
optimizer = torch.optim.AdamW(model.parameters(), lr=config.learning_rate)
criterion = torch.nn.CrossEntropyLoss()

# Freeze BERT layers (optional for parameter-efficient fine-tuning)
for param in model.bert.parameters():
    param.requires_grad = False

# Training loop
for epoch in range(config.epochs):
    model.train()
    total_train_loss = 0
    correct_train = 0

    for batch in train_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        # Generate sentence vectors for the batch
        sentence_vectors = batch['sentence_vector'].to(device)

        optimizer.zero_grad()
        logits = model(input_ids=input_ids, attention_mask=attention_mask, sentence_vectors=sentence_vectors)
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()

        total_train_loss += loss.item()
        correct_train += (logits.argmax(dim=-1) == labels).sum().item()

    train_loss = total_train_loss / len(train_loader)
    train_accuracy = correct_train / len(train_dataset)

    val_loss, val_accuracy = evaluate_model(model, test_loader, criterion, device)

    print(f"Epoch {epoch + 1}/{config.epochs}")
    print(f"Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.4f}")
    print(f"Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}")

    # Log metrics to W&B
    wandb.log({
        "epoch": epoch + 1,
        "train_loss": train_loss,
        "train_accuracy": train_accuracy,
        "val_loss": val_loss,
        "val_accuracy": val_accuracy,
    })

# Save model (optional)
# wandb.save("model.pt")
wandb.finish()

VBox(children=(Label(value='0.013 MB of 0.013 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

Some weights of the model checkpoint at monsoon-nlp/bert-base-thai were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch 1/10
Train Loss: 0.2786, Train Accuracy: 0.8789
Validation Loss: 0.1255, Validation Accuracy: 0.9559
Epoch 2/10
Train Loss: 0.1740, Train Accuracy: 0.9335
Validation Loss: 0.1252, Validation Accuracy: 0.9527
Epoch 3/10
Train Loss: 0.1528, Train Accuracy: 0.9423
Validation Loss: 0.1007, Validation Accuracy: 0.9645
Epoch 4/10
Train Loss: 0.1376, Train Accuracy: 0.9499
Validation Loss: 0.1040, Validation Accuracy: 0.9627
Epoch 5/10
Train Loss: 0.1355, Train Accuracy: 0.9518
Validation Loss: 0.1422, Validation Accuracy: 0.9474
Epoch 6/10
Train Loss: 0.1254, Train Accuracy: 0.9552
Validation Loss: 0.1007, Validation Accuracy: 0.9638
Epoch 7/10
Train Loss: 0.1222, Train Accuracy: 0.9547
Validation Loss: 0.1044, Validation Accuracy: 0.9640
Epoch 8/10
Train Loss: 0.1216, Train Accuracy: 0.9553
Validation Loss: 0.1740, Validation Accuracy: 0.9380
Epoch 9/10
Train Loss: 0.1173, Train Accuracy: 0.9565
Validation Loss: 0.0936, Validation Accuracy: 0.9655


In [None]:
wandb.finish()

In [2]:
# Initialize the API
!pip install wandb
import wandb
api = wandb.Api()

run = api.run("thanyakorn-h-NIDA/thai-clickbait-classification/runs/xojn0e4z")
print("Before",run.name, run.config)
run.config.update({"preprocessing": 'pythainlp.word_vector.WordVector'})
print("After", run.config)
run.save()



[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Before sparkling-cloud-66 {'epochs': 10, 'batch_size': 8, 'max_length': 128, 'model_name': 'airesearch/wangchanberta-base-att-spm-uncased', 'architecture': 'CustomBERTModel', 'dropout_rate': 0.3, 'learning_rate': 0.0001, 'preprocessing': 'pythainlp.tokenize.word_tokenize'}
After {'epochs': 10, 'batch_size': 8, 'max_length': 128, 'model_name': 'airesearch/wangchanberta-base-att-spm-uncased', 'architecture': 'CustomBERTModel', 'dropout_rate': 0.3, 'learning_rate': 0.0001, 'preprocessing': 'pythainlp.word_vector.WordVector'}


### Inference

In [29]:
import torch
from transformers import AutoModel
import torch.nn as nn

# Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the model
model_path = "/content/model.pt"
config={
    "model_name": "airesearch/wangchanberta-base-att-spm-uncased",
    "max_length": 128,
    "batch_size": 8,
    "learning_rate": 1e-4,
    "epochs": 10,
    "dropout_rate": 0.3,
    "architecture": 'CustomBERTModel',
    "preprocessing": 'pythainlp.word_vector.WordVector'
}
sentence_vector_dim = 1069  # Dimensionality of word vectors
model = EnhancedBERTModel(
    config['model_name'],
    num_labels=2,
    sentence_vector_dim=sentence_vector_dim,
    dropout_rate=config['dropout_rate']
)
model = torch.load(model_path, map_location=torch.device('cpu'))
model.to(device)
model.eval()  # Set model to evaluation mode

  model = torch.load(model_path, map_location=torch.device('cpu'))


EnhancedBERTModel(
  (bert): CamembertModel(
    (embeddings): CamembertEmbeddings(
      (word_embeddings): Embedding(25005, 768, padding_idx=1)
      (position_embeddings): Embedding(512, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): CamembertEncoder(
      (layer): ModuleList(
        (0-11): 12 x CamembertLayer(
          (attention): CamembertAttention(
            (self): CamembertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): CamembertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

In [12]:
from sklearn.preprocessing import LabelEncoder

# Load and preprocess data
test_df = pd.read_csv('/content/test_dataset.csv')

test_texts = [preprocess_text(text) for text in test_df['sentence']]
test_sentence_vectors = np.array([sentence_embedding(text) for text in test_texts])

label_encoder = LabelEncoder()
test_labels = label_encoder.fit_transform(test_df['label'])

In [85]:
import pandas as pd
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer

# Tokenizer setup
tokenizer = AutoTokenizer.from_pretrained(config['model_name'])

def predict_with_attention(texts, sentence_vectors, true_labels, model, tokenizer, max_length, device):
    """
    Predict with attention visualization, probabilities, and original text.
    """
    model.eval()  # Set model to evaluation mode
    encodings = tokenizer(
        texts,
        max_length=max_length,
        padding=True,
        truncation=True,
        return_tensors="pt"  # Use PyTorch tensors
    )

    input_ids = encodings["input_ids"].to(device)
    attention_mask = encodings["attention_mask"].to(device)
    sentence_vectors = torch.tensor(sentence_vectors, dtype=torch.float32).to(device)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, sentence_vectors=sentence_vectors)
        logits = outputs  # Classification logits
        probabilities = F.softmax(logits, dim=-1).cpu().numpy()  # Probabilities
        predictions = torch.argmax(logits, dim=-1).cpu().numpy()  # Predicted labels

    # Prepare results
    result_data = []
    for i, text in enumerate(texts):
        result_data.append({
            "original_text": text,
            "true_label": test_labels[i],
            "predicted_label": predictions[i],
            "probabilities": [round(prob, 4) for prob in probabilities[i]]
        })

    # Convert results to a pandas DataFrame
    results_df = pd.DataFrame(result_data)

    return results_df

# Example usage
pd.set_option('display.max_colwidth',50)
results_df = predict_with_attention(
    texts=test_texts[:10],
    sentence_vectors=test_sentence_vectors[:10],
    true_labels=test_labels[:10],  # Provide true labels if available
    model=model,
    tokenizer=tokenizer,
    max_length=config['max_length'],
    device=device
)

# Display the DataFrame
print(results_df)

                                       original_text  true_label  \
0  ไม่ ธรรมดา   ไป   ฟัง   วลี   เด็ด   ใหม่   ขอ...           1   
1  อัพเดท   ก่อน   ใคร   โปรแกรม   การแข่งขัน   ว...           1   
2  พล . อ.   สุรยุทธ์   เรียกร้อง   WTO   พื้น เจ...           0   
3  มาแรง   โค้งสุดท้าย   เลขเด็ด   หลวงพ่อ ปาก แด...           1   
4  ถึงกับ   เงิบ   เผยโฉม   หน้า   เก่า   ของ   เ...           1   
5  เมื่อ   ผู้ชม ทางบ้าน   โดน ปั่นหัว ด้วย ตลก ร้าย           0   
6  ยิ่ง ลักษณ์   ขอ ศาล เลื่อน พิพากษา ระบุ น้ำ ใ...           0   
7  หนุ่ม   จอม   ลวนลาม   โดน   แอบ   ถ่าย   ขณะ ...           1   
8  เผย ความประสงค์ พ.อ. อภิวันท์ ก่อน สิ้นใจ   ขอ...           0   
9     รายงาน   :   เมื่อ เยาวชน เดนมาร์ก ลุกขึ้น สู้           0   

   predicted_label     probabilities  
0                1  [0.0004, 0.9996]  
1                1  [0.0078, 0.9922]  
2                0  [0.9993, 0.0007]  
3                1  [0.0008, 0.9992]  
4                1  [0.0002, 0.9998]  
5                

In [96]:
def predict_input_text(input_text, sentence_vectors, model, tokenizer, max_length, device):
    """
    Predict for a single input text.
    """
    model.eval()  # Set model to evaluation mode
    encodings = tokenizer(
        [input_text],  # Wrap input text in a list for batch processing
        max_length=max_length,
        padding=True,
        truncation=True,
        return_tensors="pt"  # Use PyTorch tensors
    )

    input_ids = encodings["input_ids"].to(device)
    attention_mask = encodings["attention_mask"].to(device)
    sentence_vectors = torch.tensor(sentence_vectors, dtype=torch.float32).to(device)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, sentence_vectors=sentence_vectors)
        logits = outputs  # Classification logits
        probabilities = F.softmax(logits, dim=-1).cpu().numpy()  # Probabilities
        prediction = torch.argmax(logits, dim=-1).cpu().numpy()  # Predicted label

    return {
        "input_text": input_text,
        "predicted_label": prediction[0],
        "probabilities": probabilities[0]
    }

# Example of predicting from a new input text
#new_input_text = "หนุ่มจอมลวนลามโดนแอบถ่ายขณะเอามือคลึงสาวนั่งหลับในรถไฟฟ้าภัยสังคมตัวจริง"
#new_input_text = "วลีเด็ดนายก คนฟังถึงกับน้ำตาซึม"
#new_input_text = "ชาวเน็ตจวกยับ ตะลึง น้ำท่วมภาคใต้ 10 ปีซ้อน รัฐบาลเมิน ไม่เยียวยา"
#new_input_text = "เด็กไทย อ่านหนังสือเพิ่มขึ้นจาก 4 บรรทัด เป็น 8 บรรทัด"
#new_input_text = "ภาพนี้ไม่น่าจะเกิดขึ้นในงานแต่งงาน"
new_input_text = "ย้อนเหตุการณ์ โรเซ่ร้องไห้กลางทัวร์ Blackpink ที่แท้รู้สาเหตุแล้ว"
input_text = preprocess_text(new_input_text)
sentence_vectors = np.array([sentence_embedding(input_text)]) # Provide the sentence vectors associated with this input
result = predict_input_text(new_input_text, sentence_vectors, model, tokenizer, config['max_length'], device)

# Print prediction result
print(new_input_text)
print("Prediction: ", result['predicted_label'])
print(result['probabilities'])

ย้อนเหตุการณ์ โรเซ่ร้องไห้กลางทัวร์ Blackpink ที่แท้รู้สาเหตุแล้ว
Prediction:  1
[0.16193607 0.83806396]
