In [1]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
# Load data
df = pd.read_csv('sentiment_2000.csv')

# Preprocess data
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

def preprocess_long_text(text, max_length=512, stride=256):
    tokens = tokenizer.tokenize(text)
    segments = []
    for i in range(0, len(tokens), max_length - stride):
        segment = tokens[i:i+max_length]
        segments.append(segment)
    return segments

max_length = 128  # Max length for each segment
stride = 64       # Stride for segment overlap

texts = []
labels = []

for idx, row in df.iterrows():
    segments = preprocess_long_text(row['content'], max_length, stride)
    for segment in segments:
        texts.append(tokenizer.convert_tokens_to_string(segment))
        labels.append(row['sentimen'])

# Convert labels to numerical format
label_map = {'positif': 1, 'netral': 0, 'negatif': -1}
labels = [label_map[label] for label in labels]

# Convert text segments to input IDs and attention masks
input_ids = []
attention_masks = []

for text in texts:
    encoded_dict = tokenizer.encode_plus(
                        text,
                        add_special_tokens=True,
                        max_length=max_length,
                        padding='max_length',
                        truncation=True,
                        return_attention_mask=True,
                        return_tensors='pt',
                   )
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)

KeyError: 1

In [None]:
# Split data into train and validation sets
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, labels, random_state=42, test_size=0.3)
train_masks, validation_masks, _, _ = train_test_split(attention_masks, labels, random_state=42, test_size=0.3)

# Define DataLoader for train and validation sets
batch_size = 16

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=True)

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_dataloader = DataLoader(validation_data, batch_size=batch_size, shuffle=False)

# Define model
model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=3)

# Send model to GPU, if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

In [None]:
# Define optimizer
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)

# Training loop
epochs = 3

for epoch in range(epochs):
    model.train()
    total_train_loss = 0
    
    for batch in train_dataloader:
        batch = tuple(t.to(device) for t in batch)
        inputs = {'input_ids': batch[0],
                  'attention_mask': batch[1],
                  'labels': batch[2]}

        optimizer.zero_grad()        

        outputs = model(**inputs)
        loss = outputs.loss
        total_train_loss += loss.item()

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()

    avg_train_loss = total_train_loss / len(train_dataloader)

    # Validation
    model.eval()
    total_eval_accuracy = 0

    for batch in validation_dataloader:
        batch = tuple(t.to(device) for t in batch)
        inputs = {'input_ids': batch[0],
                  'attention_mask': batch[1],
                  'labels': batch[2]}

        with torch.no_grad():        
            outputs = model(**inputs)
        
        logits = outputs.logits
        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].to('cpu').numpy()

        total_eval_accuracy += accuracy_score(np.argmax(logits, axis=1), label_ids)

    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
    
    print(f'Epoch {epoch + 1}:')
    print(f'  Training Loss: {avg_train_loss}')
    print(f'  Validation Accuracy: {avg_val_accuracy}')