In [4]:
import os
import csv
import re
import torch
import torch.nn as nn
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel

class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer.encode_plus(
            self.texts[idx],
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        if self.labels is not None:
            return {
                'text': self.texts[idx],
                'input_ids': encoding['input_ids'].flatten(),
                'attention_mask': encoding['attention_mask'].flatten(),
                'label': torch.tensor(self.labels[idx], dtype=torch.long)
            }
        else:
            return {
                'text': self.texts[idx],
                'input_ids': encoding['input_ids'].flatten(),
                'attention_mask': encoding['attention_mask'].flatten()
            }

class TextClassifier(nn.Module):
    def __init__(self, bert_model, n_classes):
        super(TextClassifier, self).__init__()
        self.bert = bert_model
        self.dropout = nn.Dropout(p=0.3)
        self.out = nn.Linear(self.bert.config.hidden_size, n_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs[1]  # https://huggingface.co/transformers/v3.0.2/model_doc/bert.html#bertmodel
        output = self.dropout(pooled_output)
        return self.out(output)

# Load the data
train_df = pd.read_csv('../input/train-test-filled/train_df_filled_null.csv')
test_df = pd.read_csv('../input/train-test-filled/test_df_filled_null.csv')

train_texts = train_df['text'].tolist()
train_labels = train_df['label'].tolist()

test_texts = test_df['text'].tolist()
test_domains = test_df['domain'].tolist()

# Check for CUDA
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load the GreekBERT tokenizer
model_name = "nlpaueb/bert-base-greek-uncased-v1"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Create the train and test datasets and dataloaders
train_dataset = TextDataset(train_texts, train_labels, tokenizer, max_len=512)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

test_dataset = TextDataset(test_texts, None, tokenizer, max_len=512)
test_loader = DataLoader(test_dataset, batch_size=16)

# Load the GreekBERT model
bert_model = AutoModel.from_pretrained(model_name)

# Initialize the sentiment classifier with GreekBERT
model = TextClassifier(bert_model, n_classes=9).to(device)

# Loss function and optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)
loss_function = torch.nn.CrossEntropyLoss()

epochs = 6

# Train the model
model.train()
for epoch in range(epochs):
    total_loss = 0
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad()

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = loss_function(outputs, labels)
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch + 1}/{epochs}, Loss: {total_loss / len(train_loader)}")

# Save the fine-tuned model and tokenizer
model_save_path = 'fine_tuned_bert_6'
model.bert.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)

# Make predictions
model.eval()
y_pred = []
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        y_pred.append(outputs.cpu())

y_pred = torch.cat(y_pred, dim=0).numpy()

# Write predictions to a file
with open('sample_submission.csv', 'w') as csvfile:
    writer = csv.writer(csvfile, delimiter=',')
    header = ['domain_name'] + [f'class_{i}' for i in range(9)]
    writer.writerow(header)
    for i, domain in enumerate(test_domains):
        row = [domain] + y_pred[i].tolist()
        writer.writerow(row)


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Epoch 1/6, Loss: 1.7186581136887533
Epoch 2/6, Loss: 1.244075699333559
Epoch 3/6, Loss: 1.0137851065710972
Epoch 4/6, Loss: 0.8393957847565935
Epoch 5/6, Loss: 0.7092764347530248
Epoch 6/6, Loss: 0.5898335444038374
