In [None]:
# ------------------ 1. Εισαγωγή βιβλιοθηκών ------------------
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import torch
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
from transformers import Trainer, TrainingArguments

# Απενεργοποίηση wandb
import os
os.environ["WANDB_DISABLED"] = "true"

# Έλεγχος για GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device: {device}")


In [None]:
# ------------------ 2. Φόρτωση και επεξεργασία δεδομένων ------------------
df = pd.read_csv("/kaggle/input/tweets/Tweets.csv")
df = df[['text', 'airline_sentiment']]
label2id = {'negative': 0, 'neutral': 1, 'positive': 2}
id2label = {0: 'negative', 1: 'neutral', 2: 'positive'}
df['label'] = df['airline_sentiment'].map(label2id)


In [None]:
# ------------------ 3. Split σε train / val / test ------------------
train_texts, temp_texts, train_labels, temp_labels = train_test_split(
    df['text'].tolist(), df['label'].tolist(), test_size=0.3, random_state=42)

val_texts, test_texts, val_labels, test_labels = train_test_split(
    temp_texts, temp_labels, test_size=0.5, random_state=42)


In [None]:
# ------------------ 4. Tokenization ------------------
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=128)
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=128)


In [None]:
# ------------------ 5. Dataset Class ------------------
class TweetDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = TweetDataset(train_encodings, train_labels)
val_dataset = TweetDataset(val_encodings, val_labels)
test_dataset = TweetDataset(test_encodings, test_labels)


In [None]:
# ------------------ 6. Φόρτωση Μοντέλου ------------------
model = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-uncased',
    num_labels=3,
    id2label=id2label,
    label2id=label2id
)
model.to(device)


In [None]:
# ------------------ 7. Phase 1: Πάγωμα Backbone ------------------
for param in model.distilbert.parameters():
    param.requires_grad = False

training_args_1 = TrainingArguments(
    output_dir='./results_phase1',
    evaluation_strategy='epoch',
    save_strategy='epoch',
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

trainer_1 = Trainer(
    model=model,
    args=training_args_1,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

print("===== Phase 1: Training classification head only =====")
trainer_1.train()


In [None]:
# ------------------ 8. Phase 2: Ξεπαγώνουμε όλα τα layers ------------------
for param in model.distilbert.parameters():
    param.requires_grad = True

training_args_2 = TrainingArguments(
    output_dir='./results_phase2',
    evaluation_strategy='epoch',
    save_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    logging_dir='./logs_phase2',
    logging_steps=10,
)

trainer_2 = Trainer(
    model=model,
    args=training_args_2,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

print("===== Phase 2: Fine-tuning all layers =====")
trainer_2.train()


In [None]:
# ------------------ 9. Αξιολόγηση στο test set ------------------
preds = trainer_2.predict(test_dataset)
y_pred = np.argmax(preds.predictions, axis=1)

cm = confusion_matrix(test_labels, y_pred)
sns.heatmap(cm, annot=True, fmt='d', xticklabels=id2label.values(), yticklabels=id2label.values())
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()

print(classification_report(test_labels, y_pred, target_names=id2label.values()))


In [None]:
# ------------------ 10. Καμπύλες εκπαίδευσης ------------------
logs = trainer_2.state.log_history
train_loss = [x['loss'] for x in logs if 'loss' in x]
eval_loss = [x['eval_loss'] for x in logs if 'eval_loss' in x]

plt.plot(train_loss, label='Train Loss')
plt.plot(eval_loss, label='Validation Loss')
plt.xlabel('Step')
plt.ylabel('Loss')
plt.legend()
plt.title('Training vs Validation Loss')
plt.show()


Το μοντέλο που επιλέξαμε είναι το *DistilBERT*, ένα ελαφρύτερο και ταχύτερο παράγωγο του BERT. Διαθέτει 6 layers (αντί για 12) και προσφέρει πολύ καλή απόδοση με μικρότερο κόστος υπολογισμού.

Εφαρμόσαμε gradual unfreezing:
- Στην *1η φάση*, εκπαιδεύσαμε μόνο τον ταξινομητή κρατώντας το υπόλοιπο μοντέλο "παγωμένο".
- Στη *2η φάση*, ξεπαγώσαμε όλα τα layers και κάναμε πλήρες fine-tuning.

Αυτό βοηθά να διατηρηθούν οι προεκπαιδευμένες γνώσεις και να προσαρμοστεί καλύτερα το μοντέλο στα νέα δεδομένα, αποφεύγοντας το overfitting.
