In [None]:
# Import Neceessary Libraires

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from datasets import Dataset
from transformers import DataCollatorWithPadding as DataCollator
from transformers import AutoTokenizer, AutoModel
from transformers import TrainingArguments, Trainer
from transformers import EarlyStoppingCallback
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import accuracy_score, f1_score

In [None]:
# Read CSV Dataset

df = pd.read_csv(r'dataset_path')

In [None]:
# Encode Categories into Numbers

label_encoder = LabelEncoder()
df['labels'] = label_encoder.fit_transform(df['discourse_type'])

In [None]:
# Convert Pandas DataFrame to Hugging Face Dataset

df = Dataset.from_pandas(df)

In [None]:
# Defining Base Model Path

model_path = 'microsoft/deberta-v3-base'

In [None]:
# Tokenizer and Preprocess Function Initialization

tokenizer = AutoTokenizer.from_pretrained(model_path)

def preprocess(text):
    inputs = text['discourse_text']
    tokens = tokenizer(inputs, padding=False, max_length=256, truncation=True)
    return tokens

In [None]:
# Preprocessing and Tokenization

tokenized_df = df.map(preprocess, batched=True)

In [None]:
# Splitting the Dataset for Training and Evaluation

split_dataset = tokenized_df.train_test_split(test_size=0.2, seed=42)
train_df = split_dataset['train']
test_df = split_dataset['test']

In [None]:
# Computing Class Weights to Overcome Class Imbalance

weights = compute_class_weight(
    class_weight="balanced",
    classes=np.array([0,1,2,3,4,5,6]),
    y=train_df["labels"]
)
class_weights = torch.tensor(weights, dtype=torch.float)

In [None]:
# Custom PyTorch Module For Attention Pooling

class AttentionPooler(nn.Module):
    def __init__(self, hidden_size):
        super().__init__()
        self.attention = nn.Linear(hidden_size, 1)

    def forward(self, last_hidden_state, attention_mask):
        attention_scores = self.attention(last_hidden_state)
        mask = attention_mask.unsqueeze(-1)
        attention_scores[mask == 0] = -1e4
        attention_weights = torch.softmax(attention_scores, dim=1)
        pooled_output = torch.sum(attention_weights * last_hidden_state, dim=1)
        return pooled_output

# Custom PyTorch Model

class ParagraphClassifier(nn.Module):
    def __init__(self, model):
        super().__init__()
        self.transformer = AutoModel.from_pretrained(model)
        hidden = self.transformer.config.hidden_size
        self.pooler = AttentionPooler(hidden_size=hidden)
        self.classifier = nn.Linear(hidden, 7)

        # Weighted Loss for Class Imbalance
        self.loss_fn = nn.CrossEntropyLoss(weight=class_weights)

    def forward(self, input_ids=None, attention_mask=None, labels=None):
        out = self.transformer(input_ids=input_ids, attention_mask=attention_mask)
        last_hidden = out.last_hidden_state
        pooled = self.pooler(last_hidden_state=last_hidden, attention_mask=attention_mask)
        logits = self.classifier(pooled)

        if labels is not None:
            loss = self.loss_fn(logits, labels)
            return {"loss": loss, "logits": logits}
        return {"logits": logits}

In [None]:
# Dynamic Padding

data_collator = DataCollator(
    tokenizer=tokenizer,
    padding=True,
    return_tensors='pt'
)

In [None]:
# Funtion for Metrics Calculation (Accuracy, F1-Macro)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    f1 = f1_score(labels, preds, average='macro')
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc,
            "f1": f1}

In [None]:
# Training Hyperparameters

training_args = TrainingArguments(
  output_dir='checkpoints',
  save_strategy='epoch',
  logging_strategy='epoch',
  eval_strategy='epoch',
  # Memory Optimization
  per_device_train_batch_size=16,
  per_device_eval_batch_size=16,
  save_total_limit=4,
  # Training
  learning_rate=1e-5,
  num_train_epochs=10,
  weight_decay=0.02,
  # Evaluation
  metric_for_best_model='f1',
  greater_is_better=True,
  load_best_model_at_end=True,
  # For GPU
  fp16=True,
  # Other
  report_to='none'
)

In [None]:
# Loading Model
model = ParagraphClassifier(model=model_path)

# Defining Trainer
trainer = Trainer(
  model=model,
  args=training_args,
  processing_class=tokenizer,
  train_dataset=train_df,
  eval_dataset=test_df,
  compute_metrics=compute_metrics,
  data_collator=data_collator,
  callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

In [None]:
# Training

trainer.train()