In [None]:

!pip install torch transformers datasets tqdm scikit-learn accelerate imbalanced-learn joblib

In [None]:

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from transformers import LongformerTokenizer, LongformerForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from datasets import Dataset as HFDataset
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
from accelerate import Accelerator, notebook_launcher
from sklearn.metrics import f1_score, recall_score, precision_score, accuracy_score
from imblearn.under_sampling import RandomUnderSampler
import torch.nn.functional as F
import joblib

In [None]:

class Config:
    content_length_percentile = 90
    min_text_length = 0
    train_on_subset = False
    subset_fraction = 0.01
    
    pretrained_model = "allenai/longformer-base-4096"
    dropout_rate = 0.2
    max_length = None
    
    batch_size = 4
    num_epochs = 4
    learning_rate = 2e-5
    weight_decay = 0.01
    warmup_ratio = 0.1
    
    mixed_precision = 'fp16'
    seed = 428
    gradient_accumulation_steps = 8

config = Config()

In [None]:

torch.manual_seed(config.seed)
np.random.seed(config.seed)

def clean_data(df):
    df = df.dropna(subset=['content']).copy()
    df['text_length'] = df['content'].apply(lambda x: len(str(x).split()))
    length_threshold = np.percentile(df['text_length'], config.content_length_percentile)
    df = df[(df['text_length'] >= config.min_text_length) & 
            (df['text_length'] <= length_threshold)]
    
    config.max_length = int(np.ceil(length_threshold / 2048) * 2048)
    print(f"Using dynamic max length: {config.max_length}")
    
    return df

In [None]:

df_train = pd.read_csv("/kaggle/input/dataset-gods/train.csv")
df_train = clean_data(df_train)

df_train['text'] = df_train['title'] + " " + df_train['content']

label_encoder = LabelEncoder()
df_train["target"] = label_encoder.fit_transform(df_train["target"])
num_classes = len(label_encoder.classes_)

rus = RandomUnderSampler(random_state=config.seed)
df_resampled, _ = rus.fit_resample(df_train[['text', 'text_length']], df_train['target'])
df_train = pd.DataFrame({
    'text': df_resampled['text'],
    'target': df_resampled.index.map(df_train['target'].__getitem__)
})

if config.train_on_subset:
    df_train = df_train.sample(frac=config.subset_fraction, random_state=config.seed)

from sklearn.model_selection import train_test_split
df_train, df_val = train_test_split(df_train, test_size=0.2, random_state=config.seed)

In [None]:

class MentalHealthDataset(Dataset):
    def __init__(self, texts, labels, tokenizer):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts.iloc[idx])
        label = self.labels.iloc[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=config.max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [None]:

tokenizer = LongformerTokenizer.from_pretrained(config.pretrained_model)

def initialize_model(num_classes):
    model = LongformerForSequenceClassification.from_pretrained(
        config.pretrained_model,
        num_labels=num_classes,
    )
    return model

In [None]:

def training_function():
    accelerator = Accelerator(mixed_precision=config.mixed_precision, gradient_accumulation_steps=config.gradient_accumulation_steps)
    
    model = initialize_model(num_classes)

    optimizer = AdamW(model.parameters(), lr=config.learning_rate, weight_decay=config.weight_decay)
    
    train_dataset = MentalHealthDataset(df_train['text'], df_train['target'], tokenizer)
    val_dataset = MentalHealthDataset(df_val['text'], df_val['target'], tokenizer)
    
    train_loader = DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True, num_workers=2)
    val_loader = DataLoader(val_dataset, batch_size=config.batch_size, shuffle=False, num_workers=2)

    total_steps = len(train_loader) * config.num_epochs
    scheduler = get_linear_schedule_with_warmup(
        optimizer, 
        num_warmup_steps=int(total_steps * config.warmup_ratio),
        num_training_steps=total_steps
    )

    class_counts = np.bincount(df_train['target'])
    class_weights = torch.tensor(1. / np.sqrt(class_counts), dtype=torch.float32)
    criterion = nn.CrossEntropyLoss(weight=class_weights.to(accelerator.device))

    model, optimizer, train_loader, val_loader, scheduler = accelerator.prepare(
        model, optimizer, train_loader, val_loader, scheduler
    )

    def evaluate(model, loader):
        model.eval()
        all_preds = []
        all_labels = []
        
        with torch.no_grad():
            for batch in tqdm(loader, desc="Evaluating"):
                outputs = model(**batch)
                preds = torch.argmax(outputs.logits, dim=-1)
                
                all_preds.extend(accelerator.gather_for_metrics(preds).cpu())
                all_labels.extend(accelerator.gather_for_metrics(batch['labels']).cpu())
        
        accuracy = accuracy_score(all_labels, all_preds)
        f1 = f1_score(all_labels, all_preds, average='weighted')
        return accuracy, f1

    best_f1 = 0
    for epoch in range(config.num_epochs):
        model.train()
        total_loss = 0
        all_preds = []
        all_labels = []
        
        progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{config.num_epochs}", leave=False)
        for batch in progress_bar:
            with accelerator.accumulate(model):
                outputs = model(**batch)
                loss = criterion(outputs.logits, batch['labels'])
                
                accelerator.backward(loss)
                if accelerator.sync_gradients:
                    accelerator.clip_grad_norm_(model.parameters(), 1.0)
                    
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()
                
                total_loss += loss.item()
                preds = torch.argmax(outputs.logits, dim=-1)
                all_preds.extend(accelerator.gather_for_metrics(preds).cpu())
                all_labels.extend(accelerator.gather_for_metrics(batch['labels']).cpu())
                
                progress_bar.set_postfix(loss=total_loss / len(all_labels))

        train_accuracy, train_f1 = evaluate(model, train_loader)
        val_accuracy, val_f1 = evaluate(model, val_loader)
        print(f"Epoch {epoch+1}:")
        print(f"  Train Accuracy: {train_accuracy:.4f}, Train F1: {train_f1:.4f}")
        print(f"  Validation Accuracy: {val_accuracy:.4f}, Validation F1: {val_f1:.4f}")
        
        if val_f1 > best_f1:
            best_f1 = val_f1
            print("Saving model with best validation F1 score...")
            accelerator.save_state("best_model")

    return model

In [None]:

if __name__ == "__main__":
    notebook_launcher(training_function, num_processes=1)