<h1 style="text-align:center; font-size: 32px; color: #2c3e50;">🏆 4th Place Solution 🏆</h1>
<h2 style="text-align:center; font-size: 24px; color: #34495e;">GODS Hackathon</h2>
<h3 style="text-align:center; font-size: 20px; color: #2980b9;">NLP for Mental Health</h3>
<p style="text-align:center; font-size: 18px; color: #555;">
   This notebook presents the <strong>4th place solution</strong> in the <strong>GODS Hackathon</strong>, where we applied <strong>Natural Language Processing (NLP)</strong> 
   to tackle challenges in <strong>mental health analysis and intervention</strong>.  
</p>
<hr style="border: 1px solid #ddd;">
<p style="text-align:center; font-size: 16px; color: #666;">
   🏅 Ranked <strong>4th place</strong> among top AI & NLP solutions.<br>
   💡 Utilized <strong>state-of-the-art language models</strong> to analyze mental health data.<br>
  
   .
</p>

In [3]:
import torch
import pandas as pd
import numpy as np
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
from transformers import (
    DebertaV2Tokenizer, DebertaV2Model,
    RobertaTokenizer, RobertaModel,
    AdamW, get_linear_schedule_with_warmup
)
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score
from collections import Counter
from tqdm import tqdm
import os
import warnings
warnings.filterwarnings('ignore')

# Configuration for all models

In [4]:
class Config:
    models = {
        'deberta': {
            'name': 'microsoft/deberta-v3-base',
            'batch_size': 8,
            'lr': 3e-5,
            'max_len': 512
        },
        'roberta-base': {
            'name': 'roberta-base',
            'batch_size': 8,
            'lr': 2e-5,
            'max_len': 512
        },
        'roberta-large': {
            'name': 'roberta-large',
            'batch_size': 4,
            'lr': 1e-5,
            'max_len': 512
        }
    }
    epochs = 3
    num_classes = 5
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    num_workers = os.cpu_count()
    early_stopping_patience = 2

In [11]:
# Data Augmentation Class (remains unchanged)
class DataAugmentor:
    def __init__(self, model_name="gpt2"):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForCausalLM.from_pretrained(model_name)
        self.model.to(Config.device)

    def generate_similar_text(self, text, num_variations=1, max_length=512):
        """Generate variations of the input text using the LLM."""
        max_input_length = 200  # Reserve space for generation
        if len(text) > max_input_length:
            text = text[:max_input_length]

        prompt = f"Generate a similar text to: {text}\nSimilar text:"

        input_ids = self.tokenizer.encode(prompt, return_tensors="pt").to(Config.device)
        input_length = input_ids.shape[1]

        outputs = self.model.generate(
            input_ids,
            max_new_tokens=100,  # Control the length of generated text
            num_return_sequences=num_variations,
            temperature=0.8,
            top_p=0.9,
            do_sample=True,
            pad_token_id=self.tokenizer.eos_token_id,
            no_repeat_ngram_size=3,  # Prevent repetitive text
            length_penalty=1.0  # Encourage moderate length outputs
        )

        generated_texts = []
        for output in outputs:
            generated_text = self.tokenizer.decode(output[input_length:], skip_special_tokens=True).strip()
            if generated_text:  # Only add non-empty generations
                generated_texts.append(generated_text)

        return generated_texts

def balance_dataset(df, augment_classes=[0, 4]):
    """Balance the dataset by augmenting the specified classes."""
    class_counts = df['label'].value_counts()
    max_class_count = class_counts.max()

    augmentor = DataAugmentor()
    augmented_data = []

    for cls in augment_classes:
        class_samples = df[df['label'] == cls]
        samples_needed = max_class_count - len(class_samples)

        if samples_needed <= 0:
            print(f"Class {cls} doesn't need augmentation.")
            continue

        print(f"Generating {samples_needed} new samples for class {cls}...")

        for idx, row in tqdm(class_samples.iterrows(), total=len(class_samples), desc=f"Augmenting class {cls}"):
            num_variations = samples_needed // len(class_samples) + 1
            original_text = f"{row['title']} {row['content']}"

            try:
                generated_texts = augmentor.generate_similar_text(
                    original_text,
                    num_variations=min(num_variations, 5)  # Limit variations per sample
                )

                for text in generated_texts[:samples_needed]:
                    split_point = min(len(text) // 4, 50)  # Use first quarter or 50 chars as title
                    new_title = text[:split_point].strip()
                    new_content = text[split_point:].strip()

                    if new_title and new_content:  # Only add if both title and content exist
                        augmented_data.append({
                            'title': new_title,
                            'content': new_content,
                            'target': row['target'],
                            'label': cls
                        })

                    samples_needed -= 1
                    if samples_needed <= 0:
                        break

            except Exception as e:
                print(f"Error generating text for sample {idx} in class {cls}: {str(e)}")
                continue

    print(f"Successfully generated {len(augmented_data)} new samples")
    augmented_df = pd.DataFrame(augmented_data)
    balanced_df = pd.concat([df, augmented_df], ignore_index=True)
    return balanced_df

In [5]:
class MentalHealthDataset(Dataset):
    def __init__(self, texts, targets, tokenizer, max_len):
        self.texts = texts
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            truncation=True,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'targets': torch.tensor(self.targets[idx], dtype=torch.long)
        }

In [6]:
# Model Architecture
class DebertaClassifier(nn.Module):
    def __init__(self, model_name, num_classes):
        super().__init__()
        self.deberta = DebertaV2Model.from_pretrained(model_name)
        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(self.deberta.config.hidden_size, num_classes)
        nn.init.xavier_normal_(self.classifier.weight)

    def forward(self, input_ids, attention_mask):
        outputs = self.deberta(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0, :]
        return self.classifier(self.dropout(pooled_output))

class RobertaClassifier(nn.Module):
    def __init__(self, model_name, num_classes):
        super().__init__()
        self.roberta = RobertaModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(self.roberta.config.hidden_size, num_classes)
        nn.init.xavier_normal_(self.classifier.weight)

    def forward(self, input_ids, attention_mask):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0, :]
        return self.classifier(self.dropout(pooled_output))

In [9]:
# Trainer Class
class Trainer:
    def __init__(self, model, model_cfg, train_loader, val_loader, class_weights):
        self.model = nn.DataParallel(model).to(Config.device) if torch.cuda.device_count() > 1 else model.to(Config.device)
        self.train_loader = train_loader
        self.val_loader = val_loader
        self.optimizer = AdamW(self.model.parameters(), lr=model_cfg['lr'])
        self.scheduler = get_linear_schedule_with_warmup(
            self.optimizer,
            num_warmup_steps=0,
            num_training_steps=len(train_loader) * Config.epochs
        )
        self.criterion = nn.CrossEntropyLoss(weight=class_weights)
        self.scaler = torch.cuda.amp.GradScaler()

    def train_epoch(self):
        self.model.train()
        total_loss = 0
        progress_bar = tqdm(self.train_loader, desc='Training')
        for batch in progress_bar:
            inputs = {k: v.to(Config.device) for k, v in batch.items() if k != 'targets'}
            targets = batch['targets'].to(Config.device)
            
            with torch.cuda.amp.autocast():
                outputs = self.model(**inputs)
                loss = self.criterion(outputs, targets)
            
            self.scaler.scale(loss).backward()
            self.scaler.step(self.optimizer)
            self.scaler.update()
            self.optimizer.zero_grad()
            self.scheduler.step()
            
            total_loss += loss.item()
            progress_bar.set_postfix({'loss': total_loss/(len(progress_bar)+1)})
        return total_loss / len(self.train_loader)

    def validate(self):
        self.model.eval()
        val_loss = 0
        all_preds = []
        all_targets = []
        with torch.no_grad():
            for batch in tqdm(self.val_loader, desc='Validating'):
                inputs = {k: v.to(Config.device) for k, v in batch.items() if k != 'targets'}
                targets = batch['targets'].to(Config.device)
                outputs = self.model(**inputs)
                loss = self.criterion(outputs, targets)
                val_loss += loss.item()
                all_preds.extend(torch.argmax(outputs, dim=1).cpu().numpy())
                all_targets.extend(targets.cpu().numpy())
        
        f1 = f1_score(all_targets, all_preds, average='macro')
        print(classification_report(all_targets, all_preds))
        return val_loss / len(self.val_loader), f1

In [None]:
def train_model(model_type):
    # Load data
    df = pd.read_csv('/kaggle/input/gods-dataset/Train.csv')
    df['text'] = df['title'] + ' ' + df['content'].fillna('')
    class_names = df['target'].unique()
    label2id = {cls: i for i, cls in enumerate(class_names)}
    df['label'] = df['target'].map(label2id)
    print("Balancing dataset with augmented samples for classes 0 and 4...")
    balanced_df = balance_dataset(df, augment_classes=[0, 4])
    print(f"Original class distribution: {df['label'].value_counts()}")
    print(f"Balanced class distribution: {balanced_df['label'].value_counts()}")
    
    
    # Split data
    train_df, val_df = train_test_split(balanced_df, test_size=0.2, stratify=df['label'], random_state=42)
    
    # Model setup
    model_cfg = Config.models[model_type]
    tokenizer = DebertaV2Tokenizer.from_pretrained(model_cfg['name']) if 'deberta' in model_type else RobertaTokenizer.from_pretrained(model_cfg['name'])
    
    # Datasets
    train_dataset = MentalHealthDataset(
        train_df['text'].values,
        train_df['label'].values,
        tokenizer,
        model_cfg['max_len']
    )
    val_dataset = MentalHealthDataset(
        val_df['text'].values,
        val_df['label'].values,
        tokenizer,
        model_cfg['max_len']
    )
    
    # DataLoaders
    train_loader = DataLoader(
        train_dataset,
        batch_size=model_cfg['batch_size'],
        shuffle=True,
        num_workers=Config.num_workers
    )
    val_loader = DataLoader(
        val_dataset,
        batch_size=model_cfg['batch_size']*2,
        num_workers=Config.num_workers
    )
    
    # Model initialization
    if 'deberta' in model_type:
        model = DebertaClassifier(model_cfg['name'], Config.num_classes)
    else:
        model = RobertaClassifier(model_cfg['name'], Config.num_classes)
    
    # Class weights
    class_counts = Counter(train_df['label'])
    class_weights = torch.tensor([
        1.0 / class_counts[i] for i in sorted(class_counts.keys())
    ], dtype=torch.float32).to(Config.device)
    
    # Training
    trainer = Trainer(model, model_cfg, train_loader, val_loader, class_weights)
    best_f1 = 0
    for epoch in range(Config.epochs):
        print(f'\nEpoch {epoch+1}/{Config.epochs}')
        trainer.train_epoch()
        _, val_f1 = trainer.validate()
        
        if val_f1 > best_f1:
            best_f1 = val_f1
            torch.save(trainer.model.module.state_dict() if isinstance(trainer.model, nn.DataParallel) 
                      else trainer.model.state_dict(), f'best_{model_type}.pth')
            print(f'Best model saved for {model_type} with F1: {best_f1:.4f}')

In [12]:
def voting_submission():
    test_df = pd.read_csv('/kaggle/input/gods-dataset/Test.csv')
    test_df['text'] = test_df['title'] + ' ' + test_df['content'].fillna('')
    
    # Load label mappings
    train_df = pd.read_csv('/kaggle/input/gods-dataset/Train.csv')
    class_names = train_df['target'].unique()
    label2id = {cls: i for i, cls in enumerate(class_names)}
    id2label = {v: k for k, v in label2id.items()}
    
    # Initialize models
    models = []
    for model_type in ['deberta', 'roberta-base', 'roberta-large']:
        model_cfg = Config.models[model_type]
        tokenizer = DebertaV2Tokenizer.from_pretrained(model_cfg['name']) if 'deberta' in model_type else RobertaTokenizer.from_pretrained(model_cfg['name'])
        
        # Create dataset
        test_dataset = MentalHealthDataset(
            test_df['text'].values,
            np.zeros(len(test_df)),  # Dummy targets
            tokenizer,
            model_cfg['max_len']
        )
        test_loader = DataLoader(
            test_dataset,
            batch_size=model_cfg['batch_size']*2,
            num_workers=Config.num_workers
        )
        
        # Load model
        if 'deberta' in model_type:
            model = DebertaClassifier(model_cfg['name'], Config.num_classes)
        else:
            model = RobertaClassifier(model_cfg['name'], Config.num_classes)
        
        model.load_state_dict(torch.load(f'best_{model_type}.pth', map_location=Config.device))
        model.to(Config.device)
        model.eval()
        
        # Predict
        preds = []
        with torch.no_grad():
            for batch in tqdm(test_loader, desc=f'Predicting with {model_type}'):
                inputs = {k: v.to(Config.device) for k, v in batch.items() if k != 'targets'}
                outputs = model(**inputs)
                preds.extend(torch.argmax(outputs, dim=1).cpu().numpy())
        
        models.append(preds)
        del model
        torch.cuda.empty_cache()
    
    # Voting
    final_preds = []
    for i in range(len(test_df)):
        votes = [models[0][i], models[1][i], models[2][i]]
        final_preds.append(max(set(votes), key=votes.count))
    
    submission = pd.DataFrame({
        'id': test_df['id'],
        'target': [id2label[p] for p in final_preds]
    })
    submission.to_csv('voted_submission.csv', index=False)
    print("Voted submission saved!")

In [13]:
if __name__ == '__main__':
    # Train all models
    for model_type in Config.models.keys():
        print(f'\n{"="*40}\nTraining {model_type}\n{"="*40}')
        train_model(model_type)
    
    # Generate voted predictions
    voting_submission()


Training deberta


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

KeyboardInterrupt: 

# Thank You 