# Imports and Setup

In [None]:
# Importing necessary libraries for data manipulation, deep learning, and tokenization
import pandas as pd
import torch
from torch import nn
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from transformers.modeling_outputs import SequenceClassifierOutput
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset
import numpy as np


# Data Preprocessing and Label Mapping

In [None]:
# Loading the dataset (replace with your actual dataset path)
data = pd.read_csv('/content/dataset.csv')  # Modify this path as needed

# Mapping the 'label' column (emotions) and 'health' column to unique integers
emotion_labels = data['label'].unique()
label_mapping = {emotion: idx for idx, emotion in enumerate(emotion_labels)}
data['label'] = data['label'].map(label_mapping)

health_labels = data['health'].unique()
health_mapping = {condition: idx for idx, condition in enumerate(health_labels)}
data['health'] = data['health'].map(health_mapping)

# Display mappings to ensure correct transformation
print("Emotion Label Mapping:", label_mapping)
print("Health Label Mapping:", health_mapping)


Emotion Label Mapping: {1: 0, 0: 1}
Health Label Mapping: {'ptsd': 0, 'assistance': 1, 'relationships': 2, 'survivorsofabuse': 3, 'domesticviolence': 4, 'anxiety': 5, 'homeless': 6, 'stress': 7, 'almosthomeless': 8, 'food_pantry': 9}


# Computing Class Weights

In [None]:
# Calculating class weights to handle imbalanced classes
emotion_class_weights = compute_class_weight(
    class_weight="balanced",
    classes=np.array(list(label_mapping.values())),  # Convert to NumPy array for compatibility
    y=data['label'].values
)

health_class_weights = compute_class_weight(
    class_weight="balanced",
    classes=np.array(list(health_mapping.values())),  # Convert to NumPy array for compatibility
    y=data['health'].values
)


# Data Splitting

In [None]:
# Split data into training and test sets, keeping labels balanced with stratification
train_texts, test_texts, train_labels, test_labels, train_health, test_health = train_test_split(
    data['text'], data['label'], data['health'], test_size=0.2, stratify=data['label'], random_state=42
)


# BERT Tokenization

In [None]:
# Initializing the BERT tokenizer for text processing
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Define a tokenization function that ensures padding and truncation
def tokenize_texts(texts):
    return tokenizer(texts, padding=True, truncation=True, max_length=128)

# Tokenizing training and test texts
train_encodings = tokenize_texts(train_texts.tolist())
test_encodings = tokenize_texts(test_texts.tolist())


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

# Dataset Preparation

In [None]:
# Creating a custom Dataset class for handling multi-target classification (emotion and health)
class MultiLabelDataset(Dataset):
    def __init__(self, encodings, labels, health_conditions):
        self.encodings = encodings
        self.labels = labels
        self.health_conditions = health_conditions

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        item['health'] = self.health_conditions[idx]
        return item

    def __len__(self):
        return len(self.labels)

# Creating Dataset objects for train and test data
train_dataset = MultiLabelDataset(train_encodings, torch.tensor(train_labels.tolist()), torch.tensor(train_health.tolist()))
test_dataset = MultiLabelDataset(test_encodings, torch.tensor(test_labels.tolist()), torch.tensor(test_health.tolist()))


# Custom Model Definition

In [None]:
# Custom BERT model for multi-target classification (emotion and health)
class CustomBertForMultiLabelClassification(BertForSequenceClassification):
    def __init__(self, config, emotion_weights, health_weights):
        super().__init__(config)
        self.emotion_weights = torch.tensor(emotion_weights, dtype=torch.float32)
        self.health_weights = torch.tensor(health_weights, dtype=torch.float32)

    def forward(self, input_ids=None, attention_mask=None, labels=None, health=None, **kwargs):
        outputs = super().forward(input_ids=input_ids, attention_mask=attention_mask, **kwargs)
        logits = outputs.logits

        loss = None
        if labels is not None and health is not None:
            num_emotions = len(self.emotion_weights)

            # Separate the logits for emotion and health
            emotion_logits = logits[:, :num_emotions]
            health_logits = logits[:, num_emotions:]

            # Define loss functions for both targets
            emotion_loss_fct = nn.CrossEntropyLoss(weight=self.emotion_weights.to(logits.device))
            health_loss_fct = nn.CrossEntropyLoss(weight=self.health_weights.to(logits.device))

            # Compute the loss for both targets
            emotion_loss = emotion_loss_fct(emotion_logits, labels)
            health_loss = health_loss_fct(health_logits, health)

            # Combine the losses
            loss = emotion_loss + health_loss

        return SequenceClassifierOutput(loss=loss, logits=logits)


# Model Initialization

In [None]:
# Initialize the custom BERT model
model = CustomBertForMultiLabelClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=len(label_mapping) + len(health_mapping),
    emotion_weights=emotion_class_weights,
    health_weights=health_class_weights,
)


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of CustomBertForMultiLabelClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Training Configuration

In [None]:
# Set up training parameters and configuration
training_args = TrainingArguments(
    output_dir='./results',  # Directory for saving model and logs
    num_train_epochs=3,  # Set number of epochs
    per_device_train_batch_size=8,  # Batch size for training
    per_device_eval_batch_size=16,  # Batch size for evaluation
    warmup_steps=500,  # Steps for warmup
    weight_decay=0.01,  # Regularization
    logging_dir='./logs',  # Directory for logs
    logging_steps=10,  # Frequency of logging
    evaluation_strategy="epoch",  # Evaluation strategy
    save_strategy="epoch",  # Save the model after each epoch
)




# Trainer Setup and Model Training

In [None]:
# Set up the Trainer with the custom model and training arguments
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

# Start model training
trainer.train()


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss
1,2.2997,2.305978
2,1.947,1.966677
3,1.2481,1.650122


TrainOutput(global_step=852, training_loss=2.0597073413396667, metrics={'train_runtime': 261.4549, 'train_samples_per_second': 26.047, 'train_steps_per_second': 3.259, 'total_flos': 447986791065600.0, 'train_loss': 2.0597073413396667, 'epoch': 3.0})

# Evaluation and Model Saving

In [None]:
# Evaluate the model on the test dataset
evaluation_results = trainer.evaluate()
print("Evaluation Results:", evaluation_results)

# Save the trained model and tokenizer
model.save_pretrained('./mental_health_bert_model')
tokenizer.save_pretrained('./mental_health_bert_model')


Evaluation Results: {'eval_loss': 1.650122046470642, 'eval_runtime': 4.002, 'eval_samples_per_second': 141.931, 'eval_steps_per_second': 8.996, 'epoch': 3.0}


('./mental_health_bert_model/tokenizer_config.json',
 './mental_health_bert_model/special_tokens_map.json',
 './mental_health_bert_model/vocab.txt',
 './mental_health_bert_model/added_tokens.json')

# Prediction Function

In [None]:
# Define the prediction function
def predict_emotion_and_health(text, model, tokenizer, label_map, health_map):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.eval()

    encodings = tokenizer(text, padding=True, truncation=True, max_length=128, return_tensors='pt').to(device)
    with torch.no_grad():
        output = model(**encodings)
        logits = output.logits

        # Split the logits into emotion and health parts
        num_emotions = len(label_map)
        emotion_logits = logits[:, :num_emotions]
        health_logits = logits[:, num_emotions:]

        # Get the predictions by taking the argmax
        predicted_emotion_idx = torch.argmax(emotion_logits, dim=1).item()
        predicted_health_idx = torch.argmax(health_logits, dim=1).item()

        # Map the prediction indices back to the original labels
        predicted_emotion = [key for key, value in label_map.items() if value == predicted_emotion_idx][0]
        predicted_health = [key for key, value in health_map.items() if value == predicted_health_idx][0]

    return predicted_emotion, predicted_health


# Testing Predictions

In [None]:
# Example texts for prediction
sample_texts = [
    "I have been living on the streets for weeks now, and it's hard to find any hope.",
    "I feel like everything is falling apart around me. My relationships are in shambles."
]

# Predict emotion and health for each sample text
for text in sample_texts:
    emotion, health = predict_emotion_and_health(text, model, tokenizer, label_mapping, health_mapping)
    print(f"Text: {text}")
    print(f"Predicted Emotion: {emotion}, Predicted Health: {health}\n")


Text: I have been living on the streets for weeks now, and it's hard to find any hope.
Predicted Emotion: 1, Predicted Health: homeless

Text: I feel like everything is falling apart around me. My relationships are in shambles.
Predicted Emotion: 1, Predicted Health: relationships

