In [None]:
!pip install -q transformers

In [None]:
!huggingface-cli login

In [None]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import f1_score
from torch import nn
# Replace 'train_data.csv' and 'test_data.csv' with the actual paths to your train and test CSV files
train_data_path = 'path for training data'
test_data_path = 'path for testing data'

# Replace 'text' with the actual name of the text column in your dataframe
text_column = 'text'

# Replace 'label_1' to 'label_6' with the actual names of the label columns in your dataframe
label_columns = ['Spiritual', 'Physical', 'Intellectual', 'Social', 'Vocational', 'Emotional']

# Load datasets
train_data = pd.read_csv(train_data_path)
test_data = pd.read_csv(test_data_path)

# Tokenize function
def tokenize_data(data):
    inputs = tokenizer(data['text'].tolist(), padding="max_length", truncation=True, max_length=512, return_tensors="pt")
    return inputs

# PyTorch Dataset
class MentalDataset(torch.utils.data.Dataset):
    def __init__(self, inputs, labels):
        self.inputs = inputs
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.inputs.items()}
        item["labels"] = torch.tensor(self.labels[idx], dtype=torch.float32)
        return item

from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report

def compute_metrics(pred):
    labels = pred.label_ids
    preds = (pred.predictions > 0).astype(int)  # Apply a threshold to obtain binary predictions
    acc = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}


from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report, matthews_corrcoef

from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report, matthews_corrcoef


#compute metrics2 geneate metrics for each label.
def compute_metrics2(pred):
    labels = pred.label_ids
    preds = (pred.predictions > 0).astype(int)  # Apply a threshold to obtain binary predictions
    
    acc = accuracy_score(labels, preds)
    overall_mcc = matthews_corrcoef(labels.ravel(), preds.ravel())  # Calculate overall MCC
    
    # Calculate per-label metrics
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average=None)
    mcc = [matthews_corrcoef(labels[:, i], preds[:, i]) for i in range(labels.shape[1])]
    label_accuracy = [accuracy_score(labels[:, i], preds[:, i]) for i in range(labels.shape[1])]
    
    metrics = {"accuracy": acc, "overall_mcc": overall_mcc}
    
    # Add per-label metrics to the metrics dictionary
    for i, label in enumerate(label_columns):
        metrics[f"{label}_accuracy"] = label_accuracy[i]
        metrics[f"{label}_precision"] = precision[i]
        metrics[f"{label}_recall"] = recall[i]
        metrics[f"{label}_f1"] = f1[i]
        metrics[f"{label}_mcc"] = mcc[i]
    
    return metrics



# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased", cache_dir='proj')
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", cache_dir='proj', num_labels=len(label_columns))

# Change the model's classification head for multi-label classification
model.classifier = nn.Linear(model.classifier.in_features, len(label_columns))
model.config.id2label = {i: label for i, label in enumerate(label_columns)}
model.config.label2id = {label: i for i, label in enumerate(label_columns)}

# Change the loss function to BCEWithLogitsLoss for multi-label classification
model.loss = nn.BCEWithLogitsLoss()

 

# Tokenize data
tokenized_train_data = tokenize_data(train_data)
tokenized_test_data = tokenize_data(test_data)

# Create datasets
train_dataset = MentalDataset(tokenized_train_data, train_data[label_columns].values)
test_dataset = MentalDataset(tokenized_test_data, test_data[label_columns].values)

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=20,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    logging_dir="./logs",
    logging_steps=100,
    learning_rate=2e-5,
    weight_decay=0.01,
    warmup_steps=100,
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    compute_metrics=compute_metrics,
)

# Train
trainer.train()
# Evaluate on test dataset
test_results = trainer.predict(test_dataset)
test_metrics = compute_metrics(test_results)
print("Test set results:", test_metrics)
test_metrics2 = compute_metrics2(test_results)




# Calculate and print classification report separately
labels = test_results.label_ids
preds = (test_results.predictions > 0).astype(int)
class_report = classification_report(labels, preds, target_names=label_columns)
print("Classification report:\n", class_report)

# Save predictions to CSV file
predictions_df = pd.DataFrame()
predictions_df[label_columns] = test_results.predictions
predictions_df["text"] = test_data["text"]
predictions_df.to_csv("path/to/save/predictions", index=False)

#2nd metrics
print("Test set results:", test_metrics2)
print("Accuracy: {:.4f}".format(test_metrics2["accuracy"]))
print("Overall MCC: {:.4f}".format(test_metrics2["overall_mcc"]))

for label in label_columns:
    print(f"\n{label}:")
    print(f"  Accuracy: {test_metrics2[f'{label}_accuracy']:.4f}")
    print(f"  Precision: {test_metrics2[f'{label}_precision']:.4f}")
    print(f"  Recall: {test_metrics2[f'{label}_recall']:.4f}")
    print(f"  F1 Score: {test_metrics2[f'{label}_f1']:.4f}")
    print(f"  MCC: {test_metrics2[f'{label}_mcc']:.4f}")


