In [1]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [2]:
!pip install transformers
!pip install --upgrade transformers
!pip install tensorflow-io
!pip install tensorflow-text

Collecting tensorflow-io
  Downloading tensorflow_io-0.37.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (49.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.6/49.6 MB[0m [31m34.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tensorflow-io
Successfully installed tensorflow-io-0.37.0
Collecting tensorflow-text
  Downloading tensorflow_text-2.16.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m16.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tensorflow<2.17,>=2.16.1 (from tensorflow-text)
  Downloading tensorflow-2.16.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (589.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m589.8/589.8 MB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
Collecting h5py>=3.10.0 (from tensorflow<2.17,>=2.16.1->tensorflow-text)
  Downloading h5py-3.11.0-cp310-cp310-manylinux_

In [3]:
!pip uninstall -y tensorflow
!pip install tensorflow==2.15.0

Found existing installation: tensorflow 2.16.1
Uninstalling tensorflow-2.16.1:
  Successfully uninstalled tensorflow-2.16.1
Collecting tensorflow==2.15.0
  Downloading tensorflow-2.15.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (475.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m475.2/475.2 MB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
Collecting ml-dtypes~=0.2.0 (from tensorflow==2.15.0)
  Downloading ml_dtypes-0.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m75.1 MB/s[0m eta [36m0:00:00[0m
Collecting tensorboard<2.16,>=2.15 (from tensorflow==2.15.0)
  Downloading tensorboard-2.15.2-py3-none-any.whl (5.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.5/5.5 MB[0m [31m103.1 MB/s[0m eta [36m0:00:00[0m
Collecting keras<2.16,>=2.15.0 (from tensorflow==2.15.0)
  Downloading keras-2.15.0-py3-none-any.whl (1.7 MB)


In [4]:
import pandas as pd
import numpy as np

import os
import time
import torch
import tensorflow as tf
import keras
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, accuracy_score, precision_score, recall_score, f1_score, auc, roc_curve, roc_auc_score
import transformers
from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification, RobertaTokenizer, TFRobertaForSequenceClassification, AlbertTokenizer, TFAlbertForSequenceClassification, ElectraTokenizer, TFElectraForSequenceClassification, DebertaTokenizer, TFDebertaForSequenceClassification, BertTokenizer, TFBertForSequenceClassification

# Functions

In [5]:
def load_data(filepath):
  data = pd.read_csv(filepath)
  print("Shape", data.shape)
  print(data.head())

  return data

In [6]:
def OOS_performance_measure(model, tokenizer, OOS_set):
    encodings = {
        "input_ids_OOS": [],
        "attention_masks_OOS": []
    }

    for text_row in OOS_set["usertext"]:
        # text_str = " ".join(text_row)
        tokens = tokenizer(text_row, truncation=True, padding="max_length", max_length=64, return_tensors="tf")
        encodings["input_ids_OOS"].append(tokens["input_ids"])
        encodings["attention_masks_OOS"].append(tokens["attention_mask"])

    input_ids_OOS = np.array(encodings["input_ids_OOS"])
    attention_masks_OOS = np.array(encodings["attention_masks_OOS"])
    input_ids_OOS = np.squeeze(input_ids_OOS, axis=1)
    attention_masks_OOS = np.squeeze(attention_masks_OOS, axis=1)

    OOS_labels = np.array(OOS_set["label"])

    OOS_predictions = model.predict([input_ids_OOS, attention_masks_OOS])
    OOS_proba = OOS_predictions.logits[:, 0]
    OOS_probabilities = tf.nn.sigmoid(OOS_proba).numpy()
    OOS_predictions_class = (OOS_proba > 0.5).astype(int)

    accuracy_OOS = accuracy_score(OOS_labels, OOS_predictions_class)
    precision_OOS = precision_score(OOS_labels, OOS_predictions_class)
    recall_OOS = recall_score(OOS_labels, OOS_predictions_class)
    f1_OOS = f1_score(OOS_labels, OOS_predictions_class)
    roc_auc_OOS = roc_auc_score(OOS_labels, OOS_probabilities)

    return accuracy_OOS, precision_OOS, recall_OOS, f1_OOS, roc_auc_OOS

In [7]:
def save_result_to_csv(accuracy, precision, recall, f1_score, roc_auc, csv_filepath):
    if os.path.exists(csv_filepath):
        existing_df = pd.read_csv(csv_filepath)
    else:
        existing_df = pd.DataFrame()

    new_row = pd.DataFrame({
        "Accuracy": [accuracy],
        "Precision": [precision],
        "Recall": [recall],
        "F1_Score": [f1_score],
        "ROC AUC": [roc_auc]
    })

    existing_df = pd.concat([existing_df, new_row], ignore_index=True)
    existing_df.to_csv(csv_filepath, index=False)

    return existing_df

# Data Loading

In [8]:
data = load_data("/content/drive/MyDrive/MDS_FYP/Data/suicidal_ideation_reddit_annotated.csv")

Shape (12656, 2)
                                            usertext  label
0  I just want to end my life so badly. My life i...      1
1  My relationship is complicated and painful, bu...      1
2  I owe a lot of money , so I have to work.The c...      1
3  On the 2 of October I overdosed I just felt so...      1
4  Everyone tells me how wonderful I am, but not ...      1


In [9]:
data["label"].value_counts()

label
1    6609
0    6047
Name: count, dtype: int64

In [10]:
data = data.dropna(subset=["usertext"])

# ALBERT

In [11]:
MODEL_FILE_ALBERT = "/content/drive/MyDrive/MDS_FYP/Albert_Intermediate/Model/albert_v2"
loaded_model_ALBERT = TFAlbertForSequenceClassification.from_pretrained(MODEL_FILE_ALBERT)
MODEL_NAME = "albert-base-v2"
loaded_tokenizer_ALBERT = AlbertTokenizer.from_pretrained(MODEL_NAME)

accuracy_OOS, precision_OOS, recall_OOS, f1_OOS, roc_auc_OOS = OOS_performance_measure(loaded_model_ALBERT, loaded_tokenizer_ALBERT, data)

All model checkpoint layers were used when initializing TFAlbertForSequenceClassification.

All the layers of TFAlbertForSequenceClassification were initialized from the model checkpoint at /content/drive/MyDrive/MDS_FYP/Albert_Intermediate/Model/albert_v2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFAlbertForSequenceClassification for predictions without further training.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/760k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.31M [00:00<?, ?B/s]



config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]



In [12]:
CSV_FILE_PATH = "/content/drive/MyDrive/MDS_FYP/OOS/OOS_metric_results_albert_v2.csv"
OOS_eval = save_result_to_csv(accuracy_OOS, precision_OOS, recall_OOS, f1_OOS, roc_auc_OOS, CSV_FILE_PATH)

print("OOS Accuracy:", accuracy_OOS)
print("OOS Precision:", precision_OOS)
print("OOS Recall:", recall_OOS)
print("OOS F1-score:", f1_OOS)
print("OOS ROC-AUC:", roc_auc_OOS)

OOS Accuracy: 0.6344827586206897
OOS Precision: 0.653249804228661
OOS Recall: 0.6350487210718636
OOS F1-score: 0.6440206901876013
OOS ROC-AUC: 0.6892732718753846


# BERT

In [13]:
MODEL_FILE_BERT = "/content/drive/MyDrive/MDS_FYP/BERT_Intermediate/Model/bert"
loaded_model_BERT = TFBertForSequenceClassification.from_pretrained(MODEL_FILE_BERT)
MODEL_NAME = "bert-base-cased"
loaded_tokenizer_BERT = BertTokenizer.from_pretrained(MODEL_NAME)

accuracy_OOS, precision_OOS, recall_OOS, f1_OOS, roc_auc_OOS = OOS_performance_measure(loaded_model_BERT, loaded_tokenizer_BERT, data)

Some layers from the model checkpoint at /content/drive/MyDrive/MDS_FYP/BERT_Intermediate/Model/bert were not used when initializing TFBertForSequenceClassification: ['dropout_37']
- This IS expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertForSequenceClassification were initialized from the model checkpoint at /content/drive/MyDrive/MDS_FYP/BERT_Intermediate/Model/bert.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForSequenceClassification for predictions without further t

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



In [14]:
CSV_FILE_PATH = "/content/drive/MyDrive/MDS_FYP/OOS/OOS_metric_results_bert.csv"
OOS_eval = save_result_to_csv(accuracy_OOS, precision_OOS, recall_OOS, f1_OOS, roc_auc_OOS, CSV_FILE_PATH)

print("OOS Accuracy:", accuracy_OOS)
print("OOS Precision:", precision_OOS)
print("OOS Recall:", recall_OOS)
print("OOS F1-score:", f1_OOS)
print("OOS ROC-AUC:", roc_auc_OOS)

OOS Accuracy: 0.8135552913198573
OOS Precision: 0.8163265306122449
OOS Recall: 0.8282582216808769
OOS F1-score: 0.822249093107618
OOS ROC-AUC: 0.8784453898179244


# DeBERTa

In [15]:
MODEL_FILE_DeBERTa = "/content/drive/MyDrive/MDS_FYP/DeBERTa_Intermediate/Model/deberta"
loaded_model_DeBERTa = TFDebertaForSequenceClassification.from_pretrained(MODEL_FILE_DeBERTa)
MODEL_NAME = "microsoft/deberta-base"
loaded_tokenizer_DeBERTa = DebertaTokenizer.from_pretrained(MODEL_NAME)

accuracy_OOS, precision_OOS, recall_OOS, f1_OOS, roc_auc_OOS = OOS_performance_measure(loaded_model_DeBERTa, loaded_tokenizer_DeBERTa, data)

All model checkpoint layers were used when initializing TFDebertaForSequenceClassification.

All the layers of TFDebertaForSequenceClassification were initialized from the model checkpoint at /content/drive/MyDrive/MDS_FYP/DeBERTa_Intermediate/Model/deberta.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDebertaForSequenceClassification for predictions without further training.


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/474 [00:00<?, ?B/s]



In [16]:
CSV_FILE_PATH = "/content/drive/MyDrive/MDS_FYP/OOS/OOS_metric_results_deberta.csv"
OOS_eval = save_result_to_csv(accuracy_OOS, precision_OOS, recall_OOS, f1_OOS, roc_auc_OOS, CSV_FILE_PATH)

print("OOS Accuracy:", accuracy_OOS)
print("OOS Precision:", precision_OOS)
print("OOS Recall:", recall_OOS)
print("OOS F1-score:", f1_OOS)
print("OOS ROC-AUC:", roc_auc_OOS)

OOS Accuracy: 0.7319857312722949
OOS Precision: 0.8599503049469166
OOS Recall: 0.5796285018270402
OOS F1-score: 0.6924965893587994
OOS ROC-AUC: 0.8851250365841107


# DistilBERT

In [17]:
MODEL_FILE_DistilBERT = "/content/drive/MyDrive/MDS_FYP/DistilBert_Intermediate/Model/distilbert_v2"
loaded_model_DistilBert = TFDistilBertForSequenceClassification.from_pretrained(MODEL_FILE_DistilBERT)
MODEL_NAME = "distilbert-base-uncased"
loaded_tokenizer_DistilBert = DistilBertTokenizer.from_pretrained(MODEL_NAME)

accuracy_OOS, precision_OOS, recall_OOS, f1_OOS, roc_auc_OOS = OOS_performance_measure(loaded_model_DistilBert, loaded_tokenizer_DistilBert, data)

Some layers from the model checkpoint at /content/drive/MyDrive/MDS_FYP/DistilBert_Intermediate/Model/distilbert_v2 were not used when initializing TFDistilBertForSequenceClassification: ['dropout_19']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at /content/drive/MyDrive/MDS_FYP/DistilBert_Intermediate/Model/distilbert_v2 and are newly initialized: ['dropout_62']
You should probably TRAIN this model on a down-stream task to be able

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]



In [18]:
CSV_FILE_PATH = "/content/drive/MyDrive/MDS_FYP/OOS/OOS_metric_results_distilbert_v2.csv"
OOS_eval = save_result_to_csv(accuracy_OOS, precision_OOS, recall_OOS, f1_OOS, roc_auc_OOS, CSV_FILE_PATH)

print("OOS Accuracy:", accuracy_OOS)
print("OOS Precision:", precision_OOS)
print("OOS Recall:", recall_OOS)
print("OOS F1-score:", f1_OOS)
print("OOS ROC-AUC:", roc_auc_OOS)

OOS Accuracy: 0.7659135949266745
OOS Precision: 0.8209909429941395
OOS Recall: 0.7038672350791717
OOS F1-score: 0.7579309779490122
OOS ROC-AUC: 0.8607945887543114


# ELECTRA

In [19]:
MODEL_FILE_ELECTRA = "/content/drive/MyDrive/MDS_FYP/Electra_Intermediate/Model/electra"
loaded_model_ELECTRA = TFElectraForSequenceClassification.from_pretrained(MODEL_FILE_ELECTRA)
MODEL_NAME = "google/electra-base-discriminator"
loaded_tokenizer_ELECTRA = ElectraTokenizer.from_pretrained(MODEL_NAME)

accuracy_OOS, precision_OOS, recall_OOS, f1_OOS, roc_auc_OOS = OOS_performance_measure(loaded_model_ELECTRA, loaded_tokenizer_ELECTRA, data)

All model checkpoint layers were used when initializing TFElectraForSequenceClassification.

All the layers of TFElectraForSequenceClassification were initialized from the model checkpoint at /content/drive/MyDrive/MDS_FYP/Electra_Intermediate/Model/electra.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFElectraForSequenceClassification for predictions without further training.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/666 [00:00<?, ?B/s]



In [20]:
CSV_FILE_PATH = "/content/drive/MyDrive/MDS_FYP/OOS/OOS_metric_results_electra.csv"
OOS_eval = save_result_to_csv(accuracy_OOS, precision_OOS, recall_OOS, f1_OOS, roc_auc_OOS, CSV_FILE_PATH)

print("OOS Accuracy:", accuracy_OOS)
print("OOS Precision:", precision_OOS)
print("OOS Recall:", recall_OOS)
print("OOS F1-score:", f1_OOS)
print("OOS ROC-AUC:", roc_auc_OOS)

OOS Accuracy: 0.7988109393579073
OOS Precision: 0.8509230233368165
OOS Recall: 0.7439098660170523
OOS F1-score: 0.7938261575954508
OOS ROC-AUC: 0.8897164180021419


# RoBERTa

In [21]:
MODEL_FILE_RoBERTa = "/content/drive/MyDrive/MDS_FYP/RoBERTa_Intermediate/Model/roberta_v2"
loaded_model_RoBERTa = TFRobertaForSequenceClassification.from_pretrained(MODEL_FILE_RoBERTa)
MODEL_NAME = "roberta-base"
loaded_tokenizer_RoBERTa = RobertaTokenizer.from_pretrained(MODEL_NAME)

accuracy_OOS, precision_OOS, recall_OOS, f1_OOS, roc_auc_OOS = OOS_performance_measure(loaded_model_RoBERTa, loaded_tokenizer_RoBERTa, data)

All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

All the layers of TFRobertaForSequenceClassification were initialized from the model checkpoint at /content/drive/MyDrive/MDS_FYP/RoBERTa_Intermediate/Model/roberta_v2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]



In [22]:
CSV_FILE_PATH = "/content/drive/MyDrive/MDS_FYP/OOS/OOS_metric_results_roberta_v2.csv"
OOS_eval = save_result_to_csv(accuracy_OOS, precision_OOS, recall_OOS, f1_OOS, roc_auc_OOS, CSV_FILE_PATH)

print("OOS Accuracy:", accuracy_OOS)
print("OOS Precision:", precision_OOS)
print("OOS Recall:", recall_OOS)
print("OOS F1-score:", f1_OOS)
print("OOS ROC-AUC:", roc_auc_OOS)

OOS Accuracy: 0.8154577883472057
OOS Precision: 0.8360811667723526
OOS Recall: 0.8029841656516443
OOS F1-score: 0.8191985088536814
OOS ROC-AUC: 0.8841197288918494
