### Klasifikasi Teks Untuk Mendeteksi Depresi dan Kecemasan pada Pengguna Twitter (X) dengan Transformers Model

### Dengan Model Transformers (BERT/DistilBERT/BERTweet/XLM-R)

In [16]:
# Jika di Colab, jalankan ini sekali:
# !pip install transformers datasets scikit-learn torch -q

import os
import json
import numpy as np
import pandas as pd

In [17]:
DATA_PATH = "D:\Portfolio project\Mental Health Sentiment Analysis in Twitter\Data\Cleaned Combined Data.csv"

df = pd.read_csv(DATA_PATH)
df.head()
df.columns

Index(['status', 'cleaned_statements'], dtype='object')

In [18]:
df = df[['cleaned_statements', 'status']].rename(columns={"cleaned_statements": "text", "status": "label"})

df.head(), df['label'].value_counts()

(                                                text    label
 0                                         oh my gosh  Anxiety
 1  trouble sleeping, confused mind, restless hear...  Anxiety
 2  All wrong, back off dear, forward doubt. Stay ...  Anxiety
 3  I've shifted my focus to something else but I'...  Anxiety
 4  I'm restless and restless, it's been a month n...  Anxiety,
 label
 Normal                  16343
 Depression              15404
 Suicidal                10652
 Anxiety                  3841
 Bipolar                  2777
 Stress                   2587
 Personality disorder     1077
 Name: count, dtype: int64)

### Batasi Jumlah Data

In [19]:
MAX_SAMPLES = 6000
if len(df) > MAX_SAMPLES:
    df = df.sample(n=MAX_SAMPLES, random_state=42).reset_index(drop=True)
else:
    df = df.copy()

print("Dipakai untuk training:", len(df))
df["label"].value_counts()

Dipakai untuk training: 6000


label
Normal                  1894
Depression              1730
Suicidal                1219
Anxiety                  429
Stress                   304
Bipolar                  293
Personality disorder     131
Name: count, dtype: int64

### Splitting Data

In [20]:
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split

In [21]:
train_df, temp_df = train_test_split(
    df, test_size=0.2, stratify=df['label'], random_state=42)
val_df, test_df = train_test_split(
    temp_df, test_size=0.5, stratify=temp_df['label'], random_state=42)

print(f"Train size: {len(train_df)}")
print(f"Validation size: {len(val_df)}")
print(f"Test size: {len(test_df)}")

Train size: 4800
Validation size: 600
Test size: 600


### Convert ke Hugging Face Datasets

In [22]:
train_df = Dataset.from_pandas(train_df, preserve_index=False)
val_df = Dataset.from_pandas(val_df, preserve_index=False)
test_df = Dataset.from_pandas(test_df, preserve_index=False)

raw_datasets = DatasetDict({
    'train': train_df,
    'validation': val_df,
    'test': test_df
})

raw_datasets

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 4800
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 600
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 600
    })
})

### Label Mapping

In [23]:
label_list = sorted(df['label'].unique())
num_labels = len(label_list)

label2id = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for label, i in label2id.items()}

print('Label list:', label_list)
print('Number of labels:', num_labels)
print('Label to ID mapping:', label2id)
print('ID to label mapping:', id2label)

Label list: ['Anxiety', 'Bipolar', 'Depression', 'Normal', 'Personality disorder', 'Stress', 'Suicidal']
Number of labels: 7
Label to ID mapping: {'Anxiety': 0, 'Bipolar': 1, 'Depression': 2, 'Normal': 3, 'Personality disorder': 4, 'Stress': 5, 'Suicidal': 6}
ID to label mapping: {0: 'Anxiety', 1: 'Bipolar', 2: 'Depression', 3: 'Normal', 4: 'Personality disorder', 5: 'Stress', 6: 'Suicidal'}


### Encode Label dalam dataset

In [24]:
def encode_labels(ex):
    ex["label"] = label2id[ex["label"]]
    return ex

encoded_datasets = raw_datasets.map(encode_labels)
encoded_datasets['train'][0]

Map: 100%|██████████| 4800/4800 [00:00<00:00, 15757.50 examples/s]
Map: 100%|██████████| 600/600 [00:00<00:00, 16995.32 examples/s]
Map: 100%|██████████| 600/600 [00:00<00:00, 16879.62 examples/s]


{'text': 'is really really tired and hasn t slept in day can barely keep my eye open really missing my sanity',
 'label': 3}

--------

In [25]:
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
)
from sklearn.metrics import accuracy_score, f1_score

### Tokenizer

In [26]:
MODEL_NAME = "prajjwal1/bert-mini"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [27]:
def tokenize_function(example):
        return tokenizer(example['text'], padding='max_length', truncation=True, max_length=128)
    
tokenized_datasets = encoded_datasets.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(['text'])
tokenized_datasets.set_format("torch")
tokenized_datasets

Map: 100%|██████████| 4800/4800 [00:01<00:00, 2799.98 examples/s]
Map: 100%|██████████| 600/600 [00:00<00:00, 3306.14 examples/s]
Map: 100%|██████████| 600/600 [00:00<00:00, 4347.54 examples/s]


DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 4800
    })
    validation: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 600
    })
    test: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 600
    })
})

### Model Transformers

In [28]:
import torch

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id,
)

#device = torch.device("cpu")
# tes 1 sample dulu supaya yakin tidak crash
#sample = tokenized_datasets["train"][0]
#input_ids = sample["input_ids"].unsqueeze(0).to(device)
#attention_mask = sample["attention_mask"].unsqueeze(0).to(device)
#labels = torch.tensor([sample["label"]]).to(device)

#with torch.no_grad():
    #outputs = model(
        #input_ids=input_ids,
        #attention_mask=attention_mask,
        #labels=labels,
    #)
#print("Test forward pass OK, loss:", outputs.loss.item())

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-mini and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [29]:
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    labels = p.label_ids
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average='weighted')
    return {"accuracy": acc, "f1": f1}

In [34]:
from transformers import TrainingArguments, Trainer

OUTPUT_DIR = "results"

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    report_to="none"
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    compute_metrics=compute_metrics,
    tokenizer=tokenizer
)

trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,1.1105,1.075156,0.561667,0.477984
2,0.9639,0.96579,0.63,0.60667
3,0.918,0.940916,0.645,0.628717




TrainOutput(global_step=900, training_loss=1.0893277825249565, metrics={'train_runtime': 747.1903, 'train_samples_per_second': 19.272, 'train_steps_per_second': 1.205, 'total_flos': 35689619865600.0, 'train_loss': 1.0893277825249565, 'epoch': 3.0})

Karena keterbatasan device, sehingga akurasi untuk model transformers cukup rendah, di angka 64%

In [35]:
test_metrics = trainer.evaluate(tokenized_datasets["test"])
test_metrics



{'eval_loss': 0.9540854692459106,
 'eval_accuracy': 0.6516666666666666,
 'eval_f1': 0.6290671919631035,
 'eval_runtime': 5.3965,
 'eval_samples_per_second': 111.184,
 'eval_steps_per_second': 3.521,
 'epoch': 3.0}

### Save model & tokenizer

In [36]:
MODEL_DIR = "models"
os.makedirs(MODEL_DIR, exist_ok=True)

model.save_pretrained(MODEL_DIR)
tokenizer.save_pretrained(MODEL_DIR)

with open(os.path.join(MODEL_DIR, "label_mapping.json"), "w") as f:
    json.dump(
        {
            "label_list": label_list,
            "label2id": label2id,
            "id2label": {str(k): v for k, v in id2label.items()},
        },
        f,
        indent=2,
    )

print("Model & tokenizer disimpan di:", MODEL_DIR)


Model & tokenizer disimpan di: models


In [37]:
import torch, transformers
print("Transformers version:", transformers.__version__)
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU name:", torch.cuda.get_device_name(0))

Transformers version: 4.57.1
CUDA available: False
