# Load Dataset

In [7]:
from google.colab import drive
drive.mount('/content/drive')
import pandas as pd

# Path di Google Drive
dataset_path = '/content/drive/MyDrive/TugasAkhirElsa/Suicidal Ideation Detection Reddit Dataset-Version 2.csv'

df = pd.read_csv(dataset_path)
df.head()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Unnamed: 0,Title,Post,Label
0,I'm looking for a girl I've met at the Polish ...,I've tried Polish spotted pages but I guess sh...,Non-Suicidal
1,I got a guy kicked off a domestic flight today.,I was in a forward row as we were boarding for...,Non-Suicidal
2,"My youngest got in school suspension, I’m so p...",So according to witness testimonies a boy grab...,Non-Suicidal
3,"I'm a cashier who switched from ""sir/mam"" to ""...","So as said in the title, I'm a cashier (well, ...",Non-Suicidal
4,My whole class complimented me and didn’t real...,"Today in class, we were doing this activity ca...",Non-Suicidal


# Load Model sentinet/suicidality menerapkan model untuk prediksi label dan confidence pada kolom Title

In [8]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import torch

# Load Model
model_name = "sentinet/suicidality"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

print("num_labels:", model.config.num_labels)
print("id2label:", model.config.id2label)

clf = pipeline("text-classification", model=model, tokenizer=tokenizer, return_all_scores=True)


# Baca Dataset
df = pd.read_csv('/content/drive/MyDrive/TugasAkhirElsa/Suicidal Ideation Detection Reddit Dataset-Version 2.csv')
print(df.columns)

# Fungsi Prediksi
def predict_suicidal(text):
    preds = clf(text)[0]
    preds_sorted = sorted(preds, key=lambda x: x['score'], reverse=True)
    best = preds_sorted[0]
    return best['label'], best['score']


# Merapkan untuk Semua Baris (Kolom Tittle)
pred_labels = []
pred_confidences = []

for t in df['Title']:
    try:
        label, conf = predict_suicidal(str(t))
    except Exception as e:
        label, conf = "Error", 0.0
        print("Error pada teks:", t, "→", e)
    pred_labels.append(label)
    pred_confidences.append(conf)

df['Predicted_Label'] = pred_labels
df['Confidence'] = pred_confidences


# Menyimpan ke CSV baru
output_path = '/content/drive/MyDrive/TugasAkhirElsa/textsuicidal Part 1.csv'
df.to_csv(output_path, index=False)
print(f"\n File hasil klasifikasi disimpan ke: {output_path}")


Device set to use cuda:0


num_labels: 2
id2label: {0: 'LABEL_0', 1: 'LABEL_1'}




Index(['Title', 'Post', 'Label'], dtype='object')

 File hasil klasifikasi disimpan ke: /content/drive/MyDrive/TugasAkhirElsa/textsuicidal Part 1.csv


# Menghapus data dengan label 0 (non-suicidal) dari hasil prediksi dan menyimpan data tersisa ke file baru

In [10]:
# Baca file
input_path = '/content/drive/MyDrive/TugasAkhirElsa/textsuicidal Part 1.csv'
df = pd.read_csv(input_path)

# Hapus baris dengan Predicted_Label == 'LABEL_0'
filtered_df = df[df['Predicted_Label'] != 'LABEL_0']

# Simpan ke file baru
output_path = '/content/drive/MyDrive/TugasAkhirElsa/textsuicidal Part 2.csv'
filtered_df.to_csv(output_path, index=False)

print(f"File berhasil disimpan ke: {output_path}")
print(f"Jumlah data sebelum: {len(df)}, sesudah: {len(filtered_df)}")


File berhasil disimpan ke: /content/drive/MyDrive/TugasAkhirElsa/textsuicidal Part 2.csv
Jumlah data sebelum: 15477, sesudah: 5509


# Sortir / Filter nilai 800 nilai confidance tertinggi

In [12]:
# Baca file hasil filter
df = pd.read_csv('/content/drive/MyDrive/TugasAkhirElsa/textsuicidal Part 2.csv')

# Hitung jumlah data dengan Confidence > 0.98
count_high_conf = (df['Confidence'] > 0.98).sum()
total = len(df)

print(f"Jumlah data dengan Confidence > 0.98: {count_high_conf}")
print(f"Dari total {total} data ({count_high_conf/total*100:.2f}%)")


Jumlah data dengan Confidence > 0.98: 2579
Dari total 5509 data (46.81%)


In [14]:
# Baca data
input_path = '/content/drive/MyDrive/TugasAkhirElsa/textsuicidal Part 2.csv'
df = pd.read_csv(input_path)

# Mengurutkan nilai condifance (tertinggi ke terendah)
df_sorted = df.sort_values(by='Confidence', ascending=False)

# 3️Mengambil 800 nilai confidance tertinggi
top_800 = df_sorted.head(800)

# Menyimpan file
output_path = '/content/drive/MyDrive/TugasAkhirElsa/textsuicidal Part 3 .csv'
top_800.to_csv(output_path, index=False)
print(f" File hasil (800 data dengan Confidence tertinggi) disimpan ke: {output_path}")
print(f"Nilai Confidence tertinggi: {top_800['Confidence'].max():.4f}")
print(f"Nilai Confidence terendah (dari 800 teratas): {top_800['Confidence'].min():.4f}")


 File hasil (800 data dengan Confidence tertinggi) disimpan ke: /content/drive/MyDrive/TugasAkhirElsa/textsuicidal Part 3 .csv
Nilai Confidence tertinggi: 0.9989
Nilai Confidence terendah (dari 800 teratas): 0.9930
