In [14]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import pandas as pd

In [15]:
df = pd.read_csv("/kaggle/input/cleaned-dpr/cleaned_dpr.csv")

In [16]:
model_name = "mdhugol/indonesia-bert-sentiment-classification"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

sentiment_pipeline = pipeline("text-classification", model=model, tokenizer=tokenizer)

Device set to use cuda:0


In [17]:
def label_comment(text):
    try:
        result = sentiment_pipeline(str(text)[:512])[0]  # ensure string, truncate if >512
        return result['label']
    except:
        return "Unknown"

# Inference on both raw and clean text
df["label_raw"] = df["Comment"].apply(label_comment)          # raw comment
df["label_clean"] = df["clean_comment"].apply(label_comment)  # cleaned comment

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


In [18]:
print("Raw Comment Distribution:")
df["label_raw"].value_counts(normalize=True)

Raw Comment Distribution:


label_raw
LABEL_2    0.703610
LABEL_0    0.180241
LABEL_1    0.116150
Name: proportion, dtype: float64

In [19]:
print("\nClean Comment Distribution:")
df["label_clean"].value_counts(normalize=True)


Clean Comment Distribution:


label_clean
LABEL_2    0.704904
LABEL_0    0.180672
LABEL_1    0.114424
Name: proportion, dtype: float64

In [20]:
# compare agreement between raw and clean labels
df["agree"] = df["label_raw"] == df["label_clean"]

agreement_rate = df["agree"].mean() * 100
print(f"Agreement rate between raw and clean labels: {agreement_rate:.2f}%")

# first 20 mismatches
mismatches = df[df["agree"] == False][["Comment", "clean_comment", "label_raw", "label_clean"]]
print("\nFirst 20 mismatches:\n")
mismatches.head(20)

Agreement rate between raw and clean labels: 99.77%

First 20 mismatches:



Unnamed: 0,Comment,clean_comment,label_raw,label_clean
119,​@@semestaraya562oooh gitu ya,​@ gitu ya,LABEL_0,LABEL_2
207,@@RadioSantuyID Siap gaspool,@ siap gaspool,LABEL_1,LABEL_0
211,@@RadioSantuyID untuk mereka yg menempatkan di...,@ untuk mereka yg menempatkan diri sebagai opo...,LABEL_0,LABEL_2
312,​@@RadioSantuyIDhayu atuh Wang kompak Ken,​@ atuh wang kompak ken,LABEL_1,LABEL_0
469,​@@RadioSantuyIDia mang😮😅,​@ mang😮😅,LABEL_1,LABEL_2
1186,iya emang cmn rakyat yg b​isa kang...kedaulata...,iya emang cmn rakyat yg b​isa kang...kedaulata...,LABEL_1,LABEL_2
1329,"Maklum,di senayan taman bermain para bocil2🤣​@...","maklum,di senayan taman bermain para bocil2🤣​@",LABEL_1,LABEL_2
1526,​@@RadioSantuyIDtah kitu kang,​@ kitu kang,LABEL_1,LABEL_2
1582,​@@RadioSantuyIDemang omon omon😂,​@ omon omon😂,LABEL_1,LABEL_2
1688,​@@hhhhhh4808ambyar gak tuh,​@ gak tuh,LABEL_0,LABEL_2


In [None]:
df = df[["Comment", "label_raw"]]
df.rename(columns={"label_raw":"auto_label"}, inplace=True)

In [24]:
label_map = {"LABEL_0": "negative", "LABEL_1": "neutral", "LABEL_2": "positive"}
df["auto_label"] = df["auto_label"].map(label_map)
df.head(5)

Unnamed: 0,Comment,auto_label
0,Samber geledek DPR biar angus,positive
1,Kalau begini jadi kangen gaya kepemimpinan pak...,negative
2,Gak ngertiin perasaan masyarakat.yang sekarang...,positive
3,BUBARKAN MRP DPR. ALOKASI GAJI MEREKA BUAT RAK...,positive
4,Bubarin DPR biar rakyat sejahtera,positive


In [25]:
df.to_csv("autolabeled_bert.csv", index=False)