In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import pandas as pd

In [2]:
df = pd.read_csv("cleaned_dpr.csv")

In [3]:
model_name = "w11wo/indonesian-roberta-base-sentiment-classifier"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

sentiment_pipeline = pipeline("text-classification", model=model, tokenizer=tokenizer)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/328 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/929 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Device set to use cuda:0


In [4]:
def label_comment(text):
    try:
        result = sentiment_pipeline(str(text)[:512])[0]  # ensure string, truncate if >512
        return result['label']
    except:
        return "Unknown"

# Inference on both raw and clean text
df["label_raw"] = df["Comment"].apply(label_comment)          # raw comment
df["label_clean"] = df["clean_comment"].apply(label_comment)  # cleaned comment

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


In [5]:
print("Raw Comment Distribution:")
df["label_raw"].value_counts(normalize=True)

Raw Comment Distribution:


Unnamed: 0_level_0,proportion
label_raw,Unnamed: 1_level_1
negative,0.681942
positive,0.207421
neutral,0.110445
Unknown,0.000192


In [6]:
print("\nClean Comment Distribution:")
df["label_clean"].value_counts(normalize=True)


Clean Comment Distribution:


Unnamed: 0_level_0,proportion
label_clean,Unnamed: 1_level_1
negative,0.648675
positive,0.204209
neutral,0.146925
Unknown,0.000192


In [9]:
# compare agreement between raw and clean labels
df["agree"] = df["label_raw"] == df["label_clean"]

agreement_rate = df["agree"].mean() * 100
print(f"Agreement rate between raw and clean labels: {agreement_rate:.2f}%")

# first 20 mismatches
mismatches = df[df["agree"] == False][["Comment", "clean_comment", "label_raw", "label_clean"]]
print("\nFirst 20 mismatches:\n")
mismatches.head(20)

Agreement rate between raw and clean labels: 87.96%

First 20 mismatches:



Unnamed: 0,Comment,clean_comment,label_raw,label_clean
4,Bubarin DPR biar rakyat sejahtera,bubarin dpr biar rakyat sejahtera,negative,neutral
8,Wakil Rakyat Seharusnya Merakyat!!!....,wakil rakyat seharusnya merakyat!!!....,negative,positive
25,Emang bangsa* DPR berjoget diatas penderitaan ...,emang bangsa* dpr berjoget diatas penderitaan ...,negative,neutral
29,Dari sabang sampe marauke siap dukung bubarkan...,dari sabang sampe marauke siap dukung bubarkan...,negative,neutral
34,Geleh aink mah,geleh aink mah,positive,negative
45,Hidup KDM❤❤,hidup kdm❤❤,positive,neutral
73,Gak ada yg bisa bubarin DPR bang....kekuasaan ...,gak ada yg bisa bubarin dpr bang....kekuasaan ...,positive,neutral
82,Dewan BEBAN BESAR utk RAKYAT,dewan beban besar utk rakyat,neutral,positive
85,Dewan Perampok Rakyat,dewan perampok rakyat,negative,neutral
96,ALHAMDULILLAH\nSUDAH 3 KALI PEMILU GA PERNAH I...,alhamdulillah sudah 3 kali pemilu ga pernah ik...,negative,positive


In [7]:
df.to_csv("autolabeled_roberta.csv", index=False)