# **Pseudo-Labeling**

## _**Import Library**_

In [2]:
# Library umum
import pandas as pd
import torch
from tqdm import tqdm

# Transformers (Hugging Face)
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.nn.functional import softmax


## **_load data_**

In [3]:
# Load data hasil preprocessing
df = pd.read_csv("../data/processed/full_cleaned.csv")

# Cek 5 data awal
df.head()

Unnamed: 0,full_text,stemming
0,@IqbalRa17664055 @rwp1byte aku nyusul kerja aj...,susul kerja kaburajadulu bawa istri wkwkwk
1,#KaburAjaDulu bole juga ni hashtag,kaburajadulu nih hashtag
2,#KaburAjaDulu,kaburajadulu
3,Ehhh ini Mas Adith #KaburAjaDulu ya? Mohon ma...,eh mas adith kaburajadulu ya mohon maaf ya ya ...
4,Dengan ini saya resmikan pemilik hashtag #Kabu...,resmi milik hashtag kaburajadulu mas adith sah...


## _**load model**_

In [4]:
# Gunakan model RoBERTa dari w11wo
model_name = "ayameRushia/bert-base-indonesian-1.5G-sentiment-analysis-smsa"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
model.eval();  # masuk ke mode evaluasi

In [5]:
# Lihat mapping label
print(model.config.id2label)


{0: 'Positive', 1: 'Neutral', 2: 'Negative'}


In [6]:
from torch.nn.functional import softmax

def predict_sentiment_custom(text):
    clean_text = text.lower().strip()

    # Jika hanya "kaburajadulu" atau "#kaburajadulu", langsung return label netral (1)
    if clean_text in ["kaburajadulu", "#kaburajadulu"]:
        return 1  # netral

    # Prediksi dengan model seperti biasa
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=256)
    with torch.no_grad():
        outputs = model(**inputs)
        probs = softmax(outputs.logits, dim=1)
        predicted = torch.argmax(probs, dim=1).item()
    return predicted


In [7]:
df['stemming'].apply(lambda x: type(x)).value_counts()

stemming
<class 'str'>    16905
Name: count, dtype: int64

In [8]:
# Tampilkan progress bar saat proses banyak data
tqdm.pandas()

# Prediksi label numerik
df['label_num'] = df['stemming'].progress_apply(predict_sentiment_custom)

# Konversi label angka ke teks
label_map = {0: 'positive', 1: 'neutral', 2: 'negative'}
df['pseudo_label'] = df['label_num'].map(label_map)

# Lihat hasil awal
df[['stemming', 'label_num', 'pseudo_label']].head(10)



100%|██████████| 16905/16905 [13:12<00:00, 21.34it/s]


Unnamed: 0,stemming,label_num,pseudo_label
0,susul kerja kaburajadulu bawa istri wkwkwk,1,neutral
1,kaburajadulu nih hashtag,2,negative
2,kaburajadulu,1,neutral
3,eh mas adith kaburajadulu ya mohon maaf ya ya ...,1,neutral
4,resmi milik hashtag kaburajadulu mas adith sah...,1,neutral
5,undang lunch party event company terbang indon...,1,neutral
6,kali ya ngikut tren kaburajadulu,2,negative
7,keluarga hasil kaburajadulu jepang,2,negative
8,pengin ajak danilla kaburajadulu jepang,1,neutral
9,teman teman software engineer pengin kaburajad...,1,neutral


In [9]:
# Simpan ke file untuk digunakan di model training
df.to_csv("../data/processed/full_cleaned_labeled.csv", index=False)
print("✅ Dataset berhasil disimpan ke full_cleaned_labeled.csv")

✅ Dataset berhasil disimpan ke full_cleaned_labeled.csv


In [10]:
# Mengecek jumlah label 0, 1, dan 2 full cleaned 1.5G
label_counts = df['label_num'].value_counts()

# Menampilkan hasil
print(label_counts)


label_num
2    9857
1    5748
0    1300
Name: count, dtype: int64
