In [1]:
# IMPORT LIBRARIES
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.semi_supervised import SelfTrainingClassifier
from scipy.sparse import vstack

# 1. LOAD & PREPARE DATA
data = pd.read_csv("/content/labeled_data.csv")
data['labelContent'] = data['labelContent'].astype('object')
labeled_data = data.dropna(subset=['labelContent'])
unlabeled_data = data[data['labelContent'].isna()]
print(f"Total data berlabel     : {len(labeled_data)}")
print(f"Total data tak berlabel : {len(unlabeled_data)}")

# 2. TF-IDF TRANSFORMATION
tfidf = TfidfVectorizer()
train_features = tfidf.fit_transform(labeled_data['content'])
train_labels   = labeled_data['labelContent']
test_features  = tfidf.transform(unlabeled_data['content'])

# 3. SELF-TRAINING MODEL SETUP
log_model = LogisticRegression(max_iter=250)
semi_supervised_model = SelfTrainingClassifier(
    base_estimator=log_model,
    threshold=0.75,
    verbose=True
)

# 4. GABUNGKAN LABELED & UNLABELED
# unlabeled diberi label -1 sesuai aturan Self-Training
placeholder_labels = np.full(len(unlabeled_data), -1)
combined_features = vstack([train_features, test_features])
combined_labels   = np.concatenate([train_labels, placeholder_labels])

# 5. TRAIN SELF-TRAINING
semi_supervised_model.fit(combined_features, combined_labels)
print("\n>>> Training Self-Training Selesai <<<\n")

# 6. GENERATE PSEUDO-LABEL
predicted_labels = semi_supervised_model.predict(test_features)
unlabeled_data['labelContent'] = predicted_labels
# satukan kembali labeled + unlabeled yang sudah diberi pseudo-label
final_output = pd.concat([labeled_data, unlabeled_data], ignore_index=True)

# 7. SAVE OUTPUT
output_name = "data_hasil_selftraining.csv"
final_output.to_csv(output_name, index=False)
print(f"File output disimpan sebagai: {output_name}")


Total data berlabel     : 250
Total data tak berlabel : 750
End of iteration 1, added 250 new labels.
End of iteration 2, added 59 new labels.
End of iteration 3, added 22 new labels.
End of iteration 4, added 14 new labels.
End of iteration 5, added 5 new labels.
End of iteration 6, added 2 new labels.


  warn(


End of iteration 7, added 6 new labels.
End of iteration 8, added 1 new labels.
End of iteration 9, added 2 new labels.
End of iteration 10, added 2 new labels.

>>> Training Self-Training Selesai <<<

File output disimpan sebagai: data_hasil_selftraining.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unlabeled_data['labelContent'] = predicted_labels
