<a href="https://colab.research.google.com/github/codjere/RESEARCHMETHOD-Data/blob/main/Datalama.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ================================
# 1) IMPORT LIBRARY & LOAD DATA
# ================================
import pandas as pd
import numpy as np
import os, re, glob

# CSV files
csv_files = [
    "/content/data_tiktok_1.csv",
    "/content/data_tiktok_2.csv",
    "/content/data_tiktok_3.csv",
    "/content/data_tiktok_4.csv",
    "/content/data_tiktok_5.csv"
]

dfs = []
for file in csv_files:
    try:
        df_tmp = pd.read_csv(file, encoding="utf-8")
    except:
        df_tmp = pd.read_csv(file, encoding="latin-1")
    df_tmp["source_file"] = os.path.basename(file)
    dfs.append(df_tmp)

raw_df = pd.concat(dfs, ignore_index=True)
print("Jumlah data sebelum preprocessing:", len(raw_df))

Jumlah data sebelum preprocessing: 1066


In [None]:
# ================================
# 2) NORMALISASI KOLOM
# ================================
rename_map = {
    "videoWebUrl": "video_url",
    "createTimeISO": "created_at",
    "text": "text",
    "uniqueId": "username",
    "uid": "user_id",
    "diggCount": "like_count",
    "replyCommentTotal": "reply_count",
    "likedByAuthor": "liked_by_author",
}

for old, new in rename_map.items():
    if old in raw_df.columns:
        raw_df = raw_df.rename(columns={old: new})

print("Kolom setelah normalisasi:", raw_df.columns.tolist())

Kolom setelah normalisasi: ['video_url', 'submittedVideoUrl', 'input', 'cid', 'createTime', 'created_at', 'text', 'like_count', 'liked_by_author', 'pinnedByAuthor', 'repliesToId', 'reply_count', 'user_id', 'username', 'avatarThumbnail', 'mentions', 'detailedMentions', 'source_file']


In [None]:
# ================================
# 3) PILIH KOLOM PENTING
# ================================
important_cols = [
    "video_url", "created_at", "text", "username", "user_id",
    "like_count", "reply_count", "liked_by_author"
]
important_cols = [c for c in important_cols if c in raw_df.columns]
df = raw_df[important_cols].copy()
print("Kolom yang dipakai:", df.columns.tolist())

Kolom yang dipakai: ['video_url', 'created_at', 'text', 'username', 'user_id', 'like_count', 'reply_count', 'liked_by_author']


In [None]:
# ================================
# 4) CLEANING TEXT
# ================================
url_pat = re.compile(r"http\S+")
mention_pat = re.compile(r"@\w+")
ws_pat = re.compile(r"\s+")

def clean_text(s):
    s = str(s)
    s = url_pat.sub("", s)                # hapus URL
    s = mention_pat.sub("@user", s)      # normalisasi mention
    s = ws_pat.sub(" ", s).strip()       # hapus spasi ganda
    return s

# 1) Cleaning awal: URL, mention, spasi ganda
df["text_clean"] = df["text"].fillna("").map(clean_text)

In [None]:
# ================================
# 5) PARSE WAKTU & HAPUS DUPLIKAT
# ================================
df["created_at"] = pd.to_datetime(df["created_at"], errors="coerce")
df["username"] = df["username"].astype(str).str.strip()

df = df.drop_duplicates(
    subset=["username","text_clean"],
    keep="first"
)
print("Jumlah data bersih:", len(df))
os.makedirs("data", exist_ok=True)
df.to_csv("data/tiktok_dataset_clean.csv", index=False)

Jumlah data bersih: 1058


In [None]:
# ================================
# 6) DATA UNTUK SENTIMENT LABELING (TANPA SAMPLING)
# ================================
df_sent_base = df[df["text_clean"].str.strip() != ""].copy()

N_SENTIMENT = 400
sent_sample = df_sent_base.sample(
    n=min(N_SENTIMENT, len(df_sent_base)),
    random_state=42
)

sent_for_label = sent_sample[
    ["created_at","username","text_clean",
     "like_count","reply_count","video_url"]
].reset_index(drop=True)

sent_for_label.insert(0,"id",range(1,len(sent_for_label)+1))
sent_for_label["sentiment_label"] = ""

# ðŸ”§ FIX EXCEL (INI WAJIB)
sent_for_label["created_at"] = (
    pd.to_datetime(sent_for_label["created_at"], errors="coerce")
    .dt.tz_localize(None)
)

# SIMPAN DATA
os.makedirs("labeling", exist_ok=True)
sent_for_label.to_csv(
    "labeling/sentiment_labels_manual.csv",
    index=False
)
sent_for_label.to_excel(
    "labeling/sentiment_labels_manual.xlsx",
    index=False
)

print("Komentar untuk sentiment labeling:", len(sent_for_label))

Komentar untuk sentiment labeling: 400


In [None]:
# ================================
# 7) SAMPLING UNTUK BUZZER LABELING (MANUAL)
# ================================
df_user_base = df[df["username"]!=""].copy()

user_counts = (
    df_user_base
    .groupby("username", as_index=False)
    .agg(total_comments=("text_clean","count"))
)

N_USERS = 250
user_sample = (
    user_counts
    .sort_values("total_comments", ascending=False)
    .head(N_USERS)
)

rows = []
for _, r in user_sample.iterrows():
    uname = r["username"]
    sub = df_user_base[df_user_base["username"]==uname]
    sample_comment = sub.sort_values("created_at").iloc[0]["text_clean"]

    rows.append({
        "username": uname,
        "total_comments": int(r["total_comments"]),
        "sample_comment": sample_comment
    })

buzzer_for_label = pd.DataFrame(rows)
buzzer_for_label["buzzer_label"] = ""

buzzer_for_label.to_csv(
    "labeling/buzzer_labels_manual.csv",
    index=False
)
buzzer_for_label.to_excel(
    "labeling/buzzer_labels_manual.xlsx",
    index=False
)

print("Akun untuk buzzer labeling:", len(buzzer_for_label))

Akun untuk buzzer labeling: 250


In [None]:
# ================================
# 8) LOAD MANUAL SENTIMENT & BUZZER
# ================================
df = pd.read_csv("data/tiktok_dataset_clean.csv")
sentiment_manual = pd.read_csv(
    "labeling/sentiment_labels_manual.csv"
)
sentiment_manual = sentiment_manual.drop(
    columns=["Unnamed: 8"], errors="ignore"
)

df = df.merge(
    sentiment_manual[["text_clean","sentiment_label"]],
    on="text_clean",
    how="left"
)

buzzer_manual = pd.read_csv("labeling/buzzer_labels_manual.csv")
df = df.merge(
    buzzer_manual[["username","buzzer_label"]],
    on="username",
    how="left"
)

In [None]:
# ================================
# 9) FEATURE ENGINEERING BUZZER (XGBOOST)
# ================================
# comment_count, sentiment_mean, max_duplicate, length_mean
freq = df.groupby('username').size().reset_index(name='comment_count')
df = df.merge(freq, on='username', how='left')

sentiment_map = {"negative":-1,"neutral":0,"positive":1}
df['sentiment_num'] = df['sentiment_label'].map(sentiment_map)

sent_avg = df.groupby('username')['sentiment_num'].mean().reset_index(name='sentiment_mean')
df = df.merge(sent_avg, on='username', how='left')

dup = df.groupby(['username','text_clean']).size().reset_index(name='duplicate_count')
dup_user = dup.groupby('username')['duplicate_count'].max().reset_index(name='max_duplicate')
df = df.merge(dup_user, on='username', how='left')

df['comment_length'] = df['text_clean'].astype(str).apply(len)
length_avg = df.groupby('username')['comment_length'].mean().reset_index(name='length_mean')
df = df.merge(length_avg, on='username', how='left')

df_user = df.groupby('username').agg({
    'comment_count':'first',
    'sentiment_mean':'first',
    'max_duplicate':'first',
    'length_mean':'first',
    'buzzer_label':'first'
}).reset_index()
df_user = df_user[df_user['buzzer_label'].notna()]
df_user['buzzer_label'] = df_user['buzzer_label'].astype(int)

print("Distribusi kelas (persentase) sebelum balancing:")
print(df_user['buzzer_label'].value_counts(normalize=True))

Distribusi kelas (persentase) sebelum balancing:
Series([], Name: proportion, dtype: float64)


In [None]:
#DATA FINAL USER

df_user = (
    df.groupby("username")
    .agg({
        "comment_count":"first",
        "sentiment_mean":"first",
        "max_duplicate":"first",
        "length_mean":"first",
        "buzzer_label":"first"
    })
    .reset_index()
)

df_user = df_user[df_user["buzzer_label"].notna()]
df_user["buzzer_label"] = df_user["buzzer_label"].astype(int)

print("Distribusi kelas:")
print(df_user["buzzer_label"].value_counts())

Distribusi kelas:
Series([], Name: count, dtype: int64)


In [None]:
print("df_user shape:", df_user.shape)
print(df_user["buzzer_label"].value_counts(dropna=False))

df_user shape: (0, 6)
Series([], Name: count, dtype: int64)


In [None]:
# ================================
# 10) SPLIT DATA
# ================================
from sklearn.model_selection import train_test_split

X = df_user[
    ["comment_count","sentiment_mean",
     "max_duplicate","length_mean"]
]
y = df_user["buzzer_label"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Train:", X_train.shape)
print("Test :", X_test.shape)

ValueError: With n_samples=0, test_size=0.2 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters.

In [None]:
# ================================
# 11) MODELING XGBOOST (3 VARIANTS)
# ================================
import xgboost as xgb
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
# --- a) XGBoost Default ---
xgb_base = xgb.XGBClassifier(
    n_estimators=300,
    max_depth=5,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric="logloss",
    random_state=42
)
xgb_base.fit(X_train, y_train)
pred_base = xgb_base.predict(X_test)
print("=== XGBoost Default ===")
print(classification_report(y_test, pred_base, digits=3))

In [None]:
# --- b) XGBoost + Class Weight ---
neg = (y_train==0).sum()
pos = (y_train==1).sum()
scale_pos = neg/pos if pos>0 else 1.0
xgb_weight = xgb.XGBClassifier(
    n_estimators=300,
    max_depth=5,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric="logloss",
    scale_pos_weight=scale_pos,
    random_state=42
)
xgb_weight.fit(X_train, y_train)
pred_weight = xgb_weight.predict(X_test)
print("=== XGBoost + Class Weight ===")
print(classification_report(y_test, pred_weight, digits=3))

In [None]:
# --- c) XGBoost + SMOTE ---
sm = SMOTE(random_state=42, k_neighbors=3)
xgb_smote = xgb.XGBClassifier(
    n_estimators=300,
    max_depth=5,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric="logloss",
    random_state=42
)
pipe_smote = Pipeline(steps=[('scaler',StandardScaler()),('smote',sm),('clf',xgb_smote)])
pipe_smote.fit(X_train, y_train)
pred_smote = pipe_smote.predict(X_test)
print("=== XGBoost + SMOTE ===")
print(classification_report(y_test, pred_smote, digits=3))

In [None]:
# ================================
# 12) FEATURE IMPORTANCE
# ================================
import matplotlib.pyplot as plt

importance_dict = xgb_weight.get_booster().get_score(importance_type="gain")
importance_df = pd.DataFrame({
    "Feature": importance_dict.keys(),
    "Importance": importance_dict.values()
}).sort_values(by="Importance", ascending=True)

plt.figure(figsize=(10,max(6,len(importance_df)*0.3)))
plt.barh(importance_df["Feature"], importance_df["Importance"])
plt.xlabel("Feature Importance (Gain)")
plt.ylabel("Feature")
plt.title("XGBoost Feature Importance (Weighted Class)")
plt.tight_layout()
plt.show()


In [None]:
# ================================
# 13) KLASIFIKASI BUZZER PAKAI INDOBERTWEET
# ================================
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset

# Gabungkan semua komentar per akun
df_text_user = df.groupby('username')['text_clean'].apply(lambda x: " ".join(x)).reset_index()
df_text_user = df_text_user.merge(df_user[['username','buzzer_label']], on='username', how='left')
df_text_user = df_text_user[df_text_user['buzzer_label'].notna()]

# Split train/test
X_text = df_text_user['text_clean'].tolist()
y_text = df_text_user['buzzer_label'].tolist()
X_train_text, X_test_text, y_train_text, y_test_text = train_test_split(
    X_text, y_text, test_size=0.2, random_state=42, stratify=y_text
)

MODEL = "indolem/indobertweet-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL, num_labels=2)

class BuzzerDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels
    def __len__(self):
        return len(self.labels)
    def __getitem__(self, idx):
        encoded = tokenizer(self.texts[idx], truncation=True, max_length=128, padding="max_length", return_tensors="pt")
        item = {k:v.squeeze(0) for k,v in encoded.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

train_dataset = BuzzerDataset(X_train_text, y_train_text)
test_dataset = BuzzerDataset(X_test_text, y_test_text)

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=2,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=2e-5,
    evaluation_strategy="epoch",
    save_strategy="no",
    logging_steps=10,
    seed=42
)

from transformers import Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

trainer.train()

# Evaluate
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds)
    return {"accuracy": acc, "f1": f1}

results = trainer.evaluate()
print("=== IndoBERTweet Classification Buzzer ===")
print(results)