In [None]:
# Step 1: Install library
!pip install datasets transformers huggingface_hub --quiet
!pip install -U datasets

# Step 2: Import library
import tensorflow as tf
from transformers import TFDistilBertForSequenceClassification, DistilBertTokenizer
from datasets import load_dataset
from huggingface_hub import notebook_login, create_repo, upload_folder



In [None]:
# Step 3: Login ke Hugging Face
notebook_login()  # Masukkan token dari https://huggingface.co/settings/tokens


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
# Step 4: Load dataset
dataset = load_dataset("dair-ai/emotion", split="train" )
dataset = dataset.train_test_split(test_size=0.2, seed=42)
train_data = dataset["train"]
val_data = dataset["test"]

# Step 5: Load tokenizer
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

# Step 6: Tokenisasi
def tokenize(example):
    return tokenizer(example["text"], padding="max_length", truncation=True, max_length=128)

train_encodings = tokenizer([ex["text"] for ex in train_data], truncation=True, padding=True, max_length=128)
val_encodings = tokenizer([ex["text"] for ex in val_data], truncation=True, padding=True, max_length=128)

# Step 7: Konversi ke tf.data.Dataset
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    [ex["label"] for ex in train_data]
)).shuffle(1000).batch(16)

val_dataset = tf.data.Dataset.from_tensor_slices((
    dict(val_encodings),
    [ex["label"] for ex in val_data]
)).batch(16)

# Step 8: Load model dan compile
model = TFDistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=6)

optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metrics = [tf.keras.metrics.SparseCategoricalAccuracy(name="accuracy")]

model.compile(optimizer=optimizer, loss=loss, metrics=metrics)

# Step 9: Training
model.fit(train_dataset, validation_data=val_dataset, epochs=3)

# Step 10: Simpan model dan tokenizer
model.save_pretrained("emotion-model")
tokenizer.save_pretrained("emotion-model")


README.md:   0%|          | 0.00/9.05k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/1.03M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/127k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/129k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/16000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2000 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should 

Epoch 1/3
Epoch 2/3
Epoch 3/3


('emotion-model/tokenizer_config.json',
 'emotion-model/special_tokens_map.json',
 'emotion-model/vocab.txt',
 'emotion-model/added_tokens.json')

In [None]:
# Step 11: Upload ke Hugging Face
# Ganti "farizkuy" dengan username kamu
repo_name = "distilbert-emotion-tf"
model_id = f"farizkuy/emotion_tf"  # ganti sesuai username Hugging Face kamu

# Buat repo kalau belum ada
create_repo(repo_id=model_id, exist_ok=True)

# Upload folder model
upload_folder(
    folder_path="emotion-model",
    repo_id=model_id,
    commit_message="Upload DistilBERT emotion classifier (TF)"
)


tf_model.h5:   0%|          | 0.00/268M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/farizkuy/emotion_tf/commit/29c7d887b54958399af89506e244750bd831574c', commit_message='Upload DistilBERT emotion classifier (TF)', commit_description='', oid='29c7d887b54958399af89506e244750bd831574c', pr_url=None, repo_url=RepoUrl('https://huggingface.co/farizkuy/emotion_tf', endpoint='https://huggingface.co', repo_type='model', repo_id='farizkuy/emotion_tf'), pr_revision=None, pr_num=None)