In [None]:
!pip install numpy==1.26.4 --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.3/18.3 MB[0m [31m46.5 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
opencv-python-headless 4.12.0.88 requires numpy<2.3.0,>=2; python_version >= "3.9", but you have numpy 1.26.4 which is incompatible.
thinc 8.3.6 requires numpy<3.0.0,>=2.0.0, but you have numpy 1.26.4 which is incompatible.[0m[31m
[0m

# Import Library

In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [None]:
import pandas as pd
import gdown
import re
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
from datasets import DatasetDict

from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, TrainingArguments, Trainer
from transformers import EarlyStoppingCallback

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.utils.class_weight import compute_class_weight

import torch
import numpy as np
import shutil

# Dataset

## Sample

In [None]:
'''
Link: https://drive.google.com/file/d/13mfZ7ftVwnm_x_vd11FERn3jZ7mpfxAB/view?usp=sharing
'''
id = "13mfZ7ftVwnm_x_vd11FERn3jZ7mpfxAB"
gdown.download(id=id, output='sample_reviews.csv', quiet=False)

Downloading...
From: https://drive.google.com/uc?id=13mfZ7ftVwnm_x_vd11FERn3jZ7mpfxAB
To: /content/sample_reviews.csv
100%|██████████| 1.21M/1.21M [00:00<00:00, 109MB/s]


'sample_reviews.csv'

In [None]:
df_sample = pd.read_csv('sample_reviews.csv')
df_sample = df_sample[['title',	'username',	'clean_text',	'sentiment']]
df_sample = df_sample.rename(columns={'clean_text': 'Text', 'sentiment': 'Sentiment'})
df_sample.head()

Unnamed: 0,title,username,Text,Sentiment
0,Cargo,moviemenfes,this mvs is very sad nangiiis in this scene th...,Positive
1,Layar,bicaraboxoffice,if for example there is one film that aired on...,Neutral
2,What If,moviemenfes,mvs what if today? how come it s not in disney...,Neutral
3,Layar,WatchmenID,for those who miss seeing arini on the big screen,Neutral
4,Lightyear,ErikDavis,new trailer chris evans stars as buzz in a new...,Neutral


## Full Data

In [None]:
'''
Link: https://drive.google.com/file/d/1qJzBf6VtIYgFepTX8yL7N-k3_zjz2UJz/view?usp=sharing
'''
id = "1qJzBf6VtIYgFepTX8yL7N-k3_zjz2UJz"
gdown.download(id=id, output='distilbert_predicted_sentiment.csv', quiet=False)

Downloading...
From: https://drive.google.com/uc?id=1qJzBf6VtIYgFepTX8yL7N-k3_zjz2UJz
To: /content/distilbert_predicted_sentiment.csv
100%|██████████| 9.11M/9.11M [00:00<00:00, 33.6MB/s]


'distilbert_predicted_sentiment.csv'

In [None]:
df_left = pd.read_csv('distilbert_predicted_sentiment.csv')
df_left = df_left.rename(columns={'cleaned_text': 'Text', 'sentiment': 'Sentiment'})
df_left = df_left[['title', 'username', 'Text', 'Sentiment']]
df_left.head()

Unnamed: 0,title,username,Text,Sentiment
0,Tanda Tanya,AnakNonton,update wow! the enlightenment dara s house and...,Positive
1,Village,AnakNonton,hmm maybe a little longer try a little a littl...,Positive
2,Layar,AnakNonton,news june 575 cinema 21 screens use digital te...,Neutral
3,Layar,AnakNonton,film it by stephen king will be made a big scr...,Positive
4,Layar,AnakNonton,sony pictures and ubisoft are ready to lift th...,Positive


In [None]:
df = pd.concat([df_sample, df_left], ignore_index=True)
df

Unnamed: 0,title,username,Text,Sentiment
0,Cargo,moviemenfes,this mvs is very sad nangiiis in this scene th...,Positive
1,Layar,bicaraboxoffice,if for example there is one film that aired on...,Neutral
2,What If,moviemenfes,mvs what if today? how come it s not in disney...,Neutral
3,Layar,WatchmenID,for those who miss seeing arini on the big screen,Neutral
4,Lightyear,ErikDavis,new trailer chris evans stars as buzz in a new...,Neutral
...,...,...,...,...
34081,Rise,zavvi,the droids hold a special place in my heart an...,Positive
34082,Rise,zavvi,okay campers rise and shine! this sunday at 7p...,Positive
34083,Rise,zavvi,star wars episode ix the rise of skywalker zav...,Positive
34084,Seasons,zavvi,season 1 6 blu ray boxset only 69 99 in our !,Negative


# Preprocessing

In [None]:
# Filter dan encode label
label_map = {"Negative": 0, "Neutral": 1, "Positive": 2}

df = df[['title', 'username', 'Text', 'Sentiment']].dropna()
df = df[df['Sentiment'].isin(label_map.keys())]

df['label'] = df['Sentiment'].map(label_map)

In [None]:
print(df['Sentiment'].value_counts())

Sentiment
Positive    17962
Negative    14316
Neutral      1790
Name: count, dtype: int64


In [None]:
# Hitung class weights
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(df['label']), y=df['label'])
class_weights = torch.tensor(class_weights, dtype=torch.float)

print("Class weights:", class_weights)

Class weights: tensor([0.7932, 6.3441, 0.6322])


In [None]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(df, test_size=0.2, stratify=df['label'], random_state=42)

In [None]:
from datasets import DatasetDict, Dataset

hf_dataset = DatasetDict({
    "train": Dataset.from_pandas(train_df[['Text', 'label']].reset_index(drop=True)),
    "test": Dataset.from_pandas(test_df[['Text', 'label']].reset_index(drop=True))
})

# Fine-tune DistilBERT

In [None]:
# Tokenizer
model_checkpoint = "distilbert-base-uncased"
tokenizer = DistilBertTokenizer.from_pretrained(model_checkpoint)

def tokenize_function(example):
    return tokenizer(example["Text"], padding="max_length", truncation=True, max_length=128)

# Terapkan tokenisasi ke seluruh dataset
tokenized_datasets = hf_dataset.map(tokenize_function, batched=True)
tokenized_datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Map:   0%|          | 0/27254 [00:00<?, ? examples/s]

Map:   0%|          | 0/6814 [00:00<?, ? examples/s]

In [None]:
# Load Model
model = DistilBertForSequenceClassification.from_pretrained(model_checkpoint, num_labels=3)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
training_args = TrainingArguments(
    output_dir="./distilbert_sentiment_model",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=7,
    weight_decay=0.01,
    logging_dir="./logs_distilbert",
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    report_to="none"
)

In [None]:
from transformers import Trainer

class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        loss_fct = torch.nn.CrossEntropyLoss(weight=class_weights.to(model.device))
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

In [None]:
# Metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_score(labels, predictions),
        "f1": f1_score(labels, predictions, average='weighted')
    }

In [None]:
# Trainer
trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

trainer.train()

  trainer = WeightedTrainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.5709,0.510048,0.880393,0.884395
2,0.5004,0.636851,0.888465,0.88709
3,0.4015,0.675436,0.893308,0.893103
4,0.3566,0.777435,0.889786,0.892034
5,0.2187,0.891094,0.889639,0.891794


TrainOutput(global_step=8520, training_loss=0.40393934809545956, metrics={'train_runtime': 1654.264, 'train_samples_per_second': 115.325, 'train_steps_per_second': 7.21, 'total_flos': 4512913583685120.0, 'train_loss': 0.40393934809545956, 'epoch': 5.0})

In [None]:
eval_results = trainer.evaluate()
accuracy_percent = eval_results['eval_accuracy'] * 100
print(f"Distilbert Model Accuracy: {accuracy_percent:.2f}%")

Distilbert Model Accuracy: 89.33%


In [None]:
# Save model
model.save_pretrained("./distilbert_sentiment_model")
tokenizer.save_pretrained("./distilbert_sentiment_model")

print("\nFine-tuned distilbert model saved to ./distilbert_sentiment_model")


Fine-tuned distilbert model saved to ./distilbert_sentiment_model


In [None]:
import shutil

shutil.make_archive('distilbert_sentiment_model', 'zip', 'distilbert_sentiment_model')
print("\nFine-tuned BERT model saved and zipped as 'distilbert_sentiment_model.zip'")


Fine-tuned BERT model saved and zipped as 'distilbert_sentiment_model.zip'


# Sentiment Score

In [None]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['Sentiment'])

In [None]:
# Tokenisasi seluruh teks
full_encodings = tokenizer(list(df['Text']), truncation=True, padding=True, max_length=128)

In [None]:
# Dataset custom untuk prediksi
class SentimentDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item
    def __len__(self):
        return len(self.labels)

In [None]:
# Create a dataset from the full DataFrame for prediction
full_dataset_for_prediction = SentimentDataset(full_encodings, df['label'].tolist())

# Convert to datasets.Dataset object
full_dataset_for_prediction = Dataset.from_dict({
    'input_ids': full_dataset_for_prediction.encodings['input_ids'],
    'attention_mask': full_dataset_for_prediction.encodings['attention_mask'],
    'labels': full_dataset_for_prediction.labels
})


# Use this new dataset for prediction
predictions = trainer.predict(full_dataset_for_prediction)
logits = predictions.predictions

# Convert logits to probabilities using softmax
probabilities = torch.softmax(torch.tensor(logits), dim=1).numpy()

# Skor polaritas
distilbert_scores = [p[0]*1 + p[1]*3 + p[2]*5 for p in probabilities]

# Tambahkan skor ke DataFrame
df['sentiment_score'] = distilbert_scores

In [None]:
print(df.columns)


Index(['title', 'username', 'Text', 'Sentiment', 'label', 'sentiment_score'], dtype='object')


In [None]:
df[['title', 'username', 'Text', 'Sentiment', 'sentiment_score']]

Unnamed: 0,title,username,Text,Sentiment,sentiment_score
0,Cargo,moviemenfes,this mvs is very sad nangiiis in this scene th...,Positive,4.971964
1,Layar,bicaraboxoffice,if for example there is one film that aired on...,Neutral,1.021166
2,What If,moviemenfes,mvs what if today? how come it s not in disney...,Neutral,1.014182
3,Layar,WatchmenID,for those who miss seeing arini on the big screen,Neutral,3.042384
4,Lightyear,ErikDavis,new trailer chris evans stars as buzz in a new...,Neutral,1.935174
...,...,...,...,...,...
34081,Rise,zavvi,the droids hold a special place in my heart an...,Positive,4.986464
34082,Rise,zavvi,okay campers rise and shine! this sunday at 7p...,Positive,4.981035
34083,Rise,zavvi,star wars episode ix the rise of skywalker zav...,Positive,4.919442
34084,Seasons,zavvi,season 1 6 blu ray boxset only 69 99 in our !,Negative,1.067592


In [None]:
df[['title', 'username', 'Text', 'Sentiment', 'sentiment_score']].to_csv('distilbert_sentiment_scores.csv', index=False)

In [None]:
from google.colab import files

files.download('distilbert_sentiment_scores.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>