In [None]:
!pip install numpy==1.26.4 --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.3/18.3 MB[0m [31m107.1 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
opencv-python-headless 4.12.0.88 requires numpy<2.3.0,>=2; python_version >= "3.9", but you have numpy 1.26.4 which is incompatible.
thinc 8.3.6 requires numpy<3.0.0,>=2.0.0, but you have numpy 1.26.4 which is incompatible.[0m[31m
[0m

# Import Library

In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [None]:
import pandas as pd
import gdown
import re
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
from datasets import DatasetDict
from transformers import BertTokenizer, BertForSequenceClassification, TrainingArguments, Trainer
from transformers import EarlyStoppingCallback
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.utils.class_weight import compute_class_weight
import torch
import numpy as np
import shutil

# Dataset

## Sample

In [None]:
'''
Link: https://drive.google.com/file/d/13mfZ7ftVwnm_x_vd11FERn3jZ7mpfxAB/view?usp=sharing
'''
id = "13mfZ7ftVwnm_x_vd11FERn3jZ7mpfxAB"
gdown.download(id=id, output='sample_reviews.csv', quiet=False)

Downloading...
From: https://drive.google.com/uc?id=13mfZ7ftVwnm_x_vd11FERn3jZ7mpfxAB
To: /content/sample_reviews.csv
100%|██████████| 1.21M/1.21M [00:00<00:00, 12.0MB/s]


'sample_reviews.csv'

In [None]:
df_sample = pd.read_csv('sample_reviews.csv')
df_sample = df_sample[['title',	'username',	'clean_text',	'sentiment']]
df_sample = df_sample.rename(columns={'clean_text': 'Text', 'sentiment': 'Sentiment'})
df_sample.head()

Unnamed: 0,title,username,Text,Sentiment
0,Cargo,moviemenfes,this mvs is very sad nangiiis in this scene th...,Positive
1,Layar,bicaraboxoffice,if for example there is one film that aired on...,Neutral
2,What If,moviemenfes,mvs what if today? how come it s not in disney...,Neutral
3,Layar,WatchmenID,for those who miss seeing arini on the big screen,Neutral
4,Lightyear,ErikDavis,new trailer chris evans stars as buzz in a new...,Neutral


## Full Data

In [None]:
'''
Link: https://drive.google.com/file/d/1PqocXgtELsgfl-5LtrbTFSuhLhamuQ_X/view?usp=sharing
'''
id = "1PqocXgtELsgfl-5LtrbTFSuhLhamuQ_X"
gdown.download(id=id, output='left_data.csv', quiet=False)

Downloading...
From: https://drive.google.com/uc?id=1PqocXgtELsgfl-5LtrbTFSuhLhamuQ_X
To: /content/left_data.csv
100%|██████████| 4.47M/4.47M [00:00<00:00, 31.7MB/s]


'left_data.csv'

In [None]:
df_left = pd.read_csv('left_data.csv')
df_left.head()

Unnamed: 0,title,username,Text,Predicted_Sentiment
0,Tanda Tanya,AnakNonton,update wow! the enlightenment dara s house and...,Positive
1,Village,AnakNonton,hmm maybe a little longer try a little a littl...,Negative
2,Layar,AnakNonton,news june 575 cinema 21 screens use digital te...,Positive
3,Layar,AnakNonton,film it by stephen king will be made a big scr...,Neutral
4,Layar,AnakNonton,sony pictures and ubisoft are ready to lift th...,Positive


In [None]:
df_left = df_left.rename(columns={"Predicted_Sentiment": "Sentiment"})

In [None]:
df = pd.concat([df_sample, df_left], ignore_index=True)
df

Unnamed: 0,title,username,Text,Sentiment
0,Cargo,moviemenfes,this mvs is very sad nangiiis in this scene th...,Positive
1,Layar,bicaraboxoffice,if for example there is one film that aired on...,Neutral
2,What If,moviemenfes,mvs what if today? how come it s not in disney...,Neutral
3,Layar,WatchmenID,for those who miss seeing arini on the big screen,Neutral
4,Lightyear,ErikDavis,new trailer chris evans stars as buzz in a new...,Neutral
...,...,...,...,...
33133,Rise,zavvi,the droids hold a special place in my heart an...,Positive
33134,Rise,zavvi,okay campers rise and shine! this sunday at 7p...,Positive
33135,Rise,zavvi,star wars episode ix the rise of skywalker zav...,Neutral
33136,Seasons,zavvi,season 1 6 blu ray boxset only 69 99 in our !,Neutral


# Preprocessing

In [None]:
# Filter dan encode label
label_map = {"Negative": 0, "Neutral": 1, "Positive": 2}

df = df[['title', 'username', 'Text', 'Sentiment']].dropna()
df = df[df['Sentiment'].isin(label_map.keys())]

df['label'] = df['Sentiment'].map(label_map)

In [None]:
print(df['Sentiment'].value_counts())

Sentiment
Negative    15664
Positive    13513
Neutral      3959
Name: count, dtype: int64


In [None]:
# Hitung class weights
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(df['label']), y=df['label'])
class_weights = torch.tensor(class_weights, dtype=torch.float)

print("Class weights:", class_weights)

Class weights: tensor([0.7051, 2.7899, 0.8174])


In [None]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(df, test_size=0.2, stratify=df['label'], random_state=42)

In [None]:
from datasets import DatasetDict, Dataset

hf_dataset = DatasetDict({
    "train": Dataset.from_pandas(train_df[['Text', 'label']].reset_index(drop=True)),
    "test": Dataset.from_pandas(test_df[['Text', 'label']].reset_index(drop=True))
})

# Fine-tune BERT

In [None]:
# Tokenizer
model_checkpoint = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_checkpoint)

def tokenize_function(example):
    return tokenizer(example["Text"], padding="max_length", truncation=True, max_length=128)

# Terapkan tokenisasi ke seluruh dataset
tokenized_datasets = hf_dataset.map(tokenize_function, batched=True)
tokenized_datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

Map:   0%|          | 0/26508 [00:00<?, ? examples/s]

Map:   0%|          | 0/6628 [00:00<?, ? examples/s]

In [None]:
# Load Model
model = BertForSequenceClassification.from_pretrained(model_checkpoint, num_labels=3)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
training_args = TrainingArguments(
    output_dir="./bert_sentiment_model_fixed",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=7,
    weight_decay=0.01,
    logging_dir="./logs_fixed",
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    report_to="none"
)

In [None]:
from transformers import Trainer

class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        loss_fct = torch.nn.CrossEntropyLoss(weight=class_weights.to(model.device))
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

In [None]:
# Metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_score(labels, predictions),
        "f1": f1_score(labels, predictions, average='weighted')
    }

In [None]:
# Trainer
trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

trainer.train()

  trainer = WeightedTrainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.4985,0.443846,0.851388,0.857238
2,0.351,0.572673,0.851086,0.853802
3,0.2537,0.721859,0.858177,0.858552
4,0.159,0.929312,0.874623,0.871449
5,0.0954,1.026483,0.86029,0.860979
6,0.0661,1.110102,0.870549,0.86873


TrainOutput(global_step=9942, training_loss=0.24777503407857615, metrics={'train_runtime': 5332.8792, 'train_samples_per_second': 34.795, 'train_steps_per_second': 2.175, 'total_flos': 1.0461915715700736e+16, 'train_loss': 0.24777503407857615, 'epoch': 6.0})

In [None]:
eval_results = trainer.evaluate()
accuracy_percent = eval_results['eval_accuracy'] * 100
print(f"BERT Model Accuracy: {accuracy_percent:.2f}%")

BERT Model Accuracy: 87.46%


In [None]:
# Save model
model.save_pretrained("./bert_sentiment_model")
tokenizer.save_pretrained("./bert_sentiment_model")

print("\nFine-tuned BERT model saved to ./bert_sentiment_model")


Fine-tuned BERT model saved to ./bert_sentiment_model


In [None]:
import shutil

shutil.make_archive('bert_sentiment_model', 'zip', 'bert_sentiment_model')
print("\nFine-tuned BERT model saved and zipped as 'bert_sentiment_model.zip'")


Fine-tuned BERT model saved and zipped as 'bert_sentiment_model.zip'


# Sentiment Score

In [None]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['Sentiment'])

In [None]:
# Tokenisasi seluruh teks
full_encodings = tokenizer(list(df['Text']), truncation=True, padding=True, max_length=128)

In [None]:
# Dataset custom untuk prediksi
class SentimentDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item
    def __len__(self):
        return len(self.labels)

In [None]:
# Create a dataset from the full DataFrame for prediction
full_dataset_for_prediction = SentimentDataset(full_encodings, df['label'].tolist())

# Convert to datasets.Dataset object
full_dataset_for_prediction = Dataset.from_dict({
    'input_ids': full_dataset_for_prediction.encodings['input_ids'],
    'attention_mask': full_dataset_for_prediction.encodings['attention_mask'],
    'labels': full_dataset_for_prediction.labels
})


# Use this new dataset for prediction
predictions = trainer.predict(full_dataset_for_prediction)
logits = predictions.predictions

# Convert logits to probabilities using softmax
probabilities = torch.softmax(torch.tensor(logits), dim=1).numpy()

# Skor polaritas
bert_scores = [p[0]*1 + p[1]*3 + p[2]*5 for p in probabilities]

# Tambahkan skor ke DataFrame
df['sentiment_score'] = bert_scores

In [None]:
print(df.columns)


Index(['title', 'username', 'Text', 'Sentiment', 'label', 'sentiment_score'], dtype='object')


In [None]:
df[['title', 'username', 'Text', 'Sentiment', 'sentiment_score']]

Unnamed: 0,title,username,Text,Sentiment,sentiment_score
0,Cargo,moviemenfes,this mvs is very sad nangiiis in this scene th...,Positive,4.976672
1,Layar,bicaraboxoffice,if for example there is one film that aired on...,Neutral,2.998256
2,What If,moviemenfes,mvs what if today? how come it s not in disney...,Neutral,1.004498
3,Layar,WatchmenID,for those who miss seeing arini on the big screen,Neutral,2.999633
4,Lightyear,ErikDavis,new trailer chris evans stars as buzz in a new...,Neutral,2.999340
...,...,...,...,...,...
33133,Rise,zavvi,the droids hold a special place in my heart an...,Positive,4.996857
33134,Rise,zavvi,okay campers rise and shine! this sunday at 7p...,Positive,4.996006
33135,Rise,zavvi,star wars episode ix the rise of skywalker zav...,Neutral,2.999873
33136,Seasons,zavvi,season 1 6 blu ray boxset only 69 99 in our !,Neutral,3.000320


In [None]:
df[['title', 'username', 'Text', 'Sentiment', 'sentiment_score']].to_csv('bert_sentiment_scores.csv', index=False)

In [None]:
from google.colab import files

files.download('bert_sentiment_scores.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>