# Distilation BERT (DISTILBERT)

#### Configure NVIDIA GPU as Default

In [1]:
import torch

print("Number of GPU: ", torch.cuda.device_count())
print("GPU Name: ", torch.cuda.get_device_name())

if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)} is available.")
else:
    print("No GPU available. Training will run on CPU.")

Number of GPU:  1
GPU Name:  NVIDIA GeForce GTX 1650
GPU: NVIDIA GeForce GTX 1650 is available.


In [2]:
#Configure device: Set the device configuration to GPU using the torch.device class in PyTorch:

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Selected Device:", device)

Selected Device: cuda


### Importing Library

In [3]:
import pandas as pd
import numpy as np

### Load Data

In [4]:
df = pd.read_csv('bca_preprocessed_data.csv', delimiter=',')
df.head(5)

Unnamed: 0,content,score,sentiment
0,mohon donk kembalikan tanda biru hitam saat li...,1,0
1,kenapa sekarang untuk transfer ke blu bca ada ...,2,0
2,oke terimakasih,5,2
3,tolong pihak developer untuk update hp saya ga...,5,2
4,ok,5,2


In [5]:
df.shape

(14000, 3)

### Find and clean missing data pt2

In [6]:
df.isnull().sum()

content      61
score         0
sentiment     0
dtype: int64

In [7]:
bca_clean_again = df.dropna()

In [8]:
bca_clean_again.isnull().sum()

content      0
score        0
sentiment    0
dtype: int64

In [9]:
bca_clean_again.shape

(13939, 3)

### Check Data Balance when Splitting🪓

In [10]:
#Split features (content) and labels (sentiment)

X = bca_clean_again['content']
y = bca_clean_again['sentiment']

In [11]:
# Check dataset balance and apply oversampling for "Neutral" class if needed
print("Before balancing:")
print(y.value_counts())

Before balancing:
sentiment
0    7443
2    5429
1    1067
Name: count, dtype: int64


##### Melakukan balancing data karena kelas 0 mempunyai values yang sedikit dan dapat berpengaruh kepada performa yang tidak baik menggunakan Oversampling: duplikasi class minoritas untuk menyeimbangkan dataset.

In [12]:
# Oversampling technique
from sklearn.utils import resample

bca_clean_again = pd.concat([X, y], axis=1)

#separating each class

class_0 = bca_clean_again[bca_clean_again['sentiment'] == 0]
class_1 = bca_clean_again[bca_clean_again['sentiment'] == 1]
class_2 = bca_clean_again[bca_clean_again['sentiment'] == 2]

#oversampling neutral class 1 with majority class 2

class_1_oversampled = resample(class_1, replace=True, n_samples=len(class_2), random_state=42)

#combine all class
bca_clean_again = pd.concat([class_0, class_1_oversampled, class_2])

#shuffling the data
bca_clean_again = bca_clean_again.sample(frac=1, random_state=42)

X = bca_clean_again['content']
y = bca_clean_again['sentiment']

In [13]:
# Check dataset balance and apply oversampling for "Neutral" class if needed
print("After balancing:")
print(y.value_counts())

After balancing:
sentiment
0    7443
1    5429
2    5429
Name: count, dtype: int64


### Splitting Data🪓🪓🪓 , 80:20 

In [65]:
# from sklearn.model_selection import train_test_split

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, 
#                                                     random_state=42, stratify=y)

# #split 20% test+validation into 10%test dan 10%validation
# X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, 
#                                                test_size=0.2, random_state=42)

In [14]:
from sklearn.model_selection import train_test_split

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, 
                                                    random_state=42, stratify=y)

#split 20% test+validation into 10%test dan 10%validation
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, 
                                               test_size=0.5, random_state=42,stratify=y_temp)

In [15]:
print(f"Training set size: {len(X_train)}")
print(f"Validation set size: {len(X_val)}")
print(f"Test set size: {len(X_test)}")

Training set size: 14640
Validation set size: 1830
Test set size: 1831


#### Build Tokenizer with DistilBert-base-Multilingual-uncased

In [16]:
# import transformer library
from transformers import AutoTokenizer, DistilBertModel, DistilBertForSequenceClassification, DistilBertTokenizer, AutoModelForSequenceClassification

In [17]:
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-multilingual-cased")

#### Create tokenizer function

In [18]:
def tokenize_data(tokenizer, texts, max_len=128):
    return tokenizer(
        list(texts),
        max_length=max_len,
        padding="max_length",
        truncation = True,
        return_tensors = 'pt'
    )

#### Create Pytorch dataset

In [19]:
from torch.utils.data import Dataset

class DistilSentiment_dataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        tokens = tokenize_data(self.tokenizer, [text], self.max_len)
        return{
            "input_ids": tokens["input_ids"].squeeze(0),
            "attention_mask": tokens['attention_mask'].squeeze(0),
            "label": torch.tensor(label, dtype=torch.long),
        }

#### Create DataLoaders

In [20]:
from torch.utils.data import DataLoader

#create datasets
train_dataset = DistilSentiment_dataset(X_train.values, y_train.values, tokenizer)
validation_dataset = DistilSentiment_dataset(X_val.values, y_val.values, tokenizer)
test_dataset = DistilSentiment_dataset(X_test.values, y_test.values, tokenizer)

#create dataloader
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(validation_dataset, batch_size=8, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

#### Build Sentiment Classifier

In [21]:
import torch.nn as nn

class Sentimentclassifier(nn.Module):
    # construct class
    def __init__(self, num_classes=3):
        super(Sentimentclassifier, self).__init__()
        self.distilbert = AutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-multilingual-cased",
                                                          num_labels=num_classes,
                                                          id2label={0: "NEGATIVE", 1: "NEUTRAL", 2: "POSITIVE"},
                                                          label2id={"NEGATIVE": 0, "NEUTRAL": 1, "POSITIVE": 2}
        )
    
    #forward propagation class
    def forward(self, input_ids, attention_mask):
        return self.distilbert(input_ids=input_ids, attention_mask=attention_mask)

#### Train Model

In [22]:
from torch.optim import AdamW
from tqdm import tqdm

#setup the GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = Sentimentclassifier().to(device)

#set optimizer and loss function
optimizer = AdamW(model.parameters(), lr=3e-5, weight_decay=0.01, betas=(0.9, 0.999))
loss_fn = nn.CrossEntropyLoss()

# training loop
def train_model(model, train_loader, val_loader, optimizer, loss_fn, epochs=3):
    for epoch in range(epochs):
        model.train()
        train_loss = 0
        correct = 0
        total = 0

        #training
        for batch in tqdm(train_loader, desc=f"Training Epoch {epoch+1}"):
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids, attention_mask)
            logits = outputs.logits
            loss = loss_fn(logits, labels)

            loss.backward()
            optimizer.step()

            train_loss += loss.item()
            _, preds = torch.max(logits, dim=1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)

        train_acc = correct / total
        print(f"Epoch {epoch+1} - Loss: {train_loss:.4f}, Accuracy: {train_acc:.4f}")

        #validation
        model.eval()
        val_loss = 0
        val_correct = 0
        val_total = 0
        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['label'].to(device)

                outputs = model(input_ids, attention_mask)
                logits = outputs.logits
                loss = loss_fn(logits, labels)

                val_loss += loss.item()
                _, preds = torch.max(logits, dim=1)
                val_correct += (preds == labels).sum().item()
                val_total += labels.size(0)
        
        val_acc = val_correct / val_total
        print(f"Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_acc:.4f}")

train_model(model, train_loader,val_loader, optimizer, loss_fn, epochs=5)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Training Epoch 1: 100%|██████████| 1830/1830 [08:06<00:00,  3.76it/s]


Epoch 1 - Loss: 1428.3848, Accuracy: 0.6423
Validation Loss: 139.2475, Validation Accuracy: 0.7497


Training Epoch 2: 100%|██████████| 1830/1830 [08:08<00:00,  3.75it/s]


Epoch 2 - Loss: 912.1678, Accuracy: 0.8130
Validation Loss: 102.0414, Validation Accuracy: 0.8311


Training Epoch 3: 100%|██████████| 1830/1830 [08:05<00:00,  3.77it/s]


Epoch 3 - Loss: 552.3362, Accuracy: 0.8915
Validation Loss: 92.1715, Validation Accuracy: 0.8579


Training Epoch 4: 100%|██████████| 1830/1830 [08:17<00:00,  3.68it/s]


Epoch 4 - Loss: 404.6836, Accuracy: 0.9227
Validation Loss: 80.9454, Validation Accuracy: 0.8694


Training Epoch 5: 100%|██████████| 1830/1830 [08:16<00:00,  3.69it/s]


Epoch 5 - Loss: 293.3378, Accuracy: 0.9438
Validation Loss: 88.0576, Validation Accuracy: 0.8770


### Evaluate Model

In [23]:
from sklearn.metrics import classification_report
import torch

def evaluate_model(model, test_loader):
    model.eval()
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids, attention_mask)
            logits = outputs.logits
            _, preds = torch.max(logits, dim=1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
        
        print(classification_report(all_labels, all_preds))

evaluate_model(model, test_loader)

              precision    recall  f1-score   support

           0       0.83      0.87      0.85       745
           1       0.93      0.96      0.94       543
           2       0.85      0.77      0.81       543

    accuracy                           0.87      1831
   macro avg       0.87      0.87      0.87      1831
weighted avg       0.87      0.87      0.87      1831



In [24]:
model_save = "distilepoch3.pt"
torch.save(model.state_dict(), model_save)

In [25]:
#save tokenizer
tokenizer_save = "tokenizer"
tokenizer.save_pretrained(tokenizer_save)

('tokenizer\\tokenizer_config.json',
 'tokenizer\\special_tokens_map.json',
 'tokenizer\\vocab.txt',
 'tokenizer\\added_tokens.json',
 'tokenizer\\tokenizer.json')

### inference function

In [26]:
from transformers import DistilBertForSequenceClassification, AutoTokenizer

# Paths where the model and tokenizer were saved
model_path = "/d:/SEMESTER 8/thesis-v2/distilepoch3.pt"
tokenizer_path = "/d:/SEMESTER 8/thesis-v2/tokenizer"

# Load the model and tokenizer
model = DistilBertForSequenceClassification.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)

# Move the model to the appropriate device (GPU or CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)


OSError: Incorrect path_or_model_id: '/d:/SEMESTER 8/thesis-v2/distilepoch3.pt'. Please provide either the path to a local folder or the repo_id of a model on the Hub.

In [27]:
def predict_sentiment(model, tokenizer, text):
    model.eval()

    tokens = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=128)
    input_ids = tokens['input_ids'].to(device)
    attention_mask = tokens['attention_mask'].to(device)

    #perform inference
    with torch.no_grad():
        outputs = model(input_ids, attention_mask)
        logits = outputs.logits
        probabilites = torch.softmax(logits, dim=1)


    max_prob, prediction = torch.max(probabilites, dim=1)

    label_map={0: "NEGATIVE", 1: "NEUTRAL", 2: "POSITIVE"}
    result = {
        "label": label_map[prediction.item()],
        "score": max_prob.item()
    }
    # return label_map[prediction.item()]
    return result

In [28]:
def predict_with_threshold(model, tokenizer, text, neutral_threshold=0.4):
    tokens = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=128)
    input_ids = tokens['input_ids'].to(device)
    attention_mask = tokens['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask)
        logits = outputs.logits
        probabilities = torch.softmax(logits, dim=1)

    # Custom neutral threshold
    max_prob, prediction = torch.max(probabilities, dim=1)
    if max_prob < neutral_threshold:
        return {"label": "NEUTRAL", "score": max_prob.item()}

    label_map = {0: "NEGATIVE", 1: "NEUTRAL", 2: "POSITIVE"}
    return {"label": label_map[prediction.item()], "score": max_prob.item()}

In [32]:
text = "Transfer dana berjalan lancar, tetapi menambahkan rekening baru agak rumit"
predicted_sentiment = predict_sentiment(model, tokenizer, text)

print(f"Predicted Sentiment: {predicted_sentiment['label']}, 'Score': {predicted_sentiment['score']}")

Predicted Sentiment: POSITIVE, 'Score': 0.6531272530555725
