# Intent Classifier Notebook

## Import Library

In [23]:
import pandas as pd
import numpy as np
import random
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, get_scheduler
from torch.optim import AdamW
import pandas as pd
import os
from sklearn.preprocessing import LabelEncoder
from glob import glob
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch
from transformers import DataCollatorWithPadding
from datasets import Dataset as HFDataset
from tqdm import tqdm

In [15]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## Load Dataset

In [None]:
folder_path = "../dataset/intent_dataset/"

all_data = []

for filename in os.listdir(folder_path):
    file_path = os.path.join(folder_path, filename)

    if filename.endswith(".csv"):
        with open(file_path, "r", encoding="utf-8") as f:
            for line in f:
                line = line.strip()
                if not line or "," not in line:
                    continue 

                intent, text = line.split(",", 1)
                all_data.append({"label": intent.strip(), "text": text.strip()})
                
df = pd.DataFrame(all_data)
print(df.head())

               label                                               text
0             intent                                               text
1  ask_service_price  Berapa biaya servis rutin untuk mobil di bengk...
2  ask_service_price        Apa harga ganti oli motor di tempat kalian?
3  ask_service_price         Bisa kasih tau biaya cek rem mobil berapa?
4  ask_service_price                Harga servis mesin motor berapa ya?


## Pre Process Data

### Data Augmentation

In [16]:
tokenizer = AutoTokenizer.from_pretrained("Wikidepia/IndoT5-base-paraphrase")
model = AutoModelForSeq2SeqLM.from_pretrained("Wikidepia/IndoT5-base-paraphrase")
model = model.to(device)
model.eval()

T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=768, out_features=2048, bias=False)
              (wi_1): Linear(in_features=768, out_features=2048, bias=False)
              (wo):

In [19]:
def batch_paraphrase(text_list, max_length=256, num_return_sequences=1, num_beams=5):
    encodings = tokenizer(
        text_list,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=512
    ).to(device)

    with torch.no_grad():
        outputs = model.generate(
            input_ids=encodings["input_ids"],
            attention_mask=encodings["attention_mask"],
            max_length=max_length,
            num_beams=num_beams,
            num_return_sequences=num_return_sequences,
            no_repeat_ngram_size=2,
            early_stopping=True
        )

    results = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    return results

In [24]:
batch_size = 16
num_return_sequences = 2

augmented_rows = []
for i in tqdm(range(0, len(df), batch_size)):
    batch_texts = df["text"].iloc[i:i+batch_size].tolist()
    batch_labels = df["label"].iloc[i:i+batch_size].tolist()

    augmented = batch_paraphrase(
        batch_texts,
        num_return_sequences=num_return_sequences
    )

    for j in range(len(batch_texts)):
        for k in range(num_return_sequences):
            index = j * num_return_sequences + k
            augmented_rows.append({
                "text": augmented[index],
                "label": batch_labels[j]
            })

df_augmented = pd.DataFrame(augmented_rows)
df_combined = pd.concat([df, df_augmented]).reset_index(drop=True)

  0%|          | 0/509 [00:00<?, ?it/s]

100%|██████████| 509/509 [08:55<00:00,  1.05s/it]


### Label Encoding

In [30]:
le = LabelEncoder()
df_combined["label_id"] = le.fit_transform(df_combined["label"])
label2id = {str(label): int(id_) for label, id_ in zip(le.classes_, le.transform(le.classes_))}
id2label = {int(id_): str(label) for id_, label in zip(le.transform(le.classes_), le.classes_)}

### Indobert Tokenization

In [31]:
tokenizer = BertTokenizer.from_pretrained("indobenchmark/indobert-base-p1")
dataset = HFDataset.from_pandas(df_combined[['text', 'label_id']])


def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True)

tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset = tokenized_dataset.rename_column("label_id", "labels")

Map:   0%|          | 0/24411 [00:00<?, ? examples/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Map: 100%|██████████| 24411/24411 [00:05<00:00, 4190.85 examples/s]


In [32]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
train_test = tokenized_dataset.train_test_split(test_size=0.2)
train_loader = DataLoader(train_test['train'], shuffle=True, batch_size=8, collate_fn=data_collator)
eval_loader = DataLoader(train_test['test'], batch_size=8, collate_fn=data_collator)


In [33]:
model = BertForSequenceClassification.from_pretrained(
    "indobenchmark/indobert-base-p1",
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id
).to(device)



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
