#**PLACE SEARCH ENGINE WITH TRANSFORMER INDOBERT BASE P1**

* COMBINING SEMANTIC SEACRH WITH CLASSIFICATION MODEL FOR BETTER SEMANTIC SEARCH AND QUERY UNDERSTANDING

* FINETUNING THE LAST 4 LAYERS OF INDOBERT BASE P1 AND ADDING CUSTOM LAYERS OF 75 CLASS

#IMPORTING LIBRARY

In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel, get_scheduler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from torch.optim import AdamW
from sklearn.metrics import classification_report, accuracy_score

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

#CREATING THE MODEL

###DATASET PREPARATIONS

In [4]:
df = pd.read_excel("/content/drive/MyDrive/BPS_PROJECT/PRE-PROCESSED/corpus_se.xlsx")
df = df.dropna(subset=["query", "label"])

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 756 entries, 0 to 755
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   query   756 non-null    object
 1   label   756 non-null    object
dtypes: object(2)
memory usage: 11.9+ KB


In [6]:
df.head()

Unnamed: 0,query,label
0,balai penyemaian tanaman,agriculture
1,dinas pertanian,agriculture
2,lokasi balai benih hortikultura,agriculture
3,pusat pelatihan pertanian,agriculture
4,balai penelitian padi dan tanaman pangan pku,agriculture


LABEL ENCODER

In [7]:
df['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
food,35
convenience_store,30
grocery_or_supermarket,25
cafe,22
supermarket,20
...,...
electronic_repair,5
tailor,5
airport,5
bar,5


In [8]:
label_encoder = LabelEncoder()
df['label_enc'] = label_encoder.fit_transform(df['label'])
num_labels = len(label_encoder.classes_)

SPLITTING THE DATASET

In [9]:
# Train/Val Split
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['query'].tolist(), df['label_enc'].tolist(), test_size=0.2, random_state=42, stratify=df['label_enc']
)

TOKENIZER FROM INDOBERT BASE P1

In [10]:
# Tokenizer
tokenizer = BertTokenizer.from_pretrained("indobenchmark/indobert-base-p1")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/229k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.53k [00:00<?, ?B/s]

PROCESSING THE DATASET

In [11]:
class PlaceDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        inputs = self.tokenizer(self.texts[idx], max_length=self.max_len, padding='max_length', truncation=True, return_tensors="pt")
        input_ids = inputs['input_ids'].squeeze(0)
        attention_mask = inputs['attention_mask'].squeeze(0)
        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'label': torch.tensor(self.labels[idx], dtype=torch.long)
        }

AUGMENTS THE DATASET

In [None]:
def get_augmented_dataset(texts, labels, tokenizer, augment=True):
    if augment:
        augmented_texts = texts.copy()
        augmented_labels = labels.copy()
        num_to_augment = int(len(texts) * 0.15)
        indices = np.random.choice(len(texts), num_to_augment, replace=False)

        for idx in indices:
            text = texts[idx]
            words = text.split()
            if len(words) > 3: 
                aug_type = np.random.choice(['drop', 'duplicate'])
                if aug_type == 'drop':
                    drop_idx = np.random.randint(0, len(words))
                    words.pop(drop_idx)
                else:
                    dup_idx = np.random.randint(0, len(words))
                    words.insert(dup_idx, words[dup_idx])

                augmented_text = ' '.join(words)
                augmented_texts.append(augmented_text)
                augmented_labels.append(labels[idx])

        return PlaceDataset(augmented_texts, augmented_labels, tokenizer)
    else:
        return PlaceDataset(texts, labels, tokenizer)

CREATING DATASET AND DATALOADER

In [None]:
train_dataset = get_augmented_dataset(train_texts, train_labels, tokenizer, augment=True)
val_dataset = PlaceDataset(val_texts, val_labels, tokenizer)

batch_size = 16  
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

###MODEL INITIALIZATION

USING **INDOBERT BASE P1** AS THE MODEL AND TRANSFER LEARNING BY FREEZING THE LAYER AND ONLY TRAINING ONLY 4 LAYERS FOR FINETUNING

In [None]:
class ImprovedPlaceClassifier(nn.Module):
    def __init__(self, num_labels, dropout_rate=0.2):
        super(ImprovedPlaceClassifier, self).__init__()
        self.bert = BertModel.from_pretrained("indobenchmark/indobert-base-p1")


        for param in self.bert.parameters():
            param.requires_grad = False

        for i in range(8, 12):
            for param in self.bert.encoder.layer[i].parameters():
                param.requires_grad = True
        self.pooling_type = "cls"  
        self.drop = nn.Dropout(dropout_rate)
        self.norm = nn.BatchNorm1d(self.bert.config.hidden_size)
        self.classifier = nn.Sequential(
            nn.Linear(self.bert.config.hidden_size, 512),
            nn.ReLU(),
            nn.BatchNorm1d(512),
            nn.Dropout(dropout_rate),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.BatchNorm1d(256),
            nn.Dropout(dropout_rate),
            nn.Linear(256, num_labels)
        )

    def forward(self, input_ids, attention_mask, **kwargs):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, **kwargs)

        if self.pooling_type == "cls":
            pooled_output = outputs.last_hidden_state[:, 0, :]  
        elif self.pooling_type == "mean":
            masked = outputs.last_hidden_state * attention_mask.unsqueeze(-1)
            pooled_output = masked.sum(dim=1) / attention_mask.sum(dim=1, keepdim=True)
        else:  
            masked = outputs.last_hidden_state * attention_mask.unsqueeze(-1)
            pooled_output = torch.max(masked, dim=1)[0]

        normalized = self.norm(pooled_output)
        dropped = self.drop(normalized)
        return self.classifier(dropped)

In [15]:
model = ImprovedPlaceClassifier(num_labels=num_labels).to(device)

pytorch_model.bin:   0%|          | 0.00/498M [00:00<?, ?B/s]

FULL MODEL ARCHITECTURE

In [16]:
print(model)

ImprovedPlaceClassifier(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(50000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, e

LOSS

In [17]:
criterion = nn.CrossEntropyLoss(label_smoothing=0.1)
weight_decay = 0.01
lr = 5e-5
optimizer = AdamW(model.parameters(), lr=5e-5)

WEIGHT DECAYING

In [18]:
no_decay = ["bias", "LayerNorm.weight"]
optimizer_grouped_parameters = [
    {
        "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
        "weight_decay": weight_decay,
    },
    {
        "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
        "weight_decay": 0.0,
    },
]

OPTIMIZER (ADAM)

In [19]:
optimizer = AdamW(optimizer_grouped_parameters, lr=lr)

# Learning rate scheduler with warmup
num_epochs = 15
num_training_steps = len(train_loader) * num_epochs
num_warmup_steps = int(0.1 * num_training_steps)

LEARNING RATE SCHEDULER

In [20]:
lr_scheduler = get_scheduler(
    name="linear",
    optimizer=optimizer,
    num_warmup_steps=num_warmup_steps,
    num_training_steps=num_training_steps,
)

EARLY STOPPING CALLBACKS

In [None]:
# Early stopping 
class EarlyStopping:
    def __init__(self, patience=3, min_delta=0.001):
        self.patience = patience
        self.min_delta = min_delta
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.best_model_state = None

    def __call__(self, val_score, model):
        if self.best_score is None:
            self.best_score = val_score
            self.best_model_state = model.state_dict().copy()
        elif val_score < self.best_score + self.min_delta:
            self.counter += 1
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = val_score
            self.best_model_state = model.state_dict().copy()
            self.counter = 0

        return self.early_stop

###TRAINING AND EVALUATION FUNCTION

In [None]:
def train_model(model, train_loader, val_loader, optimizer, criterion, scheduler):
    early_stopping = EarlyStopping(patience=3)
    history = {'train_loss': [], 'val_acc': []}

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0

        for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            # Forward pass
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            loss = criterion(outputs, labels)

            # Backward pass
            optimizer.zero_grad()
            loss.backward()

            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

            optimizer.step()
            scheduler.step()

            total_loss += loss.item()

        avg_train_loss = total_loss / len(train_loader)
        history['train_loss'].append(avg_train_loss)

        val_acc, val_report = evaluate_model(model, val_loader)
        history['val_acc'].append(val_acc)

        print(f"Epoch {epoch+1} | Train Loss: {avg_train_loss:.4f} | Val Accuracy: {val_acc:.4f}")
        print(f"Classification Report:\n{val_report}")

        # Check for early stopping
        if early_stopping(val_acc, model):
            print(f"Early stopping triggered after epoch {epoch+1}")
            model.load_state_dict(early_stopping.best_model_state)
            break

    return model, history

In [23]:
def evaluate_model(model, val_loader):
    model.eval()
    correct = 0
    total = 0
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            preds = torch.argmax(outputs, dim=1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

            correct += (preds == labels).sum().item()
            total += labels.size(0)

    acc = correct / total
    report = classification_report(all_labels, all_preds, target_names=label_encoder.classes_, zero_division=0)

    return acc, report

###TRAINING

In [24]:
trained_model, history = train_model(model, train_loader, val_loader, optimizer, criterion, lr_scheduler)

Epoch 1:   0%|          | 0/40 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

Epoch 1: 100%|██████████| 40/40 [00:09<00:00,  4.43it/s]


Epoch 1 | Train Loss: 4.5221 | Val Accuracy: 0.0526
Classification Report:
                                 precision    recall  f1-score   support

                     accounting       0.00      0.00      0.00         1
                    agriculture       0.00      0.00      0.00         3
                        airport       0.00      0.00      0.00         1
              architecture_firm       0.00      0.00      0.00         3
                    art_gallery       0.00      0.00      0.00         1
                            atm       0.00      0.00      0.00         1
                           bank       0.00      0.00      0.00         3
                            bar       0.00      0.00      0.00         1
                   beauty_salon       0.00      0.00      0.00         1
                          bureu       0.00      0.00      0.00         1
                           cafe       0.00      0.00      0.00         5
                     car_dealer       0.50      

Epoch 2: 100%|██████████| 40/40 [00:06<00:00,  5.87it/s]


Epoch 2 | Train Loss: 4.0920 | Val Accuracy: 0.3289
Classification Report:
                                 precision    recall  f1-score   support

                     accounting       0.00      0.00      0.00         1
                    agriculture       0.00      0.00      0.00         3
                        airport       0.00      0.00      0.00         1
              architecture_firm       0.50      0.67      0.57         3
                    art_gallery       0.00      0.00      0.00         1
                            atm       0.00      0.00      0.00         1
                           bank       0.75      1.00      0.86         3
                            bar       0.00      0.00      0.00         1
                   beauty_salon       0.00      0.00      0.00         1
                          bureu       0.25      1.00      0.40         1
                           cafe       0.33      0.20      0.25         5
                     car_dealer       0.33      

Epoch 3: 100%|██████████| 40/40 [00:06<00:00,  5.79it/s]


Epoch 3 | Train Loss: 3.4306 | Val Accuracy: 0.4474
Classification Report:
                                 precision    recall  f1-score   support

                     accounting       0.00      0.00      0.00         1
                    agriculture       0.33      0.33      0.33         3
                        airport       1.00      1.00      1.00         1
              architecture_firm       0.38      1.00      0.55         3
                    art_gallery       0.00      0.00      0.00         1
                            atm       0.00      0.00      0.00         1
                           bank       0.75      1.00      0.86         3
                            bar       0.00      0.00      0.00         1
                   beauty_salon       0.00      0.00      0.00         1
                          bureu       0.50      1.00      0.67         1
                           cafe       0.38      0.60      0.46         5
                     car_dealer       0.67      

Epoch 4: 100%|██████████| 40/40 [00:06<00:00,  5.74it/s]


Epoch 4 | Train Loss: 2.9400 | Val Accuracy: 0.5197
Classification Report:
                                 precision    recall  f1-score   support

                     accounting       1.00      1.00      1.00         1
                    agriculture       0.25      0.33      0.29         3
                        airport       1.00      1.00      1.00         1
              architecture_firm       0.50      1.00      0.67         3
                    art_gallery       0.00      0.00      0.00         1
                            atm       0.00      0.00      0.00         1
                           bank       0.60      1.00      0.75         3
                            bar       0.00      0.00      0.00         1
                   beauty_salon       0.00      0.00      0.00         1
                          bureu       1.00      1.00      1.00         1
                           cafe       0.45      1.00      0.62         5
                     car_dealer       0.67      

Epoch 5: 100%|██████████| 40/40 [00:07<00:00,  5.66it/s]


Epoch 5 | Train Loss: 2.5757 | Val Accuracy: 0.6579
Classification Report:
                                 precision    recall  f1-score   support

                     accounting       1.00      1.00      1.00         1
                    agriculture       0.67      0.67      0.67         3
                        airport       1.00      1.00      1.00         1
              architecture_firm       0.50      1.00      0.67         3
                    art_gallery       0.00      0.00      0.00         1
                            atm       0.00      0.00      0.00         1
                           bank       0.60      1.00      0.75         3
                            bar       0.00      0.00      0.00         1
                   beauty_salon       0.00      0.00      0.00         1
                          bureu       1.00      1.00      1.00         1
                           cafe       0.83      1.00      0.91         5
                     car_dealer       1.00      

Epoch 6: 100%|██████████| 40/40 [00:07<00:00,  5.58it/s]


Epoch 6 | Train Loss: 2.2953 | Val Accuracy: 0.6645
Classification Report:
                                 precision    recall  f1-score   support

                     accounting       1.00      1.00      1.00         1
                    agriculture       0.67      0.67      0.67         3
                        airport       1.00      1.00      1.00         1
              architecture_firm       0.50      1.00      0.67         3
                    art_gallery       0.00      0.00      0.00         1
                            atm       0.00      0.00      0.00         1
                           bank       0.60      1.00      0.75         3
                            bar       0.00      0.00      0.00         1
                   beauty_salon       0.00      0.00      0.00         1
                          bureu       1.00      1.00      1.00         1
                           cafe       0.83      1.00      0.91         5
                     car_dealer       1.00      

Epoch 7: 100%|██████████| 40/40 [00:07<00:00,  5.52it/s]


Epoch 7 | Train Loss: 2.0839 | Val Accuracy: 0.7039
Classification Report:
                                 precision    recall  f1-score   support

                     accounting       1.00      1.00      1.00         1
                    agriculture       0.67      0.67      0.67         3
                        airport       1.00      1.00      1.00         1
              architecture_firm       0.60      1.00      0.75         3
                    art_gallery       0.00      0.00      0.00         1
                            atm       0.00      0.00      0.00         1
                           bank       0.60      1.00      0.75         3
                            bar       0.00      0.00      0.00         1
                   beauty_salon       0.00      0.00      0.00         1
                          bureu       1.00      1.00      1.00         1
                           cafe       0.71      1.00      0.83         5
                     car_dealer       1.00      

Epoch 8: 100%|██████████| 40/40 [00:07<00:00,  5.45it/s]


Epoch 8 | Train Loss: 1.8981 | Val Accuracy: 0.7500
Classification Report:
                                 precision    recall  f1-score   support

                     accounting       1.00      1.00      1.00         1
                    agriculture       0.67      0.67      0.67         3
                        airport       1.00      1.00      1.00         1
              architecture_firm       1.00      1.00      1.00         3
                    art_gallery       1.00      1.00      1.00         1
                            atm       0.00      0.00      0.00         1
                           bank       0.60      1.00      0.75         3
                            bar       0.00      0.00      0.00         1
                   beauty_salon       0.00      0.00      0.00         1
                          bureu       1.00      1.00      1.00         1
                           cafe       0.83      1.00      0.91         5
                     car_dealer       1.00      

Epoch 9: 100%|██████████| 40/40 [00:07<00:00,  5.32it/s]


Epoch 9 | Train Loss: 1.7542 | Val Accuracy: 0.7632
Classification Report:
                                 precision    recall  f1-score   support

                     accounting       1.00      1.00      1.00         1
                    agriculture       0.67      0.67      0.67         3
                        airport       1.00      1.00      1.00         1
              architecture_firm       1.00      1.00      1.00         3
                    art_gallery       0.00      0.00      0.00         1
                            atm       0.00      0.00      0.00         1
                           bank       0.75      1.00      0.86         3
                            bar       0.00      0.00      0.00         1
                   beauty_salon       0.00      0.00      0.00         1
                          bureu       1.00      1.00      1.00         1
                           cafe       0.83      1.00      0.91         5
                     car_dealer       1.00      

Epoch 10: 100%|██████████| 40/40 [00:07<00:00,  5.25it/s]


Epoch 10 | Train Loss: 1.6737 | Val Accuracy: 0.7829
Classification Report:
                                 precision    recall  f1-score   support

                     accounting       1.00      1.00      1.00         1
                    agriculture       0.67      0.67      0.67         3
                        airport       1.00      1.00      1.00         1
              architecture_firm       0.75      1.00      0.86         3
                    art_gallery       1.00      1.00      1.00         1
                            atm       0.00      0.00      0.00         1
                           bank       0.75      1.00      0.86         3
                            bar       0.00      0.00      0.00         1
                   beauty_salon       0.00      0.00      0.00         1
                          bureu       1.00      1.00      1.00         1
                           cafe       0.83      1.00      0.91         5
                     car_dealer       1.00     

Epoch 11: 100%|██████████| 40/40 [00:07<00:00,  5.11it/s]


Epoch 11 | Train Loss: 1.5915 | Val Accuracy: 0.7632
Classification Report:
                                 precision    recall  f1-score   support

                     accounting       1.00      1.00      1.00         1
                    agriculture       0.67      0.67      0.67         3
                        airport       1.00      1.00      1.00         1
              architecture_firm       0.75      1.00      0.86         3
                    art_gallery       1.00      1.00      1.00         1
                            atm       0.00      0.00      0.00         1
                           bank       0.67      0.67      0.67         3
                            bar       0.00      0.00      0.00         1
                   beauty_salon       0.00      0.00      0.00         1
                          bureu       1.00      1.00      1.00         1
                           cafe       0.83      1.00      0.91         5
                     car_dealer       1.00     

Epoch 12: 100%|██████████| 40/40 [00:07<00:00,  5.02it/s]


Epoch 12 | Train Loss: 1.5109 | Val Accuracy: 0.7895
Classification Report:
                                 precision    recall  f1-score   support

                     accounting       1.00      1.00      1.00         1
                    agriculture       0.67      0.67      0.67         3
                        airport       1.00      1.00      1.00         1
              architecture_firm       1.00      1.00      1.00         3
                    art_gallery       1.00      1.00      1.00         1
                            atm       0.00      0.00      0.00         1
                           bank       0.75      1.00      0.86         3
                            bar       0.00      0.00      0.00         1
                   beauty_salon       0.00      0.00      0.00         1
                          bureu       1.00      1.00      1.00         1
                           cafe       0.83      1.00      0.91         5
                     car_dealer       1.00     

Epoch 13: 100%|██████████| 40/40 [00:08<00:00,  4.97it/s]


Epoch 13 | Train Loss: 1.4878 | Val Accuracy: 0.7829
Classification Report:
                                 precision    recall  f1-score   support

                     accounting       1.00      1.00      1.00         1
                    agriculture       0.67      0.67      0.67         3
                        airport       1.00      1.00      1.00         1
              architecture_firm       1.00      1.00      1.00         3
                    art_gallery       1.00      1.00      1.00         1
                            atm       0.00      0.00      0.00         1
                           bank       0.67      0.67      0.67         3
                            bar       0.00      0.00      0.00         1
                   beauty_salon       0.00      0.00      0.00         1
                          bureu       1.00      1.00      1.00         1
                           cafe       0.83      1.00      0.91         5
                     car_dealer       1.00     

Epoch 14: 100%|██████████| 40/40 [00:07<00:00,  5.05it/s]


Epoch 14 | Train Loss: 1.4604 | Val Accuracy: 0.7895
Classification Report:
                                 precision    recall  f1-score   support

                     accounting       1.00      1.00      1.00         1
                    agriculture       0.67      0.67      0.67         3
                        airport       1.00      1.00      1.00         1
              architecture_firm       1.00      1.00      1.00         3
                    art_gallery       1.00      1.00      1.00         1
                            atm       0.00      0.00      0.00         1
                           bank       0.67      0.67      0.67         3
                            bar       0.00      0.00      0.00         1
                   beauty_salon       0.00      0.00      0.00         1
                          bureu       1.00      1.00      1.00         1
                           cafe       0.83      1.00      0.91         5
                     car_dealer       1.00     

Epoch 15: 100%|██████████| 40/40 [00:07<00:00,  5.12it/s]


Epoch 15 | Train Loss: 1.4286 | Val Accuracy: 0.7961
Classification Report:
                                 precision    recall  f1-score   support

                     accounting       1.00      1.00      1.00         1
                    agriculture       0.67      0.67      0.67         3
                        airport       1.00      1.00      1.00         1
              architecture_firm       1.00      1.00      1.00         3
                    art_gallery       1.00      1.00      1.00         1
                            atm       0.00      0.00      0.00         1
                           bank       0.67      0.67      0.67         3
                            bar       0.00      0.00      0.00         1
                   beauty_salon       0.00      0.00      0.00         1
                          bureu       1.00      1.00      1.00         1
                           cafe       0.83      1.00      0.91         5
                     car_dealer       1.00     

###SAVING THE MODEL

In [None]:
def save_model(model, tokenizer, label_encoder, output_dir="./saved_model"):
    import os
    import pickle
    import h5py
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # Save model state dictionary using H5 format
    torch.save(model.state_dict(), f"{output_dir}/model_weights.h5")

    # Save model configuration (architecture) using H5 format
    torch.save({
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'num_labels': num_labels,
        'model_config': model.bert.config.to_dict() if hasattr(model, 'bert') else None,
    }, f"{output_dir}/model_full.h5")

    # Save tokenizer
    tokenizer.save_pretrained(output_dir)


In [26]:
save_model(trained_model, tokenizer, label_encoder)

#SEARCH ENGINE FUNCTION

###PREDICT WITH TEST SET

In [27]:
def predict(text, model, tokenizer):
    model.eval()
    inputs = tokenizer(
        text,
        max_length=128,
        padding='max_length',
        truncation=True,
        return_tensors="pt"
    )
    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        probs = torch.nn.functional.softmax(outputs, dim=1)
        pred_class = torch.argmax(probs, dim=1).item()

    predicted_label = label_encoder.classes_[pred_class]
    confidence = probs[0, pred_class].item()

    return predicted_label, confidence

TESTING

In [28]:
query = "restoran western sekitar"
model.eval()
inputs = tokenizer(query, return_tensors="pt", truncation=True, padding=True).to(device)
output = model(**inputs)
pred_label = torch.argmax(output, dim=1).item()
pred_place_type = label_encoder.inverse_transform([pred_label])[0]
print(f"Detected place type: {pred_place_type}")

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Detected place type: restaurant


###SEARCH ENGINE FUNCTION FULL TO THE DATASET

In [29]:
def haversine(lat1, lon1, lat2, lon2):
    R = 6371
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat / 2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2)**2
    return 2 * R * np.arcsin(np.sqrt(a))

In [None]:
from difflib import SequenceMatcher

def string_similarity(a, b):
    return SequenceMatcher(None, a.lower(), b.lower()).ratio()

def keyword_match_score(place_name, keywords):
    name = place_name.lower()
    match_count = sum(1 for kw in keywords if kw in name)
    return match_count / len(keywords) if keywords else 0

def filter_places_by_type_and_distance(user_query, user_lat, user_lon, model, tokenizer, label_encoder, places_df, radius_km=5):
    model.eval()
    inputs = tokenizer(user_query, return_tensors="pt", truncation=True, padding=True).to(device)
    outputs = model(**inputs)
    pred_label = torch.argmax(outputs, dim=1).item()
    place_type = label_encoder.inverse_transform([pred_label])[0]

    keywords = user_query.lower().split()
    match_type = places_df['placeTypes'].str.lower().str.contains(place_type.lower(), na=False)
    df_type = places_df[match_type].copy()

    df_type['distance_km'] = haversine(user_lat, user_lon, df_type['placeLatitude'], df_type['placeLongitude'])
    df_type = df_type[df_type['distance_km'] <= radius_km]

    df_type['similarity_score'] = df_type['placeName'].apply(lambda name: string_similarity(user_query, name))
    df_type['keyword_score'] = df_type['placeName'].apply(lambda name: keyword_match_score(name, keywords))

    df_type['total_score'] = 0.5 * df_type['similarity_score'] + 0.5 * df_type['keyword_score']

    nearby = df_type.sort_values(by=['total_score', 'distance_km'], ascending=[False, True]).head(5)

    return place_type, nearby

TESTING WITH FULL DATASET

In [32]:
places_df = pd.read_excel("//content/drive/MyDrive/BPS_PROJECT/PROCESSED/klbi_classification/PREDICTED_ALL_82ACC_cnn.xlsx")

user_query = "ampera dekat saya"
user_lat, user_lon = 0.509954, 101.454620

predicted_type, results = filter_places_by_type_and_distance(
    user_query, user_lat, user_lon,
    model, tokenizer, label_encoder, places_df,
    radius_km=2
)

print(f"Detected type: {predicted_type}")
print(results[['placeName', 'distance_km']])


Detected type: food
                  placeName  distance_km
38481       Ampera Kak Baya     0.650941
19778  Ampera Empat Saudara     1.368809
23579          Ampera Sarah     1.028596
32839          Ampera ayang     1.761982
39475          Ampera Da In     1.844647


In [33]:
user_query = "kosan murah"
user_lat, user_lon = 0.509954, 101.454620

predicted_type, results = filter_places_by_type_and_distance(
    user_query, user_lat, user_lon,
    model, tokenizer, label_encoder, places_df,
    radius_km=2
)

print(f"Detected type: {predicted_type}")
print(results[['placeName', 'distance_km']])


Detected type: lodging
                           placeName  distance_km
15899                    Kosan Putra     1.895135
15958                Kosan nurazizah     1.747937
30638                Kos-kosan putra     0.552331
19403                   Kos Kosan DS     1.724713
32352  Kosan Muslimah Plus-Plus Nila     0.917135


In [34]:
user_query = "tempat ngopi nyaman"
user_lat, user_lon = 0.509954, 101.454620

predicted_type, results = filter_places_by_type_and_distance(
    user_query, user_lat, user_lon,
    model, tokenizer, label_encoder, places_df,
    radius_km=2
)

print(f"Detected type: {predicted_type}")
print(results[['placeName', 'distance_km']])


Detected type: cafe
                  placeName  distance_km
30687             Ngopi moh     0.122590
30566         Pojokan Ngopi     0.297107
23376     Warung Kopi Arkan     1.638149
32893   Kedai Kopi Mangga 2     1.651765
37580  Cafe Teja Tepi Jalan     1.165483


In [35]:
user_query = "cafe nyaman"
user_lat, user_lon = 0.509954, 101.454620

predicted_type, results = filter_places_by_type_and_distance(
    user_query, user_lat, user_lon,
    model, tokenizer, label_encoder, places_df,
    radius_km=2
)

print(f"Detected type: {predicted_type}")
print(results[['placeName', 'distance_km']])


Detected type: cafe
                  placeName  distance_km
29679             CAFE ARMY     1.992351
31180         Cafe JengJeng     1.732157
19941        Cafe Mas Jarwo     1.511247
37580  Cafe Teja Tepi Jalan     1.165483
32939     Cafe Green Corner     1.931540


In [36]:
user_query = "restoran pizza"
user_lat, user_lon = 0.509954, 101.454620

predicted_type, results = filter_places_by_type_and_distance(
    user_query, user_lat, user_lon,
    model, tokenizer, label_encoder, places_df,
    radius_km=2
)

print(f"Detected type: {predicted_type}")
print(results[['placeName', 'distance_km']])


Detected type: restaurant
                                placeName  distance_km
16018                  Pizza Hut Restoran     1.237399
16003    Rumah Makan Dan Restoran Bahagia     1.210468
23705  Restoran Mixue Jalan Sudirman Riau     0.582906
37999                           Pizza Hut     1.753671
32737              Doough Pizza Pekanbaru     1.735025


In [37]:
user_query = "bengkel motor terdekat"
user_lat, user_lon = 0.509954, 101.454620

predicted_type, results = filter_places_by_type_and_distance(
    user_query, user_lat, user_lon,
    model, tokenizer, label_encoder, places_df,
    radius_km=2
)

print(f"Detected type: {predicted_type}")
print(results[['placeName', 'distance_km']])


Detected type: car_repair
                                  placeName  distance_km
23329                     RJM Bengkel Motor     1.102699
19380                 Bengkel Meranti Motor     1.832968
15972  Bengkel Cat Mobil dan Motor Mak itam     1.545504
15977               Bengkel Iwan Bersaudara     1.526077
23516                    Selamat Motor Toko     1.522047


##SE SEDERHANA (TANPA MACHINE LEARNING)

In [70]:
def simple_search(query, user_lat, user_lon, radius_km=2):
    query = query.lower()
    keywords = query.split()
    mask = places_df['placeName'].str.lower().apply(
        lambda name: any(word in name for word in keywords)
    )
    results = places_df[mask].copy()
    results['placeLatitude'] = pd.to_numeric(results['placeLatitude'], errors='coerce')
    results['placeLongitude'] = pd.to_numeric(results['placeLongitude'], errors='coerce')
    results = results.dropna(subset=['placeLatitude', 'placeLongitude'])
    results['distance_km'] = results.apply(
        lambda row: haversine(user_lat, user_lon, row['placeLatitude'], row['placeLongitude']),
        axis=1
    )
    nearby_results = results[results['distance_km'] <= radius_km].sort_values(by='distance_km')
    return nearby_results[['placeName', 'distance_km']]

In [71]:
result = simple_search("ampera dekat saya", user_lat, user_lon, radius_km=2)
print(result)

                                               placeName  distance_km
30524                                      AMPERA PESONA     0.133576
30594                            Rumah Makan Raja Ampera     0.385808
30717                                 Ampera Ninjau Raya     0.469585
30656                     Rumah makan Ampera Family Jaya     0.479765
38481                                    Ampera Kak Baya     0.650941
23649  RUMAH RAUDHOH SUDIRMAN TOKO OLEH-OLEH HAJI UMR...     0.679989
30635                                   AMPERA PATTIMURA     0.735321
30626                                Ampera Surya Minang     0.742202
31317                                      AMPERA RAHMAT     0.801239
16616                                 Ampera Sinar Murni     0.893829
30885   YAYASAN ANAK YATIM 8 ASNAF KESAYANGANKU CABANG 2     0.925155
32355               PONDOK CAT BENGKEL CAT BODY TERDEKAT     0.933000
32397                                   RM Seribu Sayang     0.951149
30999               

In [73]:
result = simple_search("kosan murah", user_lat, user_lon, radius_km=2)
print(result)

                                               placeName  distance_km
30587                                       Kosan Bu Ros     0.324414
30638                                    Kos-kosan putra     0.552331
32352                      Kosan Muslimah Plus-Plus Nila     0.917135
16546                                    Kos kosan Putri     1.050236
23345                                    Kos-kosan cowok     1.129407
31374  HUMAIRA THE SPIRITUAL LINE DISTRIBUTOR MUKENA ...     1.151578
32775                                   Agen kuota murah     1.275690
19666                         Kaos polos murah pekanbaru     1.694026
31127                           Papan Bunga PKU Termurah     1.694549
19403                                       Kos Kosan DS     1.724713
15958                                    Kosan nurazizah     1.747937
31719                    Stempel Cepat Cetak Lebih Murah     1.865806
33003                                   Kosanselinditno5     1.891654
15899               

In [74]:
result = simple_search("tempat ngopi nyaman", user_lat, user_lon, radius_km=2)
print(result)

                  placeName  distance_km
30687             Ngopi moh     0.122590
30566         Pojokan Ngopi     0.297107
30964    Tempat Mengaji Umi     1.064556
16557        Tempat tinggal     1.075660
32542  Rumah tempat tinggal     1.582682
32764          Tempat tidur     1.656713


In [75]:
result = simple_search("cafe nyaman", user_lat, user_lon, radius_km=2)
print(result)

                                        placeName  distance_km
30742                      Temok Cafe Bakso Temok     0.380674
30534                                     FamCafe     0.392676
31314                         Cafe Shanum Kitchen     0.776833
32478                             Crown cybercafe     0.824153
30990                            Gutji Cafe Resto     0.874852
30912                             Chatterbox Cafe     1.119594
37580                        Cafe Teja Tepi Jalan     1.165483
32774              Celebrity Cafe Resto Pekanbaru     1.228494
31261                             Cafe nilam sari     1.315178
31391                                  Ovela Cafe     1.382459
16313                  Forty One Cafe Coffee Shop     1.415465
19755                          Bestie Cafe Reborn     1.436508
32547                                Stasiun Cafe     1.461044
37663                              Boardgame Cafe     1.483094
19941                              Cafe Mas Jarwo     1

In [76]:
result = simple_search("restoran pizza", user_lat, user_lon, radius_km=2)
print(result)

                                placeName  distance_km
23705  Restoran Mixue Jalan Sudirman Riau     0.582906
38105                  Pizza Hut Delivery     0.707805
16003    Rumah Makan Dan Restoran Bahagia     1.210468
16228                    Pizza Rakyat PKU     1.211499
16018                  Pizza Hut Restoran     1.237399
16299    Pizza Paradise Kavling/Samarinda     1.366759
16098                        lariza pizza     1.698065
32737              Doough Pizza Pekanbaru     1.735025
37999                           Pizza Hut     1.753671
32851        Pide Turkish Pizza Pekanbaru     1.895118


In [77]:
result = simple_search("bengkel motor terdekat", user_lat, user_lon, radius_km=2)
print(result)

                                placeName  distance_km
30571                  Bengkel Motor Ucok     0.373807
30531               Bengkelgitarpekanbaru     0.492231
23647                      Bengkel tas 55     0.665892
23641               Surogentho Motorcycle     0.688596
38480   Cucian Motor dan Mobil Abdul Muis     0.712507
...                                   ...          ...
31718                        CUCIAN MOTOR     1.956752
33156          Cucian motor mobil 3 putri     1.966048
33015                        Oskana Motor     1.971234
15853         Bengkel Las Panorama Teknik     1.973517
19573  ssindikat pencucian helm dan motor     1.993565

[90 rows x 2 columns]
