#**PLACE SEARCH ENGINE WITH TRANSFORMER INDOBERT BASE P1**

* COMBINING SEMANTIC SEACRH WITH CLASSIFICATION MODEL FOR BETTER SEMANTIC SEARCH AND QUERY UNDERSTANDING

* FINETUNING THE LAST 4 LAYERS OF INDOBERT BASE P1 AND ADDING CUSTOM LAYERS OF 75 CLASS

#IMPORTING LIBRARY

In [22]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel, get_scheduler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from torch.optim import AdamW
from sklearn.metrics import classification_report, accuracy_score

In [23]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [24]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

#CREATING THE MODEL

###DATASET PREPARATIONS

In [25]:
df = pd.read_excel("/content/drive/MyDrive/corpus_se.xlsx")
df = df.dropna(subset=["query", "label"])

LABEL ENCODER

In [26]:
label_encoder = LabelEncoder()
df['label_enc'] = label_encoder.fit_transform(df['label'])
num_labels = len(label_encoder.classes_)

SPLITTING THE DATASET

In [27]:
# Train/Val Split
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['query'].tolist(), df['label_enc'].tolist(), test_size=0.2, random_state=42, stratify=df['label_enc']
)

TOKENIZER FROM INDOBERT BASE P1

In [28]:
# Tokenizer
tokenizer = BertTokenizer.from_pretrained("indobenchmark/indobert-base-p1")

PROCESSING THE DATASET

In [29]:
class PlaceDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        inputs = self.tokenizer(self.texts[idx], max_length=self.max_len, padding='max_length', truncation=True, return_tensors="pt")
        input_ids = inputs['input_ids'].squeeze(0)
        attention_mask = inputs['attention_mask'].squeeze(0)
        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'label': torch.tensor(self.labels[idx], dtype=torch.long)
        }

AUGMENTS THE DATASET

In [30]:
# Create dataset with data augmentation for training
def get_augmented_dataset(texts, labels, tokenizer, augment=True):
    if augment:
        # Simple augmentation: add some noise to sequences (randomly mask tokens)
        augmented_texts = texts.copy()
        augmented_labels = labels.copy()

        # Add slight random noise to some examples (15% of data)
        num_to_augment = int(len(texts) * 0.15)
        indices = np.random.choice(len(texts), num_to_augment, replace=False)

        for idx in indices:
            text = texts[idx]
            words = text.split()
            # Randomly drop or duplicate some words
            if len(words) > 3:  # Only if we have enough words
                # Randomly choose augmentation type
                aug_type = np.random.choice(['drop', 'duplicate'])
                if aug_type == 'drop':
                    # Drop a random word
                    drop_idx = np.random.randint(0, len(words))
                    words.pop(drop_idx)
                else:
                    # Duplicate a random word
                    dup_idx = np.random.randint(0, len(words))
                    words.insert(dup_idx, words[dup_idx])

                augmented_text = ' '.join(words)
                augmented_texts.append(augmented_text)
                augmented_labels.append(labels[idx])

        return PlaceDataset(augmented_texts, augmented_labels, tokenizer)
    else:
        return PlaceDataset(texts, labels, tokenizer)

CREATING DATASET AND DATALOADER

In [31]:
# Create datasets with potential augmentation
train_dataset = get_augmented_dataset(train_texts, train_labels, tokenizer, augment=True)
val_dataset = PlaceDataset(val_texts, val_labels, tokenizer)

# Create data loaders with appropriate batch sizes
batch_size = 16  # Increased batch size for better generalization
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

###MODEL INITIALIZATION

USING **INDOBERT BASE P1** AS THE MODEL AND TRANSFER LEARNING BY FREEZING THE LAYER AND ONLY TRAINING ONLY 4 LAYERS FOR FINETUNING

In [32]:
class ImprovedPlaceClassifier(nn.Module):
    def __init__(self, num_labels, dropout_rate=0.2):
        super(ImprovedPlaceClassifier, self).__init__()
        self.bert = BertModel.from_pretrained("indobenchmark/indobert-base-p1")

        # Effective fine-tuning technique: gradual unfreezing
        # First freeze all BERT layers
        for param in self.bert.parameters():
            param.requires_grad = False

        # Then unfreeze last 4 layers for fine-tuning
        for i in range(8, 12):
            for param in self.bert.encoder.layer[i].parameters():
                param.requires_grad = True

        # Pooling strategy
        self.pooling_type = "cls"  # Options: cls, mean, max

        # Improved classifier head with batch normalization
        self.drop = nn.Dropout(dropout_rate)
        self.norm = nn.BatchNorm1d(self.bert.config.hidden_size)

        # Wider then narrower architecture
        self.classifier = nn.Sequential(
            nn.Linear(self.bert.config.hidden_size, 512),
            nn.ReLU(),
            nn.BatchNorm1d(512),
            nn.Dropout(dropout_rate),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.BatchNorm1d(256),
            nn.Dropout(dropout_rate),
            nn.Linear(256, num_labels)
        )

    def forward(self, input_ids, attention_mask, **kwargs):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, **kwargs)

        # Different pooling strategies
        if self.pooling_type == "cls":
            pooled_output = outputs.last_hidden_state[:, 0, :]  # [CLS] token
        elif self.pooling_type == "mean":
            # Mean of all tokens (excluding padding)
            masked = outputs.last_hidden_state * attention_mask.unsqueeze(-1)
            pooled_output = masked.sum(dim=1) / attention_mask.sum(dim=1, keepdim=True)
        else:  # max pooling
            # Max over all tokens (excluding padding)
            masked = outputs.last_hidden_state * attention_mask.unsqueeze(-1)
            pooled_output = torch.max(masked, dim=1)[0]

        normalized = self.norm(pooled_output)
        dropped = self.drop(normalized)
        return self.classifier(dropped)

In [33]:
model = ImprovedPlaceClassifier(num_labels=num_labels).to(device)

FULL MODEL ARCHITECTURE

In [34]:
print(model)

ImprovedPlaceClassifier(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(50000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, e

LOSS

In [35]:
criterion = nn.CrossEntropyLoss(label_smoothing=0.1)
weight_decay = 0.01
lr = 5e-5
# optimizer = AdamW(model.parameters(), lr=5e-5)

WEIGHT DECAYING

In [36]:
no_decay = ["bias", "LayerNorm.weight"]
optimizer_grouped_parameters = [
    {
        "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
        "weight_decay": weight_decay,
    },
    {
        "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
        "weight_decay": 0.0,
    },
]

OPTIMIZER (ADAM)

In [37]:
optimizer = AdamW(optimizer_grouped_parameters, lr=lr)

# Learning rate scheduler with warmup
num_epochs = 15  # Reduced number of epochs
num_training_steps = len(train_loader) * num_epochs
num_warmup_steps = int(0.1 * num_training_steps)  # 10% warmup

LEARNING RATE SCHEDULER

In [38]:
lr_scheduler = get_scheduler(
    name="linear",
    optimizer=optimizer,
    num_warmup_steps=num_warmup_steps,
    num_training_steps=num_training_steps,
)

EARLY STOPPING CALLBACKS

In [39]:
# Early stopping implementation
class EarlyStopping:
    def __init__(self, patience=3, min_delta=0.001):
        self.patience = patience
        self.min_delta = min_delta
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.best_model_state = None

    def __call__(self, val_score, model):
        if self.best_score is None:
            self.best_score = val_score
            self.best_model_state = model.state_dict().copy()
        elif val_score < self.best_score + self.min_delta:
            self.counter += 1
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = val_score
            self.best_model_state = model.state_dict().copy()
            self.counter = 0

        return self.early_stop

###TRAINING AND EVALUATION FUNCTION

In [40]:
def train_model(model, train_loader, val_loader, optimizer, criterion, scheduler):
    early_stopping = EarlyStopping(patience=3)
    history = {'train_loss': [], 'val_acc': []}

    for epoch in range(num_epochs):
        # Training phase
        model.train()
        total_loss = 0

        for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            # Forward pass
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            loss = criterion(outputs, labels)

            # Backward pass
            optimizer.zero_grad()
            loss.backward()

            # Gradient clipping to prevent exploding gradients
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

            optimizer.step()
            scheduler.step()

            total_loss += loss.item()

        avg_train_loss = total_loss / len(train_loader)
        history['train_loss'].append(avg_train_loss)

        # Validation phase
        val_acc, val_report = evaluate_model(model, val_loader)
        history['val_acc'].append(val_acc)

        print(f"Epoch {epoch+1} | Train Loss: {avg_train_loss:.4f} | Val Accuracy: {val_acc:.4f}")
        print(f"Classification Report:\n{val_report}")

        # Check for early stopping
        if early_stopping(val_acc, model):
            print(f"Early stopping triggered after epoch {epoch+1}")
            # Load the best model
            model.load_state_dict(early_stopping.best_model_state)
            break

    return model, history

In [41]:
def evaluate_model(model, val_loader):
    model.eval()
    correct = 0
    total = 0
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            preds = torch.argmax(outputs, dim=1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

            correct += (preds == labels).sum().item()
            total += labels.size(0)

    acc = correct / total
    report = classification_report(all_labels, all_preds, target_names=label_encoder.classes_, zero_division=0)

    return acc, report

###TRAINING

In [42]:
trained_model, history = train_model(model, train_loader, val_loader, optimizer, criterion, lr_scheduler)

Epoch 1: 100%|██████████| 21/21 [00:03<00:00,  5.44it/s]


Epoch 1 | Train Loss: 4.4761 | Val Accuracy: 0.0400
Classification Report:
                                 precision    recall  f1-score   support

                     accounting       0.00      0.00      0.00         1
                    agriculture       0.00      0.00      0.00         1
                        airport       0.00      0.00      0.00         1
              architecture_firm       0.00      0.00      0.00         1
                    art_gallery       0.00      0.00      0.00         1
                            atm       0.00      0.00      0.00         1
                           bank       0.00      0.00      0.00         1
                            bar       0.00      0.00      0.00         1
                   beauty_salon       0.00      0.00      0.00         1
                          bureu       0.00      0.00      0.00         1
                           cafe       0.00      0.00      0.00         1
                     car_dealer       0.00      

Epoch 2: 100%|██████████| 21/21 [00:03<00:00,  5.45it/s]


Epoch 2 | Train Loss: 4.2492 | Val Accuracy: 0.1067
Classification Report:
                                 precision    recall  f1-score   support

                     accounting       0.00      0.00      0.00         1
                    agriculture       0.00      0.00      0.00         1
                        airport       0.25      1.00      0.40         1
              architecture_firm       0.00      0.00      0.00         1
                    art_gallery       0.00      0.00      0.00         1
                            atm       0.00      0.00      0.00         1
                           bank       1.00      1.00      1.00         1
                            bar       0.00      0.00      0.00         1
                   beauty_salon       0.00      0.00      0.00         1
                          bureu       1.00      1.00      1.00         1
                           cafe       0.00      0.00      0.00         1
                     car_dealer       0.00      

Epoch 3: 100%|██████████| 21/21 [00:03<00:00,  5.42it/s]


Epoch 3 | Train Loss: 3.9475 | Val Accuracy: 0.1867
Classification Report:
                                 precision    recall  f1-score   support

                     accounting       0.00      0.00      0.00         1
                    agriculture       0.00      0.00      0.00         1
                        airport       0.20      1.00      0.33         1
              architecture_firm       0.00      0.00      0.00         1
                    art_gallery       0.00      0.00      0.00         1
                            atm       0.00      0.00      0.00         1
                           bank       0.50      1.00      0.67         1
                            bar       0.00      0.00      0.00         1
                   beauty_salon       0.00      0.00      0.00         1
                          bureu       0.25      1.00      0.40         1
                           cafe       0.00      0.00      0.00         1
                     car_dealer       0.00      

Epoch 4: 100%|██████████| 21/21 [00:03<00:00,  5.32it/s]


Epoch 4 | Train Loss: 3.5938 | Val Accuracy: 0.3333
Classification Report:
                                 precision    recall  f1-score   support

                     accounting       0.00      0.00      0.00         1
                    agriculture       0.33      1.00      0.50         1
                        airport       1.00      1.00      1.00         1
              architecture_firm       0.00      0.00      0.00         1
                    art_gallery       0.00      0.00      0.00         1
                            atm       0.00      0.00      0.00         1
                           bank       1.00      1.00      1.00         1
                            bar       0.00      0.00      0.00         1
                   beauty_salon       0.00      0.00      0.00         1
                          bureu       0.14      1.00      0.25         1
                           cafe       0.00      0.00      0.00         1
                     car_dealer       0.00      

Epoch 5: 100%|██████████| 21/21 [00:04<00:00,  5.22it/s]


Epoch 5 | Train Loss: 3.4034 | Val Accuracy: 0.4400
Classification Report:
                                 precision    recall  f1-score   support

                     accounting       0.00      0.00      0.00         1
                    agriculture       0.50      1.00      0.67         1
                        airport       0.50      1.00      0.67         1
              architecture_firm       0.00      0.00      0.00         1
                    art_gallery       1.00      1.00      1.00         1
                            atm       1.00      1.00      1.00         1
                           bank       0.50      1.00      0.67         1
                            bar       0.00      0.00      0.00         1
                   beauty_salon       0.00      0.00      0.00         1
                          bureu       0.33      1.00      0.50         1
                           cafe       0.00      0.00      0.00         1
                     car_dealer       0.00      

Epoch 6: 100%|██████████| 21/21 [00:04<00:00,  5.22it/s]


Epoch 6 | Train Loss: 3.1040 | Val Accuracy: 0.5333
Classification Report:
                                 precision    recall  f1-score   support

                     accounting       0.00      0.00      0.00         1
                    agriculture       0.50      1.00      0.67         1
                        airport       0.50      1.00      0.67         1
              architecture_firm       0.00      0.00      0.00         1
                    art_gallery       1.00      1.00      1.00         1
                            atm       1.00      1.00      1.00         1
                           bank       0.50      1.00      0.67         1
                            bar       0.00      0.00      0.00         1
                   beauty_salon       0.00      0.00      0.00         1
                          bureu       1.00      1.00      1.00         1
                           cafe       1.00      1.00      1.00         1
                     car_dealer       0.00      

Epoch 7: 100%|██████████| 21/21 [00:04<00:00,  5.14it/s]


Epoch 7 | Train Loss: 2.8743 | Val Accuracy: 0.6400
Classification Report:
                                 precision    recall  f1-score   support

                     accounting       1.00      1.00      1.00         1
                    agriculture       0.50      1.00      0.67         1
                        airport       1.00      1.00      1.00         1
              architecture_firm       0.00      0.00      0.00         1
                    art_gallery       1.00      1.00      1.00         1
                            atm       1.00      1.00      1.00         1
                           bank       0.50      1.00      0.67         1
                            bar       0.00      0.00      0.00         1
                   beauty_salon       0.00      0.00      0.00         1
                          bureu       1.00      1.00      1.00         1
                           cafe       1.00      1.00      1.00         1
                     car_dealer       0.00      

Epoch 8: 100%|██████████| 21/21 [00:04<00:00,  5.16it/s]


Epoch 8 | Train Loss: 2.7074 | Val Accuracy: 0.7200
Classification Report:
                                 precision    recall  f1-score   support

                     accounting       1.00      1.00      1.00         1
                    agriculture       0.50      1.00      0.67         1
                        airport       1.00      1.00      1.00         1
              architecture_firm       0.00      0.00      0.00         1
                    art_gallery       1.00      1.00      1.00         1
                            atm       1.00      1.00      1.00         1
                           bank       1.00      1.00      1.00         1
                            bar       0.00      0.00      0.00         1
                   beauty_salon       1.00      1.00      1.00         1
                          bureu       1.00      1.00      1.00         1
                           cafe       1.00      1.00      1.00         1
                     car_dealer       0.00      

Epoch 9: 100%|██████████| 21/21 [00:04<00:00,  5.24it/s]


Epoch 9 | Train Loss: 2.5404 | Val Accuracy: 0.7333
Classification Report:
                                 precision    recall  f1-score   support

                     accounting       1.00      1.00      1.00         1
                    agriculture       0.50      1.00      0.67         1
                        airport       1.00      1.00      1.00         1
              architecture_firm       0.00      0.00      0.00         1
                    art_gallery       1.00      1.00      1.00         1
                            atm       1.00      1.00      1.00         1
                           bank       1.00      1.00      1.00         1
                            bar       0.00      0.00      0.00         1
                   beauty_salon       0.00      0.00      0.00         1
                          bureu       0.50      1.00      0.67         1
                           cafe       1.00      1.00      1.00         1
                     car_dealer       1.00      

Epoch 10: 100%|██████████| 21/21 [00:03<00:00,  5.26it/s]


Epoch 10 | Train Loss: 2.4839 | Val Accuracy: 0.7467
Classification Report:
                                 precision    recall  f1-score   support

                     accounting       1.00      1.00      1.00         1
                    agriculture       1.00      1.00      1.00         1
                        airport       1.00      1.00      1.00         1
              architecture_firm       0.00      0.00      0.00         1
                    art_gallery       1.00      1.00      1.00         1
                            atm       1.00      1.00      1.00         1
                           bank       0.50      1.00      0.67         1
                            bar       0.00      0.00      0.00         1
                   beauty_salon       1.00      1.00      1.00         1
                          bureu       1.00      1.00      1.00         1
                           cafe       1.00      1.00      1.00         1
                     car_dealer       1.00     

Epoch 11: 100%|██████████| 21/21 [00:03<00:00,  5.38it/s]


Epoch 11 | Train Loss: 2.3903 | Val Accuracy: 0.7867
Classification Report:
                                 precision    recall  f1-score   support

                     accounting       1.00      1.00      1.00         1
                    agriculture       1.00      1.00      1.00         1
                        airport       1.00      1.00      1.00         1
              architecture_firm       1.00      1.00      1.00         1
                    art_gallery       1.00      1.00      1.00         1
                            atm       1.00      1.00      1.00         1
                           bank       1.00      1.00      1.00         1
                            bar       0.00      0.00      0.00         1
                   beauty_salon       1.00      1.00      1.00         1
                          bureu       0.50      1.00      0.67         1
                           cafe       1.00      1.00      1.00         1
                     car_dealer       1.00     

Epoch 12: 100%|██████████| 21/21 [00:03<00:00,  5.40it/s]


Epoch 12 | Train Loss: 2.2942 | Val Accuracy: 0.8133
Classification Report:
                                 precision    recall  f1-score   support

                     accounting       1.00      1.00      1.00         1
                    agriculture       1.00      1.00      1.00         1
                        airport       1.00      1.00      1.00         1
              architecture_firm       1.00      1.00      1.00         1
                    art_gallery       1.00      1.00      1.00         1
                            atm       1.00      1.00      1.00         1
                           bank       1.00      1.00      1.00         1
                            bar       0.00      0.00      0.00         1
                   beauty_salon       1.00      1.00      1.00         1
                          bureu       1.00      1.00      1.00         1
                           cafe       1.00      1.00      1.00         1
                     car_dealer       1.00     

Epoch 13: 100%|██████████| 21/21 [00:03<00:00,  5.35it/s]


Epoch 13 | Train Loss: 2.2268 | Val Accuracy: 0.8133
Classification Report:
                                 precision    recall  f1-score   support

                     accounting       1.00      1.00      1.00         1
                    agriculture       0.50      1.00      0.67         1
                        airport       1.00      1.00      1.00         1
              architecture_firm       1.00      1.00      1.00         1
                    art_gallery       1.00      1.00      1.00         1
                            atm       1.00      1.00      1.00         1
                           bank       1.00      1.00      1.00         1
                            bar       1.00      1.00      1.00         1
                   beauty_salon       1.00      1.00      1.00         1
                          bureu       1.00      1.00      1.00         1
                           cafe       1.00      1.00      1.00         1
                     car_dealer       1.00     

Epoch 14: 100%|██████████| 21/21 [00:03<00:00,  5.47it/s]


Epoch 14 | Train Loss: 2.2032 | Val Accuracy: 0.8133
Classification Report:
                                 precision    recall  f1-score   support

                     accounting       1.00      1.00      1.00         1
                    agriculture       0.50      1.00      0.67         1
                        airport       1.00      1.00      1.00         1
              architecture_firm       1.00      1.00      1.00         1
                    art_gallery       1.00      1.00      1.00         1
                            atm       1.00      1.00      1.00         1
                           bank       1.00      1.00      1.00         1
                            bar       1.00      1.00      1.00         1
                   beauty_salon       1.00      1.00      1.00         1
                          bureu       1.00      1.00      1.00         1
                           cafe       1.00      1.00      1.00         1
                     car_dealer       1.00     

Epoch 15: 100%|██████████| 21/21 [00:03<00:00,  5.49it/s]


Epoch 15 | Train Loss: 2.1603 | Val Accuracy: 0.8133
Classification Report:
                                 precision    recall  f1-score   support

                     accounting       1.00      1.00      1.00         1
                    agriculture       0.50      1.00      0.67         1
                        airport       1.00      1.00      1.00         1
              architecture_firm       1.00      1.00      1.00         1
                    art_gallery       1.00      1.00      1.00         1
                            atm       1.00      1.00      1.00         1
                           bank       1.00      1.00      1.00         1
                            bar       0.00      0.00      0.00         1
                   beauty_salon       1.00      1.00      1.00         1
                          bureu       1.00      1.00      1.00         1
                           cafe       1.00      1.00      1.00         1
                     car_dealer       1.00     

###SAVING THE MODEL

In [51]:
def save_model(model, tokenizer, label_encoder, output_dir="./saved_model"):
    import os
    import pickle
    import h5py

    # Create directory if it doesn't exist
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # Save model state dictionary using H5 format
    torch.save(model.state_dict(), f"{output_dir}/model_weights.h5")

    # Save model configuration (architecture) using H5 format
    torch.save({
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'num_labels': num_labels,
        'model_config': model.bert.config.to_dict() if hasattr(model, 'bert') else None,
    }, f"{output_dir}/model_full.h5")

    # Save tokenizer
    tokenizer.save_pretrained(output_dir)


In [52]:
save_model(trained_model, tokenizer, label_encoder)

#SEARCH ENGINE FUNCTION

###PREDICT WITH TEST SET

In [43]:
def predict(text, model, tokenizer):
    model.eval()
    inputs = tokenizer(
        text,
        max_length=128,
        padding='max_length',
        truncation=True,
        return_tensors="pt"
    )
    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        probs = torch.nn.functional.softmax(outputs, dim=1)
        pred_class = torch.argmax(probs, dim=1).item()

    predicted_label = label_encoder.classes_[pred_class]
    confidence = probs[0, pred_class].item()

    return predicted_label, confidence

TESTING

In [44]:
query = "restoran western sekitar"
model.eval()
inputs = tokenizer(query, return_tensors="pt", truncation=True, padding=True).to(device)
output = model(**inputs)
pred_label = torch.argmax(output, dim=1).item()
pred_place_type = label_encoder.inverse_transform([pred_label])[0]
print(f"Detected place type: {pred_place_type}")

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Detected place type: restaurant


###SEARCH ENGINE FUNCTION FULL TO THE DATASET

In [45]:
def haversine(lat1, lon1, lat2, lon2):
    R = 6371  # Earth radius in KM
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat / 2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2)**2
    return 2 * R * np.arcsin(np.sqrt(a))

In [46]:
def filter_places_by_type_and_distance(user_query, user_lat, user_lon, model, tokenizer, label_encoder, places_df, radius_km=5):
    # 1. Predict place type
    model.eval()
    inputs = tokenizer(user_query, return_tensors="pt", truncation=True, padding=True).to(device)
    outputs = model(**inputs)
    pred_label = torch.argmax(outputs, dim=1).item()
    place_type = label_encoder.inverse_transform([pred_label])[0]

    # 2. Keyword extraction
    keywords = user_query.lower().split()

    # 3. Exact match ke nama tempat (berbasis keyword)
    exact_match = places_df[
        places_df['placeName'].str.lower().apply(lambda name: any(kw in name for kw in keywords))
    ].copy()

    # 4. Hitung jarak jika exact match ada
    if not exact_match.empty:
        exact_match['distance_km'] = haversine(user_lat, user_lon, exact_match['placeLatitude'], exact_match['placeLongitude'])
        nearby = exact_match[exact_match['distance_km'] <= radius_km].sort_values(by='distance_km').head(5)
        return place_type, nearby

    # 5. Jika tidak ada exact match, fallback ke filter by place_type
    match_type = places_df['placeTypes'].str.lower().str.contains(place_type.lower(), na=False)
    fallback = places_df[match_type].copy()

    fallback['distance_km'] = haversine(user_lat, user_lon, fallback['placeLatitude'], fallback['placeLongitude'])
    nearby = fallback[fallback['distance_km'] <= radius_km].sort_values(by='distance_km').head(5)

    return place_type, nearby


TESTING WITH FULL DATASET

In [47]:
places_df = pd.read_excel("/content/drive/MyDrive/final_banget_pc_70_amen (1).xlsx")

user_query = "ampera dekat saya"
user_lat, user_lon = 0.509954, 101.454620

predicted_type, results = filter_places_by_type_and_distance(
    user_query, user_lat, user_lon,
    model, tokenizer, label_encoder, places_df,
    radius_km=2
)

print(f"Detected type: {predicted_type}")
print(results[['placeName', 'distance_km']])


Detected type: food
                            placeName  distance_km
31655                   AMPERA PESONA     0.133576
31729         Rumah Makan Raja Ampera     0.385808
31858              Ampera Ninjau Raya     0.469585
31792  Rumah makan Ampera Family Jaya     0.479765
40120                 Ampera Kak Baya     0.650941
