# Inference

In [1]:
import numpy as np
import torch
import pandas as pd
import json

from transformers import XLMRobertaTokenizerFast, XLMRobertaForTokenClassification, pipeline
from transformers import AutoTokenizer, AutoModelForTokenClassification

In [2]:
test_df = pd.read_csv("test.csv")
test_df

Unnamed: 0,id,raw_address
0,0,s. par 53 sidanegara 4 cilacap tengah
1,1,"angg per, baloi indah kel. lubuk baja"
2,2,"asma laun, mand imog,"
3,3,"ud agung rej, raya nga sri wedari karanganyar"
4,4,"cut mutia, 35 baiturrahman"
...,...,...
49995,49995,toko mbak farid semboro semboro
49996,49996,"vie - tk. ridho kids, vete 3 cari, 16720 ciawi"
49997,49997,"mart dan roti bakar malabar, nasio,"
49998,49998,graha indah pamulang jl. mujair raya bambu apu...


## Test with 1 sample

In [3]:
sample = test_df.iloc[1]["raw_address"]
sample

'angg per, baloi indah kel. lubuk baja'

Load `id2tag` to decode model output

In [6]:
with open("tag2id_run5.json", "r", encoding="utf-8") as f:
    file = json.load(f)
    tag2id = file["tag2id"]
    id2tag = file["id2tag"]

id2tag = {int(k): v for k,v  in id2tag.items()}
id2tag

{0: 'street', 1: 'POI', 2: 'O'}

Use `pipeline` to load the model

In [7]:
address_extract = pipeline(
    task="ner", 
    model="./results_run5/checkpoint-6000", 
    tokenizer="./results_run5/checkpoint-6000", 
    grouped_entities=True,
    ignore_labels=[f"LABEL_{tag2id['O']}"]
)

Some weights of BertModel were not initialized from the model checkpoint at ./model_indobert and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
result = address_extract(sample)
pd.DataFrame(result).T

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Unnamed: 0,0
entity_group,LABEL_0
score,0.903341
word,angg per
start,0
end,8


## Extract POI and street from model output

In [11]:
tokenizer = XLMRobertaTokenizerFast.from_pretrained("./results_run5/checkpoint-6000")

In [12]:
id2tag

{0: 'street', 1: 'POI', 2: 'O'}

Choose the group with highest score

In [13]:
def post_process(result, tokenizer, id2tag):
    types = ["POI", "street"]
    
    elements = {}
    for x in types:
        elements[x] = {
            "word": "",
            "score": 0
        }
    
    for entity in result:
        # check for empty word
        if not entity["word"]:
            continue
        
        # convert to label
        tag_id = int(entity["entity_group"][-1])
        label = id2tag[tag_id]
        if label == "O":
            continue
    
        if entity["score"] > elements[label]["score"]:
            elements[label]["word"] = entity["word"]
            elements[label]["score"] = entity["score"]
    
    return elements

for x in results:
    print(post_process(x, tokenizer, id2tag))

{'POI': {'word': '', 'score': 0}, 'street': {'word': 'perum nua suka', 'score': 0.9003584682941437}}
{'POI': {'word': 'kios sury b', 'score': 0.7671713382005692}, 'street': {'word': 'may sut', 'score': 0.7526836693286896}}
{'POI': {'word': 'apartemen sauthgate stasiun', 'score': 0.874185836315155}, 'street': {'word': '', 'score': 0}}
{'POI': {'word': '', 'score': 0}, 'street': {'word': 'taman ubud indah viii', 'score': 0.919511079788208}}
{'POI': {'word': '', 'score': 0}, 'street': {'word': 'mah', 'score': 0.8990364074707031}}
{'POI': {'word': '', 'score': 0}, 'street': {'word': 'r. e. mar', 'score': 0.9122492432594299}}
{'POI': {'word': '', 'score': 0}, 'street': {'word': 'bala vi', 'score': 0.9063057899475098}}
{'POI': {'word': '', 'score': 0}, 'street': {'word': 'kerja bakti iv', 'score': 0.9109340707461039}}
{'POI': {'word': 'plaza marein', 'score': 0.9202009240786234}, 'street': {'word': 'jl jendral sudirman', 'score': 0.9211373130480448}}
{'POI': {'word': 'char seluler', 'score':

## Inference on all test set

In [14]:
test_texts = test_df["raw_address"].to_list()

address_extract = pipeline(
    task="ner", 
    model="./results_run5/checkpoint-6000", 
    tokenizer="./results_run5/checkpoint-6000",
    grouped_entities=True,
    ignore_labels=[f"LABEL_{tag2id['O']}"],
    device=1
)

Some weights of BertModel were not initialized from the model checkpoint at ./model_indobert and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Run inference, will take a while

In [15]:
preds = address_extract(test_texts)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [16]:
preds[0]

[{'entity_group': 'LABEL_0',
  'score': 0.9103303949038187,
  'word': 's. par',
  'start': 0,
  'end': 6}]

Apply post processing

In [17]:
post_process(preds[0], tokenizer, id2tag)

{'POI': {'word': '', 'score': 0},
 'street': {'word': 's. par', 'score': 0.9103303949038187}}

Save results to required format

In [18]:
results = []
for i in range(len(test_texts)):
    result = post_process(preds[i], tokenizer, id2tag)
    results.append(f"{result['POI']['word']}/{result['street']['word']}")

In [19]:
results[:10]

['/s. par',
 '/angg per',
 'asma laun/mand imog',
 'ud agung rej/raya nga',
 '/cut mutia',
 'pem dos dapur ala/perum gar',
 'tb. mara/',
 'pura taman beji tista/',
 'tk/',
 '/raya won']

## Export results for submission

In [20]:
test_df["POI/street"] = results
test_df

Unnamed: 0,id,raw_address,POI/street
0,0,s. par 53 sidanegara 4 cilacap tengah,/s. par
1,1,"angg per, baloi indah kel. lubuk baja",/angg per
2,2,"asma laun, mand imog,",asma laun/mand imog
3,3,"ud agung rej, raya nga sri wedari karanganyar",ud agung rej/raya nga
4,4,"cut mutia, 35 baiturrahman",/cut mutia
...,...,...,...
49995,49995,toko mbak farid semboro semboro,toko mbak farid/
49996,49996,"vie - tk. ridho kids, vete 3 cari, 16720 ciawi",vie - tk. ridho kids/vete 3 cari
49997,49997,"mart dan roti bakar malabar, nasio,",mart dan roti bakar malabar/nasio
49998,49998,graha indah pamulang jl. mujair raya bambu apu...,graha indah/jl. mujair raya


In [20]:
test_df[["id", "POI/street"]].to_csv("submission.csv", index=False)