# Process data

In [1]:
import pandas as pd
from fuzzywuzzy import fuzz

In [2]:
train_df = pd.read_csv("train.csv")
train_df

Unnamed: 0,id,raw_address,POI/street
0,0,jl kapuk timur delta sili iii lippo cika 11 a ...,/jl kapuk timur delta sili iii lippo cika
1,1,"aye, jati sampurna",/
2,2,setu siung 119 rt 5 1 13880 cipayung,/siung
3,3,"toko dita, kertosono",toko dita/
4,4,jl. orde baru,/jl. orde baru
...,...,...,...
299995,299995,jend ahmad yani 331 kertasari ciamis,/jend ahmad yani
299996,299996,"raya cila kko, cilandak timur kel.",/raya cila kko
299997,299997,tanjung gusta jl. yaya 2 no 17,/
299998,299998,jalan cipadu jaya taman asri gang bijaksana 3 ...,taman asri/


Only take 10,000 samples for testing in notebook

In [3]:
train_df = train_df.sample(10000)
train_df

Unnamed: 0,id,raw_address,POI/street
220978,220978,jati kampung baru senapelan,/jati
93521,93521,jend soedi purwokerto wetan purwokerto timur,/jend soedi
85325,85325,"tpa qurr ayun kh abdul wahid,",/kh abdul wahid
295540,295540,lubang buaya jl. monumen panca sak gang sadar ...,/jl. monumen panca sak gang sadar 9
5451,5451,warna warni jaya jogo banyuwangi,warna warni jaya/jogo
...,...,...,...
152846,152846,taman bunga sukamukti blok h1 no.18 depan lapa...,taman bunga/
78724,78724,dataran tinggi soeka hatta 134 20351 binjai timur,/soeka hatta
234289,234289,kiar simp babakan surabaya kiaracondong,/kiar simp
287198,287198,"cuci motor sadi, rinj sidanegara cilacap tengah",cuci motor sadirman/rinj


Split POI and street into separate columns

In [4]:
train_df[["POI", "street"]] = train_df["POI/street"].str.split("/").to_list()
train_df

Unnamed: 0,id,raw_address,POI/street,POI,street
220978,220978,jati kampung baru senapelan,/jati,,jati
93521,93521,jend soedi purwokerto wetan purwokerto timur,/jend soedi,,jend soedi
85325,85325,"tpa qurr ayun kh abdul wahid,",/kh abdul wahid,,kh abdul wahid
295540,295540,lubang buaya jl. monumen panca sak gang sadar ...,/jl. monumen panca sak gang sadar 9,,jl. monumen panca sak gang sadar 9
5451,5451,warna warni jaya jogo banyuwangi,warna warni jaya/jogo,warna warni jaya,jogo
...,...,...,...,...,...
152846,152846,taman bunga sukamukti blok h1 no.18 depan lapa...,taman bunga/,taman bunga,
78724,78724,dataran tinggi soeka hatta 134 20351 binjai timur,/soeka hatta,,soeka hatta
234289,234289,kiar simp babakan surabaya kiaracondong,/kiar simp,,kiar simp
287198,287198,"cuci motor sadi, rinj sidanegara cilacap tengah",cuci motor sadirman/rinj,cuci motor sadirman,rinj


Use Moses tokenizer to tokenize raw address, POI and street

In [5]:
from sacremoses import MosesTokenizer

mt = MosesTokenizer()

In [6]:
train_df["tokens"] = train_df["raw_address"].apply(mt.tokenize)
train_df["POI_tokens"] = train_df["POI"].apply(mt.tokenize)
train_df["street_tokens"] = train_df["street"].apply(mt.tokenize)
train_df

Unnamed: 0,id,raw_address,POI/street,POI,street,tokens,POI_tokens,street_tokens
220978,220978,jati kampung baru senapelan,/jati,,jati,"[jati, kampung, baru, senapelan]",[],[jati]
93521,93521,jend soedi purwokerto wetan purwokerto timur,/jend soedi,,jend soedi,"[jend, soedi, purwokerto, wetan, purwokerto, t...",[],"[jend, soedi]"
85325,85325,"tpa qurr ayun kh abdul wahid,",/kh abdul wahid,,kh abdul wahid,"[tpa, qurr, ayun, kh, abdul, wahid, ,]",[],"[kh, abdul, wahid]"
295540,295540,lubang buaya jl. monumen panca sak gang sadar ...,/jl. monumen panca sak gang sadar 9,,jl. monumen panca sak gang sadar 9,"[lubang, buaya, jl., monumen, panca, sak, gang...",[],"[jl., monumen, panca, sak, gang, sadar, 9]"
5451,5451,warna warni jaya jogo banyuwangi,warna warni jaya/jogo,warna warni jaya,jogo,"[warna, warni, jaya, jogo, banyuwangi]","[warna, warni, jaya]",[jogo]
...,...,...,...,...,...,...,...,...
152846,152846,taman bunga sukamukti blok h1 no.18 depan lapa...,taman bunga/,taman bunga,,"[taman, bunga, sukamukti, blok, h1, no.18, dep...","[taman, bunga]",[]
78724,78724,dataran tinggi soeka hatta 134 20351 binjai timur,/soeka hatta,,soeka hatta,"[dataran, tinggi, soeka, hatta, 134, 20351, bi...",[],"[soeka, hatta]"
234289,234289,kiar simp babakan surabaya kiaracondong,/kiar simp,,kiar simp,"[kiar, simp, babakan, surabaya, kiaracondong]",[],"[kiar, simp]"
287198,287198,"cuci motor sadi, rinj sidanegara cilacap tengah",cuci motor sadirman/rinj,cuci motor sadirman,rinj,"[cuci, motor, sadi, ,, rinj, sidanegara, cilac...","[cuci, motor, sadirman]",[rinj]


Labelling the tokens by matching POI and street with the raw address
- Use fuzzy match because not all POI and street are complete in the raw address

In [7]:
def label_tokens(row, entity_types=["POI", "street"]):
    len_tokens = {x: len(row[f"{x}_tokens"]) for x in entity_types}
    all_tokens = len(row["tokens"])
    label = ["O"] * all_tokens
    
    for x in entity_types:
        max_score = 0
        max_i = 0
        
        if len_tokens[x] == 0:
            continue
            
        for i in range(all_tokens - len_tokens[x]):
            score = fuzz.ratio(row["tokens"][i:i+len_tokens[x]], row[f"{x}_tokens"])
            
            if score == 100:
                max_i = i
                break
            if score > max_score:
                max_score = score
                
        label[max_i:max_i+len_tokens[x]] = [x] * len_tokens[x]
        
    return label

train_df["label"] = train_df.apply(label_tokens, axis=1)
train_df

Unnamed: 0,id,raw_address,POI/street,POI,street,tokens,POI_tokens,street_tokens,label
220978,220978,jati kampung baru senapelan,/jati,,jati,"[jati, kampung, baru, senapelan]",[],[jati],"[street, O, O, O]"
93521,93521,jend soedi purwokerto wetan purwokerto timur,/jend soedi,,jend soedi,"[jend, soedi, purwokerto, wetan, purwokerto, t...",[],"[jend, soedi]","[street, street, O, O, O, O]"
85325,85325,"tpa qurr ayun kh abdul wahid,",/kh abdul wahid,,kh abdul wahid,"[tpa, qurr, ayun, kh, abdul, wahid, ,]",[],"[kh, abdul, wahid]","[O, O, O, street, street, street, O]"
295540,295540,lubang buaya jl. monumen panca sak gang sadar ...,/jl. monumen panca sak gang sadar 9,,jl. monumen panca sak gang sadar 9,"[lubang, buaya, jl., monumen, panca, sak, gang...",[],"[jl., monumen, panca, sak, gang, sadar, 9]","[O, O, street, street, street, street, street,..."
5451,5451,warna warni jaya jogo banyuwangi,warna warni jaya/jogo,warna warni jaya,jogo,"[warna, warni, jaya, jogo, banyuwangi]","[warna, warni, jaya]",[jogo],"[POI, POI, POI, street, O]"
...,...,...,...,...,...,...,...,...,...
152846,152846,taman bunga sukamukti blok h1 no.18 depan lapa...,taman bunga/,taman bunga,,"[taman, bunga, sukamukti, blok, h1, no.18, dep...","[taman, bunga]",[],"[POI, POI, O, O, O, O, O, O, O]"
78724,78724,dataran tinggi soeka hatta 134 20351 binjai timur,/soeka hatta,,soeka hatta,"[dataran, tinggi, soeka, hatta, 134, 20351, bi...",[],"[soeka, hatta]","[O, O, street, street, O, O, O, O]"
234289,234289,kiar simp babakan surabaya kiaracondong,/kiar simp,,kiar simp,"[kiar, simp, babakan, surabaya, kiaracondong]",[],"[kiar, simp]","[street, street, O, O, O]"
287198,287198,"cuci motor sadi, rinj sidanegara cilacap tengah",cuci motor sadirman/rinj,cuci motor sadirman,rinj,"[cuci, motor, sadi, ,, rinj, sidanegara, cilac...","[cuci, motor, sadirman]",[rinj],"[POI, POI, POI, O, street, O, O, O]"
