# Packages

In [1]:
import pandas as pd
import sys

In [2]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline

In [3]:
%load_ext autoreload
%autoreload 2

sys.path.append('..')

from src.data import raw_to_tokens, tokens_to_tagged, tagged_to_tokens, tagged_to_poi_street
from src.models.nltk_approach import NltkApproach

# Import Data

In [4]:
df = pd.read_csv('../data/train.csv').set_index('id')
df['poi'], df['street'] = zip(*df['POI/street'].str.split('/'))
df['raw_tokens'] = df['raw_address'].apply(raw_to_tokens)
df['poi_tokens'] = df['poi'].apply(raw_to_tokens)
df['street_tokens'] = df['street'].apply(raw_to_tokens)
df['tagged_tokens'] = df.apply(lambda r: tokens_to_tagged(r.raw_tokens, r.poi_tokens, r.street_tokens), axis=1)

In [5]:
df.sample(n=10)

Unnamed: 0_level_0,raw_address,POI/street,poi,street,raw_tokens,poi_tokens,street_tokens,tagged_tokens
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
165863,"sony sugema college, engku putri",sony sugema college/engku putri,sony sugema college,engku putri,"[sony, sugema, college,, engku, putri]","[sony, sugema, college]","[engku, putri]","[(sony, POI), (sugema, POI), (college,, OTHER)..."
6509,raya lungsi 14 kedewatan ubud,/,,,"[raya, lungsi, 14, kedewatan, ubud]",[],[],"[(raya, OTHER), (lungsi, OTHER), (14, OTHER), ..."
259425,gg. subi 115 campurejo bojonegoro,/gg. subingan,,gg. subingan,"[gg., subi, 115, campurejo, bojonegoro]",[],"[gg., subingan]","[(gg., STREET), (subi, OTHER), (115, OTHER), (..."
165928,kranji lapa bola 2 no 67-71 bekasi barat,/lapa bola 2,,lapa bola 2,"[kranji, lapa, bola, 2, no, 67-71, bekasi, barat]",[],"[lapa, bola, 2]","[(kranji, OTHER), (lapa, STREET), (bola, STREE..."
10095,jati pad raya no 6h 4 12540,/jati pad raya,,jati pad raya,"[jati, pad, raya, no, 6h, 4, 12540]",[],"[jati, pad, raya]","[(jati, STREET), (pad, STREET), (raya, STREET)..."
158403,raya kalima 3 haurkuning darma,/raya kalimalang,,raya kalimalang,"[raya, kalima, 3, haurkuning, darma]",[],"[raya, kalimalang]","[(raya, STREET), (kalima, OTHER), (3, OTHER), ..."
4707,rangkapan jaya baru rd. suka 89 16434 pancoran...,/rd. suka,,rd. suka,"[rangkapan, jaya, baru, rd., suka, 89, 16434, ...",[],"[rd., suka]","[(rangkapan, OTHER), (jaya, OTHER), (baru, OTH..."
168154,"yasmina cassavina cakes, taman cima kedung war...",yasmina cassavina cakes/taman cima,yasmina cassavina cakes,taman cima,"[yasmina, cassavina, cakes,, taman, cima, kedu...","[yasmina, cassavina, cakes]","[taman, cima]","[(yasmina, POI), (cassavina, POI), (cakes,, OT..."
108948,"poncokusumo poncokusumo - malang, poncokusumo",/,,,"[poncokusumo, poncokusumo, -, malang,, poncoku...",[],[],"[(poncokusumo, OTHER), (poncokusumo, OTHER), (..."
108964,pamekaran teru sore - cipa 75 soreang,/,,,"[pamekaran, teru, sore, -, cipa, 75, soreang]",[],[],"[(pamekaran, OTHER), (teru, OTHER), (sore, OTH..."


# Train-Valid Split

In [6]:
train_df, valid_df = train_test_split(df, test_size=0.2, random_state=42)
train_df.shape, valid_df.shape

((240000, 8), (60000, 8))

# Train 

In [7]:
mdl = NltkApproach()

In [9]:
mdl.transform(train_df['tagged_tokens'].sample(n=10000, random_state=42))

In [10]:
mdl.fit()

# Evaluate

In [12]:
pred_df = valid_df.copy()
pred_df['pred_tagged'] = mdl.predict(pred_df['raw_tokens'])
pred_df['pred_POI/street'] = pred_df['pred_tagged'].apply(tagged_to_poi_street)

In [22]:
pred_df[['raw_address', 'POI/street', 'pred_POI/street']].sample(n=5)

Unnamed: 0_level_0,raw_address,POI/street,pred_POI/street
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
200309,"alfa r406 andi kam merd, salekoe",alfamrt r406 andi kam/merd,alfa/kam
202262,pon indah vii cireundeu rt 4 2 ciputat timur,/pon indah vii,/vii
26253,"tk pgri karang, kalidawir",tk pgri karangtalun/,"tk karang,/"
240811,"jl. raya lembang km 12,3 no.35 (rm. saung peng...",saung pengkolan 1/jl. raya lembang,lembang/jl. raya
171332,"kalib, 5 kalibagor kalibagor",/kalib,"/kalib,"


In [15]:
accuracy_score(y_true = pred_df['POI/street'], y_pred = pred_df['pred_POI/street'])

0.14806666666666668

In [21]:
pred_df[pred_df['POI/street'] == pred_df['pred_POI/street']][['raw_address', 'POI/street', 'pred_POI/street']].sample(n=5)

Unnamed: 0_level_0,raw_address,POI/street,pred_POI/street
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
216903,lon bawah 10 rt 4 12 kebon melati tanah abang,/lon bawah,/lon bawah
25048,dokter subagio cemp karangmanyar,dokter subagio/cemp,dokter subagio/cemp
80906,pem 76 klaten,/pem,/pem
244873,pem 79 selat dalam,/pem,/pem
43356,amplas gg. mel 20371 percut sei tuan,/gg. mel,/gg. mel
