In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore") # remove warning

import tensorflow as tf
tf.get_logger().setLevel('INFO')

In [2]:
train_df = pd.read_csv('../data/train.csv')
test_df = pd.read_csv('../data/test.csv')

In [3]:
print(train_df.shape)
print(test_df.shape)

(300000, 3)
(50000, 2)


In [4]:
train_df.head()

Unnamed: 0,id,raw_address,POI/street
0,0,jl kapuk timur delta sili iii lippo cika 11 a ...,/jl kapuk timur delta sili iii lippo cika
1,1,"aye, jati sampurna",/
2,2,setu siung 119 rt 5 1 13880 cipayung,/siung
3,3,"toko dita, kertosono",toko dita/
4,4,jl. orde baru,/jl. orde baru


In [5]:
train_df['POI'] , train_df['street'] = zip(*train_df['POI/street'].str.split(pat='/'))

In [6]:
train_df.head()

Unnamed: 0,id,raw_address,POI/street,POI,street
0,0,jl kapuk timur delta sili iii lippo cika 11 a ...,/jl kapuk timur delta sili iii lippo cika,,jl kapuk timur delta sili iii lippo cika
1,1,"aye, jati sampurna",/,,
2,2,setu siung 119 rt 5 1 13880 cipayung,/siung,,siung
3,3,"toko dita, kertosono",toko dita/,toko dita,
4,4,jl. orde baru,/jl. orde baru,,jl. orde baru


In [7]:
train_df[train_df['POI']==''].id.count()

178509

In [8]:
train_df[train_df['street']==''].id.count()

70143

## EDA

### Word One Hot Encoding

In [9]:
import nltk
from nltk.tokenize import word_tokenize

In [10]:
# tokenize word. replace number with token NUMB since the number is not important in this analysis
unique_word_list = set()
for col in ['raw_address','POI', 'street']:
    train_df[col+'_numb_removed'] = train_df[col].str.replace('\d+',' NUMB ')
    corpus = train_df[col+'_numb_removed'].values
    unique_word_list=unique_word_list.union( set(word_tokenize(" ".join(corpus))) ) #join two sets

In [11]:
# unique word
len(unique_word_list)

87541

In [12]:
# create dict of token
token_dict={}

i=1
for token in unique_word_list:
    token_dict[token] = i
    i += 1

In [13]:
def tokenize_word(text):
    try:
        tokens = word_tokenize(text)
        return [token_dict[token] for token in tokens]
    except:
        pass

In [28]:
for col in ['raw_address','POI', 'street']:
    train_df[col+'_tokenized'] = train_df[col+'_numb_removed'].apply(tokenize_word)

In [29]:
train_df.sample(10)

Unnamed: 0,id,raw_address,POI/street,POI,street,raw_address_numb_removed,POI_numb_removed,street_numb_removed,raw_address_tokenized,POI_tokenized,street_tokenized
239686,239686,cilandak barat cila i ujung 38 rt 2 1 12430 ci...,/cila i ujung,,cila i ujung,cilandak barat cila i ujung NUMB rt NUMB ...,,cila i ujung,"[83052, 74694, 77057, 45206, 84491, 5180, 1620...",[],"[77057, 45206, 84491]"
293348,293348,"kut, kuta blang banda sakti",/,,,"kut, kuta blang banda sakti",,,"[54504, 63836, 37745, 50476, 70696, 50913]",[],[]
88355,88355,"re martadinata vii, ternate baru kel.",/,,,"re martadinata vii, ternate baru kel.",,,"[75878, 18280, 69747, 63836, 77068, 43066, 426...",[],[]
254495,254495,"bak dan jag bakar mas salim, raya kled,",bakwan dan jagung bakar mas salim/raya kled,bakwan dan jagung bakar mas salim,raya kled,"bak dan jag bakar mas salim, raya kled,",bakwan dan jagung bakar mas salim,raya kled,"[35038, 25997, 2223, 37414, 35462, 31777, 6383...","[75850, 25997, 79887, 37414, 35462, 31777]","[49443, 39105]"
228536,228536,gg. buntu tritih kulon cilacap utara,/gg. buntu,,gg. buntu,gg. buntu tritih kulon cilacap utara,,gg. buntu,"[26812, 78385, 69751, 21364, 60759, 3760, 8647]",[],"[26812, 78385, 69751]"
276656,276656,"gube ha bast, seberang ulu i",/gube ha bast,,gube ha bast,"gube ha bast, seberang ulu i",,gube ha bast,"[84675, 21053, 74451, 63836, 61466, 44092, 45206]",[],"[84675, 21053, 74451]"
284138,284138,kelapa gading barat gad kir timur iv 13 rt 4 8...,/,,,kelapa gading barat gad kir timur iv NUMB rt...,,,"[436, 25412, 74694, 44724, 85858, 32754, 51325...",[],[]
59338,59338,raya pawi 40,/raya pawi,,raya pawi,raya pawi NUMB,,raya pawi,"[49443, 81521, 5180]",[],"[49443, 81521]"
119535,119535,karang anyar b kar anyar no 17 rt 8 4 sawah besar,/b kar anyar,,b kar anyar,karang anyar b kar anyar no NUMB rt NUMB ...,,b kar anyar,"[28865, 45539, 47533, 20964, 45539, 21989, 518...",[],"[47533, 20964, 45539]"
3652,3652,"meut multi konsultan, manggala",/,,,"meut multi konsultan, manggala",,,"[51365, 83227, 824, 63836, 4645]",[],[]


In [30]:
for col in ['POI_tokenized', 'street_tokenized']:
    train_df.loc[train_df[col].str.len() == 0, col] = 0 # impute empty output with 0

In [41]:
train_df.head()

Unnamed: 0,id,raw_address,POI/street,POI,street,raw_address_numb_removed,POI_numb_removed,street_numb_removed,raw_address_tokenized,POI_tokenized,street_tokenized
0,0,jl kapuk timur delta sili iii lippo cika 11 a ...,/jl kapuk timur delta sili iii lippo cika,,jl kapuk timur delta sili iii lippo cika,jl kapuk timur delta sili iii lippo cika NUMB...,,jl kapuk timur delta sili iii lippo cika,"[40092, 84419, 32754, 11100, 68752, 49783, 579...",0,"[40092, 84419, 32754, 11100, 68752, 49783, 579..."
1,1,"aye, jati sampurna",/,,,"aye, jati sampurna",,,"[50685, 63836, 63735, 48193]",0,0
2,2,setu siung 119 rt 5 1 13880 cipayung,/siung,,siung,setu siung NUMB rt NUMB NUMB NUMB cipa...,,siung,"[69481, 28653, 5180, 16208, 5180, 5180, 5180, ...",0,[28653]
3,3,"toko dita, kertosono",toko dita/,toko dita,,"toko dita, kertosono",toko dita,,"[53037, 62544, 63836, 63005]","[53037, 62544]",0
4,4,jl. orde baru,/jl. orde baru,,jl. orde baru,jl. orde baru,,jl. orde baru,"[40092, 78385, 34890, 43066]",0,"[40092, 78385, 34890, 43066]"


In [58]:
# Check if output is exact subset of raw_address. if any of the output is coming from raw_address, return False
def check_subset(row, output_col):
    try:
        raw_address = row['raw_address_tokenized']
        res = True
        for token in row[output_col]:
            if token not in raw_address:
                res = False
        return res
    except:
        pass

for col in ['POI_tokenized', 'street_tokenized']:
    train_df.loc[train_df[col]!=0, col+'_subset'] = train_df.apply(lambda x: check_subset(x, col), axis=1)

In [67]:
print(train_df.POI_tokenized_subset.value_counts())
print(train_df.street_tokenized_subset.value_counts())

True     75381
False    46110
Name: POI_tokenized_subset, dtype: int64
True     213092
False     16764
Name: street_tokenized_subset, dtype: int64


Many POIs are not exact subset of raw_address. Hence, they might be written in the form of abbreviation in raw_address.

## Split train-valid

In [17]:
from sklearn.model_selection import train_test_split

train, valid = train_test_split(train_df, test_size=0.2, random_state=42)

## Simple model

In [None]:
from keras.models import Model
from keras.layers import Input, LSTM, Dense

num_encoder_tokens = len(unique_word_list)
num_decoder_tokens = len(unique_word_list)

# Define an input sequence and process it.
encoder_inputs = Input(shape=(None, num_encoder_tokens))
encoder = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_inputs)
# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]

# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(None, num_decoder_tokens))
# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the 
# return states in the training model, but we will use them in inference.
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs,
                                     initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)