In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore") # remove warning

import tensorflow as tf
tf.get_logger().setLevel('INFO')

import sys
sys.path.append('..')
from src import CustomTokenizer

In [2]:
train_df = pd.read_csv('../data/train.csv')
test_df = pd.read_csv('../data/test.csv')

In [3]:
print(train_df.shape)
print(test_df.shape)

(300000, 3)
(50000, 2)


In [4]:
train_df.head()

Unnamed: 0,id,raw_address,POI/street
0,0,jl kapuk timur delta sili iii lippo cika 11 a ...,/jl kapuk timur delta sili iii lippo cika
1,1,"aye, jati sampurna",/
2,2,setu siung 119 rt 5 1 13880 cipayung,/siung
3,3,"toko dita, kertosono",toko dita/
4,4,jl. orde baru,/jl. orde baru


In [5]:
train_df['POI'] , train_df['street'] = zip(*train_df['POI/street'].str.split(pat='/'))

In [6]:
train_df.head()

Unnamed: 0,id,raw_address,POI/street,POI,street
0,0,jl kapuk timur delta sili iii lippo cika 11 a ...,/jl kapuk timur delta sili iii lippo cika,,jl kapuk timur delta sili iii lippo cika
1,1,"aye, jati sampurna",/,,
2,2,setu siung 119 rt 5 1 13880 cipayung,/siung,,siung
3,3,"toko dita, kertosono",toko dita/,toko dita,
4,4,jl. orde baru,/jl. orde baru,,jl. orde baru


In [7]:
train_df[train_df['POI']==''].id.count()

178509

In [8]:
train_df[train_df['street']==''].id.count()

70143

## EDA

### Word One Hot Encoding

In [9]:
# import nltk
# from nltk.tokenize import word_tokenize

In [10]:
# # tokenize word. replace number with token NUMB since the number is not important in this analysis
# unique_word_list = set()
# for col in ['raw_address','POI', 'street']:
#     train_df[col+'_numb_removed'] = train_df[col].str.replace('\d+',' NUMB ')
#     corpus = train_df[col+'_numb_removed'].values
#     unique_word_list=unique_word_list.union( set(word_tokenize(" ".join(corpus))) ) #join two sets

In [11]:
# # unique word
# len(unique_word_list)

In [12]:
# # create dict of token
# token_dict={}

# token_dict[''] = 0 # first index for empty string
# i=1
# for token in unique_word_list:
#     token_dict[token] = i
#     i += 1

In [13]:
# def tokenize_word(text):
#     try:
#         tokens = word_tokenize(text)
#         return np.array([token_dict[token] for token in tokens])
#     except:
#         pass

In [14]:
# for col in ['raw_address','POI', 'street']:
#     train_df[col+'_tokenized'] = train_df[col+'_numb_removed'].apply(tokenize_word)

In [15]:
# for col in ['POI_tokenized', 'street_tokenized']:
#     train_df.loc[train_df[col].str.len() == 0, col] = 0 # impute empty output with 0

In [16]:
# # Check if output is exact subset of raw_address. if any of the output is coming from raw_address, return False
# def check_subset(row, output_col):
#     try:
#         raw_address = row['raw_address_tokenized']
#         res = True
#         for token in row[output_col]:
#             if token not in raw_address:
#                 res = False
#         return res
#     except:
#         pass

# for col in ['POI_tokenized', 'street_tokenized']:
#     train_df.loc[train_df[col]!=0, col+'_subset'] = train_df.apply(lambda x: check_subset(x, col), axis=1)

In [17]:
# print(train_df.POI_tokenized_subset.value_counts())
# print(train_df.street_tokenized_subset.value_counts())

Many POIs are not exact subset of raw_address. Hence, they might be written in the form of abbreviation in raw_address.

## Split train-valid

In [18]:
from sklearn.model_selection import train_test_split

train, valid = train_test_split(train_df, test_size=0.2, random_state=42)

## Tokenize data

In [19]:
texts = train_df.raw_address.tolist() + train_df['POI/street'].tolist()

In [20]:
len(texts)

600000

In [21]:
tokenizer = CustomTokenizer.CustomTokenizer(train_texts = texts)
# fit o the train
tokenizer.train_tokenize()
tokenized_X_train = tokenizer.vectorize_input(train['raw_address'])
tokenized_y_train = tokenizer.vectorize_input(train['POI/street'])
tokenized_X_valid = tokenizer.vectorize_input(valid['raw_address'])
tokenized_y_valid = tokenizer.vectorize_input(valid['POI/street'])

### Word Embedding

In [22]:
from gensim.models import Word2Vec
word2vec_model = Word2Vec.load('../data/id/id.bin')

EMBEDDING_VECTOR_LENGTH=300
MAX_SEQUENCE_LENGTH=50

def getVector(str):
    if str in word2vec_model:
        return word2vec_model[str]
    else:
        return None;
def isInModel(str):
     return str in word2vec_model

In [23]:
# create a weight matrix for words in training docs
embedding_matrix = np.zeros((len(tokenizer.tokenizer.word_index)+1, EMBEDDING_VECTOR_LENGTH))
for word, i in tokenizer.tokenizer.word_index.items():
    embedding_vector = getVector(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [24]:
from keras.initializers import Constant
from keras.models import Sequential
from keras.layers import Embedding,LSTM,Dropout, Dense
from keras.optimizers import Adam

model=Sequential()

embedding=Embedding(len(tokenizer.tokenizer.word_index)+1, # number of unique tokens
                    EMBEDDING_VECTOR_LENGTH, #number of features
                    embeddings_initializer=Constant(embedding_matrix), # initialize 
                    input_length=MAX_SEQUENCE_LENGTH, 
                    trainable=False)

model.add(embedding)
model.add(LSTM(64, dropout=0.5))
model.add(Dense(MAX_SEQUENCE_LENGTH, activation='softmax'))
# compile the model

model.compile(optimizer='adam', 
              loss='binary_crossentropy', 
              metrics=['acc'])

# fit the model
history = model.fit(tokenized_X_train, tokenized_y_train, 
                    batch_size=32, 
                    epochs=5, 
                    validation_data=(tokenized_X_valid,tokenized_y_valid), 
                    verbose=2)

Epoch 1/5
7500/7500 - 255s - loss: -2.3332e+04 - acc: 0.4970 - val_loss: -4.5687e+04 - val_acc: 0.4999
Epoch 2/5
7500/7500 - 253s - loss: -6.8577e+04 - acc: 0.4983 - val_loss: -9.0586e+04 - val_acc: 0.4999
Epoch 3/5
7500/7500 - 225s - loss: -1.1383e+05 - acc: 0.4983 - val_loss: -1.3548e+05 - val_acc: 0.4999
Epoch 4/5
7500/7500 - 128s - loss: -1.5908e+05 - acc: 0.4983 - val_loss: -1.8045e+05 - val_acc: 0.4999
Epoch 5/5
7500/7500 - 128s - loss: -2.0436e+05 - acc: 0.4983 - val_loss: -2.2542e+05 - val_acc: 0.4999


In [25]:
loss, accuracy = model.evaluate(tokenized_X_valid, tokenized_y_valid, verbose=0)
print(loss)
print(accuracy)

-225417.453125
0.49994999170303345


## Simple model