In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore") # remove warning

import tensorflow as tf
tf.get_logger().setLevel('INFO')

import sys
sys.path.append('..')
from src import CustomTokenizer

In [2]:
train_df = pd.read_csv('../data/train.csv')
test_df = pd.read_csv('../data/test.csv')

In [3]:
print(train_df.shape)
print(test_df.shape)

(300000, 3)
(50000, 2)


In [4]:
train_df.head()

Unnamed: 0,id,raw_address,POI/street
0,0,jl kapuk timur delta sili iii lippo cika 11 a ...,/jl kapuk timur delta sili iii lippo cika
1,1,"aye, jati sampurna",/
2,2,setu siung 119 rt 5 1 13880 cipayung,/siung
3,3,"toko dita, kertosono",toko dita/
4,4,jl. orde baru,/jl. orde baru


In [5]:
train_df['POI'] , train_df['street'] = zip(*train_df['POI/street'].str.split(pat='/'))

In [6]:
train_df.head()

Unnamed: 0,id,raw_address,POI/street,POI,street
0,0,jl kapuk timur delta sili iii lippo cika 11 a ...,/jl kapuk timur delta sili iii lippo cika,,jl kapuk timur delta sili iii lippo cika
1,1,"aye, jati sampurna",/,,
2,2,setu siung 119 rt 5 1 13880 cipayung,/siung,,siung
3,3,"toko dita, kertosono",toko dita/,toko dita,
4,4,jl. orde baru,/jl. orde baru,,jl. orde baru


In [7]:
train_df[train_df['POI']==''].id.count()

178509

In [8]:
train_df[train_df['street']==''].id.count()

70143

## Split train-valid

In [109]:
from sklearn.model_selection import train_test_split

train, valid = train_test_split(train_df.head(2000), test_size=0.2, random_state=42)

## Tokenize data

In [137]:
texts = train['raw_address'].tolist() + train['POI/street'].tolist() +  valid['raw_address'].tolist() + valid['POI/street'].tolist()

In [138]:
len(texts)

4000

In [139]:
tokenizer = CustomTokenizer.CustomTokenizer(train_texts = texts)
tokenizer.train_tokenize()

In [140]:
tokenized_X_train = tokenizer.vectorize_input(train['raw_address'])
tokenized_y_train = tokenizer.vectorize_input(train['POI/street'])
tokenized_X_valid = tokenizer.vectorize_input(valid['raw_address'])
tokenized_y_valid = tokenizer.vectorize_input(valid['POI/street'])

## Word Embedding

In [141]:
EMBEDDING_VECTOR_LENGTH=300
MAX_SEQUENCE_LENGTH=30
VOCAB_SIZE=len(tokenizer.tokenizer.word_index)+1
latent_dim = 256

In [142]:
print(VOCAB_SIZE)

5046


In [126]:
from gensim.models import Word2Vec
word2vec_model = Word2Vec.load('../data/id/id.bin')

def getVector(str):
    if str in word2vec_model:
        return word2vec_model[str]
    else:
        return None;
def isInModel(str):
     return str in word2vec_model

In [127]:
# create a weight matrix for words in training docs
embedding_matrix = np.zeros((VOCAB_SIZE, EMBEDDING_VECTOR_LENGTH))
for word, i in tokenizer.tokenizer.word_index.items():
    embedding_vector = getVector(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [150]:
from keras.layers import Embedding
from keras.initializers import Constant

embedding_layer =Embedding(len(tokenizer.tokenizer.word_index)+1, # number of unique tokens
                    EMBEDDING_VECTOR_LENGTH, #number of features
                    embeddings_initializer=Constant(embedding_matrix), # initialize 
                    input_length=MAX_SEQUENCE_LENGTH, 
                    trainable=False)

encoder_input_data = embedding_layer(tokenized_X_train)
decoder_input_data = embedding_layer(tokenized_y_train)
decoder_target_data=to_categorical(tokenized_y_train, num_classes=VOCAB_SIZE)
# decoder_target_data = np.zeros(shape=decoder_input_data.shape)
# for t in range(MAX_SEQUENCE_LENGTH-1):
#     decoder_target_data[:, t, :] = decoder_input_data[:, t + 1, :]


In [151]:
from keras.models import Model
from keras.layers import LSTM, Input, Dense, Embedding
from keras.optimizers import Adam

encoder_inputs = Input(shape=(None, EMBEDDING_VECTOR_LENGTH), name='encoder_inputs')
x, state_h, state_c = LSTM(latent_dim, return_state=True, name='encoder_lstm')(encoder_inputs)
encoder_states = [state_h, state_c]

# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(None, EMBEDDING_VECTOR_LENGTH), name='decoder_inputs')
decoder_outputs,_,_ = LSTM(latent_dim, return_sequences=True, return_state=True, name='decoder_lstm')(decoder_inputs, initial_state=encoder_states)
decoder_outputs = Dense(VOCAB_SIZE, activation='softmax')(decoder_outputs)

# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Compile & run training
model.compile(optimizer='adam', loss='categorical_crossentropy')
model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
          batch_size=32,
          epochs=3,
          validation_split=0.2, verbose=2)

Epoch 1/3
40/40 - 10s - loss: 2.9679 - val_loss: 0.8789
Epoch 2/3
40/40 - 8s - loss: 0.8160 - val_loss: 0.7899
Epoch 3/3
40/40 - 7s - loss: 0.7397 - val_loss: 0.7668


<tensorflow.python.keras.callbacks.History at 0x148090c4400>

In [67]:
# from keras.models import Model
# from keras.layers import LSTM, Input, Dense
# from keras.optimizers import Adam

# # Define an input sequence and process it.
# encoder_inputs = Input(shape=(None, EMBEDDING_VECTOR_LENGTH), name='encoder_inputs')

# encoder = LSTM(latent_dim, return_state=True, name='encoder_lstm')
# encoder_outputs, state_h, state_c = encoder(encoder_inputs)
# encoder_states = [state_h, state_c]

# # Set up the decoder, using `encoder_states` as initial state.
# decoder_inputs = Input(shape=(None, EMBEDDING_VECTOR_LENGTH), name='decoder_inputs')
# decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True, name='decoder_lstm')
# decoder_outputs, _, _ = decoder_lstm(decoder_inputs,
#                                      initial_state=encoder_states)
# decoder_outputs = Dense(EMBEDDING_VECTOR_LENGTH, activation='linear')(decoder_outputs)

# model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# # Run training
# model.compile(optimizer='adam', loss='cosine_similarity')
# model.fit([encoder_input_data, decoder_input_data], decoder_input_data,
#           batch_size=32,
#           epochs=3,
#           validation_split=0.2, 
#                     verbose=2)

In [152]:
# Inference Model
encoder_model = Model(encoder_inputs, encoder_states)

decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_outputs, state_h, state_c = decoder_lstm(
    decoder_inputs, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs] + decoder_states)

TypeError: 'list' object is not callable

In [106]:
input_seq = embedding_layer(tokenized_X_valid[2:3])

In [103]:
input_seq.shape

TensorShape([1, 30, 300])

In [107]:
states_value=encoder_model.predict(input_seq)
output_tokens, h, c = decoder_model.predict(
            [target_seq] + states_value)
output_tokens

array([[[nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
         nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
         nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
         nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
         nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
         nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
         nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
         nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
         nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
         nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
         nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
         nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
         nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
         nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
         nan, nan, nan, nan, nan, 

In [None]:
def decode_sequence(input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)

    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1, 1, EMBEDDING_VECTOR_LENGTH))
#     # Populate the first character of target sequence with the start character.
#     target_seq[0, 0, target_token_index['\t']] = 1.

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict(
            [target_seq] + states_value)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence += sampled_char

        # Exit condition: either hit max length
        # or find stop character.
        if (sampled_char == '\n' or
           len(decoded_sentence) > max_decoder_seq_length):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1, 1, num_decoder_tokens))
        target_seq[0, 0, sampled_token_index] = 1.

        # Update states
        states_value = [h, c]

    return decoded_sentence

In [25]:
# model.save('src/models/seq_to_seq')



INFO:tensorflow:Assets written to: src/models/seq_to_seq\assets


INFO:tensorflow:Assets written to: src/models/seq_to_seq\assets


In [36]:
reverse_word_map = dict(map(reversed, tokenizer.tokenizer.word_index.items()))