In [1]:
%matplotlib inline

In [2]:
from gensim.models import word2vec

Using TensorFlow backend.


## prepare

In [20]:
import re
import pickle
import collections

In [5]:
re_eq = re.compile('^(Wh[^?.!]+\?)')
re_fq = re.compile('^([^?.!]+\?)')

In [6]:
en_fname = 'giga-fren.release2.fixed.en'
fr_fname = 'giga-fren.release2.fixed.fr'
lines = ((re_eq.search(eq), re_fq.search(fq)) for eq, fq in zip(open(en_fname), open(fr_fname)))

In [7]:
qs = [(e.group(), f.group()) for e,f in lines if e and f]
len(qs)

52331

In [8]:
qs[:6]

[('What is light ?', 'Qu’est-ce que la lumière?'),
 ('Who are we?', 'Où sommes-nous?'),
 ('Where did we come from?', "D'où venons-nous?"),
 ('What would we do without it?', 'Que ferions-nous sans elle ?'),
 ('What is the absolute location (latitude and longitude) of Badger, Newfoundland and Labrador?',
  'Quelle sont les coordonnées (latitude et longitude) de Badger, à Terre-Neuve-etLabrador?'),
 ('What is the major aboriginal group on Vancouver Island?',
  'Quel est le groupe autochtone principal sur l’île de Vancouver?')]

In [11]:
pickle.dump(qs, open('qs.pkl', 'wb'))

In [None]:
qs = pickle.load(open('qs.pkl', 'rb'))

In [12]:
en_qs, fr_qs = zip(*qs)

In [13]:
re_mult_space = re.compile(r"  *")
re_mw_punc = re.compile(r"(\w[’'])(\w)")
re_punc = re.compile("([\"().,;:/_?!—])")
re_apos = re.compile(r"(\w)'s\b")

In [14]:
def simple_toks(sent):
    sent = re_apos.sub(r"\1 's", sent)
    sent = re_mw_punc.sub(r"\1 \2", sent)
    sent = re_punc.sub(r" \1 ", sent).replace('-', ' ')
    sent = re_mult_space.sub(' ', sent)
    return sent.lower().split()

In [15]:
fr_qtoks = list(map(simple_toks, fr_qs)); fr_qtoks[:4]

[['qu’', 'est', 'ce', 'que', 'la', 'lumière', '?'],
 ['où', 'sommes', 'nous', '?'],
 ["d'", 'où', 'venons', 'nous', '?'],
 ['que', 'ferions', 'nous', 'sans', 'elle', '?']]

In [16]:
en_qtoks = list(map(simple_toks, en_qs)); en_qtoks[:4]

[['what', 'is', 'light', '?'],
 ['who', 'are', 'we', '?'],
 ['where', 'did', 'we', 'come', 'from', '?'],
 ['what', 'would', 'we', 'do', 'without', 'it', '?']]

In [17]:
simple_toks("Rachel's baby is cuter than other's.")

['rachel', "'s", 'baby', 'is', 'cuter', 'than', 'other', "'s", '.']

In [18]:
def toks2ids(sents):
    voc_cnt = collections.Counter(t for sent in sents for t in sent)
    vocab = sorted(voc_cnt, key=voc_cnt.get, reverse=True)
    vocab.insert(0, "<PAD>")
    w2id = {w:i for i,w in enumerate(vocab)}
    ids = [[w2id[t] for t in sent] for sent in sents]
    return ids, vocab, w2id, voc_cnt

In [21]:
fr_ids, fr_vocab, fr_w2id, fr_counts = toks2ids(fr_qtoks)
en_ids, en_vocab, en_w2id, en_counts = toks2ids(en_qtoks)
len(en_vocab), len(fr_vocab)

(19548, 26708)

## word vectors
http://files.fast.ai/models/glove/6B.100d.tgz

In [30]:
import bcolz
import numpy as np

In [24]:
en_vecs = bcolz.open('6B.100d.dat')[:]
en_wv_word = pickle.load(open('6B.100d_words.pkl','rb'), encoding='latin1')
en_wv_idx = pickle.load(open('6B.100d_idx.pkl','rb'), encoding='latin1')

In [25]:
en_w2v = {w: en_vecs[en_wv_idx[w]] for w in en_wv_word}

In [26]:
n_en_vec, dim_en_vec = en_vecs.shape
dim_fr_vec = 200

In [27]:
fr_model = word2vec.KeyedVectors.load_word2vec_format('frWac_non_lem_no_postag_no_phrase_200_skip_cut100.bin', binary=True)
fr_voc = fr_model.vocab

In [32]:
def create_emb(w2v, targ_vocab, dim_vec):
    vocab_size = len(targ_vocab)
    emb = np.zeros((vocab_size, dim_vec))

    for i, word in enumerate(targ_vocab):
        try:
            emb[i] = w2v[word]
        except KeyError:
            # If we can't find the word, randomly initialize
            emb[i] = np.random.normal(scale=0.6, size=(dim_vec,))

    return emb

In [33]:
en_embs = create_emb(en_w2v, en_vocab, dim_en_vec); en_embs.shape

(19548, 100)

In [34]:
fr_embs = create_emb(fr_model, fr_vocab, dim_fr_vec); fr_embs.shape

(26708, 200)

## prep data

In [41]:
import collections
from keras.preprocessing.sequence import pad_sequences

In [36]:
en_lengths = collections.Counter(len(s) for s in en_ids)

In [37]:
maxlen = 30

In [38]:
len(list(filter(lambda x: len(x) > maxlen, en_ids))), len(
    list(filter(lambda x: len(x) <= maxlen, en_ids)))

(1726, 50605)

In [39]:
len(list(filter(lambda x: len(x) > maxlen, fr_ids))), len(
    list(filter(lambda x: len(x) <= maxlen, fr_ids)))

(3579, 48752)

In [42]:
en_padded = pad_sequences(en_ids, maxlen, padding="post", truncating="post")

In [43]:
fr_padded = pad_sequences(fr_ids, maxlen, padding="post", truncating="post")

In [44]:
en_padded.shape, fr_padded.shape, en_embs.shape


((52331, 30), (52331, 30), (19548, 100))

In [45]:
n = int(len(en_ids)*0.9)
idxs = np.random.permutation(len(en_ids))
fr_train, fr_test = fr_padded[idxs][:n], fr_padded[idxs][n:]
en_train, en_test = en_padded[idxs][:n], en_padded[idxs][n:]

## model

In [52]:
from keras.layers import *
from keras.models import Model

In [46]:
en_train.shape

(47097, 30)

In [47]:
fr_wgts = [fr_embs.T, np.zeros((len(fr_vocab,)))]

In [50]:
inp = Input((maxlen,))
x = Embedding(len(en_vocab), dim_en_vec, input_length=maxlen,
              weights=[en_embs], trainable=False)(inp)
x = Bidirectional(LSTM(128, return_sequences=True))(x)
x = Bidirectional(LSTM(128, return_sequences=True))(x)
x = LSTM(128, return_sequences=True)(x)
x = TimeDistributed(Dense(dim_fr_vec))(x)
x = TimeDistributed(Dense(len(fr_vocab), weights=fr_wgts))(x)
x = Activation('softmax')(x)

In [53]:
model = Model(inp, x)
model.compile('adam', 'sparse_categorical_crossentropy')

In [54]:
K.set_value(model.optimizer.lr, 1e-3)

In [None]:
hist=model.fit(en_train, np.expand_dims(fr_train,-1), batch_size=64, epochs=20, verbose=1, 
               validation_data=[en_test, np.expand_dims(fr_test,-1)])

Train on 47097 samples, validate on 5234 samples
Epoch 1/20
Epoch 2/20

In [None]:
def plot_train(hist):
    h = hist.history
    if 'acc' in h:
        meas='acc'
        loc='lower right'
    else:
        meas='loss'
        loc='upper right'
    plt.plot(hist.history[meas])
    plt.plot(hist.history['val_'+meas])
    plt.title('model '+meas)
    plt.ylabel(meas)
    plt.xlabel('epoch')
    plt.legend(['train', 'validation'], loc=loc)
plot_train(hist)

In [None]:
model.save_weights('trans.h5')

In [None]:
model.load_weights('trans.h5')

## testing

In [None]:
def sent2ids(sent):
    sent = simple_toks(sent)
    ids = [en_w2id[t] for t in sent]
    return pad_sequences([ids], maxlen, padding="post", truncating="post")

In [None]:
def en2fr(sent): 
    ids = sent2ids(sent)
    tr_ids = np.argmax(model.predict(ids), axis=-1)
    return ' '.join(fr_vocab[i] for i in tr_ids[0] if i>0)

In [None]:
en2fr("what is the size of canada?")