In [None]:
import subprocess
subprocess.call("python lstm.py")

In [77]:
import Levenshtein
import numpy as np
from sentence_getter import SentenceGetter
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split
import pickle
from keras.models import Model, Input
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from keras import optimizers
import kenlm



def get_model(max_len, n_words, n_tags, embedding_mat):
    input = Input(shape=(max_len,))
    model = Embedding(input_dim=n_words, weights=[embedding_mat], output_dim=50, input_length=max_len)(input)
    model = Dropout(0.1)(model)
    model = Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=0.1))(model)
    out = TimeDistributed(Dense(n_tags, activation="softmax"))(model)  # softmax output layer
    model = Model(input, out)
    return model


def get_embedding_matrix(embeddings_path, word2idx):
    embedding_vectors = {}
    with open(embeddings_path, 'r') as f:
        for line in f:
            line_split = line.strip().split(" ")
            vec = np.array(line_split[1:], dtype=float)
            char = line_split[0]
            embedding_vectors[char] = vec

    embedding_matrix = np.zeros((len(word2idx), 50))
    for char in word2idx:
        embedding_vector = embedding_vectors.get(char)
        if embedding_vector is not None:
            embedding_matrix[word2idx[char]] = embedding_vector
    return embedding_matrix


def get_word(X, y, words, tags):
    ans = ""
    for i, ch in enumerate(X):
        if tags[y[i]] == "C":
            ans += words[ch]
    return ans

def get_word2(word,tag_seq, words):
    ans = ""
    for i in range(len(word)):
        if tag_seq[i]=='C':
            ans+=words[word[i]]
    return ans

In [4]:
data = pickle.load(open("./data/df_lstm.pkl", "rb"))
embeddings_path = "./data/pretrained_char_emb.txt"

words = list(set(data["Word"].values))
words.append("$")
n_words = len(words)
tags = list(set(data["Tag"].values))
tags.append("O")
n_tags = len(tags)
getter = SentenceGetter(data)
sentences = getter.sentences
max_len = 30
word2idx = {w: i for i, w in enumerate(words)}
tag2idx = {t: i for i, t in enumerate(tags)}

embedding_mat = get_embedding_matrix(embeddings_path, word2idx)

X = [[word2idx[w[0]] for w in s] for s in sentences]
X = pad_sequences(maxlen=max_len, sequences=X, padding="post", value=n_words - 1)
y = [[tag2idx[w[1]] for w in s] for s in sentences]
y = pad_sequences(maxlen=max_len, sequences=y, padding="post", value=tag2idx["O"])
y = [to_categorical(i, num_classes=n_tags) for i in y]
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.1)

# 1.74 ,1.37 

In [None]:
X_train = np.array([*X_tr,*X_tr,*X_tr,*X_tr])
Y_train = [*y_tr,*y_tr,*y_tr,*y_tr]

In [182]:
model = get_model(max_len, n_words, n_tags, embedding_mat)
model.compile(optimizer="rmsprop", loss="mse", metrics=["accuracy"])
history = model.fit(X_tr, np.array(y_tr), batch_size=32, epochs=5, validation_split=0.1, verbose=1)
lmodel = kenlm.Model('./data/wordlist_english_filtered_threshold100-kenlm.arpa')

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 1194 samples, validate on 133 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [178]:
import kenlm
mod = kenlm.LanguageModel('./data/wordlist_english_filtered_threshold100-kenlm.arpa')
mod.score("a s d e f e")/6**0.5
mod.score("a p p l e")/5**0.5

-1.568528618900828

In [198]:
preds = []
true = []
for i, test in enumerate(X_te):
    p = model.predict(np.array([X_te[i]]))
    t = y_te[i]
    predictions = getTopk(p[0],10)
    candidates = [get_word2(X_te[i],d[1], words) for d in predictions]
    m_scores=[lmodel.score(" ".join(c))/(float(len(" ".join(c)))) for c in candidates]
    for j in range(len(m_scores)):
        m_scores[j]= m_scores[j]/6 +  predictions[j][0]
#     max_idx =
#     print(m_scores)
    max_idx=-1
    max_val = -99999
    for ele in enumerate(m_scores):
        if ele[1]>max_val:
            max_val = ele[1]
            max_idx = ele[0]
#     max_idx=0
    preds.append(candidates[max_idx])
#     p = np.argmax(p, axis=-1)
    t = np.argmax(t, axis=-1)
#     preds.append(get_word(X_te[i], p[0], words, tags))
    true.append(get_word(X_te[i], t, words, tags))

distance = 0
for i,word in enumerate(true):
    distance += Levenshtein.distance(word, preds[i])
print(distance / len(preds))


1.3581081081081081


In [61]:
import heapq
from collections import defaultdict


            
def updatestr(s,i,ch):  
    list1 = list(s)
    list1[i] = ch
    str1 = ''.join(list1)
    return str1

def getTopk(m,k):
    mapping = {0:'D',1:'C',2:'O'}
    r_mapping = {'D':0, 'C':1, 'O':2}
#     prob_best = defaultdict(float)
#     prob_second = defaultdict(float)
#     assign(m,prob_best,prob_second)
#     padding = 29
#     results = []
#     while prob_best[padding][0] == 'O':
#         padding-=1
#     seq_len = padding+1
#     print("len ",seq_len)
#     if padding>21:
#         print("this one is gonna take time! Length = ", seq_len)
#     best_seq = ""
#     best_prob = 1.0
#     for ele in prob_best:
#         best_seq += prob_best[ele][0]
#         best_prob *= prob_best[ele][1]
    best_seq = ""
    best_prob = 1.0
    best_idx = np.argmax(m, axis=-1)
    for i in range(30):
        best_seq += mapping[best_idx[i]]
        best_prob *= m[i][best_idx[i]]
    heap = [(-1*best_prob,best_seq)]
    heapq.heapify(heap)
    
    result = []
    added = set()
    while k>0:
        top = heapq.heappop(heap)
        result += [(top[0]*-1,top[1])]
        added.add(top[1])
        k-=1
        prob = -1*top[0]
        seq = top[1]
        curr_prob = prob
        curr_seq = seq
        for i in range(30):
            for j in range(3):
                curr_seq = updatestr(curr_seq,i,mapping[j])
                if curr_seq in added:
                    continue
                curr_prob = prob*m[i][j]/m[i][r_mapping[seq[i]]]
                heapq.heappush(heap,(-1*curr_prob,curr_seq))
                curr_seq = seq
#     num = 1<<seq_len
#     for i in range(num):
#         seq = format(i, '040b')[-seq_len:]
#         curr_prob = 1.0
#         idx=0
#         out_str = ""
#         for ele in seq:
#             if ele=='1':
#                 curr_prob *= prob_best[idx][1]
#                 out_str = out_str + prob_best[idx][0]
#             else:
#                 curr_prob *= prob_second[idx][1]
#                 out_str = out_str + prob_second[idx][0]
#             idx+=1
        
#         results += [(curr_prob, out_str.ljust(30, 'O'))]
    return result
    

In [58]:
getTop10(grr,10)

[(0.04165565675043418, 'CCCCDDDDDCCCCCOOOOOOOOOOOOOOOO'),
 (0.030473228490919164, 'CCCDDDDDDCCCCCOOOOOOOOOOOOOOOO'),
 (0.023312213915672144, 'CCCCCDDDDCCCCCOOOOOOOOOOOOOOOO'),
 (0.019885439599809495, 'CCCCDDDDCCCCCCOOOOOOOOOOOOOOOO'),
 (0.017054068443514772, 'CCCDCDDDDCCCCCOOOOOOOOOOOOOOOO'),
 (0.017054068443514772, 'CCCDCDDDDCCCCCOOOOOOOOOOOOOOOO'),
 (0.014683610774561463, 'CCCCDDDDDDCCCCOOOOOOOOOOOOOOOO'),
 (0.014547208994875608, 'CCCDDDDDCCCCCCOOOOOOOOOOOOOOOO'),
 (0.014547208994875606, 'CCCDDDDDCCCCCCOOOOOOOOOOOOOOOO'),
 (0.012358366784220696, 'CCDCDDDDDCCCCCOOOOOOOOOOOOOOOO')]

In [22]:
grr

array([[1.0527074e-01, 8.9458370e-01, 1.4553506e-04],
       [1.5765575e-01, 8.4221870e-01, 1.2547441e-04],
       [2.2877818e-01, 7.7112985e-01, 9.1942944e-05],
       [4.2245165e-01, 5.7747412e-01, 7.4239564e-05],
       [6.4114231e-01, 3.5880953e-01, 4.8188060e-05],
       [8.8727862e-01, 1.1269377e-01, 2.7648372e-05],
       [9.9155384e-01, 8.4371623e-03, 8.9927480e-06],
       [8.5765886e-01, 1.4226426e-01, 7.6855402e-05],
       [6.7675102e-01, 3.2306516e-01, 1.8384511e-04],
       [2.6054394e-01, 7.3913217e-01, 3.2386402e-04],
       [2.1751241e-01, 7.8128552e-01, 1.2020216e-03],
       [1.2100350e-01, 8.7761891e-01, 1.3776020e-03],
       [1.0943196e-01, 8.7928015e-01, 1.1287919e-02],
       [6.5070033e-02, 9.0963250e-01, 2.5297526e-02],
       [9.3023311e-03, 3.7080593e-02, 9.5361710e-01],
       [1.3950175e-03, 5.8074510e-03, 9.9279755e-01],
       [3.6855845e-04, 1.9973603e-03, 9.9763405e-01],
       [1.3010575e-04, 1.0026451e-03, 9.9886727e-01],
       [5.8977057e-05, 6.148