In [236]:
import Levenshtein
import numpy as np
from sentence_getter import SentenceGetter
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split
import pickle
from keras.models import Model, Input
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from keras import optimizers
import kenlm
import pandas as pd
import seaborn as sns
import pylab as pl
from sklearn.linear_model import BayesianRidge
from sklearn.metrics import r2_score
import heapq
from collections import defaultdict 

def get_model(max_len, n_words, n_tags, embedding_mat):
    input = Input(shape=(max_len,))
    model = Embedding(input_dim=n_words, weights=[embedding_mat], output_dim=50, input_length=max_len)(input)
    model = Dropout(0.1)(model)
    model = Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=0.1))(model)
    out = TimeDistributed(Dense(n_tags, activation="softmax"))(model)  # softmax output layer
    model = Model(input, out)
    return model


def get_embedding_matrix(embeddings_path, word2idx):
    embedding_vectors = {}
    with open(embeddings_path, 'r') as f:
        for line in f:
            line_split = line.strip().split(" ")
            vec = np.array(line_split[1:], dtype=float)
            char = line_split[0]
            embedding_vectors[char] = vec

    embedding_matrix = np.zeros((len(word2idx), 50))
    for char in word2idx:
        embedding_vector = embedding_vectors.get(char)
        if embedding_vector is not None:
            embedding_matrix[word2idx[char]] = embedding_vector
    return embedding_matrix


def get_word(X, y, words, tags):
    ans = ""
    for i, ch in enumerate(X):
        if tags[y[i]] == "C":
            ans += words[ch]
    return ans

def get_word2(word,tag_seq, words):
    ans = ""
    for i in range(len(word)):
        if tag_seq[i]=='C':
            ans+=words[word[i]]
    return ans

def score_candidate_length(c, example,length_model):
    import scipy.stats
    needed_length, needed_length_std=length_model.predict(np.array([[len(example)]]), return_std=True)
    needed_length=needed_length[0]
    needed_length_std=needed_length_std[0]
    clength=len(c)
    #print "NL:{} CL:{}".format(needed_length, clength)
    return scipy.stats.norm.logpdf(clength, loc=needed_length, scale=needed_length_std)-scipy.stats.norm.logpdf(needed_length, loc=needed_length, scale=needed_length_std)    

In [237]:
     
def updatestr(s,i,ch):  
    list1 = list(s)
    list1[i] = ch
    str1 = ''.join(list1)
    return str1

def getTopk(m,k, r_mapping):
    mapping = {}
    for tag in r_mapping:
        mapping[r_mapping[tag]] = tag
#     mapping = {0:'C',1:'D',2:'O'}
#     r_mapping = {'C':0, 'D':1, 'O':2}
    best_seq = ""
    best_prob = 1.0
    best_idx = np.argmax(m, axis=-1)
    for i in range(30):
        best_seq += mapping[best_idx[i]]
        best_prob *= m[i][best_idx[i]]
    heap = [(-1*best_prob,best_seq)]
    heapq.heapify(heap)
    
    result = []
    added = set()
    while k>0:
        top = heapq.heappop(heap)
        result += [(top[0]*-1,top[1])]
        added.add(top[1])
        k-=1
        prob = -1*top[0]
        seq = top[1]
        curr_prob = prob
        curr_seq = seq
        for i in range(30):
            for j in range(3):
                curr_seq = updatestr(curr_seq,i,mapping[j])
                if curr_seq in added:
                    continue
                curr_prob = prob*m[i][j]/m[i][r_mapping[seq[i]]]
                heapq.heappush(heap,(-1*curr_prob,curr_seq))
                curr_seq = seq
    return result
    

In [238]:
def get_len(row):
    from collections import Counter
    return Counter(list(row))['C']
df = pd.read_csv('./data/components-blends-knight.csv',sep='\t',index_col=0)
df["slen"]=df.source.apply(len)
df["tlen"]=df.target.apply(get_len)
df["ratio"]=df["slen"]/df["tlen"]
len_model = BayesianRidge(verbose=True, compute_score=True)
X=df["slen"].values.reshape(-1,1)
y=df["tlen"].values
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test=train_test_split(X, y)
len_model.fit(X_train, y_train)
# r2_score(y_test,len_model.predict(X_test))

Convergence after  2  iterations


BayesianRidge(alpha_1=1e-06, alpha_2=1e-06, compute_score=True, copy_X=True,
              fit_intercept=True, lambda_1=1e-06, lambda_2=1e-06, n_iter=300,
              normalize=False, tol=0.001, verbose=True)

In [239]:
kfold = 3
X_tr = []
X_te = []
y_tr = [] 
y_te = []

for i in range(kfold):
    data = pickle.load(open("./data/df_lstm.pkl", "rb"))

    embeddings_path = "./data/pretrained_char_emb.txt"

    words = list(set(data["Word"].values))
    words.append("$")
    n_words = len(words)
    tags = list(set(data["Tag"].values))
    tags.append("O")
    n_tags = len(tags)
    getter = SentenceGetter(data)
    sentences = getter.sentences
    max_len = 30
    word2idx = {w: i for i, w in enumerate(words)}
    tag2idx = {t: i for i, t in enumerate(tags)}

    embedding_mat = get_embedding_matrix(embeddings_path, word2idx)

    X = [[word2idx[w[0]] for w in s] for s in sentences]
    X = pad_sequences(maxlen=max_len, sequences=X, padding="post", value=n_words - 1)
    y = [[tag2idx[w[1]] for w in s] for s in sentences]
    y = pad_sequences(maxlen=max_len, sequences=y, padding="post", value=tag2idx["O"])
    y = [to_categorical(i, num_classes=n_tags) for i in y]
    X_tr_t, X_te_t, y_tr_t, y_te_t = train_test_split(X, y, test_size=0.1)
    X_tr += [X_tr_t]
    X_te += [X_te_t]
    y_tr += [y_tr_t]
    y_te += [y_te_t]

In [240]:
models = []
lmodel = kenlm.Model('./data/wordlist_english_filtered_threshold100-kenlm.arpa')
for i in range(kfold):
    model = get_model(max_len, n_words, n_tags, embedding_mat)
    model.compile(optimizer="rmsprop", loss="mse", metrics=["accuracy"])
    history = model.fit(X_tr[i], np.array(y_tr[i]), batch_size=32, epochs=10, validation_split=0.1, verbose=0)
    models += [model]
    print("model ",i+1," trained.")
# from keras.models import load_model

# models += [load_model("./keras_jupyper.h5")]


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


model  1  trained.


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


model  2  trained.


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


model  3  trained.


In [266]:
def getEditDistance(model,idx,l1,l2,l3):
    preds = []
    true = []
    for i, test in enumerate(X_te[idx]):
        p = model.predict(np.array([X_te[idx][i]]))
        t = y_te[idx][i]
        predictions = getTopk(p[0],10, tag2idx)
#         print(predictions)
        candidates = [get_word2(X_te[idx][i],d[1], words) for d in predictions]
        m_scores=[lmodel.score(" ".join(c))/(float(len(" ".join(c))**1)) for c in candidates]
        input_len = [len_model.predict([[p[1].index('O')]])[0] if 'O' in p[1] else 30 for p in predictions]
        lstm_len = [(p[1].index('O') - p[1].count('D')) if 'O' in p[1] else 30 for p in predictions]
        len_score = [1/(1+(abs(i-l))) for l,i in zip(lstm_len,input_len)]
        for j in range(len(m_scores)):
#             print(m_scores[j]/l1,predictions[j][0]*l2,len_score[j]/l3)
            m_scores[j]= m_scores[j]/l1 +  predictions[j][0]*l2 + len_score[j]/l3
        
        max_idx=-1
        max_val = -99999
        for ele in enumerate(m_scores):
            if ele[1]>max_val:
                max_val = ele[1]
                max_idx = ele[0]
        preds.append(candidates[max_idx])
    #     p = np.argmax(p, axis=-1)
        t = np.argmax(t, axis=-1)
    #     preds.append(get_word(X_te[i], p[0], words, tags))
        true.append(get_word(X_te[idx][i], t, words, tags))

    distance = 0
    for i,word in enumerate(true):
        distance += Levenshtein.distance(word, preds[i])
    acc = distance / len(preds)
    return acc

In [263]:
def gridsearch(model,i):
    best_ed = 999
    l1 = np.arange(0,1,0.1)
    l2 = np.arange(0,1,0.1)
    for ele1 in l1:
        for ele2 in l2:
            ed = getEditDistance(model,i,ele1,ele2)
            if ed<best_ed:
                best_ed = ed
    return best_ed
avg_edit = 0
for i in range(kfold):
#     res = getEditDistance(models[i],i)
    res = gridsearch(models[i],i)
    avg_edit += res
    
avg_edit/kfold

In [267]:
getEditDistance(models[0],0,8,1,40)

1.8445945945945945