In [2]:
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook as tqdm
import matplotlib.pyplot as plt
import pickle
from nltk import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

import tensorflow as tf
import keras.backend as K
from keras.layers import Input, Embedding, Bidirectional, LSTM, Dense, Dropout, Concatenate, TimeDistributed
from keras.models import Model, load_model
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [13]:
repl_df = pd.read_pickle('data/lexical_repl/repl_df.zip')

In [15]:
with open('data/lexical_repl/repl_dict.pkl', 'rb') as f:
    repl_dict = pickle.load(f)

In [14]:
repl_df.head()

Unnamed: 0,word,orig,repl
0,internal,"(the, bank, 's, still-sloppy, internal, controls)","[the, bank, 's, still-sloppy, inside, controls]"
2,internal,"((, internal, leaf, crowns)","[(, inside, leaf, crowns]"
3,internal,"(one, internal, pocket)","[one, inside, pocket]"
4,internal,"(the, internal, variations)","[the, inside, variations]"
5,internal,"(Yugoslavia, 's, internal, common, market)","[Yugoslavia, 's, inside, common, market]"


In [8]:
#sents_df = pd.read_pickle('data/lexical_repl/sents-df-with-frags.zip') # complete
sents_df = pd.read_pickle('data/lexical_repl/sents-df-restricted.pkl') # smaller, more balanced

In [9]:
sents_df.head()

Unnamed: 0,sent,source,description,masks,words,phrases
2,"[The, September-October, term, jury, had, been...",brown,,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...",[investigate],"[[to, investigate, reports, of, possible, ``]]"
6,"[The, grand, jury, commented, on, a, number, o...",brown,,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[operated, purchasing]","[[purchasing], [well, operated, and, follow]]"
8,"[However, ,, the, jury, said, it, believes, ``...",brown,,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",[administration],[[administration]]
12,"[It, urged, that, the, next, Legislature, ``, ...",brown,,"[0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, ...",[provide],"[[that, the, next, Legislature, ``, provide, e..."
18,"[The, jury, also, commented, on, the, Fulton, ...",brown,,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",[administrators],[[administrators]]


In [30]:
for idx, row in sents_df.sample(5).iterrows():
    s = ''
    sent = row.sent
    mask = row.masks
    for i in range(len(sent)):
        s += sent[i]
        if mask[i] == 1:
            s += '/1 '
        else:
            s += ' '
    print(s)
    for p in row.phrases:
        if ' '.join(p) in repl_dict:
            print(repl_dict[' '.join(p)])
    print()

The church leaders said : ‘ In our opinion these are merely words — fine words certainly and welcome — but/1 nevertheless/1 only/1 words/1 . 
but in any case only words

First , the Freedom of Information Acts give an/1 individual/1 a right to information possessed/1 by/1 the/1 government/1 about/1 him/1 or/1 her/1 and the government may have to justify non-disclosure in court . 
had by the government about him or her

`` Well , I might not get that far '' , I told them , `` as actually I have no papers to enter Germany and , as a matter of fact , no/1 permit/1 to return to France once I leave '' . 
no pass

This machine , operating/1 at/1 speeds/1 up/1 to/1 350,000/1 revolutions/1 per/1 minute/1 , is believed to/1 provide/1 one/1 of/1 the/1 fastest/1 mechanical operations in industry today . 
working at speeds up to 350,000 revolutions per minute
to give one of the fastest

Branches containing/1 important/1 keywords/1 related/1 to/1 the/1 topic/1 are then placed around the focus . 
ha

In [None]:
def evaluate_find(X, y):
    true = np.argmax(y, axis = 2)
    pred = np.argmax(find_model.predict(X, verbose = 1), axis = 2)
    total = float(y.shape[0])
    
    total_correct = 0
    indiv_wrong = 0
    tp = 0
    fp = 0
    tn = 0
    fn = 0
    
    for i in tqdm(range(y.shape[0])):
        if (true[i] == pred[i]).all():
            total_correct += 1
        for j in range(y.shape[1]):
            if true[i][j] != pred[i][j]:
                indiv_wrong += 1
        for idx in range(len(true[i])):
            if true[i][idx] == 0 and pred[i][idx] == 0:
                tn += 1
            elif true[i][idx] == 0 and pred[i][idx] == 1:
                fp += 1
            elif pred[i][idx] == 1:
                tp += 1
            else:
                fn += 1
        
    print('Absolute accuracy (all correct):\t\t' + str(total_correct / total))
    print('Overall accuracy (individual):\t\t\t' + str((tp + tn) / (tp + fp + tn + fn)))
    print('Precision:\t\t\t\t\t' + str(tp / (tp + fp)))
    print('Recall:\t\t\t\t\t\t' + str(tp / (tp + fn)))
    
    print('Average number of incorrect labels per answer:\t' + str(indiv_wrong / total))
    
    return pred

results = evaluate_find(X_test, y_test)

In [None]:
train_results = evaluate_find(X_train, y_train)

In [None]:
def predict_find(X):
    pad_X = pad_sequences([X], value = 0, padding = 'post', maxlen = X_train.shape[1]).astype('int64')
    pred = np.argmax(find_model.predict([pad_X], batch_size = 1), axis = 2)
    result = ''
    length = 50 if len(X) > 50 else len(X)
    for i in range(length):
        if X[i] == 0:
            break
        result += str(pred[0][i]) + '/' + idx2w[X[i]] + '\t'
    print(result)
    
text = """
What are your requirements?
"""
predict_find(seq_to_idx(text))

In [None]:
for x in X_test[:100]:
    predict_find(x)
    print()