In [175]:
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook as tqdm
import matplotlib.pyplot as plt
import pickle
import json
from nltk import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from random import sample

import tensorflow as tf
import keras.backend as K
from keras.layers import Input, Embedding, Bidirectional, LSTM, Dense, Dropout, Concatenate, TimeDistributed
from keras.models import Model, load_model
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences

This notebook contains the various tests, qualitative and quantitative, carried out on the formal phrase replacement models, from both the phrase identification and phrase replacement subtasks.

# Load relevant data and models

In [2]:
repl_df = pd.read_pickle('data/lexical_repl/repl_df.zip')

In [3]:
with open('data/lexical_repl/repl_dict.pkl', 'rb') as f:
    repl_dict = pickle.load(f)

In [17]:
with open('data/lexical_repl/list-formal-words-only.pkl', 'rb') as f:
    initial_list = pickle.load(f)

In [4]:
main_df = pd.read_pickle('data/lexical_repl/sents-df-with-frags.zip') # complete
reduced_df = pd.read_pickle('data/lexical_repl/sents-df-restricted.pkl') # smaller, more balanced

In [74]:
ms_df = pd.read_pickle('data/microsoft/microsoft_df.pkl')

In [9]:
main_model = load_model('data/lexical_repl_models/find_model_all_data.h5')
reduced_model = load_model('data/lexical_repl_models/find_model_restricted_data.h5')

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use tf.cast instead.


## Sample from original dataset (phrase identification)

Here, I randomly sample sentences from the original dataset to use in the paper.

In [30]:
for idx, row in sents_df.sample(5).iterrows():
    s = ''
    sent = row.sent
    mask = row.masks
    for i in range(len(sent)):
        s += sent[i]
        if mask[i] == 1:
            s += '/1 '
        else:
            s += ' '
    print(s)
    for p in row.phrases:
        if ' '.join(p) in repl_dict:
            print(repl_dict[' '.join(p)])
    print()

The church leaders said : ‘ In our opinion these are merely words — fine words certainly and welcome — but/1 nevertheless/1 only/1 words/1 . 
but in any case only words

First , the Freedom of Information Acts give an/1 individual/1 a right to information possessed/1 by/1 the/1 government/1 about/1 him/1 or/1 her/1 and the government may have to justify non-disclosure in court . 
had by the government about him or her

`` Well , I might not get that far '' , I told them , `` as actually I have no papers to enter Germany and , as a matter of fact , no/1 permit/1 to return to France once I leave '' . 
no pass

This machine , operating/1 at/1 speeds/1 up/1 to/1 350,000/1 revolutions/1 per/1 minute/1 , is believed to/1 provide/1 one/1 of/1 the/1 fastest/1 mechanical operations in industry today . 
working at speeds up to 350,000 revolutions per minute
to give one of the fastest

Branches containing/1 important/1 keywords/1 related/1 to/1 the/1 topic/1 are then placed around the focus . 
ha

# Accuracy metrics on data with true labels (phrase identification)

Here I define some functions for later use (which were also used in the notebook in which I built the models) which can evaluate the models on their training and test sets.

In [6]:
with open('data/lexical_repl/embedding.pkl', 'rb') as f:
    embedding = pickle.load(f)
    
with open('data/lexical_repl/idx2w.pkl', 'rb') as f:
    idx2w = pickle.load(f)
    
with open('data/lexical_repl/w2idx.pkl', 'rb') as f:
    w2idx = pickle.load(f)
    
assert 0 not in idx2w
assert '\t' in w2idx # SOS
assert '\n' in w2idx # EOS
assert '[UNK]' in w2idx

In [5]:
# helper functions

def seq_to_idx(string):
    # turns sequence of tokens to sequence of indices
    seq = word_tokenize(string)
    idx = []
    for word in seq:
        if word in w2idx:
            idx.append(w2idx[word])
        else:
            #print('unknown:\t' + word)
            idx.append(w2idx['[UNK]'])
    return idx

def tok_to_idx(seq):
    # if already tokenized
    idx = []
    for word in seq:
        if word in w2idx:
            idx.append(w2idx[word])
        else:
            #print('unknown:\t' + word)
            idx.append(w2idx['[UNK]'])
    return idx

In [None]:
def evaluate_find(X, y):
    # Takes X and y already vectorized and prepared for model training
    # Provides results for different evaluation metrics
    
    true = np.argmax(y, axis = 2)
    pred = np.argmax(find_model.predict(X, verbose = 1), axis = 2)
    total = float(y.shape[0])
    
    total_correct = 0
    indiv_wrong = 0
    tp = 0
    fp = 0
    tn = 0
    fn = 0
    
    for i in tqdm(range(y.shape[0])):
        if (true[i] == pred[i]).all():
            total_correct += 1
        for j in range(y.shape[1]):
            if true[i][j] != pred[i][j]:
                indiv_wrong += 1
        for idx in range(len(true[i])):
            if true[i][idx] == 0 and pred[i][idx] == 0:
                tn += 1
            elif true[i][idx] == 0 and pred[i][idx] == 1:
                fp += 1
            elif pred[i][idx] == 1:
                tp += 1
            else:
                fn += 1
        
    print('Absolute accuracy (all correct):\t\t' + str(total_correct / total))
    print('Overall accuracy (individual):\t\t\t' + str((tp + tn) / (tp + fp + tn + fn)))
    print('Precision:\t\t\t\t\t' + str(tp / (tp + fp)))
    print('Recall:\t\t\t\t\t\t' + str(tp / (tp + fn)))
    
    print('Average number of incorrect labels per answer:\t' + str(indiv_wrong / total))
    
    return pred

# Examine errors on train and test set (phrase identification)

In this section, I print out examples from training and test in which the phrase identification model incorrectly matched the labeled span.

In [169]:
for idx, row in main_df.sample(100).iterrows():
    sent = row.sent
    toks = tok_to_idx(sent)
    true = row.masks
    pred = predict_masks(toks, main_model)[0]
    pred_red = predict_masks(toks, reduced_model)[0]
    
    true_phr = find_spans(sent, true)
    pred_phr = find_spans(sent, pred)
    pred_phr_red = find_spans(sent, pred_red)
    
    true_l = ', '.join([' '.join(p) for p in true_phr])
    pred_l = ', '.join([' '.join(p) for p in pred_phr])
    pred_l_red = ', '.join([' '.join(p) for p in pred_phr_red])
    
    if true_l != pred_l or true_l != pred_l_red:
        print(' '.join(sent))
        print('TRUE:\t\t\t' + true_l)
        print('PRED (MAIN):\t\t' + pred_l)
        print('PRED (REDUCED):\t\t' + pred_l_red)
        print()

Each side has a cast iron arch in 7 segments from which the iron trough is hung by 35 wrought iron rods .
TRUE:			35 wrought iron rods
PRED (MAIN):		35 wrought iron rods
PRED (REDUCED):		

‘ Three hours ’ restricted leave from six to nine , when you can do what you please .
TRUE:			‘ Three hours ’ restricted leave
PRED (MAIN):		Three hours, restricted leave
PRED (REDUCED):		restricted

Studies of traditional library catalogues have provided little insight into user information needs .
TRUE:			provided little insight into user information needs
PRED (MAIN):		provided little insight into user information needs
PRED (REDUCED):		provided little insight into user information

iv.2.4 : this man is a fifth-century soldier , not the famous fourth-century politician ) is explicitly told to use his fleet round the Peloponnese ‘ as he thought fit ’ ; he took Pylos with it ( p. 132 ) , an act which Thucydides implies was more extempore than it really was , but which was nevertheless not something 

They were , indeed , usually of the kind denoting residence , such as atte Mede — at the mead ; in the putte — in the pit ; which in time became just Mead or Pitt , the definite articles being dropped ; or were toponymical , such as in the hurn ( a hurn being an out-of-the-way corner ) , which evolved into Hurneman and Hurman .
TRUE:			the kind denoting residence
PRED (MAIN):		
PRED (REDUCED):		

The Carolingians ' heartlands , that is , the area where most of their estates were clustered , and where Franks had been settled for centuries , spanned modern Belgium and the Netherlands , northern France , and western Germany , thus lying athwart the `` natural `` frontiers of the Ardennes forest , and the rivers Meuse and Rhine .
TRUE:			thus
PRED (MAIN):		and
PRED (REDUCED):		and

An unexpected internal error has occurred while processing your request .
TRUE:			An unexpected internal error, your request
PRED (MAIN):		An unexpected internal error, your request
PRED (REDUCED):		An unexpecte

In the same way , shipwrights were expected to show initiative when building in the traditional way in wood ; but with the coming of iron and then steel as materials , leading to bigger ships as steam replaced sail , calculations led to exact drawings , which became the vehicle of centralized control and stronger industrial discipline .
TRUE:			initiative
PRED (MAIN):		the
PRED (REDUCED):		

or one-twentieth during the first five years thereafter to have 2s .
TRUE:			thereafter
PRED (MAIN):		thereafter
PRED (REDUCED):		

First , in making her appointments in 1975 to the Consultative Committee or Shadow Cabinet , Mrs Thatcher displayed conciliatory traits in retaining so many of Mr Heath 's appointees .
TRUE:			retaining so many of Mr Heath 's appointees
PRED (MAIN):		retaining so many of Mr Heath 's appointees
PRED (REDUCED):		displayed conciliatory traits, retaining so many

The advisers must not only provide good financial and legal advice , but give continuing support to the manager

# Checking for generalization to phrases not from the original list (phrase identification)

Next, I pull in the Microsoft data to run it through the phrase identification models, and from that check to see how many phrases were found which did not have a word from the original word list.

In [15]:
def predict_find(X, model):
    # Gives samples for already-vectorized data
    pad_X = pad_sequences([X], value = 0, padding = 'post', maxlen = 50).astype('int64')
    pred = np.argmax(model.predict([pad_X], batch_size = 1), axis = 2)
    result = ''
    length = 50 if len(X) > 50 else len(X)
    for i in range(length):
        if X[i] == 0:
            break
        result += str(pred[0][i]) + '/' + idx2w[X[i]] + '\t'
    print(result)

In [16]:
text = """
What are your requirements?
"""
predict_find(seq_to_idx(text), reduced_model)

0/What	0/are	1/your	1/requirements	0/?	


In [60]:
def predict_masks(X, model):
    # Gives samples for already-vectorized data
    pad_X = pad_sequences([X], value = 0, padding = 'post', maxlen = 50).astype('int64')
    pred = np.argmax(model.predict([pad_X], batch_size = 1), axis = 2)
    length = 50 if len(X) > 50 else len(X)
    return pred

In [61]:
def find_spans(sent, labels):
    # takes a sentence (list of tokens) and labels (list of binary values)
    # returns the phrases marked in the sentence
    
    if len(sent) > 50:
        sent = sent[:50]
    
    phrases = []
    phrase = []
    in_phrase = False
    
    for idx in range(len(sent)):
        if in_phrase:
            if labels[idx] == 1:
                phrase.append(sent[idx])
            else:
                in_phrase = False
                phrases.append(phrase)
                phrase = []
        else:
            if labels[idx] == 1:
                in_phrase = True
                phrase.append(sent[idx])
                
    return phrases

In [101]:
# find all phrases found, see if the initial formal words are there, and count them

other_found_reduced = []
other_found_main = []
total_found_reduced = 0
total_found_main = 0
red_ph_df = []
main_ph_df = []
temp_ms_df = ms_df.sample(10000).copy()

for idx, row in tqdm(temp_ms_df.iterrows(), total = 10000):
    item = row.source
    toks = seq_to_idx(item)
    
    # find predictions of both models
    reduced_labels = predict_masks(toks, reduced_model)[0]
    main_labels = predict_masks(toks, main_model)[0]
    
    # gather phrases for both models
    main_phrases = find_spans(word_tokenize(item), main_labels)
    main_ph_df.append(main_phrases)
    reduced_phrases = find_spans(word_tokenize(item), reduced_labels)
    red_ph_df.append(reduced_phrases)
    
    total_found_reduced += len(reduced_phrases)
    total_found_main += len(main_phrases)
    
    found = False
    for phrase in main_phrases:
        for word in initial_list:
            if word in phrase:
                found = True
                break
        if not found:
            other_found_main.append(phrase)
            
    found = False
    for phrase in reduced_phrases:
        for word in initial_list:
            if word in phrase:
                found = True
                break
        if not found:
            other_found_reduced.append(phrase)
            
temp_ms_df['main'] = main_ph_df
temp_ms_df['reduced'] = red_ph_df
            
print('Total number of identified phrases by the main model:\t\t' + str(total_found_main))
print('Total number of identified phrases by the reduced model:\t' + str(total_found_reduced))
print('Number of generalized phrases by the main model:\t\t' + str(len(other_found_main)))
print('Number of generalized phrases by the reduced model:\t\t' + str(len(other_found_reduced)))

HBox(children=(IntProgress(value=0, max=10000), HTML(value='')))

Total number of identified phrases by the main model:		7093
Total number of identified phrases by the reduced model:	10197
Number of generalized phrases by the main model:		847
Number of generalized phrases by the reduced model:		4308


In [161]:
temp_ms_df.to_pickle('data/lexical_repl_models/ms_test_df.pkl')

In [104]:
other_found_main

[['still', 'referencing', 'the', 'cloud'],
 ['alerts'],
 ['encrypted'],
 ['undesired'],
 ['each'],
 ['your'],
 ['the', 'backups'],
 ['current'],
 ['your'],
 ['one'],
 ['achievable', 'throughput'],
 ['customized', 'offerings'],
 ['to', 'get', 'them'],
 ['indefinitely'],
 ['URL'],
 ['data'],
 ['configure'],
 ['expiry'],
 ['integrates'],
 ['documentation'],
 ['SKU'],
 ['can'],
 ['permissions'],
 ['documentation'],
 ['Results',
  '|',
  '|',
  'Action',
  '|',
  'Microsoft.StorSimple/managers/accessControlRecords/read'],
 ['gets'],
 ['users'],
 ['disable'],
 ['Ensuring'],
 ['registration'],
 ['provisioned'],
 ['to'],
 ['Azure'],
 ['enrollment'],
 ['authentication', 'protocols'],
 ['offline', 'synchronization'],
 ['your'],
 ['a', 'single'],
 ['region', 'tags'],
 ['the'],
 ['Identifier'],
 ['Azure'],
 ['execute'],
 ['provisioned'],
 ['public', 'IP', 'address'],
 ['throughput'],
 ['append'],
 ['URL'],
 ['Complete'],
 ['in', 'the'],
 ['VM', ','],
 ['a', 'user-defined'],
 ['table'],
 ['Embedded

In [105]:
other_found_reduced

[['events'],
 ['content', 'delivery'],
 ['the', 'key', 'delivery'],
 ['monthly'],
 ['annual'],
 ['some', 'additional'],
 ['the', 'client'],
 ['each'],
 ['detect', 'potentially', 'undesired', 'content'],
 ['all', 'your', 'users'],
 ['the'],
 ['your'],
 ['delivery'],
 ['the', 'new', 'pricing'],
 ['user'],
 ['user'],
 ['update'],
 ['prefixes'],
 ['disable'],
 ['zero', 'or'],
 ['your', 'contract'],
 ['secure', 'the', 'communication', 'channel'],
 ['whether'],
 ['a', 'production', 'environment'],
 ['SSO', 'improves', 'security'],
 ['service', 'availability'],
 ['user'],
 ['An', 'enhanced', 'unified', 'alerts'],
 ['An'],
 ['the', 'current', 'index'],
 ['the'],
 ['a'],
 ['a'],
 ['the'],
 ['user'],
 ['the'],
 ['to'],
 ['the', 'paired', 'region'],
 ['perform', 'ICMP'],
 ['a', 'production', 'environment'],
 ['comply'],
 ['an', 'SBD'],
 ['prior'],
 ['file', 'extension'],
 ['the', 'tool'],
 ['calculate', 'achievable', 'throughput'],
 ['supported', 'languages'],
 ['topology'],
 ['the', 'new'],
 ['s

In [179]:
sample(other_found_main, 100)

[['authenticate'],
 ['user'],
 ['to'],
 ['redirected', 'to', 'the', 'Azure', 'Databricks', 'portal'],
 ['asynchronous'],
 ['section'],
 ['tabulated'],
 ['group'],
 ['repository'],
 ['updates'],
 ['When'],
 ['recovery'],
 ['expiry'],
 ['authenticated'],
 ['datasets'],
 ['definition', '|', '|', 'Action'],
 ['detects'],
 ['the', 'stored', 'procedure'],
 ['log'],
 ['group'],
 ['name'],
 ['for'],
 ['Remove', 'the', 'record'],
 ['keys'],
 ['until', 'it', 'is'],
 ['regular', 'updates'],
 ['dataset'],
 ['to'],
 ['a', 'user-defined'],
 ['been'],
 ['dataset'],
 ['options'],
 ['(', 'MSI', ')'],
 ['documentation'],
 ['app'],
 ['overload'],
 ['When', 'you', 'are'],
 ['classifies', 'the', 'data', 'that'],
 ['hash', 'stored', 'in'],
 ['they'],
 ['Identifier'],
 ['provisioned'],
 ['notifications'],
 ['the', 'updates'],
 ['endpoints'],
 ['Caveat'],
 ['SQL'],
 ['proxy', 'authentication', 'using', 'machine', 'context'],
 ['referenced', 'in', 'the'],
 ['Virtual', 'Array'],
 ['IP'],
 ['parameters'],
 ['to'

In [177]:
sample(other_found_reduced, 100)

[['input'],
 ['supported', 'for', 'RHEL', 'for'],
 ['perform', 'operations'],
 ['The', 'resource', 'limits'],
 ['the', 'incoming', 'queries'],
 ['the', 'underlying'],
 ['interactive', 'user'],
 ['deployment'],
 ['the'],
 ['your', 'StorSimple'],
 ['the'],
 ['production'],
 ['for', 'you'],
 ['displayed'],
 ['the', 'security'],
 ['|'],
 ['the'],
 ['new', 'input', 'datasets'],
 ['traffic', 'was'],
 ['the'],
 ['a', 'defined', 'service', 'type'],
 ['default'],
 ['a'],
 ['update', 'your', 'code'],
 ['authentication', 'information'],
 ['2017'],
 ['database'],
 ['these', 'parameters'],
 ['that', 'perform'],
 ['actually', 'delete', 'records'],
 ['evaluate', 'the', 'cost', 'of', 'your'],
 ['owner'],
 ['a', 'single', 'IP', 'address'],
 ['a', 'new'],
 ['the'],
 ['a', 'object'],
 ['if', 'supported', 'for', 'some', 'applications'],
 ['inherited'],
 ['securing', 'your', 'PaaS', 'web', 'and', 'mobile', 'applications'],
 ['The'],
 ['to', 'automatically'],
 ['the', 'sparse', 'track'],
 ['Linux'],
 ['upda

In [145]:
to_be_translated = []

for item in other_found_main:
    if len(item) > 1 and item not in to_be_translated:
        to_be_translated.append(item)
for item in other_found_reduced:
    if len(item) > 1 and item not in to_be_translated:
        to_be_translated.append(item)
        
print(len(to_be_translated))

2010


In [146]:
with open('data/lexical_repl_models/src-mic-test.txt', 'w') as f:
    for x in to_be_translated:
        f.write(' '.join(x) + '\n')

In [173]:
for idx, row in temp_ms_df.sample(100).iterrows():
    print(row.source)
    print(row.main)
    print(row.reduced)
    print()

By mixing the two patterns , an application 's largest tenants can occupy dedicated services while the long tail of less active , smaller tenants can occupy indexes in a shared service .
[['an', 'application', "'s", 'largest', 'tenants']]
[['an', 'application'], ['occupy', 'dedicated'], ['occupy', 'indexes', 'in', 'a', 'shared', 'service']]

Resource governance is supported in Service Fabric in accordance with the service package .
[]
[['supported', 'in', 'Service', 'Fabric']]

Parameters in the index may not be altered once they are created .
[]
[]

Can I synchronize or set the authentication phone , authentication email , or alternate authentication phone fields on behalf of my users ?
[['alternate', 'authentication', 'phone', 'fields']]
[]

To get the correct path , you can use the function Get-MSBuildCmd , as shown in this example .
[['the', 'function', 'Get-MSBuildCmd']]
[['the', 'function']]

Decide the new VM series you will be using .
[]
[]

To insert a parameter , place the cu

# phrase replacement models: binary accuracy evaluation

For the phrase replacement task, I see if the output exactly matches one of the expected translations according to the original word list and dictionary.

In [111]:
with open('data/lexical_repl_models/pred-5000.txt') as f:
    pred_reduced = [l.strip() for l in f.readlines()]
with open('data/lexical_repl_models/pred.txt') as f:
    pred_main = [l.strip() for l in f.readlines()]
with open('data/lexical_repl_models/src-test.txt') as f:
    true = [l.strip() for l in f.readlines()]
with open('data/lexical_repl/acrolinx.json', 'r') as f:
    acro = json.load(f)

In [142]:
assert len(pred_reduced) == len(pred_main) == len(true)

total = len(true)
reduced_correct = 0
main_correct = 0
incorrect_pairs = []

for idx in tqdm(range(total)):
    
    t = word_tokenize(true[idx])
    to_be_repl = []
    
    for word in acro:
        if word in t:
            to_be_repl.append(word)
    
    options = [true[idx]]
    final_options = []
    it = 0
    
    for word in to_be_repl:
        it += 1
        if it == len(to_be_repl):
            for repl in acro[word]:
                for o in options:
                    final_options.append(o.replace(word, repl))
        else:
            temp_options = []
            for repl in acro[word]:
                for o in options:
                    temp_options.append(o.replace(word, repl))
            options = temp_options.copy()
    
    if pred_main[idx] in final_options:
        main_correct += 1
    else:
        incorrect_pairs.append(('main', true[idx], final_options, pred_main[idx]))
    if pred_reduced[idx] in final_options:
        reduced_correct += 1
    else:
        incorrect_pairs.append(('main', true[idx], final_options, pred_reduced[idx]))
        
print('Accuracy for main:\t\t' + str(main_correct * 100 / total))
print('Accuracy for reduced:\t' + str(reduced_correct * 100 / total))

HBox(children=(IntProgress(value=0, max=30672), HTML(value='')))

Accuracy for main:		93.06533646322379
Accuracy for reduced:	91.54603547209182


In [144]:
len(true)

30672

In [143]:
incorrect_pairs

[('main',
  'to determine our attitude to foundationalism in general',
  ['to shape our attitude to foundationalism in general',
   'to find out our attitude to foundationalism in general'],
  'to find our attitude to foundationalism in general'),
 ('main',
  'to determine our attitude to foundationalism in general',
  ['to shape our attitude to foundationalism in general',
   'to find out our attitude to foundationalism in general'],
  'to find our attitude to foundationalism in general'),
 ('main', 'opposed it', ['gone against it', 'went against it'], 'went it'),
 ('main', 'opposed it', ['gone against it', 'went against it'], 'gone it'),
 ('main',
  'to be formally determined it had to be',
  ['to be formally shaped it had to be',
   'to be formally found out it had to be'],
  'to be formally found it had to be'),
 ('main',
  'to be formally determined it had to be',
  ['to be formally shaped it had to be',
   'to be formally found out it had to be'],
  'to be formally found it had t

# phrase replacement: Microsoft data check

Finally, I look at the phrase replacement results on the Microsoft dataset and see how many of the inputs were actually changed during translation.

In [150]:
with open('data/lexical_repl_models/src-mic-test.txt') as f:
    mic_test = [l.strip() for l in f.readlines()]
with open('data/lexical_repl_models/pred-mic-5000.txt') as f:
    mic_pred_5k = [l.strip() for l in f.readlines()]
with open('data/lexical_repl_models/pred-mic.txt') as f:
    mic_pred_100k = [l.strip() for l in f.readlines()]

In [152]:
total = len(mic_test)
changed_5k = 0
changed_100k = 0

for idx in range(total):
    if mic_pred_5k[idx] != mic_test[idx]:
        changed_5k += 1
    if mic_pred_100k[idx] != mic_test[idx]:
        changed_100k += 1
        
print('Percent changed for 5k:\t\t' + str(100 * changed_5k / total))
print('Percent changed for 100k:\t' + str(100 * changed_100k / total))

Percent changed for 5k:		49.55223880597015
Percent changed for 100k:	43.43283582089552
