In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook as tqdm
import gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import pickle
import random as rand
from nltk import word_tokenize, sent_tokenize
from collections import Counter

import keras
import pydot
from keras.models import Model, load_model
from keras.utils import plot_model, to_categorical

Using TensorFlow backend.


In this notebook, I run tests on the data and the models trained previously for the discourse marker insertion task.

# statistics on original dataset

In [None]:
oanc_df = pd.read_pickle('data/discourse_markers/oanc_pair_df.zip')

In [None]:
bnc_df = pd.read_pickle('data/discourse_markers/bnc_pair_df.zip')

In [None]:
with open('data/discourse_markers/oanc_terms.pkl', 'rb') as f:
    terms_dict = pickle.load(f)
idx_dict = {terms_dict[k]: k for k in terms_dict}
idx_dict[9] = 'NULL'

In [None]:
# count how many samples for each chosen term
for t in terms_dict:
    idx = terms_dict[t]
    print(t)
    num = len(oanc_df[oanc_df.y_dense == idx]) + len(bnc_df[bnc_df.y_dense == idx])
    print(num)

In [None]:
# show random examples
for idx, row in oanc_df.sample(100).iterrows():
    if row['y_dense'] != 9:
        print(' '.join(row['sent1']))
        print(' '.join(row['sent2']))
        print(idx_dict[row['y_dense']])
        print()

# visualize model architecture

In [2]:
model = load_model('data/discourse_markers_models/model.But.h5')

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use tf.cast instead.


In [4]:
plot_model(model, to_file = 'data/discourse_markers/graph.png', show_shapes = True)

# testing and evaluation

## load necessities

In [2]:
with open('data/discourse_markers/oanc_terms.pkl', 'rb') as f:
    terms_dict = pickle.load(f)
idx_dict = {terms_dict[k]: k for k in terms_dict}
idx_dict[9] = 'NULL'

In [3]:
models = {}
dfs = {}

for term in terms_dict.keys():
    models[term] = load_model('data/discourse_markers_models/model.' + term + '.h5')
    dfs[term] = pd.read_pickle('data/discourse_markers_models/' + term + '_df.pkl')

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use tf.cast instead.


In [4]:
d2v = Doc2Vec.load("data/discourse_markers/d2v.model")

## define helper functions

In [5]:
def pred(vecs, term):
    # takes an array in shape [None, 2, 100]
    # returns results for a given term
        
    return np.argmax(models[term].predict(vecs), axis = 1)

In [6]:
def series_to_vec(series):
    # takes a series
    # returns vectors in np.array format for pred function
    
    return np.array(list(series))

In [173]:
def predict_from_text(text, threshold = None):
    # takes a single string of a passage
    # predicts for all terms, what could be present
    # can take a threshold, to reduce suggestion of discourse markers
    sents = sent_tokenize(text)
    tok_sents = [word_tokenize(sent) for sent in sents]
    vectors = [d2v.infer_vector(sent) for sent in tok_sents]
    
    for idx in range(len(vectors) - 1):
        input_vec = np.array([vectors[idx], vectors[idx+1]])
        
        ans = []
        
        for term in terms_dict.keys():
            if threshold:
                if models[term].predict(np.array([input_vec,]))[0][1] > threshold:
                    ans.append(term)
            elif np.argmax(models[term].predict(np.array([input_vec,]))[0]):
                ans.append(term)
                
        if len(ans) > 0:                
            ans_str = '/'.join(ans)
            print(sents[idx])
            print('[' + ans_str + '] ' + sents[idx+1])
            print()

## evaluate training accuracy for all models, to see results without dropout

In [51]:
for term in terms_dict.keys():
    X_train = series_to_vec(dfs[term][dfs[term].set == 'train'].X)
    y_train = to_categorical([1 if x==terms_dict[term] else 0 for x in dfs[term][dfs[term].set == 'train'].y_dense])
    loss, accuracy = models[term].evaluate(X_train, y_train, batch_size = 32, verbose = 0)
    print('Training accuracy for term ' + term + ' on complete model:\t' + str(accuracy))

Training accuracy for term But on complete model:	0.7709707578039786
Training accuracy for term Now on complete model:	0.724487497143768
Training accuracy for term First on complete model:	0.7263157894736842
Training accuracy for term Also on complete model:	0.6947754353257519
Training accuracy for term And on complete model:	0.789050404256046
Training accuracy for term Yet on complete model:	0.746433770014556
Training accuracy for term Well on complete model:	0.8546695927822482
Training accuracy for term Or on complete model:	0.7796610170316218
Training accuracy for term So on complete model:	0.774496014946456


## update dataframes with predictions immediately, and save

In [24]:
for term in terms_dict:
    print('working with term:\t' + term)
    y = []
    predictions = []
    for idx, row in tqdm(dfs[term].iterrows(), total = len(dfs[term]), leave = False):
        if row.y_dense == terms_dict[term]: # change to a simpler and easier to use binary system
            y.append(1)
        else:
            y.append(0)
        predictions.append(pred(series_to_vec([row.X]), term)[0])
    dfs[term]['pred'] = predictions
    dfs[term]['y'] = y
    dfs[term] = dfs[term].drop('y_dense', axis = 1) # we no longer need this row

working with term:	First


HBox(children=(IntProgress(value=0, max=3272), HTML(value='')))

working with term:	Yet


HBox(children=(IntProgress(value=0, max=3817), HTML(value='')))

working with term:	Or


HBox(children=(IntProgress(value=0, max=3212), HTML(value='')))

working with term:	Now


HBox(children=(IntProgress(value=0, max=4932), HTML(value='')))

working with term:	But


HBox(children=(IntProgress(value=0, max=42784), HTML(value='')))

working with term:	So


HBox(children=(IntProgress(value=0, max=9480), HTML(value='')))

working with term:	Also


HBox(children=(IntProgress(value=0, max=2424), HTML(value='')))

working with term:	Well


HBox(children=(IntProgress(value=0, max=4557), HTML(value='')))

working with term:	And


HBox(children=(IntProgress(value=0, max=25836), HTML(value='')))



In [29]:
for term in terms_dict.keys():
    dfs[term].to_pickle('data/discourse_markers_models/' + term + '_df.pkl')

## calculate precision, recall and f-score over all sets

In [53]:
def metrics(true, pred):
    # takes true values and predicted values of a model 
    # (binary classification only)
    # returns precision, recall and f1 score
    true = list(true)
    pred = list(pred)
    assert len(true) == len(pred)
    
    tp = 0
    tn = 0
    fp = 0
    fn = 0
    
    for i in range(len(true)):
        if true[i] == 1:
            if pred[i] == 1:
                tp += 1
            else:
                fn += 1
        else:
            if pred[i] == 1:
                fp += 1
            else:
                tn += 1
                
    print('true positives:\t\t' + str(tp))
    print('true negatives:\t\t' + str(tn))
    print('false positives:\t' + str(fp))
    print('false negatives:\t' + str(fn))
    
    a = (tp + tn) / (tp + tn + fp + fn)
    p = tp / (tp+fp)
    r = tp / (tp+fn)
    f = 2 * (p*r) / (p+r)
    
    print('accuracy:\t\t' + str(a))
    print('precision:\t\t' + str(p))
    print('recall:\t\t\t' + str(r))
    print('f-score:\t\t' + str(f))

In [56]:
for term in terms_dict.keys():
    print('working with term:\t' + term)
    metrics(dfs[term][dfs[term].set == 'test'].y, dfs[term][dfs[term].set == 'test'].pred)
    print()

working with term:	First
true positives:		91
true negatives:		136
false positives:	6
false negatives:	94
accuracy:		0.6941896024464832
precision:		0.9381443298969072
recall:			0.4918918918918919
f-score:		0.6453900709219857

working with term:	Yet
true positives:		126
true negatives:		149
false positives:	44
false negatives:	63
accuracy:		0.7198952879581152
precision:		0.7411764705882353
recall:			0.6666666666666666
f-score:		0.701949860724234

working with term:	Or
true positives:		117
true negatives:		110
false positives:	57
false negatives:	37
accuracy:		0.7071651090342679
precision:		0.6724137931034483
recall:			0.7597402597402597
f-score:		0.7134146341463414

working with term:	Now
true positives:		152
true negatives:		220
false positives:	39
false negatives:	82
accuracy:		0.7545638945233266
precision:		0.7958115183246073
recall:			0.6495726495726496
f-score:		0.7152941176470589

working with term:	But
true positives:		1370
true negatives:		1902
false positives:	243
false negative

## manual inspection of examples

In [79]:
terms_dict.keys()

dict_keys(['First', 'Yet', 'Or', 'Now', 'But', 'So', 'Also', 'Well', 'And'])

In [None]:
# stopped at but, so, also, well, have 1 example from: and, but. none from: so, also, well

In [160]:
term = 'But'
df = dfs[term][dfs[term].set == 'test']

In [165]:
# FALSE POSITIVES
print(len(df[(df.y == 0) & (df.pred == 1)]))
print()
for idx, row in df[(df.y == 0) & (df.pred == 1)].sample(5).iterrows():
    print(' '.join(row['sent1']))
    print('[FP/' + term + '] ' + ' '.join(row['sent2']))
    print()

243

We have great confidence in our nation 's legal system and look for a timely resolution of this important matter .
[FP/But] If you have any questions , comments or concerns , please do not hesitate to contact me .

It also noted that the French national rugby team , currently in England for that sport 's World Cup , had ordered a meal of roast beef from room service at their hotel in Windsor with `` absolutely no stipulations as to where it came from . ''
[FP/But] The paper wished the French team `` all the best '' in its semifinal on Sunday .

Lewinsky -- dubbed a `` zaftig little rascal '' by the hometown LAT -- is generally perceived as childlike , weak , and sort of , like , stupid .
[FP/But] Tripp is said to be a meatier character -- genuinely caring at times , staggeringly manipulative at others .

Between 1993 and 1994 , they aired more than 12,000 times , and at one point Inphomation was shelling out half a million dollars a week to buy air time on cable stations .
[FP/But

In [170]:
# FALSE NEGATIVES
print(len(df[(df.y == 1) & (df.pred == 0)]))
print()
for idx, row in df[(df.y == 1) & (df.pred == 0)].sample(5).iterrows():
    print(' '.join(row['sent1']))
    print('[FN/' + term + '] ' + ' '.join(row['sent2']))
    print()

763

The mostly one-piece garments worn by the inmates , which come in bright hues of yellow , blue , and ( especially ) red -- to judge from the photos , red is the dominant color this season on death row -- are not , Chatterbox presumes , actually available at Benetton outlets .
[FN/But] Apparently , Benetton hopes that some of the death-row inmates ' existential glamour ( `` They broke the rules .

If he had lied under oath about parking illegally I would n't be so disgusted .
[FN/But] For a married man to have oral sex with a woman employee less than half his age in the Oval Office -- I ca n't claim not to be offended by that .

They agreed on most matters regarding the runic characters and on many features of language .
[FN/But] If Chapman had hoped to convert Einar Haugen to his own views of the authenticity of the stones , he did not succeed , for the Professor never deviated from his conviction that they were modern .

I am glad that you talked to Ken Arrow .
[FN/But] Nobel lau

In [163]:
# TRUE POSITIVES
print(len(df[(df.y == 1) & (df.pred == 1)]))
print()
for idx, row in df[(df.y == 1) & (df.pred == 1)].sample(5).iterrows():
    print(' '.join(row['sent1']))
    print('[TP/' + term + '] ' + ' '.join(row['sent2']))
    print()

1370

Whether this reflects the inviolable rule of hospitality , the undeniable misogyny in the Old Testament , or that angels have higher standing than humans is open to question .
[TP/But] The question can not even be asked when the details are mislaid .

As a result of the no-confidence vote , Poland faces four months with a lame-duck government at a time when hard decisions about privatisation , tax and banking reforms have been left untaken .
[TP/But] That is all .

But she knew it would be hard and that the time would surely come when Creggan would grow angry with her as Kraal had done .
[TP/But] She also knew that anger would be part of Creggan 's survival , part of the strength he would need if , as she hoped , his chance for freedom ever came .

That sort of gossip certainly should be condemned ; that is the sort Hesiod warned against .
[TP/But] Even he went on to say that gossip has ‘ a kind of divinity ’ .

Normally Alexandra was not susceptible to that kind of come-on rubbi

In [164]:
# TRUE NEGATIVES
print(len(df[(df.y == 0) & (df.pred == 0)]))
print()
for idx, row in df[(df.y == 0) & (df.pred == 0)].sample(5).iterrows():
    print(' '.join(row['sent1']))
    print('[TN/' + term + '] ' + ' '.join(row['sent2']))
    print()

1902

It is noticeable that the Russell-Copleston debate became embroiled in a discussion of necessary propositions , a discussion made necessary by Copleston 's desire to show Russell that the world is such that it must be the case that it has a Creator . But does theism have to make such a case ? Is n't it making the mistake of claiming too much ? Is n't it unnecessarily raising the stakes here ? Do we really have to be sure that God exists in order to believe in God ? Can not we argue , indeed , on the basis of the usual meaning of ‘ faith ’ that involves trust in the face of intellectual un certainty , that Russell 's uncertainty as to whether or not God exists , the agnostic position , is the one the theist in fact should hold ?
[TN/But] Suppose there existed a God who wished us to be unsure whether or not He existed .

Reviewers praise the second volume of Cook 's biography as well researched , thorough , and fascinating .
[TN/But] Many also take it as a point of departure for ta

# test on Microsoft example text

In [182]:
text = "The Azure portal is a web-based, unified console that provides an alternative to command-line tools. With the Azure portal, you can manage your Azure subscription using a graphical user interface. You can build, manage, and monitor everything from simple web apps to complex cloud deployments, create custom dashboards for an organized view of resources, and configure accessibility options for the best experience. The Azure portal is designed for resiliency and continuous availability. It has a presence in every Azure datacenter thereby making it resilient to individual datacenter failures and also avoids network slow-downs by being close to users. The Azure portal updates continuously and requires no downtime for maintenance activities."

In [183]:
print(text)

The Azure portal is a web-based, unified console that provides an alternative to command-line tools. With the Azure portal, you can manage your Azure subscription using a graphical user interface. You can build, manage, and monitor everything from simple web apps to complex cloud deployments, create custom dashboards for an organized view of resources, and configure accessibility options for the best experience. The Azure portal is designed for resiliency and continuous availability. It has a presence in every Azure datacenter thereby making it resilient to individual datacenter failures and also avoids network slow-downs by being close to users. The Azure portal updates continuously and requires no downtime for maintenance activities.


In [198]:
predict_from_text(text)

The Azure portal is a web-based, unified console that provides an alternative to command-line tools.
[Also/And] With the Azure portal, you can manage your Azure subscription using a graphical user interface.

With the Azure portal, you can manage your Azure subscription using a graphical user interface.
[But/Also/And] You can build, manage, and monitor everything from simple web apps to complex cloud deployments, create custom dashboards for an organized view of resources, and configure accessibility options for the best experience.

You can build, manage, and monitor everything from simple web apps to complex cloud deployments, create custom dashboards for an organized view of resources, and configure accessibility options for the best experience.
[But/And] The Azure portal is designed for resiliency and continuous availability.

The Azure portal is designed for resiliency and continuous availability.
[First/Now/And] It has a presence in every Azure datacenter thereby making it resili

In [202]:
predict_from_text(text, 0.75)

The Azure portal is a web-based, unified console that provides an alternative to command-line tools.
[And] With the Azure portal, you can manage your Azure subscription using a graphical user interface.

You can build, manage, and monitor everything from simple web apps to complex cloud deployments, create custom dashboards for an organized view of resources, and configure accessibility options for the best experience.
[And] The Azure portal is designed for resiliency and continuous availability.

It has a presence in every Azure datacenter thereby making it resilient to individual datacenter failures and also avoids network slow-downs by being close to users.
[So] The Azure portal updates continuously and requires no downtime for maintenance activities.

