In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook as tqdm
import gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import pickle
import random as rand
from nltk import word_tokenize, sent_tokenize
from collections import Counter

from keras.models import Model, load_model

Using TensorFlow backend.


In this notebook, I run tests on the data and the models trained previously for the discourse marker insertion task.

# Statistics on original dataset

In [2]:
oanc_df = pd.read_pickle('data/discourse_markers/oanc_pair_df.zip')

In [3]:
bnc_df = pd.read_pickle('data/discourse_markers/bnc_pair_df.zip')

In [17]:
with open('data/discourse_markers/oanc_terms.pkl', 'rb') as f:
    terms_dict = pickle.load(f)
idx_dict = {terms_dict[k]: k for k in terms_dict}
idx_dict[9] = 'NULL'

In [43]:
# count how many samples for each chosen term
for t in terms_dict:
    idx = terms_dict[t]
    print(t)
    num = len(oanc_df[oanc_df.y_dense == idx]) + len(bnc_df[bnc_df.y_dense == idx])
    print(num)

Now
2466
Yet
1909
And
12918
So
4740
First
1636
Well
2279
Or
1606
Also
1212
But
21392


In [32]:
# show random examples
for idx, row in oanc_df.sample(100).iterrows():
    if row['y_dense'] != 9:
        print(' '.join(row['sent1']))
        print(' '.join(row['sent2']))
        print(idx_dict[row['y_dense']])
        print()

There is nothing wrong with trying to change that shape , and advocates of nonsexist English have worked miracles in the short time since they have succeeded in making their concerns known .
To deny that the oblique senses of man are still very much with us is mere optimistic folly .
But

The paper points out that the episode is apt to rekindle consumer concerns about online credit card security .
Since the jerkball 's e-mail trail leads to eastern Europe , the case stands to highlight the freedom online criminals have to operate beyond U.S. jurisdiction .
Also

For example , the amount of PfHRP2 secreted per parasite varies between different parasite strains .
In high transmission areas , where partial immunity against the disease develops , clearance of PfHRP2 might be increased in the presence of antibodies against the protein ; in these areas—such as countries in sub-Saharan Africa—the model would thus underestimate the parasite burden and might need to be adapted further for use .

# visualize model architecture

In [None]:
# https://www.graphviz.org/
# https://keras.io/visualization/

In [2]:
model = load_model('data/discourse_markers_models/model.But.h5')

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use tf.cast instead.


In [3]:
plot_model(model, to_file = 'data/discourse_markers/graph.png')

OSError: `pydot` failed to call GraphViz.Please install GraphViz (https://www.graphviz.org/) and ensure that its executables are in the $PATH.

In [None]:
    loss, accuracy = model.evaluate(X_test, y_test, batch_size = 32, verbose = 0)
    print('Test accuracy:\t\t\t\t' + str(accuracy) + '\n')

In [4]:
but_df = pd.read_pickle('data/discourse_markers_models/But_df.pkl')

In [5]:
but_df.head()

Unnamed: 0,sent1,sent2,label,X,y,y_dense,sent2_orig,set
0,"[Short, leaves, of, 6, weeks, or, less, (, the...","[Longer, leaves, ,, of, 12, weeks, or, more, ,...",non-fiction/OUP/Berk/ch1,"[[-0.071559414, 0.030517343, -0.017138, -0.333...","[0, 0, 0, 0, 0, 0, 0, 0, 1, 0]",8,,train
1,"[Studies, carried, out, during, the, 1970s, an...","[Repeatedly, ,, eective, parenting, mediated, ...",non-fiction/OUP/Berk/ch1,"[[0.35579225, 0.03694494, 0.41214183, -0.16565...","[0, 0, 0, 0, 0, 0, 0, 0, 1, 0]",8,,train
2,"[Parents, anxious, for, their, children, to, d...","[Research, eventually, documented, otherwise—t...",non-fiction/OUP/Berk/ch1,"[[0.25921673, 0.16878448, -0.14332393, -0.1533...","[0, 0, 0, 0, 0, 0, 0, 0, 1, 0]",8,,train
3,"[As, Damon, explains, ,, the, child-centered, ...","[Damon, contends, ,, modern, child-centerednes...",non-fiction/OUP/Berk/ch1,"[[0.15743977, 0.17438783, -0.03214123, -0.0016...","[0, 0, 0, 0, 0, 0, 0, 0, 1, 0]",8,,train
4,"[Damon, acknowledges, that, economic, constrai...","[He, places, most, blame, on, how, contemporar...",non-fiction/OUP/Berk/ch1,"[[0.06932011, 0.08450028, -0.03265151, -0.1084...","[0, 0, 0, 0, 0, 0, 0, 0, 1, 0]",8,,test


In [3]:
d2v = Doc2Vec.load("data/discourse_markers/d2v.model")
np.random.seed = 47
#X_pad = np.random.rand(50)
#with open('data/discourse_markers/oanc_terms.pkl', 'rb') as f:
#    terms_dict = pickle.load(f)
#ind_dict = {v: k for k, v in terms_dict.items()}
#ind_dict[9] = 'NULL'

In [4]:
ind_dict = {0: 'NULL', 1: 'But'}

In [5]:
def pred(passage, vectorized = False):
    if not vectorized:
        sentences = sent_tokenize(passage)
        tok_sent = [word_tokenize(sentence) for sentence in sentences]
        vectors = [d2v.infer_vector(sentence) for sentence in tok_sent]
    else:
        vectors = passage
    
    for idx in range(len(vectors) - 1):
        if idx == 0 and not vectorized:
            print(sentences[idx])
        input_vec = np.array([vectors[idx], vectors[idx+1]])
        ans = model.predict(np.array([input_vec,]))
        if not vectorized:
            print('[' + ind_dict[np.argmax(ans[0])] + '] ' + sentences[idx+1])
        else:
            return(ind_dict[np.argmax(ans[0])])

In [6]:
text = """
Philosophy of Education is a label applied to the study of the purpose, process, nature and ideals of education. It can be considered a branch of both philosophy and education. Education can be defined as the teaching and learning of specific skills, and the imparting of knowledge, judgment and wisdom, and is something broader than the societal institution of education we often speak of.

Many educationalists consider it a weak and woolly field, too far removed from the practical applications of the real world to be useful. Philosophers dating back to Plato and the Ancient Greeks have given the area much thought and emphasis, and there is little doubt that their work has helped shape the practice of education over the millennia.

Plato is the earliest important educational thinker, and education is an essential element in "The Republic" (his most important work on philosophy and political theory, written around 360 B.C.). In it, he advocates some rather extreme methods: removing children from their mothers' care and raising them as wards of the state, and differentiating children suitable to the various castes, the highest receiving the most education, so that they could act as guardians of the city and care for the less able. He believed that education should be holistic, including facts, skills, physical discipline, music and art. Plato believed that talent and intelligence is not distributed genetically and thus is be found in children born to all classes, although his proposed system of selective public education for an educated minority of the population does not really follow a democratic model.

Aristotle considered human nature, habit and reason to be equally important forces to be cultivated in education, the ultimate aim of which should be to produce good and virtuous citizens. He proposed that teachers lead their students systematically, and that repetition be used as a key tool to develop good habits, unlike Socrates' emphasis on questioning his listeners to bring out their own ideas. He emphasized the balancing of the theoretical and practical aspects of subjects taught, among which he explicitly mentions reading, writing, mathematics, music, physical education, literature, history, and a wide range of sciences, as well as play, which he also considered important.

During the Medieval period, the idea of Perennialism was first formulated by St. Thomas Aquinas in his work "De Magistro". Perennialism holds that one should teach those things deemed to be of everlasting importance to all people everywhere, namely principles and reasoning, not just facts (which are apt to change over time), and that one should teach first about people, not machines or techniques. It was originally religious in nature, and it was only much later that a theory of secular perennialism developed.

During the Renaissance, the French skeptic Michel de Montaigne (1533 - 1592) was one of the first to critically look at education. Unusually for his time, Montaigne was willing to question the conventional wisdom of the period, calling into question the whole edifice of the educational system, and the implicit assumption that university-educated philosophers were necessarily wiser than uneducated farm workers, for example.

"""

In [10]:
pred(text)


Philosophy of Education is a label applied to the study of the purpose, process, nature and ideals of education.
[But] It can be considered a branch of both philosophy and education.
[But] Education can be defined as the teaching and learning of specific skills, and the imparting of knowledge, judgment and wisdom, and is something broader than the societal institution of education we often speak of.
[But] Many educationalists consider it a weak and woolly field, too far removed from the practical applications of the real world to be useful.
[But] Philosophers dating back to Plato and the Ancient Greeks have given the area much thought and emphasis, and there is little doubt that their work has helped shape the practice of education over the millennia.
[But] Plato is the earliest important educational thinker, and education is an essential element in "The Republic" (his most important work on philosophy and political theory, written around 360 B.C.).
[But] In it, he advocates some rath

In [11]:
# see where the errors are landing

errors = Counter()
total_pred = Counter()

for idx in tqdm(range(len(X_test))):
    predicted = pred(X_test[idx], True)
    true = ind_dict[np.argmax(y_test[idx])]
    total_pred[predicted] += 1
    if predicted != true:
        errors[true + ' => ' + predicted] += 1

NameError: name 'X_test' is not defined

In [None]:
errors.most_common()

In [None]:
total_pred.most_common()

In [13]:
term_idx = 8

In [23]:
for idx, row in bnc_df[bnc_df.y_dense == term_idx].sample(20).iterrows():
    print(' '.join(row['sent1']))
    result = pred(np.array(row['X']), True)
    print(result)
    print(' '.join(row['sent2']))
    print()


On large farms they ran up to fifty or sixty acres .
But
In grazing country these larger fields were soon reduced to a number of smaller fields of round about ten acres apiece .

If we do nothing the badger is doomed to suffer , and terrible suffering goes on all the time .
But
Our work costs money .

It is only recently that the municipal authorities and hospital boards have begun to discuss the possibility of changing the insurers ' conditions of funding so that they are more permissive of non-medical intervention and support .
NULL
There are signs too that the professional care providers seek to halt the slight trend in reduction which has taken place in the role of the mental hospital .

When market forces alone are allowed to determine the exchange value of currencies , a free-floating exchange rate system is said to exist .
But
Will a free-floating exchange rate system operate smoothly ?

‘ Black , ’ she said , ‘ well polished and without broken laces .
But
They were boots .

Nat

In [18]:
for idx, row in oanc_df[oanc_df.y_dense == 9].sample(20).iterrows():
    print(' '.join(row['sent1']))
    result = pred(np.array(row['X']), True)
    print(result)
    print(' '.join(row['sent2']))
    print()


Kenneth Starr , who has been poking into Whitewater for three years , is the least of them .
NULL
One special prosecutor is still deciding whether to prosecute Henry Cisneros , the former housing secretary , for understating payments to an ex-mistress .

Renamed Northwest Texas Legal Aid , its headquarters will be in Arlington .
NULL
Created by Congress in 1974 , Legal Services Corp. is frequently a budgetary target .

Laboratory data should be recorded on a real-time basis to prevent the loss of information or inadvertent introduction of errors into the record .
NULL
Original data sheets should be signed and dated by the laboratory personnel performing the tests .

Historically the stronghold of merchants and royalty , today it remains the home of commerce and government .
NULL
Faubourg Saint-Honoré offers the luxury of jewelry shops and haute couture ; the Champs-Elysées claims the first-run cinemas , airline companies , and car showrooms .

Wandering the side streets at night allows