In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook as tqdm
import gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import pickle
import random as rand
from nltk import word_tokenize, sent_tokenize
from collections import Counter

import keras
import pydot
from keras.models import Model, load_model
from keras.utils import plot_model

paramiko missing, opening SSH/SCP/SFTP paths will be disabled.  `pip install paramiko` to suppress
Using TensorFlow backend.


In this notebook, I run tests on the data and the models trained previously for the discourse marker insertion task.

# statistics on original dataset

In [None]:
oanc_df = pd.read_pickle('data/discourse_markers/oanc_pair_df.zip')

In [None]:
bnc_df = pd.read_pickle('data/discourse_markers/bnc_pair_df.zip')

In [None]:
with open('data/discourse_markers/oanc_terms.pkl', 'rb') as f:
    terms_dict = pickle.load(f)
idx_dict = {terms_dict[k]: k for k in terms_dict}
idx_dict[9] = 'NULL'

In [None]:
# count how many samples for each chosen term
for t in terms_dict:
    idx = terms_dict[t]
    print(t)
    num = len(oanc_df[oanc_df.y_dense == idx]) + len(bnc_df[bnc_df.y_dense == idx])
    print(num)

In [None]:
# show random examples
for idx, row in oanc_df.sample(100).iterrows():
    if row['y_dense'] != 9:
        print(' '.join(row['sent1']))
        print(' '.join(row['sent2']))
        print(idx_dict[row['y_dense']])
        print()

# visualize model architecture

In [2]:
model = load_model('data/discourse_markers_models/model.But.h5')

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use tf.cast instead.


In [4]:
plot_model(model, to_file = 'data/discourse_markers/graph.png', show_shapes = True)

# testing and evaluation

## load necessities

In [5]:
with open('data/discourse_markers/oanc_terms.pkl', 'rb') as f:
    terms_dict = pickle.load(f)
idx_dict = {terms_dict[k]: k for k in terms_dict}
idx_dict[9] = 'NULL'

In [9]:
models = {}
dfs = {}

for term in terms_dict.keys():
    models[term] = load_model('data/discourse_markers_models/model.' + term + '.h5')
    dfs[term] = pd.read_pickle('data/discourse_markers_models/' + term + '_df.pkl')

In [59]:
d2v = Doc2Vec.load("data/discourse_markers/d2v.model")

FileNotFoundError: [Errno 2] No such file or directory: 'data/discourse_markers/d2v.model'

## define helper functions

In [46]:
def pred(vecs, term):
    # takes an array in shape [None, 2, 100]
    # returns results for a given term
        
    return np.argmax(models[term].predict(vecs), axis = 1)

In [47]:
def df_to_vec(df):
    # takes a DataFrame
    # returns vectors in np.array format for pred function
    
    return np.array(list(df.X))

In [57]:
def predict_from_text(text):
    # takes a single string of a passage
    # predicts for all terms, what could be present
    sents = sent_tokenize(text)
    tok_sents = [word_tokenize(sent) for sent in sents]
    vectors = [d2v.infer_vector(sent) for sent in tok_sents]
    
    for idx in range(len(vectors) - 1):
        print(sents[idx])
        input_vec = np.array([vectors[idx], vectors[idx+1]])
        
        ans = []
        
        for term in terms_dict.keys():
            if np.argmax(models[term].predict(np.array([input_vec,]))[0]):
                ans.append(term)
                
        ans_str = '/'.join(ans)
        print('[' + ans_str + ']' + sents[idx+1])

## test

In [51]:
pred(df_to_vec(dfs['But'].sample(10)), 'But')

array([0, 0, 0, 1, 0, 0, 0, 0, 0, 1])

In [58]:
text = """
Philosophy of Education is a label applied to the study of the purpose, process, nature and ideals of education. It can be considered a branch of both philosophy and education. Education can be defined as the teaching and learning of specific skills, and the imparting of knowledge, judgment and wisdom, and is something broader than the societal institution of education we often speak of.

Many educationalists consider it a weak and woolly field, too far removed from the practical applications of the real world to be useful. Philosophers dating back to Plato and the Ancient Greeks have given the area much thought and emphasis, and there is little doubt that their work has helped shape the practice of education over the millennia.

Plato is the earliest important educational thinker, and education is an essential element in "The Republic" (his most important work on philosophy and political theory, written around 360 B.C.). In it, he advocates some rather extreme methods: removing children from their mothers' care and raising them as wards of the state, and differentiating children suitable to the various castes, the highest receiving the most education, so that they could act as guardians of the city and care for the less able. He believed that education should be holistic, including facts, skills, physical discipline, music and art. Plato believed that talent and intelligence is not distributed genetically and thus is be found in children born to all classes, although his proposed system of selective public education for an educated minority of the population does not really follow a democratic model.

Aristotle considered human nature, habit and reason to be equally important forces to be cultivated in education, the ultimate aim of which should be to produce good and virtuous citizens. He proposed that teachers lead their students systematically, and that repetition be used as a key tool to develop good habits, unlike Socrates' emphasis on questioning his listeners to bring out their own ideas. He emphasized the balancing of the theoretical and practical aspects of subjects taught, among which he explicitly mentions reading, writing, mathematics, music, physical education, literature, history, and a wide range of sciences, as well as play, which he also considered important.

During the Medieval period, the idea of Perennialism was first formulated by St. Thomas Aquinas in his work "De Magistro". Perennialism holds that one should teach those things deemed to be of everlasting importance to all people everywhere, namely principles and reasoning, not just facts (which are apt to change over time), and that one should teach first about people, not machines or techniques. It was originally religious in nature, and it was only much later that a theory of secular perennialism developed.

During the Renaissance, the French skeptic Michel de Montaigne (1533 - 1592) was one of the first to critically look at education. Unusually for his time, Montaigne was willing to question the conventional wisdom of the period, calling into question the whole edifice of the educational system, and the implicit assumption that university-educated philosophers were necessarily wiser than uneducated farm workers, for example.

"""

predict_from_text(text)

NameError: name 'd2v' is not defined

In [None]:
    loss, accuracy = model.evaluate(X_test, y_test, batch_size = 32, verbose = 0)
    print('Test accuracy:\t\t\t\t' + str(accuracy) + '\n')

In [None]:
# see where the errors are landing

errors = Counter()
total_pred = Counter()

for idx in tqdm(range(len(X_test))):
    predicted = pred(X_test[idx], True)
    true = ind_dict[np.argmax(y_test[idx])]
    total_pred[predicted] += 1
    if predicted != true:
        errors[true + ' => ' + predicted] += 1

In [None]:
errors.most_common()

In [None]:
total_pred.most_common()

In [None]:
term_idx = 8

In [None]:
for idx, row in bnc_df[bnc_df.y_dense == term_idx].sample(20).iterrows():
    print(' '.join(row['sent1']))
    result = pred(np.array(row['X']), True)
    print(result)
    print(' '.join(row['sent2']))
    print()


In [None]:
for idx, row in oanc_df[oanc_df.y_dense == 9].sample(20).iterrows():
    print(' '.join(row['sent1']))
    result = pred(np.array(row['X']), True)
    print(result)
    print(' '.join(row['sent2']))
    print()
