In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook as tqdm
import gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import pickle
from sklearn.model_selection import train_test_split
from sklearn.utils import class_weight
from sklearn.utils import shuffle
import random as rand
from nltk import word_tokenize, sent_tokenize
from collections import Counter
import math

import keras.backend as K
import tensorflow as tf

from keras.models import Model
from keras.layers import Input, Dense, Lambda, Dropout, Bidirectional, SimpleRNN
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [42]:
oanc_df = pd.read_pickle('data/discourse_markers/oanc_pair_df.zip')

In [46]:
oanc_df.sample(5)

Unnamed: 0,sent1,sent2,label,X,y,y_dense
236078,"[The, documents, show, ,, as, Gerth, himself, ...","[The, Washington, Post, added, that, even, the...",journal/slate/50/ArticleIP_25878,"[[-0.021916695, -0.22111335, -0.10068608, -0.2...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1]",9
106741,"[BMIs, straddle, the, worlds, of, fact, and, f...","[While, the, entertainment, industry, has, foc...",technical/plos/journal.pbio.0020430,"[[-0.10693365, -0.015572075, -0.13940139, -0.0...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1]",9
66434,"[Executive, Order, No, .]","[According, to, the, Department, ,, this, rule...",technical/government/Gen_Account_Office/og97002,"[[0.0038720437, 0.046461582, -0.021168824, 0.0...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1]",9
63300,"[See, chapter, 6, for, a, discussion, of, the,...","[Significant, findings, and, recommendations, ...",technical/government/Gen_Account_Office/Govern...,"[[0.009962393, -0.039003048, 0.014711713, 0.09...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1]",9
173657,"[He, ignites, kegs, of, dynamite, in, his, Asp...","[To, ring, in, the, new, year, in, 1997, ,, he...",journal/slate/30/ArticleIP_1867,"[[0.040279258, 0.052571785, -0.07801149, -0.02...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1]",9


In [14]:
bnc_df = pd.read_pickle('data/discourse_markers/bnc_pair_df.zip')

In [49]:
bnc_df.sample(5)

Unnamed: 0,label,sent1,sent2,sent2_orig,y,y_dense
541376,Tales I tell my mother. Sample containing a...,"[Jo, 's, voice, follows, her, heels, clicking,...","[Go, anywhere, ,, out, of, this, whole, mess, .]",,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1]",9
501311,Dostoevsky. Sample containing about 34134 wo...,"[Petersburg, encourages, his, vicious, loose-e...","[Raskolnikov, 's, ‘, incomplete, smile, ’, is,...",,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1]",9
302103,Armada. Sample containing about 37064 words ...,"[She, pushed, a, dark, lock, of, hair, back, o...","["", Drowned, ,, "", he, said, ,, and, told, her...",,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1]",9
30521,Britain and Europe - European art: radio pro...,"[That, does, n't, help, !]","[He, does, more, or, less, blank, paintings, o...",,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1]",9
431291,Introduction to computer law. Sample contain...,"[Most, of, these, points, are, self-explanator...","[The, misrepresentation, is, not, necessarily,...",,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1]",9


In [22]:
with open('data/discourse_markers/oanc_terms.pkl', 'rb') as f:
    terms_dict = pickle.load(f)
ind_dict = {v: k for k, v in terms_dict.items()}
ind_dict[9] = 'NULL'
terms_dict

{'Also': 4,
 'And': 7,
 'But': 8,
 'First': 3,
 'Now': 5,
 'Or': 2,
 'So': 6,
 'Well': 0,
 'Yet': 1}

In [44]:
counts = Counter()
for count in oanc_df["y_dense"]:
    counts[count] += 1
print(counts)

Counter({9: 252574, 8: 9685, 7: 5545, 6: 1629, 4: 850, 5: 837, 2: 795, 3: 763, 1: 625, 0: 399})


In [48]:
counts = Counter()
for count in bnc_df["y_dense"]:
    counts[count] += 1
print(counts)

Counter({9: 532618, 8: 11707, 7: 7373, 6: 3111, 0: 1880, 5: 1629, 1: 1284, 3: 873, 2: 811, 4: 362})


# re-vectorization with new data using doc2vec

In [51]:
X_tokens = []
for idx, row in tqdm(bnc_df.iterrows(), total=len(bnc_df)):
    X_tokens.append(row['sent1'])
for idx, row in tqdm(oanc_df.iterrows(), total=len(oanc_df)):
    X_tokens.append(row['sent1'])

HBox(children=(IntProgress(value=0, max=561648), HTML(value='')))

HBox(children=(IntProgress(value=0, max=273702), HTML(value='')))

In [52]:
tagged = []
for i, sent in enumerate(tqdm(X_tokens)):
    tagged.append(TaggedDocument(words = sent, tags = [str(i)]))

HBox(children=(IntProgress(value=0, max=835350), HTML(value='')))

In [53]:
d2v = Doc2Vec(vector_size = 100, min_count = 1, dm = 0)
d2v.build_vocab(tagged)
print('vocabulary built')
d2v.train(tagged, total_examples = d2v.corpus_count, epochs = 20)
print('training finished')
d2v.save("data/discourse_markers/d2v.model")
print("trained & saved")

vocabulary built
training finished
trained & saved


In [59]:
index = 0

X = []
for idx, row in tqdm(bnc_df.iterrows(), total=len(bnc_df)):
    assert tagged[index].words == row['sent1']
    sent1_vec = d2v.docvecs[str(index)]
    sent2_vec = d2v.infer_vector(row['sent2'])
    if index + 1 < len(tagged):
        if tagged[index+1].words == row['sent2']:
            sent2_vec = d2v.docvecs[str(index)]
    index += 1
    X.append([sent1_vec, sent2_vec])  
bnc_df['X'] = X

X = []
for idx, row in tqdm(oanc_df.iterrows(), total=len(oanc_df)):
    assert tagged[index].words == row['sent1']
    sent1_vec = d2v.docvecs[str(index)]
    sent2_vec = d2v.infer_vector(row['sent2'])
    if index + 1 < len(tagged):
        if tagged[index+1].words == row['sent2']:
            sent2_vec = d2v.docvecs[str(index)]
    index += 1
    X.append([sent1_vec, sent2_vec])  
oanc_df['X'] = X

HBox(children=(IntProgress(value=0, max=561648), HTML(value='')))

HBox(children=(IntProgress(value=0, max=273702), HTML(value='')))

In [62]:
oanc_df.to_pickle('data/discourse_markers/oanc_pair_df.zip')
bnc_df.to_pickle('data/discourse_markers/bnc_pair_df.zip')

# create X and y

In [110]:
X = []
y = []

num = 8 # But

X.extend(oanc_df[oanc_df.y_dense == num].X)
X.extend(bnc_df[bnc_df.y_dense == num].X)
y.extend(oanc_df[oanc_df.y_dense == num].y_dense)
y.extend(bnc_df[bnc_df.y_dense == num].y_dense)

sampled_oanc = oanc_df[oanc_df.y_dense != num].sample(n=int(len(X)/2), random_state=1)
sampled_bnc = bnc_df[bnc_df.y_dense != num].sample(n=int(len(X)/2), random_state=1)

X.extend(sampled_oanc.X)
y.extend(sampled_oanc.y_dense)
X.extend(sampled_bnc.X)
y.extend(sampled_bnc.y_dense)

y = [1 if x == num else 0 for x in y]

X, y = shuffle(X, y, random_state=0)

In [115]:
X = np.array(X)
y = to_categorical(y, 2)

In [116]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=47)

# model building

In [128]:
input_len = 2
num_units = 128
embed_dim = 100

In [129]:
K.clear_session()

In [130]:
main_input = Input(shape = (input_len, embed_dim), dtype = 'float32', name = 'main_input')

rnn = Bidirectional(SimpleRNN(return_sequences = False, units = num_units), name = 'rnn')(main_input)
dropout = Dropout(rate = 0.25, name = 'dropout')(rnn)
output = Dense(2, activation='softmax', name = 'output')(dropout)

In [131]:
model = Model(inputs = main_input, outputs = output)

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
main_input (InputLayer)      (None, 2, 100)            0         
_________________________________________________________________
rnn (Bidirectional)          (None, 128)               21120     
_________________________________________________________________
dropout (Dropout)            (None, 128)               0         
_________________________________________________________________
output (Dense)               (None, 2)                 258       
Total params: 21,378
Trainable params: 21,378
Non-trainable params: 0
_________________________________________________________________


# train

In [132]:
model.compile(optimizer = 'adam',
             loss = 'binary_crossentropy',
             metrics = ['accuracy'])

history = model.fit(X_train, y_train, 
                    epochs = 5, 
                    batch_size = 32, 
                    validation_split = 0.1)

loss, accuracy = model.evaluate(X_test, y_test, batch_size = 32)

print('\naccuracy:\t' + str(accuracy))

Train on 34654 samples, validate on 3851 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
accuracy:	0.7190932460437452


# test

In [93]:
d2v = Doc2Vec.load("data/discourse_markers/d2v.model")
np.random.seed = 47
#X_pad = np.random.rand(50)
#with open('data/discourse_markers/oanc_terms.pkl', 'rb') as f:
#    terms_dict = pickle.load(f)
#ind_dict = {v: k for k, v in terms_dict.items()}
#ind_dict[9] = 'NULL'

In [94]:
ind_dict = {0: 'NULL', 1: 'But'}

In [95]:
def pred(passage, vectorized = False):
    if not vectorized:
        sentences = sent_tokenize(passage)
        tok_sent = [word_tokenize(sentence) for sentence in sentences]
        vectors = [d2v.infer_vector(sentence) for sentence in tok_sent]
    else:
        vectors = passage
    
    for idx in range(len(vectors) - 1):
        if idx == 0 and not vectorized:
            print(sentences[idx])
        input_vec = np.array([vectors[idx], vectors[idx+1]])
        ans = model.predict(np.array([input_vec,]))
        if not vectorized:
            print('[' + ind_dict[np.argmax(ans[0])] + '] ' + sentences[idx+1])
        else:
            return(ind_dict[np.argmax(ans[0])])

In [96]:
text = """
Philosophy of Education is a label applied to the study of the purpose, process, nature and ideals of education. It can be considered a branch of both philosophy and education. Education can be defined as the teaching and learning of specific skills, and the imparting of knowledge, judgment and wisdom, and is something broader than the societal institution of education we often speak of.

Many educationalists consider it a weak and woolly field, too far removed from the practical applications of the real world to be useful. Philosophers dating back to Plato and the Ancient Greeks have given the area much thought and emphasis, and there is little doubt that their work has helped shape the practice of education over the millennia.

Plato is the earliest important educational thinker, and education is an essential element in "The Republic" (his most important work on philosophy and political theory, written around 360 B.C.). In it, he advocates some rather extreme methods: removing children from their mothers' care and raising them as wards of the state, and differentiating children suitable to the various castes, the highest receiving the most education, so that they could act as guardians of the city and care for the less able. He believed that education should be holistic, including facts, skills, physical discipline, music and art. Plato believed that talent and intelligence is not distributed genetically and thus is be found in children born to all classes, although his proposed system of selective public education for an educated minority of the population does not really follow a democratic model.

Aristotle considered human nature, habit and reason to be equally important forces to be cultivated in education, the ultimate aim of which should be to produce good and virtuous citizens. He proposed that teachers lead their students systematically, and that repetition be used as a key tool to develop good habits, unlike Socrates' emphasis on questioning his listeners to bring out their own ideas. He emphasized the balancing of the theoretical and practical aspects of subjects taught, among which he explicitly mentions reading, writing, mathematics, music, physical education, literature, history, and a wide range of sciences, as well as play, which he also considered important.

During the Medieval period, the idea of Perennialism was first formulated by St. Thomas Aquinas in his work "De Magistro". Perennialism holds that one should teach those things deemed to be of everlasting importance to all people everywhere, namely principles and reasoning, not just facts (which are apt to change over time), and that one should teach first about people, not machines or techniques. It was originally religious in nature, and it was only much later that a theory of secular perennialism developed.

During the Renaissance, the French skeptic Michel de Montaigne (1533 - 1592) was one of the first to critically look at education. Unusually for his time, Montaigne was willing to question the conventional wisdom of the period, calling into question the whole edifice of the educational system, and the implicit assumption that university-educated philosophers were necessarily wiser than uneducated farm workers, for example.

"""

In [97]:
pred(text)


Philosophy of Education is a label applied to the study of the purpose, process, nature and ideals of education.
[But] It can be considered a branch of both philosophy and education.
[But] Education can be defined as the teaching and learning of specific skills, and the imparting of knowledge, judgment and wisdom, and is something broader than the societal institution of education we often speak of.
[NULL] Many educationalists consider it a weak and woolly field, too far removed from the practical applications of the real world to be useful.
[NULL] Philosophers dating back to Plato and the Ancient Greeks have given the area much thought and emphasis, and there is little doubt that their work has helped shape the practice of education over the millennia.
[But] Plato is the earliest important educational thinker, and education is an essential element in "The Republic" (his most important work on philosophy and political theory, written around 360 B.C.).
[But] In it, he advocates some ra

In [125]:
# see where the errors are landing

errors = Counter()
total_pred = Counter()

for idx in tqdm(range(len(X_test))):
    predicted = pred(X_test[idx], True)
    true = ind_dict[np.argmax(y_test[idx])]
    total_pred[predicted] += 1
    if predicted != true:
        errors[true + ' => ' + predicted] += 1

HBox(children=(IntProgress(value=0, max=4279), HTML(value='')))

In [126]:
errors.most_common()

[('But => NULL', 754), ('NULL => But', 443)]

In [127]:
total_pred.most_common()

[('NULL', 2429), ('But', 1850)]

In [101]:
for idx, row in bnc_df[bnc_df.y_dense == num].sample(20).iterrows():
    print(' '.join(row['sent1']))
    result = pred(np.array(row['X']), True)
    print(result)
    print(' '.join(row['sent2']))
    print()


If he could convince her that nothing she could say or do would make him change his mind about Carrie , the way would be open for him to prove to Carrie he had enough love for the two of them .
But
There was so little time left now , for he could get his papers any day … He had the urge to run .

’
But
She knew that this was more important .

She could believe that this was the creature that had floated inside her — yes , like a starry astronaut in his liquid capsule , attached to his red life-support cable — she had pored over photographs of embryos and imagined him a hundred times .
But
Surely only his perfect oblivious innocence of all evil including pain had enabled him to survive the journey down .

and the car Will get priority .
But
Does it have to be Curtains

So long as the traveller historians and geographers , such as Herodotus and Pliny the Elder , Chau Ju-Kwa and Ibn Batuta , Friar Odoric and Marco Polo , are writing as eye-witness observers , most of what they report is e

In [102]:
for idx, row in oanc_df[oanc_df.y_dense == 9].sample(20).iterrows():
    print(' '.join(row['sent1']))
    result = pred(np.array(row['X']), True)
    print(result)
    print(' '.join(row['sent2']))
    print()


NULL
What publication , what danger ?

My significant other is driving me berserk .
NULL
He changes religious beliefs like some people change clothes .

Lincoln once told his biographer and friend William Herndon that he had been infected with syphilis by a prostitute in Beardstown around 1835 [ 6 ] .
NULL
What if a future test could prove that Lincoln had spoken the truth ?

There ’ s not a hint of the 14th century in its splendid western façade , however .
NULL
This Baroque renovation , one of Jaime Bort ’ s celebrated designs , was undertaken when the original Gothic front suffered irreparable damage in a disastrous flood of the Segura in 1735 .

Christmas Jeers Please send your questions for publication to prudence @ slate.com .
NULL
Dear Prudie , I received the worst version of the dreaded Christmas letter -- addressed to no one in particular -- with an early Christmas card .

Seven Years in Tibet ends with the Dalai Lama 's enthronement in 1950 at age 15 and his assumption of the