In [14]:
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook as tqdm
import gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import pickle
from sklearn.model_selection import train_test_split
from sklearn.utils import class_weight
from sklearn.utils import shuffle
import random as rand
from nltk import word_tokenize, sent_tokenize
from collections import Counter
import math

import keras.backend as K
import tensorflow as tf

from keras.models import Model
from keras.layers import Input, Dense, Lambda, Dropout, Bidirectional, SimpleRNN
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences

In [2]:
oanc_df = pd.read_pickle('data/discourse_markers/oanc_pair_df.pkl')

In [20]:
oanc_df.head()

Unnamed: 0,sent1,sent2,label,X,y,y_dense
0,"[In, my, three, decades, of, teaching, univers...","[I, also, served, on, boards, of, directors, a...",non-fiction/OUP/Berk/ch1,"[[0.054467976, 0.1869069, 0.06425432, 0.130815...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1]",9
1,"[I, also, served, on, boards, of, directors, a...","[My, research, continually, drew, me, into, cl...",non-fiction/OUP/Berk/ch1,"[[-0.04379302, 0.39241648, 0.17816554, 0.17242...","[0, 0, 0, 0, 0, 0, 0, 1, 0, 0]",7
2,"[As, a, byproduct, of, those, experiences, ,, ...","[Their, fervent, questions, ,, at, times, ridd...",non-fiction/OUP/Berk/ch1,"[[0.23088518, 5.699069e-05, -0.19189279, 0.195...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1]",9
3,"[When, we, looked, for, a, preschool, ,, many,...","[To, me, ,, Lydia, ’, s, preschool, seems, lik...",non-fiction/OUP/Berk/ch1,"[[0.16534813, 0.3831386, -0.071578294, 0.27849...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1]",9
4,"[To, me, ,, Lydia, ’, s, preschool, seems, lik...","[Why, is, Lydia, ,, who, ’, s, always, been, a...",non-fiction/OUP/Berk/ch1,"[[0.22163266, 0.12578495, 0.051759634, 0.14625...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1]",9


In [4]:
bnc_df = pd.read_pickle('data/discourse_markers/bnc_df.zip')
#bnc_df['y_dense'] = bnc_df['y'].apply(np.argmax)

In [21]:
bnc_df.head()

Unnamed: 0,label,sent1,sent2,sent2_orig,y,y_dense
0,[Central television news scripts]. Sample co...,"[Well, ,, the, sun, may, have, turned, to, sto...","[Some, villagers, are, now, worried, that, a, ...","[And, some, villagers, are, now, worried, that...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...",7
1,[Central television news scripts]. Sample co...,"[And, some, villagers, are, now, worried, that...","[Gareth, Furby, reports, .]",,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",9
2,[Central television news scripts]. Sample co...,"[Gareth, Furby, reports, .]","[It, may, have, rained, for, hours, ,, but, it...",,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",9
3,[Central television news scripts]. Sample co...,"[It, may, have, rained, for, hours, ,, but, it...","[It, 's, source, ,, a, chalk, spring, ,, staye...",,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",9
4,[Central television news scripts]. Sample co...,"[It, 's, source, ,, a, chalk, spring, ,, staye...","[The, river, and, its, fish, remained, just, a...",,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",9


In [22]:
with open('data/discourse_markers/oanc_terms.pkl', 'rb') as f:
    terms_dict = pickle.load(f)
ind_dict = {v: k for k, v in terms_dict.items()}
ind_dict[9] = 'NULL'
terms_dict

{'Also': 4,
 'And': 7,
 'But': 8,
 'First': 3,
 'Now': 5,
 'Or': 2,
 'So': 6,
 'Well': 0,
 'Yet': 1}

In [6]:
counts = Counter()
for count in oanc_df["y_dense"]:
    counts[count] += 1
print(counts)

Counter({9: 252574, 8: 9685, 7: 5545, 6: 1629, 5: 924, 4: 850, 2: 795, 3: 763, 1: 625, 0: 399})


In [5]:
counts = Counter()
for count in bnc_df["y_dense"]:
    counts[count] += 1
print(counts)

Counter({9: 657673, 8: 15941, 7: 10360, 6: 4184, 5: 2308, 0: 2202, 1: 1656, 2: 1169, 3: 1089, 4: 492})


# create X and y

In [324]:
X = []
y = []

num = 5 # Or

X.extend(oanc_df[oanc_df.y_dense == num].X)
X.extend(bnc_df[bnc_df.y_dense == num].X)
y.extend(oanc_df[oanc_df.y_dense == num].y_dense)
y.extend(bnc_df[bnc_df.y_dense == num].y_dense)

sampled_oanc = oanc_df[oanc_df.y_dense != num].sample(n=len(X), random_state=1)

X.extend(sampled_oanc.X)
y.extend(sampled_oanc.y_dense)

y = [1 if x == num else 0 for x in y]

X, y = shuffle(X, y, random_state=0)

# model building

In [325]:
input_len = 2
num_units = 256
embed_dim = 50

In [326]:
K.clear_session()

In [327]:
main_input = Input(shape = (input_len, embed_dim), dtype = 'float32', name = 'main_input')

rnn = Bidirectional(SimpleRNN(return_sequences = False, units = num_units), name = 'rnn')(main_input)
dropout = Dropout(rate = 0.25, name = 'dropout')(rnn)
output = Dense(2, activation='softmax', name = 'output')(dropout)

In [328]:
model = Model(inputs = main_input, outputs = output)

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
main_input (InputLayer)      (None, 2, 50)             0         
_________________________________________________________________
rnn (Bidirectional)          (None, 512)               157184    
_________________________________________________________________
dropout (Dropout)            (None, 512)               0         
_________________________________________________________________
output (Dense)               (None, 2)                 1026      
Total params: 158,210
Trainable params: 158,210
Non-trainable params: 0
_________________________________________________________________


# train

In [329]:
X = np.array(X)
y = to_categorical(y, 2)

In [330]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=47)

In [331]:
model.compile(optimizer = 'adam',
             loss = 'binary_crossentropy',
             metrics = ['accuracy'])

history = model.fit(X_train, y_train, 
                    epochs = 5, 
                    batch_size = 32, 
                    validation_split = 0.1)

Train on 5235 samples, validate on 582 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


# test

In [292]:
d2v = Doc2Vec.load("data/discourse_markers/d2v_oanc.model")
np.random.seed = 47
#X_pad = np.random.rand(50)
#with open('data/discourse_markers/oanc_terms.pkl', 'rb') as f:
#    terms_dict = pickle.load(f)
#ind_dict = {v: k for k, v in terms_dict.items()}
#ind_dict[9] = 'NULL'

In [332]:
ind_dict = {0: 'NULL', 1: 'Now'}

In [333]:
def pred(passage, vectorized = False):
    if not vectorized:
        sentences = sent_tokenize(passage)
        tok_sent = [word_tokenize(sentence) for sentence in sentences]
        vectors = [d2v.infer_vector(sentence) for sentence in tok_sent]
    else:
        vectors = passage
    
    for idx in range(len(vectors) - 1):
        if idx == 0 and not vectorized:
            print(sentences[idx])
        input_vec = np.array([vectors[idx], vectors[idx+1]])
        ans = model.predict(np.array([input_vec,]))
        if not vectorized:
            print('[' + ind_dict[np.argmax(ans[0])] + '] ' + sentences[idx+1])
        else:
            return(ind_dict[np.argmax(ans[0])])

In [334]:
# see where the errors are landing

errors = Counter()
total_pred = Counter()
correct = 0
total = len(X_test)

for idx in tqdm(range(len(X_test))):
    predicted = pred(X_test[idx], True)
    true = ind_dict[np.argmax(y_test[idx])]
    
    total_pred[predicted] += 1
    
    if predicted == true:
        correct += 1
    else:
        errors[true + ' => ' + predicted] += 1
        
print(str(correct/float(total)*100))

HBox(children=(IntProgress(value=0, max=647), HTML(value='')))


78.05255023183926


In [335]:
errors.most_common()

[('NULL => Now', 74), ('Now => NULL', 68)]

In [336]:
total_pred.most_common()

[('NULL', 327), ('Now', 320)]

In [337]:
for idx, row in bnc_df[bnc_df.y_dense == num].sample(20).iterrows():
    print(' '.join(row['sent1']))
    result = pred(np.array(row['X']), True)
    print(result)
    print(' '.join(row['sent2']))
    print()


He had been looking for the Face of Death .
Now
He had found it .

could carry water to him in a basket .
Now
‘ That you have shown me all that is

In the old days the journey took nine or ten days by camel .
Now
It is eight hours by bus along a narrow asphalt road .

In the ballot for the Leadership of the new , merged party , the votes were as follows : Paddy Ashdown 41,401 Alan Beith 16,202
Now
That each party has formal election procedures through which to choose its Leader , the Queen in normal circumstances will not have to make a personal choice between rival candidates for Prime Minister , a choice which she had to make in 1957 and in 1963 before the Conservative Party adopted election rules .

At the far side , the peninsula of Northmavine forms a boundary which ends at Fedeland , one of the famous ‘ haaf stations ’ of last century , from which the Shetland fisherman used to row or sail in open boats to the ‘ far haaf ’ , the fishing grounds which could be as far as sixty or m

In [338]:
for idx, row in oanc_df[oanc_df.y_dense == 9].sample(20).iterrows():
    print(' '.join(row['sent1']))
    result = pred(np.array(row['X']), True)
    print(result)
    print(' '.join(row['sent2']))
    print()


For GAO to effectively do its job and obtain all the facts , we must have unfettered access to records no matter where the federal dollar goes and services are delivered .
Now
As I 've stressed , we are making major changes in how GAO will face the future , both to support Congress and to lead the government in strategic planning , human capital management , information technology , and other areas .

The New York Times reveals that nonprofits , including churches , pocketed millions in Federal grants that are designated to feed poor children .
NULL
The government nourishes 2.4 million day care kids by reimbursing intermediary organizations , which oversee the doling out of meals .

The road climbs via a series of switchback turns , and once at the top you ’ ll have a clear view of the town lying in the flat plain below .
Now
The lifestyle of the people who live high on the hillside is fascinating .

November 15 , 1996 Dear Personal Donor : In the short while since Goodwill helped him 