In [58]:
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook as tqdm
import gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import pickle
from sklearn.model_selection import train_test_split
from sklearn.utils import class_weight
from sklearn.utils import shuffle
import random as rand
from nltk import word_tokenize, sent_tokenize
from collections import Counter
import math

import keras.backend as K
import tensorflow as tf

from keras.models import Model
from keras.layers import Input, Dense, Lambda, Dropout, Bidirectional, LSTM
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences

# train doc2vec encodings

In [2]:
df = pd.read_pickle('data/discourse_markers/oanc_df.zip')

In [4]:
X_tokens = []
for idx, row in tqdm(df.iterrows(), total=len(df)):
    for item in row['clean_and_tokenized']:
        X_tokens.append(item)

HBox(children=(IntProgress(value=0, max=65101), HTML(value='')))




In [5]:
tagged = []
for i, sent in enumerate(tqdm(X_tokens)):
    tagged.append(TaggedDocument(words = sent, tags = [str(i)]))

HBox(children=(IntProgress(value=0, max=338890), HTML(value='')))




In [6]:
d2v_oanc = Doc2Vec(vector_size = 50, min_count = 1, dm = 1)
d2v_oanc.build_vocab(tagged)
print('vocabulary built')

vocabulary built


In [7]:
d2v_oanc.train(tagged, total_examples = d2v_oanc.corpus_count, epochs = 20)
print('training finished')
d2v_oanc.save("data/discourse_markers/d2v_oanc.model")
print("trained & saved")

training finished
trained & saved


# vectorize texts and add to new df

In [42]:
vecs = []

index = 0
for idx, row in tqdm(df.iterrows(), total = len(df)):
    current_vecs = []
    for item in row['clean_and_tokenized']:
        assert tagged[index].words == item
        current_vecs.append(d2v_oanc.docvecs[str(index)])
        index += 1
    vecs.append(current_vecs)
        
df['X'] = vecs

HBox(children=(IntProgress(value=0, max=65101), HTML(value='')))




In [43]:
df = df.drop(columns=['sents', 'text'])
df = df.rename(columns={"vectors": "y"})
df.to_pickle('data/discourse_markers/vectorized_oanc_df.zip')

# extract X and y, prepare balancing

## padding

# Dataset take 2: only sentence pairs

In [136]:
df = pd.read_pickle('data/discourse_markers/vectorized_oanc_df.zip')
with open('data/discourse_markers/oanc_terms.pkl', 'rb') as f:
    terms_dict = pickle.load(f)
ind_dict = {v: k for k, v in terms_dict.items()}
ind_dict[9] = 'NULL'

In [137]:
df.head()

Unnamed: 0,label,clean_and_tokenized,y,X
0,non-fiction/OUP/Berk/ch1,"[[In, my, three, decades, of, teaching, univer...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 1], [0, 0, 0, 0, ...","[[0.054467976, 0.1869069, 0.06425432, 0.130815..."
1,non-fiction/OUP/Berk/ch1,"[[As, a, byproduct, of, those, experiences, ,,...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 1], [0, 0, 0, 0, ...","[[0.23088518, 5.699069e-05, -0.19189279, 0.195..."
2,non-fiction/OUP/Berk/ch1,"[[When, we, looked, for, a, preschool, ,, many...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 1], [0, 0, 0, 0, ...","[[0.16534813, 0.3831386, -0.071578294, 0.27849..."
3,non-fiction/OUP/Berk/ch1,"[[I, ’, ve, read, that, it, ’, s, the, quality...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 1], [0, 0, 0, 0, ...","[[-0.16124506, 0.21021883, -0.029272433, -0.12..."
4,non-fiction/OUP/Berk/ch1,"[[His, father, ﬁrmly, insists, that, he, do, i...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 1], [0, 0, 0, 0, ...","[[0.17460386, 0.14078914, -0.039566375, -0.223..."


In [138]:
# create new DF of all sentence pairs
sent_1 = []
sent_2 = []
label = []
X = []
y = []

for idx, row in tqdm(df.iterrows(), total = len(df)):    
    seq_len = len(row['clean_and_tokenized'])
    assert seq_len == len(row['y']) 
    assert seq_len == len(row['X'])
    
    for i in range(seq_len - 1):
        label.append(row['label'])
        sent_1.append(row['clean_and_tokenized'][i])
        sent_2.append(row['clean_and_tokenized'][i+1])
        X.append(row['X'][i:i+2])
        y.append(row['y'][i+1])

pair_df = pd.DataFrame()
pair_df['sent1'] = sent_1
pair_df['sent2'] = sent_2
pair_df['label'] = label
pair_df['X'] = X
pair_df['y'] = y
pair_df.head()

HBox(children=(IntProgress(value=0, max=65101), HTML(value='')))




Unnamed: 0,sent1,sent2,label,X,y
0,"[In, my, three, decades, of, teaching, univers...","[I, also, served, on, boards, of, directors, a...",non-fiction/OUP/Berk/ch1,"[[0.054467976, 0.1869069, 0.06425432, 0.130815...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1]"
1,"[I, also, served, on, boards, of, directors, a...","[My, research, continually, drew, me, into, cl...",non-fiction/OUP/Berk/ch1,"[[-0.04379302, 0.39241648, 0.17816554, 0.17242...","[0, 0, 0, 0, 0, 0, 0, 1, 0, 0]"
2,"[As, a, byproduct, of, those, experiences, ,, ...","[Their, fervent, questions, ,, at, times, ridd...",non-fiction/OUP/Berk/ch1,"[[0.23088518, 5.699069e-05, -0.19189279, 0.195...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1]"
3,"[When, we, looked, for, a, preschool, ,, many,...","[To, me, ,, Lydia, ’, s, preschool, seems, lik...",non-fiction/OUP/Berk/ch1,"[[0.16534813, 0.3831386, -0.071578294, 0.27849...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1]"
4,"[To, me, ,, Lydia, ’, s, preschool, seems, lik...","[Why, is, Lydia, ,, who, ’, s, always, been, a...",non-fiction/OUP/Berk/ch1,"[[0.22163266, 0.12578495, 0.051759634, 0.14625...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1]"


In [139]:
pair_df['y_dense'] = pair_df['y'].apply(np.argmax)

In [140]:
pair_df.to_pickle('data/discourse_markers/oanc_pair_df.pkl')

In [141]:
# check distribution of classes
counts = Counter()
for count in pair_df["y_dense"]:
    counts[count] += 1
print(counts)

Counter({9: 252574, 8: 9685, 7: 5545, 6: 1629, 5: 924, 4: 850, 2: 795, 3: 763, 1: 625, 0: 399})


In [157]:
X = []
y = []

for idx in range(9):
    X.extend(pair_df[pair_df.y_dense == idx].X)
    y.extend(pair_df[pair_df.y_dense == idx].y_dense)

sampled_df = pair_df[pair_df.y_dense == 9].sample(n=10000, random_state=1)

X.extend(sampled_df.X)
y.extend(sampled_df.y_dense)

X, y = shuffle(X, y, random_state=0)

In [158]:
with open('data/discourse_markers/oanc_X_pair.pkl', 'wb') as f:
    pickle.dump(X, f)
with open('data/discourse_markers/oanc_y_pair.pkl', 'wb') as f:
    pickle.dump(y, f)

# model building

In [178]:
input_len = 2
num_units = 256
embed_dim = 50

In [209]:
K.clear_session()

In [210]:
main_input = Input(shape = (input_len, embed_dim), dtype = 'float32', name = 'main_input')

lstm = Bidirectional(LSTM(return_sequences = False, units = num_units), name = 'lstm')(main_input)
dropout = Dropout(rate = 0.25, name = 'dropout')(lstm)
output = Dense(10, activation='softmax', name = 'output')(dropout)

In [211]:
model = Model(inputs = main_input, outputs = output)

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
main_input (InputLayer)      (None, 2, 50)             0         
_________________________________________________________________
lstm (Bidirectional)         (None, 512)               628736    
_________________________________________________________________
dropout (Dropout)            (None, 512)               0         
_________________________________________________________________
output (Dense)               (None, 10)                5130      
Total params: 633,866
Trainable params: 633,866
Non-trainable params: 0
_________________________________________________________________


# train

In [163]:
with open('data/discourse_markers/oanc_X_pair.pkl', 'rb') as f:
    X = pickle.load(f)
with open('data/discourse_markers/oanc_y_pair.pkl', 'rb') as f:
    y = pickle.load(f)

In [164]:
class_weights = class_weight.compute_class_weight('balanced', np.unique(y), y)

In [200]:
class_weights

array([7.82330827, 4.9944    , 3.92641509, 4.09108781, 3.67235294,
       3.37824675, 1.91620626, 0.56293959, 0.32230253, 0.31215   ])

In [166]:
X = np.array(X)

In [167]:
y = tf.keras.utils.to_categorical(y, num_classes=10, dtype='float32')

In [168]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=47)

In [212]:
model.compile(optimizer = 'adam',
             loss = 'categorical_crossentropy',
             metrics = ['accuracy'])

history = model.fit(X, y, 
                    epochs = 5, 
                    batch_size = 32, 
                    validation_split = 0.1,
                    class_weight = class_weights)

Train on 28093 samples, validate on 3122 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
d2v = Doc2Vec.load("data/discourse_markers/d2v_oanc.model")
np.random.seed = 47
X_pad = np.random.rand(50)
with open('data/discourse_markers/oanc_terms.pkl', 'rb') as f:
    terms_dict = pickle.load(f)
ind_dict = {v: k for k, v in terms_dict.items()}
ind_dict[9] = 'NULL'

In [229]:
def pred(passage, vectorized = False):
    if not vectorized:
        sentences = sent_tokenize(passage)
        tok_sent = [word_tokenize(sentence) for sentence in sentences]
        vectors = [d2v.infer_vector(sentence) for sentence in tok_sent]
    else:
        vectors = passage
    
    for idx in range(len(vectors) - 1):
        if idx == 0 and not vectorized:
            print(sentences[idx])
        input_vec = np.array([vectors[idx], vectors[idx+1]])
        ans = model.predict(np.array([input_vec,]))
        if not vectorized:
            print('[' + ind_dict[np.argmax(ans[0])] + '] ' + sentences[idx+1])
        else:
            return(ind_dict[np.argmax(ans[0])])

In [230]:
text = """
If you’re going to try, go all the way. Otherwise, don’t even start. This could mean losing girlfriends, wives, relatives and maybe even your mind. It could mean not eating for three or four days. It could mean freezing on a park bench. It could mean jail. It could mean derision. It could mean mockery–isolation. Isolation is the gift. All the others are a test of your endurance, of how much you really want to do it. And, you’ll do it, despite rejection and the worst odds. And it will be better than anything else you can imagine. If you’re going to try, go all the way. There is no other feeling like that. You will be alone with the gods, and the nights will flame with fire. You will ride life straight to perfect laughter. It’s the only good fight there is.

"""

In [231]:
pred(text) # Also, she had


If you’re going to try, go all the way.
[But] Otherwise, don’t even start.
[But] This could mean losing girlfriends, wives, relatives and maybe even your mind.
[But] It could mean not eating for three or four days.
[But] It could mean freezing on a park bench.
[But] It could mean jail.
[But] It could mean derision.
[But] It could mean mockery–isolation.
[But] Isolation is the gift.
[NULL] All the others are a test of your endurance, of how much you really want to do it.
[But] And, you’ll do it, despite rejection and the worst odds.
[But] And it will be better than anything else you can imagine.
[But] If you’re going to try, go all the way.
[But] There is no other feeling like that.
[NULL] You will be alone with the gods, and the nights will flame with fire.
[But] You will ride life straight to perfect laughter.
[But] It’s the only good fight there is.


In [235]:
# see where the errors are landing

errors = Counter()
total_pred = Counter()
correct = 0
total = len(X)

for idx in tqdm(range(len(X))):
    predicted = pred(X[idx], True)
    true = ind_dict[np.argmax(y[idx])]
    
    total_pred[predicted] += 1
    
    if predicted == true:
        correct += 1
    else:
        errors[true + ' => ' + predicted] += 1

HBox(children=(IntProgress(value=0, max=31215), HTML(value='')))

In [236]:
print(str(correct/float(total)*100))

39.02610924235143


In [237]:
total_pred.most_common()

[('NULL', 16788),
 ('But', 13235),
 ('And', 1144),
 ('First', 18),
 ('Or', 17),
 ('So', 8),
 ('Now', 4),
 ('Well', 1)]

In [238]:
errors.most_common()

[('But => NULL', 4267),
 ('NULL => But', 3125),
 ('And => NULL', 2708),
 ('And => But', 2474),
 ('So => But', 825),
 ('So => NULL', 704),
 ('Also => NULL', 578),
 ('Now => NULL', 535),
 ('First => NULL', 454),
 ('Or => NULL', 391),
 ('Now => But', 349),
 ('Yet => But', 317),
 ('Or => But', 310),
 ('Yet => NULL', 293),
 ('First => But', 280),
 ('Also => But', 250),
 ('But => And', 244),
 ('NULL => And', 226),
 ('Well => NULL', 215),
 ('Well => But', 141),
 ('So => And', 96),
 ('Or => And', 85),
 ('Well => And', 42),
 ('Now => And', 40),
 ('Also => And', 20),
 ('First => And', 20),
 ('Yet => And', 14),
 ('And => Or', 4),
 ('But => First', 3),
 ('But => So', 3),
 ('NULL => Or', 3),
 ('But => Or', 2),
 ('And => First', 2),
 ('But => Now', 2),
 ('NULL => So', 2),
 ('Also => First', 1),
 ('Yet => First', 1),
 ('So => First', 1),
 ('So => Now', 1),
 ('First => Or', 1),
 ('NULL => First', 1),
 ('Or => First', 1),
 ('Or => So', 1),
 ('Also => Now', 1)]