In [8]:
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook as tqdm
import gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import pickle
from sklearn.model_selection import train_test_split
from sklearn.utils import class_weight
from sklearn.utils import shuffle
import random as rand
from nltk import word_tokenize, sent_tokenize
from collections import Counter
import math

import keras.backend as K
import tensorflow as tf

from keras.models import Model
from keras.layers import Input, Dense, Lambda, Dropout, Bidirectional, SimpleRNN
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras.utils import plot_model

This notebook was used to build model(s) to predict the different chosen discourse markers which might come between sentence pairs. 

# import OANC and BNC datasets

In [9]:
oanc_df = pd.read_pickle('data/discourse_markers/oanc_pair_df.zip')

In [10]:
bnc_df = pd.read_pickle('data/discourse_markers/bnc_pair_df.zip')

# vectorize using doc2vec

Here I trained a doc2vec model using gensim on all (original) sentences included in the datasets, and then vectorized all texts to include in the DataFrame.

In [51]:
X_tokens = []
for idx, row in tqdm(bnc_df.iterrows(), total=len(bnc_df)):
    X_tokens.append(row['sent1'])
for idx, row in tqdm(oanc_df.iterrows(), total=len(oanc_df)):
    X_tokens.append(row['sent1'])

HBox(children=(IntProgress(value=0, max=561648), HTML(value='')))

HBox(children=(IntProgress(value=0, max=273702), HTML(value='')))

In [52]:
tagged = []
for i, sent in enumerate(tqdm(X_tokens)):
    tagged.append(TaggedDocument(words = sent, tags = [str(i)]))

HBox(children=(IntProgress(value=0, max=835350), HTML(value='')))

In [53]:
d2v = Doc2Vec(vector_size = 100, min_count = 1, dm = 0)
d2v.build_vocab(tagged)
print('vocabulary built')
d2v.train(tagged, total_examples = d2v.corpus_count, epochs = 20)
print('training finished')
d2v.save("data/discourse_markers/d2v.model")
print("trained & saved")

vocabulary built
training finished
trained & saved


In [59]:
index = 0

X = []
for idx, row in tqdm(bnc_df.iterrows(), total=len(bnc_df)):
    assert tagged[index].words == row['sent1']
    sent1_vec = d2v.docvecs[str(index)]
    sent2_vec = d2v.infer_vector(row['sent2'])
    if index + 1 < len(tagged):
        if tagged[index+1].words == row['sent2']:
            sent2_vec = d2v.docvecs[str(index)]
    index += 1
    X.append([sent1_vec, sent2_vec])  
bnc_df['X'] = X

X = []
for idx, row in tqdm(oanc_df.iterrows(), total=len(oanc_df)):
    assert tagged[index].words == row['sent1']
    sent1_vec = d2v.docvecs[str(index)]
    sent2_vec = d2v.infer_vector(row['sent2'])
    if index + 1 < len(tagged):
        if tagged[index+1].words == row['sent2']:
            sent2_vec = d2v.docvecs[str(index)]
    index += 1
    X.append([sent1_vec, sent2_vec])  
oanc_df['X'] = X

HBox(children=(IntProgress(value=0, max=561648), HTML(value='')))

HBox(children=(IntProgress(value=0, max=273702), HTML(value='')))

In [62]:
oanc_df.to_pickle('data/discourse_markers/oanc_pair_df.zip')
bnc_df.to_pickle('data/discourse_markers/bnc_pair_df.zip')

# create X and y

Here the data is selected for training. First, a marker is chosen via its index in the provided dictionary. The resulting training set is balanced between positive and negative samples. A balanced test set is created by splitting the first results, and then an unbalanced dataset that matches the distribution in the original datasets is also sampled.

In [15]:
def build_X_y(term_idx):
    # takes an index for a term
    # returns X_train, X_test, y_train, y_test
    
    df = pd.DataFrame()
    df = pd.concat([df, 
                    oanc_df[oanc_df.y_dense == term_idx], 
                    bnc_df[bnc_df.y_dense == term_idx]],
                  sort = False)
    
    sampled_oanc = oanc_df[oanc_df.y_dense != term_idx].sample(n=int(len(df)/2), random_state = 47)
    sampled_bnc = bnc_df[bnc_df.y_dense != term_idx].sample(n=int(len(df)/2), random_state = 47)
    df = pd.concat([df, sampled_oanc, sampled_bnc], sort = False)
    df = df.reset_index(drop = True)
    
    indices = list(df.sample(frac=0.1, random_state = 47).index.values)
    
    set_labels = []
    for idx, row in df.iterrows():
        if idx in indices:
            set_labels.append('test')
        else:
            set_labels.append('train')
    df['set'] = set_labels
    
    X_train = np.stack(df[df.set == 'train'].X, axis = 0)
    X_test = np.stack(df[df.set == 'test'].X, axis = 0)
    y_train = [1 if x == term_idx else 0 for x in df[df.set == 'train'].y_dense]
    y_train = to_categorical(y_train, 2)
    y_test = [1 if x == term_idx else 0 for x in df[df.set == 'test'].y_dense]
    y_test = to_categorical(y_test, 2)
    
    X_train, y_train = shuffle(X_train, y_train, random_state=0)
    
    return df, (X_train, X_test, y_train, y_test)

# build the model

In [12]:
def build(X_train, y_train, rnn_units = 128, dense_units = 100, dropout_rate = 0.5):
    
    input_len = 2
    embed_dim = 100
    batch_size = 32
    
    # make sure weights are empty
    #model.load_weights('data/discourse_markers/model.clean')
    
    K.clear_session()

    main_input = Input(shape = (input_len, embed_dim), dtype = 'float32', name = 'main_input')

    rnn = Bidirectional(SimpleRNN(return_sequences = False, units = rnn_units), name = 'rnn')(main_input)
    rnn_dropout = Dropout(rate = dropout_rate, name = 'dropout')(rnn)

    dense = Dense(dense_units, activation = 'relu', name = 'dense')(rnn_dropout)
    dense_dropout = Dropout(rate = dropout_rate, name = 'dense_dropout')(dense)

    dense2 = Dense(dense_units, activation = 'relu', name = 'dense2')(dense_dropout)
    dense2_dropout = Dropout(rate = dropout_rate, name = 'dense2_dropout')(dense2)

    output = Dense(2, activation = 'softmax', name = 'output')(dense2_dropout)
    
    model = Model(inputs = main_input, outputs = output)
    #model.summary()
    
    model.compile(optimizer = 'adam',
             loss = 'binary_crossentropy',
             metrics = ['accuracy'])
    
    # save "clear" weights to retrain on different datasets
    #model.save_weights('data/discourse_markers/model.clean')
    
    history = model.fit(X_train, y_train, 
                        epochs = 5, 
                        batch_size = batch_size, 
                        validation_split = 0.1,
                       verbose = 0)
    
    return history, model

# train

In [13]:
with open('data/discourse_markers/oanc_terms.pkl', 'rb') as f:
    terms_dict = pickle.load(f)

In [16]:
for term in terms_dict:
    print('Building model for discourse marker:\t' + term)
    
    term_df, (X_train, X_test, y_train, y_test) = build_X_y(terms_dict[term])
    term_df.to_pickle('data/discourse_markers_models/' + term + '_df.pkl')
    print('Length of training set:\t\t\t' + str(int(len(X_train)*.9)))
    print('Length of validation set:\t\t' + str(int(len(X_train)*.1)))
    print('Length of test set:\t\t\t' + str(len(y_test)))
    
    history, model = build(X_train, y_train)
    print('Train accuracy:\t\t\t\t' + str(history.history['acc'][-1]))
    print('Validation accuracy:\t\t\t' + str(history.history['val_acc'][-1]))
        
    loss, accuracy = model.evaluate(X_test, y_test, batch_size = 32, verbose = 0)
    print('Test accuracy:\t\t\t\t' + str(accuracy) + '\n')
    
    model.save('data/discourse_markers_models/model.' + term + '.h5')

Building model for discourse marker:	Yet
Length of training set:			3091
Length of validation set:		343
Length of test set:			382
Train accuracy:				0.7094791329088276
Validation accuracy:			0.7005813939626827
Test accuracy:				0.7198952888943138

Building model for discourse marker:	So
Length of training set:			7678
Length of validation set:		853
Length of test set:			948
Train accuracy:				0.7219327949521194
Validation accuracy:			0.7564402817283916
Test accuracy:				0.7710970469164949

Building model for discourse marker:	Or
Length of training set:			2601
Length of validation set:		289
Length of test set:			321
Train accuracy:				0.7412533641365664
Validation accuracy:			0.7517241379310344
Test accuracy:				0.7071651090342679

Building model for discourse marker:	And
Length of training set:			20926
Length of validation set:		2325
Length of test set:			2584
Train accuracy:				0.7520787536864366
Validation accuracy:			0.7687016336546821
Test accuracy:				0.7859907120743034

Building mo