In [137]:
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook as tqdm
import gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import pickle
from sklearn.model_selection import train_test_split
from sklearn.utils import class_weight
from sklearn.utils import shuffle
import random as rand
from nltk import word_tokenize
from collections import Counter
import math

import keras.backend as K
import tensorflow as tf

from keras.models import Model
from keras.layers import Input, Dense, Lambda, Dropout, Bidirectional, LSTM
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences

# train doc2vec encodings

In [40]:
df = pd.read_pickle('data/discourse_markers/oanc_df.zip')

In [4]:
X_tokens = []
for idx, row in tqdm(df.iterrows(), total=len(df)):
    for item in row['clean_and_tokenized']:
        X_tokens.append(item)

HBox(children=(IntProgress(value=0, max=65101), HTML(value='')))




In [5]:
tagged = []
for i, sent in enumerate(tqdm(X_tokens)):
    tagged.append(TaggedDocument(words = sent, tags = [str(i)]))

HBox(children=(IntProgress(value=0, max=338890), HTML(value='')))




In [6]:
d2v_oanc = Doc2Vec(vector_size = 50, min_count = 1, dm = 1)
d2v_oanc.build_vocab(tagged)
print('vocabulary built')

vocabulary built


In [7]:
d2v_oanc.train(tagged, total_examples = d2v_oanc.corpus_count, epochs = 20)
print('training finished')
d2v_oanc.save("data/discourse_markers/d2v_oanc.model")
print("trained & saved")

training finished
trained & saved


# vectorize texts and add to new df

In [42]:
vecs = []

index = 0
for idx, row in tqdm(df.iterrows(), total = len(df)):
    current_vecs = []
    for item in row['clean_and_tokenized']:
        assert tagged[index].words == item
        current_vecs.append(d2v_oanc.docvecs[str(index)])
        index += 1
    vecs.append(current_vecs)
        
df['X'] = vecs

HBox(children=(IntProgress(value=0, max=65101), HTML(value='')))




In [43]:
df = df.drop(columns=['sents', 'text'])
df = df.rename(columns={"vectors": "y"})
df.to_pickle('data/discourse_markers/vectorized_oanc_df.zip')

# extract X and y, prepare balancing

In [54]:
df = pd.read_pickle('data/discourse_markers/vectorized_oanc_df.zip')
with open('data/discourse_markers/oanc_terms.pkl', 'rb') as f:
    terms_dict = pickle.load(f)
ind_dict = {v: k for k, v in terms_dict.items()}
ind_dict[9] = 'NULL'

In [55]:
df.head()

Unnamed: 0,label,clean_and_tokenized,y,X
0,non-fiction/OUP/Berk/ch1,"[[In, my, three, decades, of, teaching, univer...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 1], [0, 0, 0, 0, ...","[[0.054467976, 0.1869069, 0.06425432, 0.130815..."
1,non-fiction/OUP/Berk/ch1,"[[As, a, byproduct, of, those, experiences, ,,...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 1], [0, 0, 0, 0, ...","[[0.23088518, 5.699069e-05, -0.19189279, 0.195..."
2,non-fiction/OUP/Berk/ch1,"[[When, we, looked, for, a, preschool, ,, many...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 1], [0, 0, 0, 0, ...","[[0.16534813, 0.3831386, -0.071578294, 0.27849..."
3,non-fiction/OUP/Berk/ch1,"[[I, ’, ve, read, that, it, ’, s, the, quality...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 1], [0, 0, 0, 0, ...","[[-0.16124506, 0.21021883, -0.029272433, -0.12..."
4,non-fiction/OUP/Berk/ch1,"[[His, father, ﬁrmly, insists, that, he, do, i...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 1], [0, 0, 0, 0, ...","[[0.17460386, 0.14078914, -0.039566375, -0.223..."


In [102]:
# check distribution of classes
distribution = [0] * 10
num_samples = 0
class_weights = {}
count = Counter()

for idx, row in tqdm(df.iterrows(), total = len(df)):    
    # to make things a bit more balanced, skip if all sentences are in NULL class
    if all(item == [0, 0, 0, 0, 0, 0, 0, 0, 0, 1] for item in row['y']):
        continue
        
    # cut out super long paragraphs first
    if len(row['y']) > 8:
        continue
    count[len(row['y'])] += 1
        
    row_sum = [sum(i) for i in zip(*row['y'])]
    distribution = [x + y for x, y in zip(distribution, row_sum)]
    
    num_samples += 1

print(distribution)
total = sum(distribution)
print('number of samples\t' + str(num_samples))
print('number of subsamples\t' + str(total))
print()

for idx in range(len(distribution)):
    print(ind_dict[idx] + '\t' + "{0:.0%}".format(distribution[idx]/float(total)))

HBox(children=(IntProgress(value=0, max=65101), HTML(value='')))


[436, 586, 691, 678, 692, 978, 1658, 4858, 8762, 55002]
number of samples	15699
number of subsamples	74341

Well	1%
Yet	1%
Or	1%
First	1%
Also	1%
Now	1%
So	2%
And	7%
But	12%
NULL	74%


In [103]:
def create_class_weight(dist, mu=0.15):
    total = sum(dist)
    labels = range(len(dist))
    class_weight = {}

    for label in labels:
        #score = math.log(mu*total/float(labels_dict[key]))
        class_weight[label] = total / dist[label]

    return class_weight

class_weights = create_class_weight(distribution)
class_weights

{0: 170.50688073394497,
 1: 126.8617747440273,
 2: 107.58465991316932,
 3: 109.64749262536873,
 4: 107.42919075144509,
 5: 76.01329243353783,
 6: 44.83775633293124,
 7: 15.302799505969535,
 8: 8.484478429582287,
 9: 1.3516053961674122}

In [115]:
X = []
y = []

for idx, row in tqdm(df.iterrows(), total = len(df)):    
    # to make things a bit more balanced, skip if all sentences are in NULL class
    if all(item == [0, 0, 0, 0, 0, 0, 0, 0, 0, 1] for item in row['y']):
        continue
        
    # cut out super long paragraphs
    if len(row['y']) > 8:
        continue
        
    assert len(row['y']) == len(row['X'])
    
    X.append(row['X'])
    y.append(row['y'])

HBox(children=(IntProgress(value=0, max=65101), HTML(value='')))




## padding

In [140]:
np.random.seed = 47
X_pad = np.random.rand(50)

X = pad_sequences(X, maxlen=8, dtype='float32', padding='post', truncating='post', value=X_pad)
print(X.shape)
y = pad_sequences(y, maxlen=8, dtype='int32', padding='post', value = [0]*10)
print(y.shape)

(15699, 8, 50)
(15699, 8, 10)


In [141]:
with open('data/discourse_markers/oanc_X.pkl', 'wb') as f:
    pickle.dump(X, f)
with open('data/discourse_markers/oanc_y.pkl', 'wb') as f:
    pickle.dump(y, f)

# model building

In [142]:
input_len = X.shape[1]
num_units = 128
embed_dim = 50

In [151]:
K.clear_session()

In [152]:
main_input = Input(shape = (input_len, embed_dim), dtype = 'float32', name = 'main_input')

bi_lstm = Bidirectional(LSTM(return_sequences = True, units = num_units), name = 'bi-lstm')(main_input)
dropout = Dropout(rate = 0.25, name = 'dropout')(bi_lstm)
output = Dense(10, activation='softmax', name = 'output')(dropout)

In [153]:
model = Model(inputs = main_input, outputs = output)

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
main_input (InputLayer)      (None, 8, 50)             0         
_________________________________________________________________
bi-lstm (Bidirectional)      (None, 8, 256)            183296    
_________________________________________________________________
dropout (Dropout)            (None, 8, 256)            0         
_________________________________________________________________
output (Dense)               (None, 8, 10)             2570      
Total params: 185,866
Trainable params: 185,866
Non-trainable params: 0
_________________________________________________________________


# train

In [148]:
with open('data/discourse_markers/oanc_X.pkl', 'rb') as f:
    X = pickle.load(f)
with open('data/discourse_markers/oanc_y.pkl', 'rb') as f:
    y = pickle.load(f)

In [149]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=47)

In [155]:
model.compile(optimizer = 'adam',
             loss = 'categorical_crossentropy',
             metrics = ['accuracy'])

history = model.fit(X_train, y_train, epochs = 5, batch_size = 32)#, class_weight = class_weights)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
# https://github.com/tbennun/keras-bucketed-sequence