In [16]:
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook as tqdm
import gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import pickle
from sklearn.model_selection import train_test_split

import keras.backend as K
from keras.models import Sequential
from keras.layers import Dense

# Brown dataset: vectorization using doc2vec

In [2]:
with open('data/discourse_markers/X_50dim_d2v.pkl', 'rb') as f:
    X = pickle.load(f)
with open('data/discourse_markers/y.pkl', 'rb') as f:
    y = pickle.load(f)

In [3]:
print(X.shape)
print(y.shape)

(57340, 50)
(57340, 59)


In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
batch_size = 32
vector_size = X.shape[1] # 50
output_shape = y.shape[1] # 59

In [26]:
K.clear_session()

In [27]:
def recall_m(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

def precision_m(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [28]:
model = Sequential()
model.add(Dense(64, activation = 'relu', input_dim = 50, name = 'dense_1'))
model.add(Dense(59, activation = 'sigmoid', name = 'dense_2'))
model.compile(optimizer = 'adam',
              loss = 'sparse_categorical_crossentropy',
              metrics = ['accuracy', f1_m, precision_m, recall_m])

history = model.fit(X_train, y_train[:, 0], validation_split = 0.3, epochs = 5, batch_size = 32, verbose = 2)

loss, accuracy, f1_score, precision, recall = model.evaluate(X_test, y_test[:, 0], verbose=2)

Train on 32110 samples, validate on 13762 samples
Epoch 1/5
 - 2s - loss: 0.5143 - acc: 0.9749 - f1_m: 0.0301 - precision_m: 0.0157 - recall_m: 0.5386 - val_loss: 0.0840 - val_acc: 0.9848 - val_f1_m: 0.0290 - val_precision_m: 0.0152 - val_recall_m: 0.3976
Epoch 2/5
 - 2s - loss: 0.0825 - acc: 0.9842 - f1_m: 0.0301 - precision_m: 0.0158 - recall_m: 0.3936 - val_loss: 0.0783 - val_acc: 0.9848 - val_f1_m: 0.0290 - val_precision_m: 0.0152 - val_recall_m: 0.3976
Epoch 3/5
 - 2s - loss: 0.0799 - acc: 0.9842 - f1_m: 0.0301 - precision_m: 0.0158 - recall_m: 0.4056 - val_loss: 0.0772 - val_acc: 0.9848 - val_f1_m: 0.0290 - val_precision_m: 0.0152 - val_recall_m: 0.3976
Epoch 4/5
 - 2s - loss: 0.0793 - acc: 0.9842 - f1_m: 0.0301 - precision_m: 0.0158 - recall_m: 0.3956 - val_loss: 0.0782 - val_acc: 0.9848 - val_f1_m: 0.0290 - val_precision_m: 0.0152 - val_recall_m: 0.3976
Epoch 5/5
 - 2s - loss: 0.0792 - acc: 0.9842 - f1_m: 0.0302 - precision_m: 0.0158 - recall_m: 0.4106 - val_loss: 0.0771 - val_

In [29]:
print(f1_score)
print(precision)
print(recall)

0.035949912123618574
0.018835019183815836
0.4715730299881455
