In [126]:
import pandas as pd
import numpy as np
import tensorflow as tf

In [119]:
b = pd.read_csv('../data/brown.csv')

In [59]:
import re

docs = []
non_alpha = re.compile(r"[^ a-z\-']")

doc_name = None
for i in range(b.shape[0]):
    if b['filename'][i] != doc_name:
        if doc_name is not None:
            docs.append(doc)
        doc_name = b['filename'][i]
        doc = ''
        
    sent = re.sub(non_alpha, '', b['tokenized_text'][i].lower().replace('--', ' ').replace("''", '')).strip() + "|| "
    doc += sent

In [64]:
vocab = {}
vectors = [np.zeros(300)]

idx = 1
with open('../data/glove.6B.300d.txt') as f:
    for line in f:
        tokens = line.split()
        vocab[tokens[0]] = idx
        idx += 1
        vector = np.array(tokens[1:], dtype='float')
        vectors.append(vector)

embedding_matrix = np.vstack(vectors)

In [120]:
X = []
Y = []
window_width = 7
mid_point = window_width // 2

for doc in docs:
    tokens = doc.split()
    for pos in range(len(tokens) - window_width):
        window = tokens[pos:(pos + window_width)]
        if window[mid_point].endswith('||'):
            Y.append(1)
        else:
            Y.append(0)
            
        indices = [vocab.get(word.replace('||', ''), 0) for word in window]
        X.append(np.array(indices))
        
Y = tf.keras.utils.to_categorical(np.array(Y))
X = np.vstack(X)

In [136]:
from tensorflow.keras import layers, Input
from tensorflow.keras.models import Model
from tensorflow.keras import regularizers


def make_simple_model(word_vectors, window_width, lstm_size, dense_size):
    embed = layers.Embedding(word_vectors.shape[0],
                             word_vectors.shape[1],
                             input_length = window_width,
                             weights = [word_vectors],
                             trainable = False,
                             mask_zero = True)
    
    word_input = Input(shape=(window_width,), dtype='float32')
    vectors = embed(word_input)
    
    rnn = layers.Bidirectional(layers.SimpleRNN(lstm_size))
    rnn_out = rnn(vectors)
    #rnn_out = layers.Flatten()(vectors)
    
    out = layers.Dense(dense_size, activation='relu')(rnn_out)
    out = layers.Dropout(rate=0.2)(out)
    out = layers.Dense(dense_size, activation='relu')(out)
    out = layers.Dropout(rate=0.2)(out)
    
    output = layers.Dense(2, activation='softmax')(out)
    
    model = Model(word_input, output)

    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
    
    
    return model

In [137]:
m1 = make_simple_model(embedding_matrix, window_width, 128, 200)

In [138]:
m1.summary()

Model: "model_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_10 (InputLayer)        [(None, 7)]               0         
_________________________________________________________________
embedding_9 (Embedding)      (None, 7, 300)            120000300 
_________________________________________________________________
bidirectional_1 (Bidirection (None, 256)               109824    
_________________________________________________________________
dense_21 (Dense)             (None, 200)               51400     
_________________________________________________________________
dropout_14 (Dropout)         (None, 200)               0         
_________________________________________________________________
dense_22 (Dense)             (None, 200)               40200     
_________________________________________________________________
dropout_15 (Dropout)         (None, 200)               0   

In [139]:
m1.fit(X, Y, batch_size=256, epochs=10, validation_split=0.2)

Train on 800648 samples, validate on 200162 samples
Epoch 1/10
 57088/800648 [=>............................] - ETA: 27:30 - loss: 0.1940 - acc: 0.9434

KeyboardInterrupt: 

In [140]:
np.unique(Y, return_counts=True)

(array([0., 1.], dtype=float32), array([1000810, 1000810]))