In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf

In [2]:
b = pd.read_csv('../data/brown.csv')

In [3]:
import re

docs = []
non_alpha = re.compile(r"[^ a-z\-']")

doc_name = None
for i in range(b.shape[0]):
    if b['filename'][i] != doc_name:
        if doc_name is not None:
            docs.append(doc)
        doc_name = b['filename'][i]
        doc = ''
        
    sent = re.sub(non_alpha, '', b['tokenized_text'][i].lower().replace('--', ' ').replace("''", '')).strip() + "|| "
    doc += sent

In [4]:
vocab = {}
vectors = [np.zeros(300)]

idx = 1
with open('../data/glove.6B.300d.txt') as f:
    for line in f:
        tokens = line.split()
        vocab[tokens[0]] = idx
        idx += 1
        vector = np.array(tokens[1:], dtype='float')
        vectors.append(vector)

embedding_matrix = np.vstack(vectors)

In [14]:
window_width = 13
mid_point = window_width // 2
final = []
non_final = []
sample_factor = 1

for doc in docs:
    tokens = doc.split()
    for pos in range(len(tokens) - window_width):
        window = tokens[pos:(pos + window_width)]
        indices = [vocab.get(word.replace('||', ''), 0) for word in window]
        
        if window[mid_point].endswith('||'):
            yx = [1]
            yx.extend(indices)
            final.append(yx)
        else:
            yx = [0]
            yx.extend(indices)
            non_final.append(yx)
            

sample_idx = np.random.choice(len(non_final), len(final) * sample_factor)

sampled_non_final = [non_final[i] for i in sample_idx]
YX = np.vstack([np.vstack(final), np.vstack(sampled_non_final)])
np.random.shuffle(YX)
Y = YX[:, 0]
X = YX[:, 1:]
        
#Y = 
#X = np.vstack(X)

In [6]:
Y

array([1, 0, 0, ..., 1, 1, 1])

In [7]:
from tensorflow.keras import layers, Input
from tensorflow.keras.models import Model
from tensorflow.keras import regularizers


def make_simple_model(word_vectors, window_width, dense_size):
    embed = layers.Embedding(word_vectors.shape[0],
                             word_vectors.shape[1],
                             input_length = window_width,
                             weights = [word_vectors],
                             trainable = False,
                             mask_zero = True)
    
    word_input = Input(shape=(window_width,), dtype='float32')
    vectors = embed(word_input)
    
    out = layers.Flatten()(vectors)
    out = layers.Dropout(rate=0.4)(out)
    
    out = layers.Dense(dense_size, activation='relu')(out)
    out = layers.Dropout(rate=0.2)(out)
    out = layers.Dense(dense_size, activation='relu')(out)
    out = layers.Dropout(rate=0.2)(out)
    
    output = layers.Dense(1, activation='sigmoid')(out)
    
    model = Model(word_input, output)

    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
    
    
    return model

In [9]:
m1 = make_simple_model(embedding_matrix, window_width, 200)

In [None]:
m1.summary()

In [None]:
m1.fit(X, Y, batch_size=64, epochs=20, validation_split=0.2)

In [57]:
np.unique(Y, return_counts=True)

(array([0, 1]), array([56694, 56694]))

In [58]:
YX1 = np.vstack([non_final, final])
np.random.shuffle(YX1)

In [61]:
Y1 = YX1[:, 0]
X1 = YX1[:, 1:]

In [62]:
m1.fit(X1, Y1, batch_size=128, epochs=1, validation_split=0.2)

Train on 801446 samples, validate on 200362 samples


<tensorflow.python.keras.callbacks.History at 0x7f0002a385f8>

In [82]:
gold = Y1[:1000]

In [80]:
scores = m1.predict(X1[:1000])

In [91]:
tp, tn, fp, fn = 0,0,0,0
for i, l in enumerate(gold):
    s = scores[i]
    
    if s >= 0.1:
        if l == 1:
            tp += 1
        else:
            fp += 1
    else:
        if l == 1:
            fn += 1
        else:
            tn += 1

In [92]:
print (tp, tn, fp, fn)

50 829 114 7


In [93]:
fp/(fp+tn)

0.12089077412513255

In [94]:
tp/(tp+fn)

0.8771929824561403

In [15]:
def make_self_attention_model(word_vectors, window_width, dense_size):
    embed = layers.Embedding(word_vectors.shape[0],
                             word_vectors.shape[1],
                             input_length = window_width,
                             weights = [word_vectors],
                             trainable = False,
                             mask_zero = True)
    
    word_input = Input(shape=(window_width,), dtype='float32')
    vectors = embed(word_input)
    
    cnn_layer = layers.Conv1D(
        filters=100,
        kernel_size=4,
        padding='same')
    query_value = cnn_layer(vectors)
    
    self_attended = layers.Attention()([query_value, query_value])
    out = layers.Concatenate()([query_value, self_attended])
    
    out = layers.Dropout(rate=0.4)(out)
    
    out = layers.Flatten()(out)
    out = layers.Dense(dense_size, activation='relu')(out)
    out = layers.Dropout(rate=0.2)(out)
    out = layers.Dense(dense_size, activation='relu')(out)
    out = layers.Dropout(rate=0.2)(out)
    
    output = layers.Dense(1, activation='sigmoid')(out)
    
    model = Model(word_input, output)

    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
    
    
    return model

In [16]:
m2 = make_self_attention_model(embedding_matrix, window_width, 200)

In [17]:
m2.summary()

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            [(None, 13)]         0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 13, 300)      120000300   input_3[0][0]                    
__________________________________________________________________________________________________
conv1d_1 (Conv1D)               (None, 13, 100)      120100      embedding_2[0][0]                
__________________________________________________________________________________________________
attention_1 (Attention)         (None, 13, 100)      0           conv1d_1[0][0]                   
                                                                 conv1d_1[0][0]             

In [18]:
m2.fit(X, Y, batch_size=64, epochs=20, validation_split=0.2)

Train on 90580 samples, validate on 22646 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7f766c015e80>