## load data

In [1]:
import csv 
import re
import gensim
import numpy as np
# from keras.preprocessing.sequence import pad_sequences


def process_csv(path):
    print("in process")
    with open(path) as f1:
        reader = csv.DictReader(f1)
        sent = None
        to_ret = []
        # add sentence info
        for i, row in enumerate(reader):
            if row["Sentence #"] is not None:
                sent = row["Sentence #"]
            else:
                row["Sentence #"] = sent

            to_ret.append(row)
        return to_ret

def get_all_sentences(data):
    all_sents = []
    sentence = []
    current_sent = data[0]["Sentence #"]
    for i, row in enumerate(data): 
        if current_sent == row["Sentence #"]:
            sentence.append((row["Word"], row["Tag"]))
        else:
            all_sents.append(sentence)
            sentence = [(row["Word"], row["Tag"])]
            current_sent = row["Sentence #"]
    return all_sents

def make_x_y(data, size):
    all_sentences = get_all_sentences(data)

    all_sent_len = [len(x) for x in all_sentences]
    just_sents = [[y[0] for y in x] for x in all_sentences]
    max_len = max(all_sent_len)

    model = gensim.models.Word2Vec(just_sents, min_count=1, size = size)

    all_tags = sorted(list(set([row["Tag"] for row in data])))
    tag_to_one_hot = {}
    one_hot_to_tag = {}
    for i, tag in enumerate(all_tags):
        tag_vector = np.zeros((len(all_tags)))
        tag_vector[i] = 1
        tag_to_one_hot[tag] = tag_vector
        one_hot_to_tag[tuple(tag_vector)] = tag

    null_embedding = np.zeros((size))
    null_tag = np.zeros((len(tag_vector)))

    x_full = []
    y_full = []

    for sent in all_sentences:
        w2v_seq = []
        tag_seq = []
        for i, tup in enumerate(sent):
            word, tag = tup
            w2v_seq.append(model[word])
            tag_seq.append(tag_to_one_hot[tag])
        for j in range(i, max_len):
            w2v_seq.append(null_embedding)
            tag_seq.append(null_tag)
        x_full.append(w2v_seq)
        y_full.append(tag_seq)

    return x_full, y_full, model, (tag_to_one_hot, one_hot_to_tag)



# data = process_csv("../data/entity-annotated-corpus/ner_dataset.csv")
# X, Y, word_embeddings, tag_embeddings = make_x_y(data, 300)




In [4]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding, LSTM

# from data import process_csv, make_x_y
import numpy as np 
import math

def define_model(embedding_size, tag_size):
    model = Sequential()
    model.add(LSTM(128, return_sequences=True, input_shape=(104,300)))
    model.add(Dropout(0.5))
    model.add(Dense(tag_size, activation="sigmoid"))

    model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

    return model


print("loading data")
data = process_csv("../data/entity-annotated-corpus/ner_dataset.csv")
print("getting X Y sets")
X, Y, word_embeddings, tag_embeddings = make_x_y(data, 300)

train, test, dev = .7, .2, .1

train_split = int(len(X)*train)
test_split = train_split+1+int(math.floor(len(X)*test))


XY = zip(X, Y)
np.random.shuffle(XY)

X, Y = zip(*XY)
X_train, Y_train = X[0:train_split], Y[0:train_split]
X_test, Y_test = X[train_split +1 : test_split], Y[train_split + 1:test_split]
X_dev, Y_dev = X[test_split +1 :], Y[test_split +1 :]


tag_to_vec, vec_to_tag = tag_embeddings
print(word_embeddings.vector_size)

tag_length = len(tag_to_vec.values())
print("defining model...")
model = define_model(word_embeddings.vector_size, tag_length)
print("training...")
model.fit(X_dev, Y_dev, epochs=10, batch_size=32)
scores = model.evaluate(X_test, Y_test, verbose=1)








loading data
in process
getting X Y sets
300
defining model...
training...


KeyboardInterrupt: 