In [None]:
# Imports
import numpy as np
import regex as re
from copy import deepcopy

from keras.layers import Embedding, Dense, LSTM, SimpleRNN, Flatten, Dropout, Bidirectional
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.utils.np_utils import to_categorical
import matplotlib.pyplot as plt

In [None]:
# Setting up glove
glove_dir = '/Users/ericrostedt/Desktop/Python/Tillämpad_Maskininlärning/Labb4/glove.6B.100d.txt'

embeddings_index = {}
f = open(glove_dir)
for line in f:
    values = line.split()
    word = values[0].lower()
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

In [None]:
print(embeddings_index['sweden'])

In [None]:
# Calculate closest words
def cosine_similarity(v1,v2):
    return v1@v2/(np.linalg.norm(v1)*np.linalg.norm(v2))
    
def get_5_closest_words(word):
    closest_words = []
    closest_dist = []
    v1 = np.array(embeddings_index[word])
    for k,v in embeddings_index.items():
        if k == word:
            continue
        v2 = np.array(v)
        dist = cosine_similarity(v1,v2)
        for i in range(5):
            if i >= len(closest_words):
                closest_words.append(k)
                closest_dist.append(dist)
                break
            w, val = closest_words[i],closest_dist[i]
            if dist>val:
                closest_words.insert(i,k)
                closest_dist.insert(i,dist)
                if len(closest_words) > 5:
                    del closest_words[-1]
                break
            
    return closest_words

In [None]:
# Test closests words
words = ['table','france','sweden']
for w in words:
    closest = get_5_closest_words(w)
    print("Closest words to %s are %s, %s, %s, %s, %s" %(w,*closest))

In [None]:
# read data
BASE_DIR = '/Users/ericrostedt/Desktop/Python/Tillämpad_Maskininlärning/Labb4/NER-data/'

def load_conll2003_en():
    train_file = BASE_DIR + 'eng.train'
    dev_file = BASE_DIR + 'eng.valid'
    test_file = BASE_DIR + 'eng.test'
    column_names = ['form', 'ppos', 'pchunk', 'ner']
    train_sentences = open(train_file).read().strip()
    dev_sentences = open(dev_file).read().strip()
    test_sentences = open(test_file).read().strip()
    return train_sentences, dev_sentences, test_sentences, column_names


In [None]:
# Class to transform data
class Token(dict):
    pass

class CoNLLDictorizer:

    def __init__(self, column_names, sent_sep='\n\n', col_sep=' +'):
        self.column_names = column_names
        self.sent_sep = sent_sep
        self.col_sep = col_sep

    def fit(self):
        pass

    def transform(self, corpus):
        corpus = corpus.strip()
        sentences = re.split(self.sent_sep, corpus)
        return list(map(self._split_in_words, sentences))

    def fit_transform(self, corpus):
        return self.transform(corpus)

    def _split_in_words(self, sentence):
        rows = re.split('\n', sentence)
        return [Token(dict(zip(self.column_names,
                               re.split(self.col_sep, row))))
                for row in rows]

In [None]:
# Extract data
train_sentences, dev_sentences, test_sentences, column_names = load_conll2003_en()
conll_dict = CoNLLDictorizer(column_names, col_sep=' +')

In [None]:
# Class to create sequences
def build_sequences(corpus_dict, key_x='form', key_y='pos', tolower=True):
    X = []
    Y = []
    for sentence in corpus_dict:
        x = [word[key_x] for word in sentence]
        y = [word[key_y] for word in sentence]
        if tolower:
            x = list(map(str.lower, x))
        X += [x]
        Y += [y]
    return X, Y

In [None]:
# Construct sets
train_dict = conll_dict.transform(train_sentences)
X_train_cat, Y_train_cat = build_sequences(train_dict, key_y='ner')

temp_word_list = deepcopy(X_train_cat)
temp_word_list.extend([embeddings_index.keys()])

word_set = sorted(list(set([item for sublist in temp_word_list for item in sublist])))
ner_set = sorted(list(set([item for sublist in Y_train_cat for item in sublist])))
print(len(word_set))
print(len(ner_set))

In [None]:
# Construct mapping between words and indicies.
rev_word_idx = dict(enumerate(word_set, start=2))
rev_ner_idx = dict(enumerate(ner_set, start=2))
word_idx = {v: k for k, v in rev_word_idx.items()}
ner_idx = {v: k for k, v in rev_ner_idx.items()}

print(word_idx["sweden"])
print(rev_word_idx[351800])

In [None]:
# Construct uniform starting matrix
m, n = len(word_set)+2, len(embeddings_index["sweden"])
matrix = np.random.rand(m,n) - 1/2
print(np.shape(matrix))

In [None]:
# Insert glove information
for key, value in embeddings_index.items():
    index = word_idx[key]
    matrix[index] = value

In [None]:
# Class 
def to_index(X, idx):
    """
    Convert the word lists (or POS lists) to indexes
    :param X: List of word (or POS) lists
    :param idx: word to number dictionary
    :return:
    """
    X_idx = []
    for x in X:
        # We map the unknown words to one
        x_idx = list(map(lambda x: idx.get(x, 1), x))
        X_idx += [x_idx]
    return X_idx

In [None]:
# Training, In X_dict, we replace the words with their index

# We create the parallel sequences of indexes
X_train_idx = to_index(X_train_cat, word_idx)
Y_train_idx = to_index(Y_train_cat, ner_idx)

X_train_padded = pad_sequences(X_train_idx, maxlen=150)
Y_train_padded = pad_sequences(Y_train_idx, maxlen=150)

# One extra symbol for 0 (padding)
Y_train_padded_vectorized = to_categorical(Y_train_padded, 
                                          num_classes=len(ner_set) + 2)

In [None]:
# Dev, In X_dict, we replace the words with their index
dev_dict = conll_dict.transform(dev_sentences)
X_dev_cat, Y_dev_cat = build_sequences(dev_dict, key_y='ner')
# We create the parallel sequences of indexes
X_dev_idx = to_index(X_dev_cat, word_idx)
Y_dev_idx = to_index(Y_dev_cat, ner_idx)

X_dev_padded = pad_sequences(X_dev_idx, maxlen=150)
Y_dev_padded = pad_sequences(Y_dev_idx, maxlen=150)

# One extra symbol for 0 (padding)
Y_dev_padded_vectorized = to_categorical(Y_dev_padded, 
                                          num_classes=len(ner_set) + 2)

In [None]:
# In X_dict, we replace the words with their index
test_dict = conll_dict.transform(test_sentences)
X_test_cat, Y_test_cat = build_sequences(test_dict, key_y='ner')
# We create the parallel sequences of indexes
X_test_idx = to_index(X_test_cat, word_idx)
Y_test_idx = to_index(Y_test_cat, ner_idx)

X_test_padded = pad_sequences(X_test_idx, maxlen=150)
Y_test_padded = pad_sequences(Y_test_idx, maxlen=150)

# One extra symbol for 0 (padding)
Y_test_padded_vectorized = to_categorical(Y_test_padded, 
                                          num_classes=len(ner_set) + 2)


In [None]:
#Build model
model = Sequential()
model.add(Embedding(m,
                    100,
                    mask_zero=True, input_length=150, trainable=False))
model.add(SimpleRNN(32, activation='relu', return_sequences=True))
model.add(Dense(len(ner_set) + 2, activation='softmax'))
model.layers[0].set_weights([matrix])

In [None]:
#Compile the model

model.compile(optimizer='rmsprop',
              loss='categorical_crossentropy',
              metrics=['acc'])

In [None]:
model.summary()

In [None]:
#Fit the model
history = model.fit(X_train_padded, Y_train_padded_vectorized,
                  epochs=30, batch_size=32, validation_data=(X_dev_padded, Y_dev_padded_vectorized))

In [None]:
# Evaluate model
test_loss, test_acc = model.evaluate(X_test_padded, Y_test_padded_vectorized)
print('Test acc: ', test_acc)

In [None]:
# Plot
acc = history.history['acc']
val_acc = history.history['val_acc']
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(1, len(acc) + 1)
plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.legend()
plt.figure()
plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()
plt.show()

In [None]:
#Build model and compile LSTM model
model_LSTM = Sequential()
model_LSTM.add(Embedding(m,
                    100,
                    mask_zero=True, input_length=150, trainable=False))
model_LSTM.add(Dropout(0.3))
model_LSTM.add(Bidirectional(LSTM(100, activation='relu', return_sequences=True)))
model_LSTM.add(Dropout(0.3))
model_LSTM.add(Bidirectional(LSTM(100, return_sequences=True)))
model_LSTM.add(Dropout(0.5))
model_LSTM.add(Dense(len(ner_set) + 2, activation='softmax'))

model_LSTM.layers[0].set_weights([matrix])

model_LSTM.compile(optimizer='rmsprop',
              loss='categorical_crossentropy',
              metrics=['acc'])

In [None]:
model_LSTM.summary()

In [None]:
model_LSTM.load_weights('weights_LSTM_final.h5')

In [None]:
#Fit the model
model_LSTM.fit(X_train_padded, Y_train_padded_vectorized,
                  epochs=5, batch_size=32)

In [None]:
model_LSTM.save_weights('weights_LSTM_final1.h5')

In [None]:
# Evaluate model
test_loss, test_acc = model_LSTM.evaluate(X_test_padded,  Y_test_padded_vectorized)
print('Test acc: ', test_acc)

In [None]:
# Predict
corpus_pos_predictions = model_LSTM.predict(X_test_padded)

In [None]:
pos_pred_num = []
for sent_nbr, sent_pos_predictions in enumerate(corpus_pos_predictions):
    pos_pred_num += [sent_pos_predictions[-len(X_test_cat[sent_nbr]):]]
print(pos_pred_num[:2])

In [None]:
pos_pred = []
for sentence in pos_pred_num:
    pos_pred_idx = list(map(np.argmax, sentence))
    pos_pred_cat = list(map(rev_ner_idx.get, pos_pred_idx))
    pos_pred += [pos_pred_cat]

print(pos_pred[:2])
print(X_test_cat[:2])
print(Y_test_cat[:2])

In [None]:
total, correct, total_ukn, correct_ukn = 0, 0, 0, 0
for id_s, sentence in enumerate(X_test_cat):
    for id_w, word in enumerate(sentence):
        total += 1
        if pos_pred[id_s][id_w] == Y_test_cat[id_s][id_w]:
            correct += 1
        # The word is not in the dictionary
        if word not in word_idx:
            total_ukn += 1
            if pos_pred[id_s][id_w] == Y_test_cat[id_s][id_w]:
                correct_ukn += 1

print('total %d, correct %d, accuracy %f' % 
      (total, correct, correct / total))
if total_ukn != 0:
    print('total unknown %d, correct %d, accuracy %f' % 
          (total_ukn, correct_ukn, correct_ukn / total_ukn))

In [None]:
def predict_sentence(sentence, model, word_idx, 
                     vocabulary_words, idx_pos, verbose=False):
    # Predict one sentence
    sentence = sentence.split()
    len_sentence = len(sentence)
    word_idxs = to_index([sentence], word_idx)
    word_idxs = pad_sequences(word_idxs,maxlen=150)

    pos_idx_pred = model.predict(word_idxs)
    pos_idxs = [np.argmax(x) for x in pos_idx_pred[0]]
    pos_idxs = pos_idxs[-len_sentence:]
    pos = list(map(idx_pos.get, pos_idxs))
    if verbose:
        print('Sentence', sentence)
        print('Sentence word indexes', word_idxs)
        #print('Padded sentence', word_idx_padded)
        print('POS predicted', pos_idx_pred[0])
        print('POS shape', pos_idx_pred.shape)
    return pos

In [None]:
new_X_test_cat = []
for sentence in X_test_cat:
    new_sentence = ""
    for w in sentence:
        new_sentence += w + " "
    new_X_test_cat.append(new_sentence)

In [None]:
y_test_pred_cat = []
for sentence in new_X_test_cat:
    y_test_pred_cat.append(predict_sentence(sentence.lower(), 
                                       model_LSTM, word_idx, 
                                       word_set, 
                                       rev_ner_idx))

In [None]:
print(X_test_cat[1])
print(new_X_test_cat[1])
print(y_test_pred_cat[1])

In [None]:
file = open("results.txt", "w")
for i, prediction in enumerate(y_test_pred_cat):
    for j, pred_word in enumerate(prediction):
        file.write(Y_test_cat[i][j] + " " + str(pred_word) + "\n")

In [None]:
if np.argmax(conf_mat.T[i]) == np.argmax(conf_mat[i])