# EDAN95 - Applied Machine Learning
### LTH Fall 2019

# Lab 4
### David Larsson & Jonas Lundgren

### GloVe embeddings

In [143]:
import numpy as np
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [2]:
def read_GloVe(path = "data/glove.6B.100d.txt"):
    """
    Function that reads GloVe embeddings and returns a dictionary, 
    where the keys will be the words and the values, the embeddings.
    """
    d = {}
    with open(path) as f:
        for embeddings in f:
            embeddings_list = embeddings.split()
            d[embeddings_list[0]] = np.array([float(i) for i in embeddings_list[1:]])
            
    return d

In [3]:
glove_dict = read_GloVe()

In [4]:
def cosine_sim(word, glove_dict = glove_dict, num_closest = 5):
    """
    Using a cosine similarity, computes the num_closest 
    closest words to word in glove_dict
    """
    A = glove_dict[str(word)]
    A_norm = np.linalg.norm(A, 2)
    
    cos_sims = np.zeros(len(glove_dict))
    
    for i, (k, v) in enumerate(glove_dict.items()):
        B = v
        B_norm = np.linalg.norm(B, 2)
    
        cos_sim = A.dot(B) / (A_norm * B_norm)
        cos_sims[i] = cos_sim
    
    max

In [5]:
def cosine_sim(word, glove_dict = glove_dict, num_closest = 5):
    """
    Using a cosine similarity, computes the num_closest 
    closest words to word in glove_dict
    """
    A = glove_dict[str(word)]
    A_norm = np.linalg.norm(A, 2)
    
    cos_sims = np.zeros(len(glove_dict))
    
    for i, (k, v) in enumerate(glove_dict.items()):
        if k != word:
            B = v
            B_norm = np.linalg.norm(B, 2)
    
            cos_sim = A.dot(B) / (A_norm * B_norm)
            cos_sims[i] = cos_sim
    
    max_idxs = list(cos_sims.argsort()[-num_closest:])
    
    return [list(glove_dict.keys())[i] for i in max_idxs]

In [6]:
print("Closest 5 words to table :", cosine_sim('table'))
print("Closest 5 words to table :", cosine_sim('france'))
print("Closest 5 words to table :", cosine_sim('sweden'))

Closest 5 words to table : ['side', 'room', 'bottom', 'place', 'tables']
Closest 5 words to table : ['paris', 'spain', 'britain', 'french', 'belgium']
Closest 5 words to table : ['austria', 'netherlands', 'finland', 'norway', 'denmark']


## Load Files

In [8]:
BASE_DIR = r'/home/jonas/projects/edan95/Lab4/data'


def load_conll2003_en():
    train_file = BASE_DIR + '/eng.train'
    dev_file = BASE_DIR + '/eng.valid'
    test_file = BASE_DIR + '/eng.test'
    column_names = ['form', 'ppos', 'pchunk', 'ner']
    train_sentences = open(train_file).read().strip()
    dev_sentences = open(dev_file).read().strip()
    test_sentences = open(test_file).read().strip()
    return train_sentences, dev_sentences, test_sentences, column_names

In [9]:
train_sentences, dev_sentences, test_sentences, column_names = load_conll2003_en()

Code from course github: https://github.com/pnugues/edan95/

In [13]:
import re

class Token(dict):
    pass

class CoNLLDictorizer:

    def __init__(self, column_names, sent_sep='\n\n', col_sep=' +'):
        self.column_names = column_names
        self.sent_sep = sent_sep
        self.col_sep = col_sep

    def fit(self):
        pass

    def transform(self, corpus):
        corpus = corpus.strip()
        sentences = re.split(self.sent_sep, corpus)
        return list(map(self._split_in_words, sentences))

    def fit_transform(self, corpus):
        return self.transform(corpus)

    def _split_in_words(self, sentence):
        rows = re.split('\n', sentence)
        rows = [row for row in rows if row[0] != '#']
        return [Token(dict(zip(self.column_names,
                               re.split(self.col_sep, row))))
                for row in rows]

In [14]:
conll_dict = CoNLLDictorizer(column_names, col_sep=' +')
train_dict = conll_dict.transform(train_sentences)
dev_dict = conll_dict.transform(dev_sentences)
test_dict = conll_dict.transform(test_sentences)

Named-entity recognition (NER) tags

In [19]:
train_dict[0]

[{'form': '-DOCSTART-', 'ppos': '-X-', 'pchunk': 'O', 'ner': 'O'}]

In [34]:
def build_sequences(corpus_dict, key_x='form', key_y='ner', tolower=True):
    """
    Creates sequences from a list of dictionaries
    :param corpus_dict:
    :param key_x:
    :param key_y:
    :return: X, Y
    """
    X = []
    Y = []
    for sentence in corpus_dict:
        x = []
        y = []
        for word in sentence:
            x += [word[key_x]]
            y += [word[key_y]]
        if tolower:
            x = list(map(str.lower, x))
        X += [x]
        Y += [y]
    return X, Y

Should correspond with index 1 in X

In [40]:
X_train_cat, Y_train_cat = build_sequences(train_dict)
X_dev_cat, Y_dev_cat = build_sequences(dev_dict)
X_test_cat, Y_test_cat = build_sequences(test_dict)
print(X_train_cat[1])
print(Y_train_cat[1])

['eu', 'rejects', 'german', 'call', 'to', 'boycott', 'british', 'lamb', '.']
['I-ORG', 'O', 'I-MISC', 'O', 'O', 'O', 'I-MISC', 'O', 'O']


In [50]:
vocabulary_words = sorted(list(set([word for sentence in X_train_cat for word in sentence])))
ner = sorted(list(set([ner for sentence in Y_train_cat for ner in sentence])))

print(vocabulary_words[:10])
print(ner[:10])

['!', '"', '$', '%', '&', "'", "'d", "'ll", "'m", "'re"]
['B-LOC', 'B-MISC', 'B-ORG', 'I-LOC', 'I-MISC', 'I-ORG', 'I-PER', 'O']


In [54]:
embeddings_words = sorted(glove_dict.keys())
embeddings_words[:10]

['!', '!!', '!!!', '!!!!', '!!!!!', '!?', '!?!', '"', '#', '##']

In [104]:
vocabulary_words = sorted(list(set(vocabulary_words + embeddings_words)))
vocabulary_words[:10]

['!', '!!', '!!!', '!!!!', '!!!!!', '!?', '!?!', '"', '#', '##']

In [105]:
tok_words_reversed = dict(enumerate(vocabulary_words, start=2))
tok_ners_reversed = dict(enumerate(ner, start=2))

In [106]:
tokenized_words = {v: k for k, v in tok_words_reversed.items()}
tokenized_ners = {v: k for k, v in tok_ners_reversed.items()}

print('word index:', list(tokenized_words.items())[:10])
print('\nNER index:', list(tokenized_ners.items())[:10])

word index: [('!', 2), ('!!', 3), ('!!!', 4), ('!!!!', 5), ('!!!!!', 6), ('!?', 7), ('!?!', 8), ('"', 9), ('#', 10), ('##', 11)]

NER index: [('B-LOC', 2), ('B-MISC', 3), ('B-ORG', 4), ('I-LOC', 5), ('I-MISC', 6), ('I-ORG', 7), ('I-PER', 8), ('O', 9)]


In [140]:
def sentences_to_idx(X, tokenized_x):
    X_idxs = []
    for sentence in X:
        #Map unknowns to 1
        X_idxs.append([tokenized_x[x] if x in tokenized_x else 1 for x in sentence])
        
    return X_idxs

In [153]:
X_idx = sentences_to_idx(X_train_cat, tokenized_words)
Y_idx = sentences_to_idx(Y_train_cat, tokenized_ners)

X_idx_dev = sentences_to_idx(X_dev_cat, tokenized_words)
Y_idx_dev = sentences_to_idx(Y_dev_cat, tokenized_ners)

X_idx_test = sentences_to_idx(X_test_cat, tokenized_words)
Y_idx_test = sentences_to_idx(Y_test_cat, tokenized_ners)

In [158]:
X = pad_sequences(X_idx)
Y = pad_sequences(Y_idx)

print(X.shape)
print(Y.shape)

(14987, 113)
(14987, 113)


In [156]:
max_len = X.shape[1]

In [157]:
X_dev = pad_sequences(X_idx_dev, maxlen = max_len)
Y_dev = pad_sequences(Y_idx_dev, maxlen = max_len)

X_test = pad_sequences(X_idx_test, maxlen = max_len)
Y_test = pad_sequences(Y_idx_test, maxlen = max_len)