# Italian Word Lemmatizer

### Import


In [208]:
import seaborn as sns
import matplotlib.pyplot as plt
from keras import backend as K
from keras.layers import Input, LSTM, Dense, Embedding, Concatenate, Bidirectional, TimeDistributed, RepeatVector, Activation, Dot, Lambda, Dropout, Add, Multiply, Masking, Attention
from keras.models import Model
from keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.model_selection import train_test_split
import numpy as np
import gensim
import tensorflow_addons as tfa
import tensorflow as tf
import pandas as pd
import datetime
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

# set all random seeds
np.random.seed(42)
tf.random.set_seed(42)


## Dataset

In [209]:

dataset_path = "./dev.csv"
df_dev = pd.read_csv(dataset_path, sep="\t", header=None,
                     names=["word", "tag", "lemm"])

dataset_path = "./test.csv"
df_test = pd.read_csv(dataset_path, sep="\t", header=None,
                      names=["word", "tag", "lemm"])

df_dev["word"] = df_dev["word"].astype(str)
df_dev["tag"] = df_dev["tag"].astype(str)
df_dev["lemm"] = df_dev["lemm"].astype(str)

df_test["word"] = df_test["word"].astype(str)
df_test["tag"] = df_test["tag"].astype(str)
df_test["lemm"] = df_test["lemm"].astype(str)

# remove head
df_dev = df_dev.iloc[1:]
df_test = df_test.iloc[1:]

# removing rows where tag is nan
df_dev = df_dev.dropna(subset=["tag"])
df_dev = df_dev[df_dev["tag"] != "nan"]
df_test = df_test.dropna(subset=["tag"])
df_test = df_test[df_test["tag"] != "nan"]

# lower case all words
df_test["word"] = df_test["word"].str.lower()
df_dev["word"] = df_dev["word"].str.lower()


def get_sentences(df):
    words = []
    tags = []
    lemmas = []
    sentence = []
    max_s = 0
    for index, row in df.iterrows():
        word = row["word"]
        tag = row["tag"]
        lemm = row["lemm"]
        sentence.append([word, tag, lemm])

        if row["word"] in [".", "?", "!", ";"]:
            words.append([word for word, tag, lemm in sentence])
            tags.append([tag for word, tag, lemm in sentence])
            lemmas.append([lemm for word, tag, lemm in sentence])
            max_s = max(max_s, len(sentence))
            sentence = []

    print("Max sentence length: ", max_s)
    return words, tags, lemmas

# _s is for string
dev_words_s, dev_tags_s, dev_lemmas_s = get_sentences(df_dev)
test_words_s, test_tags_s, test_lemmas_s = get_sentences(df_test)
print("Number of sentences in dev set: ", len(dev_words_s))
print("Number of sentences in test set: ", len(test_words_s))

for i in range(len(dev_words_s)):
    if len(dev_words_s[i]) != len(dev_tags_s[i]) or len(dev_words_s[i]) != len(dev_lemmas_s[i]):
        print("Dimension mismatch in sentence: ", i)
        print("Words: ", dev_words_s[i])
        print("Tags: ", dev_tags_s[i])
        print("Lemmas: ", dev_lemmas_s[i])
        break


Max sentence length:  95
Max sentence length:  107
Number of sentences in dev set:  703
Number of sentences in test set:  5596


## Word Context

In [210]:
CTX_DIM = 13
PRE_VALUE = "<PRE>"
POST_VALUE = "<POST>"

def get_context(words, tags, lemmas):
    ctx = []
    w = []
    tag = []
    lemma = []

    for s_index in range(len(words)):
        s = words[s_index]
        sentence = " ".join(s)
        s = [PRE_VALUE] * CTX_DIM + s + [POST_VALUE] * CTX_DIM

        for w_index in range(len(s)):
            if w_index < CTX_DIM or w_index >= len(s) - CTX_DIM:
                continue

            context = s[w_index - CTX_DIM:w_index] + s[w_index + 1:w_index  + CTX_DIM + 1]
            context = " ".join(context)
            ctx.append(context)
            w.append(words[s_index][w_index-CTX_DIM])
            tag.append(tags[s_index][w_index-CTX_DIM])
            lemma.append(lemmas[s_index][w_index-CTX_DIM])

    return ctx, w, tag, lemma

dev_ctx, dev_words, dev_tags, dev_lemmas = get_context(dev_words_s, dev_tags_s, dev_lemmas_s)
test_ctx, test_words, test_tags, test_lemmas = get_context(test_words_s, test_tags_s, test_lemmas_s)

dev_ctx, val_ctx, dev_words, val_words, dev_tags, val_tags, dev_lemmas, val_lemmas = train_test_split(dev_ctx, dev_words, dev_tags, dev_lemmas, test_size=0.01, random_state=42)

### Example of context

In [211]:
print("CTX Dim:", CTX_DIM, "\n")
for i in range(3):
    index = np.random.randint(0, len(dev_ctx))
    print("Context: ", dev_ctx[index])
    print("Word: ", dev_words[index])
    print("Tag: ", dev_tags[index])
    print("Lemma: ", dev_lemmas[index])
    print()


CTX Dim: 13 

Context:  con anni o mesi di anticipo , per gli asteroidi vaganti il rischio collisione inaspettata &egrave; molto pi&ugrave; grande e il fenomeno deve essere preso sul
Word:  di
Tag:  prep
Lemma:  di

Context:  <PRE> <PRE> <PRE> <PRE> <PRE> <PRE> <PRE> <PRE> <PRE> <PRE> perfino negli usa che pure sono all' avanguardia e vantano una diffusione capillare del telefono (
Word:  ,
Tag:  p_oth
Lemma:  ,

Context:  <PRE> <PRE> con il risultato che i loro figli si trovano avvantaggiati , hanno pi&ugrave; tempo per crescere prima_che giunga la cattiva stagione e pi&ugrave; tempo
Word:  in_quanto
Tag:  conj_s
Lemma:  in_quanto



## Open Class Words
The evaluation is done only on open-class words and not to functional words: only the tokens having a PoS-tag comprised in the set ADJ *, ADV, NN, V * had to be lemmatised, in all the other cases the token could be copied unchanged into the lemma column as they were not considered for the evaluation (the asterisk indicates all PoS-tag possibilities beginning with that prefix).

In [212]:
def get_open_class_words(ctx, words, tags, lemmas):
    open_class_words = []
    open_class_ctx = []
    open_class_tags = []
    open_class_lemmas = []

    for i in range(len(words)):
        if "adj_" in tags[i] or "adv" in tags[i] or "nn" in tags[i] or "v_" in tags[i]:
            open_class_words.append(words[i])
            open_class_ctx.append(ctx[i])
            open_class_tags.append(tags[i])
            open_class_lemmas.append(lemmas[i])

    return open_class_ctx, open_class_words, open_class_tags, open_class_lemmas


test_ctx, test_words, test_tags, test_lemmas = get_open_class_words(test_ctx, test_words, test_tags, test_lemmas)
dev_ctx, dev_words, dev_tags, dev_lemmas = get_open_class_words(dev_ctx, dev_words, dev_tags, dev_lemmas)
val_ctx, val_words, val_tags, val_lemmas = get_open_class_words(val_ctx, val_words, val_tags, val_lemmas)

## Encoding

In [213]:
# word encoder
word_tokenizer = Tokenizer(filters="")
word_tokenizer.fit_on_texts(dev_ctx + test_ctx + val_ctx)

# tag encoder
tag_tokenizer = Tokenizer(filters="")
tag_tokenizer.fit_on_texts(dev_tags_s + test_tags_s)

# lemma encoder
lemma_tokenizer = Tokenizer(filters="")
lemma_tokenizer.fit_on_texts(dev_lemmas_s + test_lemmas_s)

dev_ctx_e = word_tokenizer.texts_to_sequences(dev_ctx)
val_ctx_e = word_tokenizer.texts_to_sequences(val_ctx)
test_ctx_e = word_tokenizer.texts_to_sequences(test_ctx)

PRE_E = word_tokenizer.texts_to_sequences([PRE_VALUE])[0][0]
POST_E = word_tokenizer.texts_to_sequences([POST_VALUE])[0][0]


dev_tags_e = tag_tokenizer.texts_to_sequences(dev_tags)
val_tags_e = tag_tokenizer.texts_to_sequences(val_tags)
test_tags_e = tag_tokenizer.texts_to_sequences(test_tags)


# get all unique letter in words
characters = set()

for lemma in df_dev["lemm"].unique():
    for letter in lemma:
        characters.add(letter)

for lemma in df_test["lemm"].unique():
    for letter in lemma:
        characters.add(letter)

# add padding and unknown to characters
characters.add(" ")

# the length of the vocab for one-hot encoded char
VOCAB_SIZE = len(characters)

print ("Vocab size: ", VOCAB_SIZE)
# order characters
characters = sorted(list(characters))


char2idx = {char: idx for idx, char in enumerate(characters)}
idx2char = {idx: char for idx, char in enumerate(characters)}

MAX_WORD_LENGTH = 0
for w in dev_words + test_words + dev_lemmas + test_lemmas:
    MAX_WORD_LENGTH = max(MAX_WORD_LENGTH, len(w))
MAX_WORD_LENGTH += 1
print("Max word length: ", MAX_WORD_LENGTH)

def encode_words(words):
    encoded_words = []
    for word in words:
        word_e = []
        for letter in word:
            word_e.append(characters.index(letter))
        encoded_words.append(word_e)
    return encoded_words

dev_words_e = encode_words(dev_words)
test_words_e = encode_words(test_words)
val_words_e = encode_words(val_words)

dev_lemmas_e = encode_words(dev_lemmas)
test_lemmas_e = encode_words(test_lemmas)
val_lemmas_e = encode_words(val_lemmas)

dev_words_e = tf.keras.preprocessing.sequence.pad_sequences(dev_words_e, maxlen=MAX_WORD_LENGTH, padding="post")
test_words_e = tf.keras.preprocessing.sequence.pad_sequences(test_words_e, maxlen=MAX_WORD_LENGTH, padding="post")

dev_lemmas_e = tf.keras.preprocessing.sequence.pad_sequences(dev_lemmas_e, maxlen=MAX_WORD_LENGTH, padding="post")
test_lemmas_e = tf.keras.preprocessing.sequence.pad_sequences(test_lemmas_e, maxlen=MAX_WORD_LENGTH, padding="post")

val_words_e = tf.keras.preprocessing.sequence.pad_sequences(val_words_e, maxlen=MAX_WORD_LENGTH, padding="post")
val_lemmas_e = tf.keras.preprocessing.sequence.pad_sequences(val_lemmas_e, maxlen=MAX_WORD_LENGTH, padding="post")

# show random data point
index = np.random.randint(0, len(dev_ctx))
print("Context: ", dev_ctx[index], " -> ", dev_ctx_e[index])
print("Words: ", dev_words[index], " -> ", dev_words_e[index])
print("Tag: ", dev_tags[index], " -> ", dev_tags_e[index])
print("Lemma: ", dev_lemmas[index], " -> ", dev_lemmas_e[index])

Vocab size:  60
Max word length:  26
Context:  <PRE> <PRE> <PRE> <PRE> <PRE> <PRE> <PRE> <PRE> <PRE> <PRE> <PRE> ha una grossa con denti bianchi e forti , tranne_che per un canino che si  ->  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 26, 22, 1964, 23, 1757, 3474, 5, 1116, 3, 14624, 13, 11, 6239, 7, 17]
Words:  bocca  ->  [35 48 36 36 34  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0]
Tag:  nn  ->  [1]
Lemma:  bocca  ->  [35 48 36 36 34  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0]


In [214]:
# one hot encode the characters for the lemmas
dev_lemmas_e = tf.one_hot(dev_lemmas_e, VOCAB_SIZE)
test_lemmas_e = tf.one_hot(test_lemmas_e, VOCAB_SIZE)
val_lemmas_e = tf.one_hot(val_lemmas_e, VOCAB_SIZE)

In [215]:
# trnasform to numpy array
dev_ctx_e = np.array(dev_ctx_e)
dev_words_e = np.array(dev_words_e)
dev_tags_e = np.array(dev_tags_e)
dev_lemmas_e = np.array(dev_lemmas_e)

test_ctx_e = np.array(test_ctx_e)
test_words_e = np.array(test_words_e)
test_tags_e = np.array(test_tags_e)
test_lemmas_e = np.array(test_lemmas_e)

val_ctx_e = np.array(val_ctx_e)
val_words_e = np.array(val_words_e)
val_tags_e = np.array(val_tags_e)
val_lemmas_e = np.array(val_lemmas_e)

print("Context shape: ", dev_ctx_e.shape)
print("Words shape: ", dev_words_e.shape)
print("Tags shape: ", dev_tags_e.shape)
print("Lemmas shape: ", dev_lemmas_e.shape)

Context shape:  (8086, 26)
Words shape:  (8086, 26)
Tags shape:  (8086, 1)
Lemmas shape:  (8086, 26, 60)


## Model

### Lemmatization Accuracy

In [216]:
def accuracy(y_true, y_pred):
    y_true = tf.argmax(y_true, axis=-1)
    y_pred = tf.argmax(y_pred, axis=-1)
    correct_predictions = tf.reduce_all(tf.equal(y_true, y_pred), axis=-1)
    accuracy = tf.reduce_mean(tf.cast(correct_predictions, tf.float32))
    return accuracy


### Word2Vec

In [217]:
EMBEDDING_DIM = 512
VOCABULARY_SIZE = len(word_tokenizer.word_index) + 1

def get_word2vec_weights(DIM):
    # train word2vec model
    word2vec = gensim.models.Word2Vec(dev_ctx, vector_size=DIM, window=10, min_count=1, workers=8)

    # create an empty embedding matix
    embedding_weights = np.zeros((VOCABULARY_SIZE, DIM))

    # create a word to index dictionary mapping
    word2id = word_tokenizer.word_index

    # copy vectors from word2vec model to the words present in corpus
    for word, index in word2id.items():
        try:
            embedding_weights[index, :] = word2vec.wv[word]
        except KeyError:
            pass

    return embedding_weights

embedding_weights = get_word2vec_weights(EMBEDDING_DIM)

In [218]:
# Neural network model
# inputs:
#   - context: (batch_size, CTX_DIM * 2) 
#   - tags: encoded tags: (batch_size, 1)
#   - words: encoded words: (batch_size, MAX_WORD_LENGTH)
# outputs:
#  - lemma: encoded lemma: (batch_size, MAX_WORD_LENGTH)

def get_model():
    # context
    context_input = Input(shape=(CTX_DIM * 2,), name="context_input")
    context_embedding = Embedding(len(word_tokenizer.word_index) + 1, EMBEDDING_DIM, input_length=CTX_DIM * 2,name="context_embedding", trainable=False, weights=[embedding_weights])(context_input)
    context_embedding = Bidirectional(LSTM(int(EMBEDDING_DIM/2), return_sequences=True, name="context_lstm"))(context_embedding)

    # tags
    tags_input = Input(shape=(1,), name="tags_input")
    tags_embedding = RepeatVector(MAX_WORD_LENGTH)(tags_input)
    tags_embedding = Dense(EMBEDDING_DIM, activation="swish",name="tags_embedding_dense2")(tags_embedding)

    # words
    words_input = Input(shape=(MAX_WORD_LENGTH,), name="words_input")
    words_input = Masking(mask_value=0)(words_input)

    words_embedding = Embedding(VOCAB_SIZE, int(EMBEDDING_DIM), input_length=MAX_WORD_LENGTH, name="words_embedding", trainable=True)(words_input)

    attention = tf.keras.layers.Attention(use_scale=True)([context_embedding, tags_embedding])

    # combine
    combine = Concatenate()([attention, words_embedding])

    combine = Dropout(0.5)(combine)
    lstm = Bidirectional(LSTM(EMBEDDING_DIM, return_sequences=True), name="lstm")(combine)
    lstm = Bidirectional(LSTM(EMBEDDING_DIM, return_sequences=True), name="lstm1")(lstm)

    # dense layers
    dense1 = Dense(EMBEDDING_DIM, activation="swish", name="dense1")(lstm)
    dense1 = Dropout(0.5)(dense1)
    dense2 = Dense(EMBEDDING_DIM, activation="swish", name="dense2")(dense1)
    dense2 = Dropout(0.5)(dense2)
    dense3 = Dense(EMBEDDING_DIM, activation="swish", name="dense3")(dense2)
    dense4 = Dense(EMBEDDING_DIM, activation="swish", name="dense4")(dense3)

    # output
    output = Dense(VOCAB_SIZE, activation="softmax", name="output")(dense3)

    model = Model(inputs=[context_input, tags_input,
                  words_input], outputs=output)
    return model


model = get_model()
model.summary()


Model: "model_20"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 context_input (InputLayer)     [(None, 26)]         0           []                               
                                                                                                  
 tags_input (InputLayer)        [(None, 1)]          0           []                               
                                                                                                  
 context_embedding (Embedding)  (None, 26, 512)      10096128    ['context_input[0][0]']          
                                                                                                  
 repeat_vector_20 (RepeatVector  (None, 26, 1)       0           ['tags_input[0][0]']             
 )                                                                                         

### Training

In [219]:
model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=[accuracy])

early_stopping = EarlyStopping(monitor="val_loss", patience=20, restore_best_weights=True)

# train model
history = model.fit([dev_ctx_e, dev_tags_e, dev_words_e], dev_lemmas_e, epochs=100, batch_size=256, validation_data=([val_ctx_e, val_tags_e, val_words_e], val_lemmas_e), callbacks=[early_stopping])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

## Evaluation

In [220]:
# evaluate model
result = model.evaluate([test_ctx_e, test_tags_e, test_words_e], test_lemmas_e)
print("Test loss: ", result[0])
print("Test accuracy: ", result[1])


Test loss:  0.02970941551029682
Test accuracy:  0.9537237882614136
