# Italian Word Lemmatizer

### Import


In [66]:
import seaborn as sns
import matplotlib.pyplot as plt
from keras import backend as K
from keras.layers import Input, LSTM, Dense, Embedding, Concatenate, Bidirectional, TimeDistributed, RepeatVector, Activation, Dot, Lambda, Dropout
from keras.models import Model
from keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
import numpy as np
import gensim
import tensorflow_addons as tfa
import tensorflow as tf
import pandas as pd
import datetime
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

# set all random seeds
np.random.seed(42)
tf.random.set_seed(42)


## Dataset

In [67]:

dataset_path = "./dev.csv"
df_dev = pd.read_csv(dataset_path, sep="\t", header=None,
                     names=["word", "tag", "lemm"])

dataset_path = "./test.csv"
df_test = pd.read_csv(dataset_path, sep="\t", header=None,
                      names=["word", "tag", "lemm"])

df_dev["word"] = df_dev["word"].astype(str)
df_dev["tag"] = df_dev["tag"].astype(str)
df_dev["lemm"] = df_dev["lemm"].astype(str)

df_test["word"] = df_test["word"].astype(str)
df_test["tag"] = df_test["tag"].astype(str)
df_test["lemm"] = df_test["lemm"].astype(str)

# remove head
df_dev = df_dev.iloc[1:]
df_test = df_test.iloc[1:]

# removing rows where tag is nan
df_dev = df_dev.dropna(subset=["tag"])
df_dev = df_dev[df_dev["tag"] != "nan"]
df_test = df_test.dropna(subset=["tag"])
df_test = df_test[df_test["tag"] != "nan"]

# lower case all words
df_test["word"] = df_test["word"].str.lower()
df_dev["word"] = df_dev["word"].str.lower()


def get_sentences(df):
    words = []
    tags = []
    lemmas = []
    sentence = []
    max_s = 0
    for index, row in df.iterrows():
        word = row["word"]
        tag = row["tag"]
        lemm = row["lemm"]
        sentence.append([word, tag, lemm])

        if row["word"] in [".", "?", "!", ";"]:
            words.append([word for word, tag, lemm in sentence])
            tags.append([tag for word, tag, lemm in sentence])
            lemmas.append([lemm for word, tag, lemm in sentence])
            max_s = max(max_s, len(sentence))
            sentence = []

    print("Max sentence length: ", max_s)
    return words, tags, lemmas

# _s is for string
dev_words_s, dev_tags_s, dev_lemmas_s = get_sentences(df_dev)
test_words_s, test_tags_s, test_lemmas_s = get_sentences(df_test)
print("Number of sentences in dev set: ", len(dev_words_s))
print("Number of sentences in test set: ", len(test_words_s))

for i in range(len(dev_words_s)):
    if len(dev_words_s[i]) != len(dev_tags_s[i]) or len(dev_words_s[i]) != len(dev_lemmas_s[i]):
        print("Dimension mismatch in sentence: ", i)
        print("Words: ", dev_words_s[i])
        print("Tags: ", dev_tags_s[i])
        print("Lemmas: ", dev_lemmas_s[i])
        break


Max sentence length:  95
Max sentence length:  107
Number of sentences in dev set:  703
Number of sentences in test set:  5596


## Word Context

In [68]:
CTX_DIM = 13
PRE_VALUE = "<PRE>"
POST_VALUE = "<POST>"

def get_context(words, tags, lemmas):
    ctx = []
    w = []
    tag = []
    lemma = []

    for s_index in range(len(words)):
        s = words[s_index]
        sentence = " ".join(s)
        s = [PRE_VALUE] * CTX_DIM + s + [POST_VALUE] * CTX_DIM

        for w_index in range(len(s)):
            if w_index < CTX_DIM or w_index >= len(s) - CTX_DIM:
                continue

            context = s[w_index - CTX_DIM:w_index] + s[w_index + 1:w_index  + CTX_DIM + 1]
            context = " ".join(context)
            ctx.append(context)
            w.append(words[s_index][w_index-CTX_DIM])
            tag.append(tags[s_index][w_index-CTX_DIM])
            lemma.append(lemmas[s_index][w_index-CTX_DIM])

    return ctx, w, tag, lemma

dev_ctx, dev_words, dev_tags, dev_lemmas = get_context(dev_words_s, dev_tags_s, dev_lemmas_s)
test_ctx, test_words, test_tags, test_lemmas = get_context(test_words_s, test_tags_s, test_lemmas_s)

### Example of context

In [69]:
print("CTX Dim:", CTX_DIM, "\n")
for i in range(3):
    index = np.random.randint(0, len(dev_ctx))
    print("Context: ", dev_ctx[index])
    print("Word: ", dev_words[index])
    print("Tag: ", dev_tags[index])
    print("Lemma: ", dev_lemmas[index])
    print()


CTX Dim: 13 

Context:  , e le sorelle che non trovavano marito neanche a regalarle , e mamma la quale filava al buio per risparmiar l' olio della lucerna ,
Word:  la
Tag:  art
Lemma:  la

Context:  esiste una grave frattura tra gli stati uniti e altre grandi potenze locali internazionali , egli sfider&agrave; apertamente le risoluzioni onu provocando la reazione militare statunitense
Word:  e
Tag:  conj_c
Lemma:  e

Context:  <PRE> <PRE> <PRE> <PRE> <PRE> <PRE> <PRE> <PRE> <PRE> era una bandiera che tempo aveva cessato di sventolare . <POST> <POST> <POST> <POST> <POST> <POST> <POST>
Word:  da
Tag:  prep
Lemma:  da



## Encoding

In [70]:
# word encoder
word_tokenizer = Tokenizer(filters="")
word_tokenizer.fit_on_texts(dev_ctx + test_ctx)

# tag encoder
tag_tokenizer = Tokenizer(filters="")
tag_tokenizer.fit_on_texts(dev_tags_s + test_tags_s)

# lemma encoder
lemma_tokenizer = Tokenizer(filters="")
lemma_tokenizer.fit_on_texts(dev_lemmas_s + test_lemmas_s)

dev_ctx_e = word_tokenizer.texts_to_sequences(dev_ctx)
test_ctx_e = word_tokenizer.texts_to_sequences(test_ctx)

dev_tags_e = tag_tokenizer.texts_to_sequences(dev_tags)
test_tags_e = tag_tokenizer.texts_to_sequences(test_tags)

# get all unique letter in words
characters = set()

for lemma in dev_lemmas + test_lemmas:
    for letter in lemma:
        characters.add(letter)

# add padding and unknown to characters
characters.add(" ")

# the length of the vocab for one-hot encoded char
VOCAB_SIZE = len(characters)

# order characters
characters = sorted(list(characters))

char2idx = {char: idx for idx, char in enumerate(characters)}
idx2char = {idx: char for idx, char in enumerate(characters)}

MAX_WORD_LENGTH = 0
for w in dev_words + test_words + dev_lemmas + test_lemmas:
    MAX_WORD_LENGTH = max(MAX_WORD_LENGTH, len(w))
MAX_WORD_LENGTH += 1
print("Max word length: ", MAX_WORD_LENGTH)

def encode_words(words):
    encoded_words = []
    for word in words:
        word_e = []
        for letter in word:
            word_e.append(characters.index(letter))
        encoded_words.append(word_e)
    return encoded_words

dev_words_e = encode_words(dev_words)
test_words_e = encode_words(test_words)
dev_lemmas_e = encode_words(dev_lemmas)
test_lemmas_e = encode_words(test_lemmas)

dev_words_e = tf.keras.preprocessing.sequence.pad_sequences(dev_words_e, maxlen=MAX_WORD_LENGTH, padding="post")
test_words_e = tf.keras.preprocessing.sequence.pad_sequences(test_words_e, maxlen=MAX_WORD_LENGTH, padding="post")

dev_lemmas_e = tf.keras.preprocessing.sequence.pad_sequences(dev_lemmas_e, maxlen=MAX_WORD_LENGTH, padding="post")
test_lemmas_e = tf.keras.preprocessing.sequence.pad_sequences(test_lemmas_e, maxlen=MAX_WORD_LENGTH, padding="post")


# show random data point
index = np.random.randint(0, len(dev_ctx))
print("Context: ", dev_ctx[index], " -> ", dev_ctx_e[index])
print("Words: ", dev_words[index], " -> ", dev_words_e[index])
print("Tag: ", dev_tags[index], " -> ", dev_tags_e[index])
print("Lemma: ", dev_lemmas[index], " -> ", dev_lemmas_e[index])

Max word length:  26
Context:  <PRE> <PRE> <PRE> <PRE> <PRE> <PRE> <PRE> sullo scalandrone invece il tempo della &egrave; assai pi&ugrave; disteso e il ritmo necessariamente pi&ugrave; lento , pi&ugrave; o  ->  [1, 1, 1, 1, 1, 1, 1, 1460, 4566, 214, 8, 111, 24, 14, 585, 29, 7720, 5, 8, 1867, 3149, 29, 1692, 3, 29, 35]
Words:  danza  ->  [37 34 47 59 34  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0]
Tag:  nn  ->  [1]
Lemma:  danza  ->  [37 34 47 59 34  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0]


In [71]:
# one hot encode the characters for the lemmas
dev_lemmas_e = tf.one_hot(dev_lemmas_e, VOCAB_SIZE)
test_lemmas_e = tf.one_hot(test_lemmas_e, VOCAB_SIZE)

In [72]:
# trnasform to numpy array
dev_ctx_e = np.array(dev_ctx_e)
dev_words_e = np.array(dev_words_e)
dev_tags_e = np.array(dev_tags_e)
dev_lemmas_e = np.array(dev_lemmas_e)

test_ctx_e = np.array(test_ctx_e)
test_words_e = np.array(test_words_e)
test_tags_e = np.array(test_tags_e)
test_lemmas_e = np.array(test_lemmas_e)

print("Context shape: ", dev_ctx_e.shape)
print("Words shape: ", dev_words_e.shape)
print("Tags shape: ", dev_tags_e.shape)
print("Lemmas shape: ", dev_lemmas_e.shape)

Context shape:  (17301, 26)
Words shape:  (17301, 26)
Tags shape:  (17301, 1)
Lemmas shape:  (17301, 26, 60)


## Model

In [73]:
EMBEDDING_SIZE = 256
VOCABULARY_SIZE = len(word_tokenizer.word_index) + 1

# train word2vec model
word2vec = gensim.models.Word2Vec(dev_words_s + test_words_s, vector_size=EMBEDDING_SIZE, window=10, min_count=1, workers=8)

# create an empty embedding matix
embedding_weights = np.zeros((VOCABULARY_SIZE, EMBEDDING_SIZE))

# create a word to index dictionary mapping
word2id = word_tokenizer.word_index

# copy vectors from word2vec model to the words present in corpus
for word, index in word2id.items():
    try:
        embedding_weights[index, :] = word2vec.wv[word]
    except KeyError:
        pass

In [74]:
# Neural network model with attention mechanism
# inputs:
#   - context: (batch_size, CTX_DIM * 2) 
#   - tags: encoded tags: (batch_size, 1)
#   - words: encoded words: (batch_size, MAX_WORD_LENGTH)
# outputs:
#  - lemma: encoded lemma: (batch_size, MAX_WORD_LENGTH)

EMBEDDING_DIM = 256

def get_model():
    # context
    context_input = Input(shape=(CTX_DIM * 2,), name="context_input")
    context_embedding = Embedding(len(word_tokenizer.word_index) + 1, EMBEDDING_DIM, input_length=CTX_DIM * 2, name="context_embedding", trainable=False, weights=[embedding_weights])(context_input)
    context_embedding = Dense(EMBEDDING_DIM, activation="swish", name="context_embedding_dense")(context_embedding)

    # tags
    tags_input = Input(shape=(1,), name="tags_input")
    tags_embedding = RepeatVector(MAX_WORD_LENGTH)(tags_input)
    tags_embedding = Dense(EMBEDDING_DIM, activation="swish", name="tags_embedding_dense")(tags_embedding)

    # words
    words_input = Input(shape=(MAX_WORD_LENGTH,), name="words_input")
    words_embedding = Embedding(VOCAB_SIZE, EMBEDDING_DIM, input_length=MAX_WORD_LENGTH, name="words_embedding", trainable=True)(words_input)
    words_embedding = Dense(EMBEDDING_DIM, activation="swish", name="words_embedding_dense")(words_embedding)

    # combine context, tags and words without using concatenation
    combine = tf.keras.layers.Add()([context_embedding, tags_embedding, words_embedding])

    combine = Dropout(0.5)(combine)
    lstm = Bidirectional(LSTM(EMBEDDING_DIM, return_sequences=True), name="lstm")(combine)
    lstm = Bidirectional(LSTM(EMBEDDING_DIM, return_sequences=True), name="lstm2")(lstm)

    # dense layers
    dense1 = Dense(EMBEDDING_DIM, activation="swish", name="dense1")(lstm)
    dense1 = Dropout(0.5)(dense1)
    dense2 = Dense(EMBEDDING_DIM, activation="swish", name="dense2")(dense1)
    dense3 = Dense(EMBEDDING_DIM, activation="swish", name="dense3")(dense2)

    # output
    output = Dense(VOCAB_SIZE, activation="softmax", name="output")(dense3)

    model = Model(inputs=[context_input, tags_input, words_input], outputs=output)
    return model

model = get_model()
model.summary()

Model: "model_5"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 context_input (InputLayer)     [(None, 26)]         0           []                               
                                                                                                  
 tags_input (InputLayer)        [(None, 1)]          0           []                               
                                                                                                  
 words_input (InputLayer)       [(None, 26)]         0           []                               
                                                                                                  
 context_embedding (Embedding)  (None, 26, 256)      5051392     ['context_input[0][0]']          
                                                                                            

### Lemmatization Accuracy

In [75]:
def accuracy(y_true, y_pred):
    y_true = tf.argmax(y_true, axis=-1)
    y_pred = tf.argmax(y_pred, axis=-1)
    correct_predictions = tf.reduce_all(tf.equal(y_true, y_pred), axis=-1)
    accuracy = tf.reduce_mean(tf.cast(correct_predictions, tf.float32))
    return accuracy

### Training

In [76]:
model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=[accuracy])

# train model
history = model.fit([dev_ctx_e, dev_tags_e, dev_words_e], dev_lemmas_e, epochs=100, batch_size=128, validation_split=0.05)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100

## Evaluation

In [None]:
# evaluate model
result = model.evaluate([test_ctx_e, test_tags_e, test_words_e], test_lemmas_e)
print("Test loss: ", result[0])
print("Test accuracy: ", result[1])


Test loss:  0.019336679950356483
Test accuracy:  0.969872236251831


In [None]:
pred = model.predict([test_ctx_e, test_tags_e, test_words_e])

total = 0
correct = 0

for i in range(len(test_lemmas)):
    total += 1

    y_pred = np.argmax(pred[i], axis=-1)
    y_true = np.argmax(test_lemmas_e[i], axis=-1)

    if np.array_equal(y_pred, y_true):
        correct += 1

print("Accuracy: ", correct / total)


Accuracy:  0.9698715406887574
