# Italian sentences lemmatization

## Dataset

In [1]:
import seaborn as sns
import matplotlib.pyplot as plt
from keras import backend as K
from keras.layers import Input, LSTM, Dense, Embedding, Concatenate, Bidirectional, TimeDistributed, RepeatVector, Activation, Dot, Lambda
from keras.models import Model
from keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
import numpy as np
import gensim
import tensorflow_addons as tfa
import tensorflow as tf
import pandas as pd
import datetime
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

# set all random seeds
np.random.seed(42)
tf.random.set_seed(42)

dataset_path = "./dev.csv"
df_dev = pd.read_csv(dataset_path, sep="\t", header=None,names=["word", "tag", "lemm"])

dataset_path = "./test.csv"
df_test = pd.read_csv(dataset_path, sep="\t",header=None, names=["word", "tag", "lemm"])

df_dev["word"] = df_dev["word"].astype(str)
df_dev["tag"] = df_dev["tag"].astype(str)
df_dev["lemm"] = df_dev["lemm"].astype(str)

df_test["word"] = df_test["word"].astype(str)
df_test["tag"] = df_test["tag"].astype(str)
df_test["lemm"] = df_test["lemm"].astype(str)

# remove head
df_dev = df_dev.iloc[1:]
df_test = df_test.iloc[1:]

# removing rows where tag is nan
df_dev = df_dev.dropna(subset=["tag"])
df_dev = df_dev[df_dev["tag"] != "nan"]
df_test = df_test.dropna(subset=["tag"])
df_test = df_test[df_test["tag"] != "nan"]

# lower case all words
df_test["word"] = df_test["word"].str.lower()
df_dev["word"] = df_dev["word"].str.lower()


def get_sentences(df):
    words = []
    tags = []
    lemmas = []
    sentence = []
    max_s = 0
    for index, row in df.iterrows():
        word = row["word"]
        tag = row["tag"]
        lemm = row["lemm"]
        sentence.append([word, tag, lemm])

        if row["word"] in [".", "?", "!", ";"]:
            words.append([word for word, tag, lemm in sentence])
            tags.append([tag for word, tag, lemm in sentence])
            lemmas.append([lemm for word, tag, lemm in sentence])
            max_s = max(max_s, len(sentence))
            sentence = []

    print("Max sentence length: ", max_s)
    return words, tags, lemmas

s_dev_words, s_dev_tags, s_dev_lemmas = get_sentences(df_dev)
s_test_words, s_test_tags, s_test_lemmas = get_sentences(df_test)

print("Number of sentences in dev set: ", len(s_dev_words))
print("Number of sentences in test set: ", len(s_test_words))

for i in range(len(s_dev_words)):
    if len(s_dev_words[i]) != len(s_dev_tags[i]) or len(s_dev_words[i]) != len(s_dev_lemmas[i]):
        print("Dimension mismatch in sentence: ", i)
        print("Words: ", s_dev_words[i])
        print("Tags: ", s_dev_tags[i])
        print("Lemmas: ", s_dev_lemmas[i])
        break

Max sentence length:  95
Max sentence length:  107
Number of sentences in dev set:  703
Number of sentences in test set:  5596


In [2]:
# encode words
word_tokenizer = Tokenizer(filters="")
word_tokenizer.fit_on_texts(s_dev_words + s_test_words)
s_dev_words_e = word_tokenizer.texts_to_sequences(s_dev_words)
s_test_words_e = word_tokenizer.texts_to_sequences(s_test_words)

# encode tags
tag_tokenizer = Tokenizer(filters="")
tag_tokenizer.fit_on_texts(s_dev_tags + s_test_tags)
s_dev_tags_e = tag_tokenizer.texts_to_sequences(s_dev_tags)
s_test_tags_e = tag_tokenizer.texts_to_sequences(s_test_tags)

# encode lemmas
lemma_tokenizer = Tokenizer(filters="")
lemma_tokenizer.fit_on_texts(s_dev_lemmas + s_test_lemmas)
s_dev_lemmas_e = lemma_tokenizer.texts_to_sequences(s_dev_lemmas)
s_test_lemmas_e = lemma_tokenizer.texts_to_sequences(s_test_lemmas)

# look at first encoded data point
print("Word: ", s_dev_words[0])
print("Tag: ", s_dev_tags[0])
print("Lemma: ", s_dev_lemmas[0])
print("Encoded word: ", s_dev_words_e[0])
print("Encoded tag: ", s_dev_tags_e[0])
print("Encoded lemma: ", s_dev_lemmas_e[0])

Word:  ['mi', 'riferisco', 'al', 'lavoro', 'dove', 'non', "c'", '&egrave;', ',', 'innanzitutto', 'nel', 'mezzogiorno', ',', 'e', 'al', 'lavoro', 'che', 'cambia', '.']
Tag:  ['pron_per', 'v_gvrb', 'prep_a', 'nn', 'conj_s', 'adv', 'adv', 'v_essere', 'p_oth', 'adv', 'prep_a', 'nn_p', 'p_oth', 'conj_c', 'prep_a', 'nn', 'pron_rel', 'v_gvrb', 'p_eos']
Lemma:  ['mi', 'riferire', 'al', 'lavoro', 'dove', 'non', 'ci', 'essere', ',', 'innanzitutto', 'nel', 'mezzogiorno', ',', 'e', 'al', 'lavoro', 'che', 'cambiare', '.']
Encoded word:  [153, 8977, 31, 162, 99, 13, 75, 11, 1, 4242, 35, 1277, 1, 4, 31, 162, 5, 3359, 2]
Encoded tag:  [14, 5, 7, 1, 16, 8, 8, 13, 4, 8, 7, 9, 4, 11, 7, 1, 15, 5, 10]
Encoded lemma:  [167, 2613, 32, 147, 104, 14, 41, 4, 1, 3755, 39, 1380, 1, 5, 32, 147, 6, 642, 2]


In [3]:
# find max len
max_len = 0
for i in range(len(s_dev_words_e)):
    max_len = max(max_len, len(s_dev_words_e[i]))

for i in range(len(s_test_words_e)):
    max_len = max(max_len, len(s_test_words_e[i]))
    
print("Max len: ", max_len)
padding_type = "pre"

s_dev_words_e = tf.keras.preprocessing.sequence.pad_sequences(s_dev_words_e, maxlen=max_len, padding=padding_type)
s_test_words_e = tf.keras.preprocessing.sequence.pad_sequences(s_test_words_e, maxlen=max_len, padding=padding_type)
s_dev_tags_e = tf.keras.preprocessing.sequence.pad_sequences(s_dev_tags_e, maxlen=max_len, padding=padding_type)
s_test_tags_e = tf.keras.preprocessing.sequence.pad_sequences(s_test_tags_e, maxlen=max_len, padding=padding_type)
s_dev_lemmas_e = tf.keras.preprocessing.sequence.pad_sequences(s_dev_lemmas_e, maxlen=max_len, padding=padding_type)
s_test_lemmas_e = tf.keras.preprocessing.sequence.pad_sequences(s_test_lemmas_e, maxlen=max_len, padding=padding_type)

# print first encoded data point
print("Encoded words: ", s_dev_words_e[0])
print("Encoded tags: ", s_dev_tags_e[0])
print("Encoded lemmas: ", s_dev_lemmas_e[0])


Max len:  107
Encoded words:  [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0  153 8977   31  162   99   13   75   11    1 4242
   35 1277    1    4   31  162    5 3359    2]
Encoded tags:  [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 14  5  7  1 16  8  8 13
  4  8  7  9  4 11  7  1 15  5 10]
Encoded lemmas:  [   0    0    0    0    0    0    0    0    0    0    0    0    0  

In [4]:
EMBEDDING_SIZE = 300
VOCABULARY_SIZE = len(word_tokenizer.word_index) + 1

# train word2vec model
word2vec = gensim.models.Word2Vec(s_dev_words + s_test_words, vector_size=EMBEDDING_SIZE, window=7, min_count=1, workers=4)

# create an empty embedding matix
embedding_weights = np.zeros((VOCABULARY_SIZE, EMBEDDING_SIZE))

# create a word to index dictionary mapping
word2id = word_tokenizer.word_index

# copy vectors from word2vec model to the words present in corpus
for word, index in word2id.items():
    try:
        embedding_weights[index, :] = word2vec.wv[word]
    except KeyError:
        pass

In [5]:
tag_enc = OneHotEncoder(sparse_output=False)

# get all tags
tags = list(tag_tokenizer.word_index.values())
tags = [0] + tags
tags = np.array(tags).reshape(-1, 1)

dev_tags_1he = np.empty((len(s_dev_tags_e), max_len, len(tags)))
test_tags_1he = np.empty((len(s_test_tags_e), max_len, len(tags)))

# fit and transform all tags
tag_enc.fit(tags)
for i in range(len(s_dev_tags_e)):
    for j in range(len(s_dev_tags_e[i])):
        dev_tags_1he[i][j] = tag_enc.transform([[s_dev_tags_e[i][j]]])[0]


for i in range(len(s_test_tags_e)):
    for j in range(len(s_test_tags_e[i])):
        test_tags_1he[i][j] = tag_enc.transform([[s_test_tags_e[i][j]]])[0]

In [6]:
lemmas_enc = OneHotEncoder(sparse_output=False)

# get all lemmas
lemmas = list(lemma_tokenizer.word_index.values())
lemmas = [0] + lemmas
lemmas = np.array(lemmas).reshape(-1, 1)

dev_lemmas_1he = np.empty((len(s_dev_lemmas_e), max_len, len(lemmas)))
test_lemmas_1he = np.empty((len(s_test_lemmas_e), max_len, len(lemmas)))

# fit and transform all lemmas
lemmas_enc.fit(lemmas)
for i in range(len(s_dev_lemmas_e)):
    for j in range(len(s_dev_lemmas_e[i])):
        dev_lemmas_1he[i][j] = lemmas_enc.transform([[s_dev_lemmas_e[i][j]]])[0]

for i in range(len(s_test_lemmas_e)):
    for j in range(len(s_test_lemmas_e[i])):
        test_lemmas_1he[i][j] = lemmas_enc.transform([[s_test_lemmas_e[i][j]]])[0]

In [7]:
# get all unique letter in words
characters = set()

for lemma in df_test["lemm"]:
    for letter in lemma:
        characters.add(letter)

for word in df_dev["word"]:
    for letter in word:
        characters.add(letter)

# add padding and unknown to characters
characters.add(" ")

# the length of the vocab for one-hot encoded char
vocab_size = len(characters)

#print("\n### Number of unique characters:", vocab_size)

max_word_length = max(df_dev["word"].str.len().max(), df_test["lemm"].str.len().max())
#print("\n### Max word length:", max_word_length)
max_word_length += 1
max_word_length = int(max_word_length)


# Each word is encoded as a list of one-hot encoded characters
char_enc = OneHotEncoder(sparse_output=False)
char_enc.fit([[char] for char in characters])

def pad_word(word, max_word_length):
    return word + " " * (max_word_length - len(word))

def encode_word(word):
    return char_enc.transform([[char] for char in word])

def encode_words(s):
    words = []
    for w in s:
        # transform w from int to string
        if w == 0:
            w = ""
        else:
            w = lemma_tokenizer.index_word[w]

        w = pad_word(w, max_word_length)
        words.append(encode_word(w))
    return words

mask_value = char_enc.transform([[" "]])[0]

d_lemmas_e = np.array([encode_words(s) for s in s_dev_lemmas_e])
t_leamms_e = np.array([encode_words(s) for s in s_test_lemmas_e])
print("Shape of encoded words:", d_lemmas_e.shape)


Shape of encoded words: (703, 107, 26, 60)


In [16]:
# bidirectional LSTM
def get_model():
    # input layers
    word_input = Input(shape=(max_len,), name="word_input")
    tag_input = Input(shape=(max_len, len(tags)), name="tag_input")

    # embedding layers
    word_embedding = Embedding(VOCABULARY_SIZE, EMBEDDING_SIZE, weights=[embedding_weights], trainable=True, name="word_embedding")(word_input)
    tag_embedding = TimeDistributed(Dense(EMBEDDING_SIZE, activation="swish"), name="tag_embedding")(tag_input)

    # concatenate embeddings
    concat = Concatenate(axis=-1, name="concat")([word_embedding, tag_embedding])

    # bidirectional LSTM
    lstm = Bidirectional(LSTM(EMBEDDING_SIZE, return_sequences=True), name="lstm")(concat)
    lstm = Bidirectional(LSTM(EMBEDDING_SIZE, return_sequences=True), name="lstm2")(lstm)


    dense = TimeDistributed(Dense(EMBEDDING_SIZE, activation="swish"), name="dense")(lstm)
    dense = TimeDistributed(Dense(EMBEDDING_SIZE, activation="swish"), name="dense2")(dense)
    
    # output layer
    output = TimeDistributed(Dense(len(lemmas), activation="softmax"), name="output")(dense)

    model = Model(inputs=[word_input, tag_input], outputs=output)
    return model

model = get_model()
model.summary()


Model: "model_3"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 word_input (InputLayer)        [(None, 107)]        0           []                               
                                                                                                  
 tag_input (InputLayer)         [(None, 107, 32)]    0           []                               
                                                                                                  
 word_embedding (Embedding)     (None, 107, 300)     5919000     ['word_input[0][0]']             
                                                                                                  
 tag_embedding (TimeDistributed  (None, 107, 300)    9900        ['tag_input[0][0]']              
 )                                                                                          

In [9]:
# custom accuracy metric
# a word is correct if all letters are correct
def accuracy(y_true, y_pred):
    y_true = K.argmax(y_true, axis=-1)
    y_pred = K.argmax(y_pred, axis=-1)
    correct = K.cast(K.equal(y_true, y_pred), K.floatx())
    return K.mean(correct)

In [17]:
# train model
model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
history = model.fit([s_dev_words_e, dev_tags_1he], dev_lemmas_1he, epochs=10, batch_size=64)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [18]:
# test model
model.evaluate([s_test_words_e, test_tags_1he], test_lemmas_1he)

InternalError: Failed copying input tensor from /job:localhost/replica:0/task:0/device:CPU:0 to /job:localhost/replica:0/task:0/device:GPU:0 in order to run _EagerConst: Dst tensor is not initialized.

## Lemmatization Accuracy

Lemmatisation accuracy is defined as the number of correct lemma assignment divided by the total number of tokens in the test set belonging to the considered lexical classes (ADJ_, ADV,NN, V_). 

(Evalita2011)