# Lemmatization

# Dataset

In [1]:
import pandas as pd
import tensorflow as tf
import tensorflow_addons as tfa
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from keras.models import Model
from keras.layers import Input, LSTM, Dense, Embedding, Concatenate, Bidirectional, TimeDistributed, RepeatVector, Activation, Dot, Lambda
from keras import backend as K


dataset_path = "./out.csv"
df = pd.read_csv(dataset_path, sep="\t", header=None, names=["word", "tag", "lemm"])

df["word"] = df["word"].astype(str) 
df["tag"] = df["tag"].astype(str)
df["lemm"] = df["lemm"].astype(str)

# remove duplicates in word columns
df = df.drop_duplicates(subset=["word"])

# remove head
df = df.iloc[1:]

# # removing punctuation and numbers
# df = df[df["tag"] != "p_oth"]
# df = df[df["tag"] != "c_num"]

print("### DF shape:" ,df.shape)
print("\n### NaN values:")
print(df.isnull().sum())

# removing rows where tag is nan
df = df.dropna(subset=["tag"])
print("\n### DF shape after removing rows where tag is nan:" ,df.shape)

# print number of unique values for each column
print("\n### Unique values:")
print(df.nunique())

# lower case all words
df["word"] = df["word"].str.lower()

# count number of row where word is equal to lemm
print("\n### Number of word that are equals to lemm:")
print(df[df["word"] == df["lemm"]].shape[0])

df.head()

### DF shape: (5113, 3)

### NaN values:
word    0
tag     0
lemm    0
dtype: int64

### DF shape after removing rows where tag is nan: (5113, 3)

### Unique values:
word    5113
tag       31
lemm    3952
dtype: int64

### Number of word that are equals to lemm:
2886


Unnamed: 0,word,tag,lemm
1,mi,pron_per,mi
2,riferisco,v_gvrb,riferire
3,al,prep_a,al
4,lavoro,nn,lavoro
5,dove,conj_s,dove


## Data Processing

In [2]:
# get all unique letter in words
characters = set()

for word in df["word"]:
    for letter in word:
        characters.add(letter)

for lemma in df["lemm"]:
    for letter in lemma:
        characters.add(letter)

# add padding and unknown to characters
characters.add(" ")

# the length of the vocab for one-hot encoded char
vocab_size = len(characters)

print("\n### Number of unique characters:", vocab_size)

# Input in a LSTM must have all the same length
# so we pad the words with spaces to have the same length
def pad_word(word, max_word_length):
    return word + " " * (max_word_length - len(word))

max_word_length = max(df["word"].str.len().max(), df["lemm"].str.len().max())
print("\n### Max word length:", max_word_length)
max_word_length +=1
max_word_length = int(max_word_length)

df["word"] = df["word"].apply(lambda x: pad_word(x, max_word_length))
df["lemm"] = df["lemm"].apply(lambda x: pad_word(x, max_word_length))

# order characters
characters = sorted(list(characters))

def word2int(word):
    return [float(characters.index(letter)) for letter in word]

def int2word(ints):
    return "".join([characters[i] for i in ints])


### Number of unique characters: 57

### Max word length: 22


## Word Encoding

In [3]:
# Each word is encoded as a list of one-hot encoded characters
char_enc = OneHotEncoder(sparse_output=False)
char_enc.fit([[char] for char in characters])

def encode_word(word):
    return char_enc.transform([[char] for char in word])

def decode_word(word):
    decoded_word = ""
    for c in word:
        decoded_word += char_enc.inverse_transform([c])[0]
    return decoded_word

# Applying the encoding to the words in the dataframe        
df["word_e"] = df["word"].apply(encode_word)
df["lemm_e"] = df["lemm"].apply(encode_word)

## Tag Encoding

In [4]:
tag_enc = OneHotEncoder(sparse_output=False)
tag_enc.fit(df[["tag"]])
df["tag_e"] = tag_enc.transform(df[["tag"]]).tolist()

# the length of the vocab for one-hot encoded pos
pos_size = len(tag_enc.categories_[0])
print("### Number of POS tags:", pos_size)

### Number of POS tags: 31


## Train Test Split

In [5]:
x_word  = np.array(df["word_e"].tolist())
x_tag   = np.array(df["tag_e"].tolist())
y = np.array(df["lemm_e"].tolist())
word_train, word_test, tag_train, tag_test, y_train, y_test = train_test_split(x_word, x_tag, y , test_size=0.1, random_state=42)
word_train, word_val, tag_train, tag_val, y_train, y_val = train_test_split(word_train, tag_train, y_train, test_size=0.01, random_state=42)

print("### Word train shape:", word_train.shape)
print("### Tag train shape:", tag_train.shape)
print("### Y train shape:", y_train.shape)

print("\n### Word val shape:", word_val.shape)
print("### Tag val shape:", tag_val.shape)
print("### Y val shape:", y_val.shape)

print("\n### Word test shape:", word_test.shape)
print("### Tag test shape:", tag_test.shape)
print("### Y test shape:", y_test.shape)

print("\n### Vocab size:", vocab_size)
print("### POS tag size:", pos_size)

### Word train shape: (4554, 23, 57)
### Tag train shape: (4554, 31)
### Y train shape: (4554, 23, 57)

### Word val shape: (47, 23, 57)
### Tag val shape: (47, 31)
### Y val shape: (47, 23, 57)

### Word test shape: (512, 23, 57)
### Tag test shape: (512, 31)
### Y test shape: (512, 23, 57)

### Vocab size: 57
### POS tag size: 31


# Model

In [6]:
def get_model():
    # Inputs
    word_input = tf.keras.layers.Input(name="word_input", shape=(max_word_length, vocab_size))
    tag_input = tf.keras.layers.Input(name="tag_input", shape=(pos_size))

    # Bidirectional LSTM layer
    lstm = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True))(word_input)
    lstm = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True))(lstm)

    # Fully connected layer for tag_input
    tag_fc = tf.keras.layers.Dense(64, name="tag_dense", activation='relu')(tag_input)
    tag_fc = tf.keras.layers.RepeatVector(max_word_length)(tag_fc)

    # Concatenate the two inputs
    concat = tf.keras.layers.Concatenate()([lstm, tag_fc])

    # Fully connected layer
    fc = tf.keras.layers.Dense(64, name="lstm2dense", activation='relu')(concat)

    # Output layer
    output = tf.keras.layers.Dense(vocab_size, name="output", activation='softmax')(fc)

    # Create model
    return tf.keras.models.Model(inputs=[word_input, tag_input], outputs=output)


# Create model
model = get_model()
model.summary()

Metal device set to: Apple M1 Pro

systemMemory: 32.00 GB
maxCacheSize: 10.67 GB



2023-01-14 12:36:49.710829: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2023-01-14 12:36:49.710970: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 word_input (InputLayer)        [(None, 23, 57)]     0           []                               
                                                                                                  
 tag_input (InputLayer)         [(None, 31)]         0           []                               
                                                                                                  
 bidirectional (Bidirectional)  (None, 23, 128)      62464       ['word_input[0][0]']             
                                                                                                  
 tag_dense (Dense)              (None, 64)           2048        ['tag_input[0][0]']              
                                                                                              

In [7]:
# Hyperparameters
epochs = 50
batch_size = 256

def accuracy(y_true, y_pred):
    y_true = tf.argmax(y_true, axis=-1)
    y_pred = tf.argmax(y_pred, axis=-1)
    correct_predictions = tf.reduce_all(tf.equal(y_true, y_pred), axis=-1)
    accuracy = tf.reduce_mean(tf.cast(correct_predictions, tf.float32))
    return accuracy

model.compile(optimizer='adam', loss="categorical_crossentropy", metrics=[accuracy])
model.fit([word_train, tag_train], y_train, epochs=epochs, batch_size=batch_size, validation_data=([word_val, tag_val], y_val))

Epoch 1/50


2023-01-14 12:36:50.436223: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2023-01-14 12:36:52.872066: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2023-01-14 12:36:53.243910: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2023-01-14 12:36:53.265969: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2023-01-14 12:36:53.451239: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2023-01-14 12:36:53.468114: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2023-01-14 12:36:53.687124: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114

 1/18 [>.............................] - ETA: 1:03 - loss: 4.0360 - accuracy: 0.0000e+00

2023-01-14 12:36:53.969032: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2023-01-14 12:36:53.993488: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




2023-01-14 12:36:56.181795: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2023-01-14 12:36:56.327628: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2023-01-14 12:36:56.339576: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2023-01-14 12:36:56.435630: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2023-01-14 12:36:56.447555: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x2e80cce50>

# Evaluation

In [8]:
# evaluate the model
loss, acc = model.evaluate([word_test, tag_test], y_test, verbose=0)
print('### Test Accuracy: %f' % (acc*100))
print('### Test Loss: %f' % (loss))

### Test Accuracy: 81.054688
### Test Loss: 0.059654


## Lemmatization Accuracy

Lemmatisation accuracy is defined as the number of correct lemma assignment divided by the total number of tokens in the test set belonging to the considered lexical classes (ADJ_, ADV,NN, V_). 

(Evalita2011)

In [9]:
# Generate predictions for the test set
predictions = model.predict([word_test, tag_test], verbose=0)

# Convert predictions to lemmas
predicted_lemmas = [decode_word(pred) for pred in predictions]

# Compare predictions to actual lemmas
correct = 0
for i, lemma in enumerate(predicted_lemmas):
    if lemma == decode_word(y_test[i]):
        correct += 1

# Calculate accuracy
accuracy = correct / len(y_test)
print("Lemmatization accuracy:", accuracy)

2023-01-14 12:37:49.629589: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2023-01-14 12:37:49.750224: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2023-01-14 12:37:49.750283: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2023-01-14 12:37:49.853008: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2023-01-14 12:37:49.864540: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Lemmatization accuracy: 0.810546875


## Demo

In [22]:
# try the model with an example
n = np.random.randint(0, len(word_test))
word = word_test[n]
tag = tag_test[n]
y = y_test[n]

print("### Word:", decode_word(word))
print("### Tag:", tag_enc.inverse_transform([tag])[0])
print("### Lemma:", decode_word(y))

prediction = model.predict([np.array([word]), np.array([tag])], verbose=0)
print("### Lemma prediction:", decode_word(prediction[0]))

### Word: ['sgraziata              ']
### Tag: ['adj']
### Lemma: ['sgraziato              ']
### Lemma prediction: ['sgraziato              ']


In [11]:
# Save the model
model.save("lemmatizer.h5")