# Lemmatization

# Dataset

In [110]:
import pandas as pd
import tensorflow as tf
import tensorflow_addons as tfa
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from keras.models import Model
from keras.layers import Input, LSTM, Dense, Embedding, Concatenate, Bidirectional, TimeDistributed, RepeatVector, Activation, Dot, Lambda
from keras import backend as K


dataset_path = "./out.csv"
df = pd.read_csv(dataset_path, sep="\t", header=None, names=["word", "tag", "lemm"])

df["word"] = df["word"].astype(str) 
df["tag"] = df["tag"].astype(str)
df["lemm"] = df["lemm"].astype(str)

# remove duplicates in word columns
df = df.drop_duplicates(subset=["word"])

# remove head
df = df.iloc[1:]

# removing punctuation and numbers
df = df[df["tag"] != "p_oth"]
df = df[df["tag"] != "c_num"]

print("### DF shape:" ,df.shape)
print("\n### NaN values:")
print(df.isnull().sum())

# removing rows where tag is nan
df = df.dropna(subset=["tag"])
print("\n### DF shape after removing rows where tag is nan:" ,df.shape)

# print number of unique values for each column
print("\n### Unique values:")
print(df.nunique())

# lower case all words
df["word"] = df["word"].str.lower()

# count number of row where word is equal to lemm
print("\n### Number of word that are equals to lemm:")
print(df[df["word"] == df["lemm"]].shape[0])

# word that are equals to thei lemma
# df = df[df["word"] != df["lemm"]]

df.head()

### DF shape: (404711, 3)

### NaN values:
word    0
tag     0
lemm    0
dtype: int64

### DF shape after removing rows where tag is nan: (404711, 3)

### Unique values:
word    404711
tag        581
lemm     33245
dtype: int64

### Number of word that are equals to lemm:
31830


Unnamed: 0,word,tag,lemm
1,!,pon,!
3,!!!,sent,!!!
4,"""",pon,""""
5,#,sym,#
6,$,sym,$


## Data Processing

In [111]:
# get all unique letter in words
characters = set()

for word in df["word"]:
    for letter in word:
        characters.add(letter)

for lemma in df["lemm"]:
    for letter in lemma:
        characters.add(letter)

# add padding and unknown to characters
characters.add(" ")
print("\n### Number of unique characters:", len(characters))

# order characters
characters = sorted(list(characters))

max_word_length = df["lemm"].str.len().max()
print("\n### Max word length:", max_word_length)
max_word_length = 35

# create a function that will pad a word
def pad_word(word, max_word_length):
    return word + " " * (max_word_length - len(word))

df["word"] = df["word"].apply(lambda x: pad_word(x, max_word_length))
df["lemm"] = df["lemm"].apply(lambda x: pad_word(x, max_word_length))

def word2int(word):
    return [float(characters.index(letter)) for letter in word]

def int2word(ints):
    return "".join([characters[i] for i in ints])


### Number of unique characters: 82

### Max word length: 29


In [112]:
# import numpy as np
# from keras.layers import Embedding

# # create a dictionary that maps characters to integers
# char2int = {c: i for i, c in enumerate(sorted(characters))}

# # create a dictionary that maps integers to characters
# int2char = {i: c for i, c in enumerate(sorted(characters))}


# # Create an embedding layer
# vocab_size = len(characters)
# embedding_dim = 4
# embedding_layer = Embedding(vocab_size, embedding_dim, input_length=1)

# # Convert characters to dense vectors using the embedding layer
# word = "abc"

# dense_vectors = []
# for char in word:
#     char_index = char2int[char]
#     dense_vector = embedding_layer(np.array([char_index]))
#     dense_vectors.append(dense_vector)
#     print("Character: {}, Index: {}, Vector: {}".format(char, char_index, dense_vector))
  
# # Concatenate dense vectors into a single tensor
# dense_tensor = np.concatenate(dense_vectors, axis=0)

# print("Word: {}".format(word))


# # apply the embedding layer to the all the words in the dataset
# with tf.device('/CPU:0'):
#     df["word_e"] = df["word"].apply(lambda x: embedding_layer(np.array([char2int[char] for char in x])))

# df.head()

## Word Encoding

In [113]:
char_enc = OneHotEncoder(sparse_output=False)
char_enc.fit([[char] for char in characters])

def encode_word(word):
    return char_enc.transform([[char] for char in word])

def decode_word(word):
    decoded_word = ""
    for c in word:
        decoded_word += char_enc.inverse_transform([c])[0]
    return decoded_word
        
df["word_e"] = df["word"].apply(encode_word)
df["lemm_e"] = df["lemm"].apply(encode_word)

## Tag Encoding

In [114]:
tag_enc = OneHotEncoder(sparse_output=False)
tag_enc.fit(df[["tag"]])
df["tag_e"] = tag_enc.transform(df[["tag"]]).tolist()

pos_tag_size = len(tag_enc.categories_[0])
print("### Number of POS tags:", pos_tag_size)

### Number of POS tags: 581


## Train Test Split

In [115]:
x_word  = np.array(df["word_e"].tolist())
x_tag   = np.array(df["tag_e"].tolist())
y = np.array(df["lemm_e"].tolist())
word_train, word_test, tag_train, tag_test, y_train, y_test = train_test_split(x_word, x_tag, y , test_size=0.1, random_state=42)
word_train, word_val, tag_train, tag_val, y_train, y_val = train_test_split(word_train, tag_train, y_train, test_size=0.01, random_state=42)

print("### Word train shape:", word_train.shape)
print("### Tag train shape:", tag_train.shape)
print("### Y train shape:", y_train.shape)

print("\n### Word val shape:", word_val.shape)
print("### Tag val shape:", tag_val.shape)
print("### Y val shape:", y_val.shape)

print("\n### Word test shape:", word_test.shape)
print("### Tag test shape:", tag_test.shape)
print("### Y test shape:", y_test.shape)

### Word train shape: (360596, 35, 82)
### Tag train shape: (360596, 581)
### Y train shape: (360596, 35, 82)

### Word val shape: (3643, 35, 82)
### Tag val shape: (3643, 581)
### Y val shape: (3643, 35, 82)

### Word test shape: (40472, 35, 82)
### Tag test shape: (40472, 581)
### Y test shape: (40472, 35, 82)


# Model

In [116]:
# the length of the vocab for one-hot encoded char
vocab_size = len(characters)
pos_size = pos_tag_size  # the length of the vocab for one-hot encoded pos

d_model = 64  # the size of the internal representation
nhead = 8  # number of attention heads
num_layers = 2  # the number of layers in the transformer

print("### Vocab size:", vocab_size)
print("### POS tag size:", pos_size)

# Inputs
word_input = tf.keras.layers.Input(name="word_input", shape=(max_word_length, vocab_size))
tag_input = tf.keras.layers.Input(name="tag_input", shape=(pos_size))

# Bidirectional LSTM layer
lstm = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True))(word_input)
lstm = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True))(lstm)

# Fully connected layer for tag_input
tag_fc = tf.keras.layers.Dense(64, name="tag_dense", activation='relu')(tag_input)
tag_fc = tf.keras.layers.RepeatVector(max_word_length)(tag_fc)

# Concatenate the two inputs
concat = tf.keras.layers.Concatenate()([lstm, tag_fc])

# Fully connected layer
fc = tf.keras.layers.Dense(64, name="lstm2dense", activation='relu')(concat)

# Output layer
output = tf.keras.layers.Dense(vocab_size, name="output",activation='softmax')(fc)

# Create model
model = tf.keras.models.Model(inputs=[word_input, tag_input], outputs=output)

# Compile model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

### Vocab size: 82
### POS tag size: 581
Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 word_input (InputLayer)        [(None, 35, 82)]     0           []                               
                                                                                                  
 tag_input (InputLayer)         [(None, 581)]        0           []                               
                                                                                                  
 bidirectional_4 (Bidirectional  (None, 35, 128)     75264       ['word_input[0][0]']             
 )                                                                                                
                                                                                                  
 tag_dense (Dense)              (None, 64)         

In [117]:
model.compile(optimizer='adam', loss="categorical_crossentropy", metrics=['accuracy'])
model.fit([word_train, tag_train], y_train, epochs=10, batch_size=128, validation_data=([word_val, tag_val], y_val))

Epoch 1/10


2023-01-12 14:51:00.631494: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2023-01-12 14:51:01.016649: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2023-01-12 14:51:01.046477: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2023-01-12 14:51:01.181506: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2023-01-12 14:51:01.198602: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2023-01-12 14:51:01.459359: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2023-01-12 14:51:01.490024: I tensorflow/core/grappler/optimizers/cust



2023-01-12 14:54:12.252204: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2023-01-12 14:54:12.399242: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2023-01-12 14:54:12.411948: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2023-01-12 14:54:12.559077: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2023-01-12 14:54:12.571472: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Epoch 2/10

KeyboardInterrupt: 

# Evaluation

In [118]:
# evaluate the model
loss, acc = model.evaluate([word_test, tag_test], y_test, verbose=0)
print('Test Accuracy: %f' % (acc*100))


Test Accuracy: 99.661851


## Lemmatization Accuracy

Lemmatisation accuracy is defined as the number of correct lemma assignment divided by the total number of tokens in the test set belonging to the considered lexical classes (ADJ_, ADV,NN, V_). 

(Evalita2011)

In [120]:
# Generate predictions for the test set
predictions = model.predict([word_test, tag_test], verbose=0)

# Convert predictions to lemmas
predicted_lemmas = [decode_word(pred) for pred in predictions]

# Compare predictions to actual lemmas
correct = 0
for i, lemma in enumerate(predicted_lemmas):
    if lemma == decode_word(y_test[i]):
        correct += 1

# Calculate accuracy
accuracy = correct / len(y_test)
print("Lemmatization accuracy:", accuracy)


Lemmatization accuracy: 0.9300751136588259


## Demo

In [None]:
# try the model with an example
n = np.random.randint(0, len(word_test))
word = word_test[n]
tag = tag_test[n]
y = y_test[n]

print("### Word:", decode_word(word))
print("### Tag:", tag_enc.inverse_transform([tag])[0])
print("### Lemma:", decode_word(y))

prediction = model.predict([np.array([word]), np.array([tag])], verbose=0)
print("### Lemma prediction:", decode_word(prediction[0]))

### Word: ['downing                            ']
### Tag: ['nn_p']
### Lemma: ['downing                            ']
### Lemma prediction: ['downing                            ']
