# Lemmatization

# Dataset

In [4]:
import pandas as pd
import tensorflow as tf
import numpy as np

dataset_path = "./out.csv"
df = pd.read_csv(dataset_path, sep="\t", header=None, names=["word", "tag", "lemm"])

df["word"] = df["word"].astype(str) 
df["tag"] = df["tag"].astype(str)
df["lemm"] = df["lemm"].astype(str)

# remove duplicates in word columns
df = df.drop_duplicates(subset=["word"])

# remove head
df = df.iloc[1:]

# remove tag P_OTH
df = df[df["tag"] != "P_OTH"]

print("### DF shape:" ,df.shape)
print("\n### NaN values:")
print(df.isnull().sum())

# removing rows where tag is nan
df = df.dropna(subset=["tag"])
print("\n### DF shape after removing rows where tag is nan:" ,df.shape)

# print number of unique values for each column
print("\n### Unique values:")
print(df.nunique())

# lower case all words
df["word"] = df["word"].str.lower()




### DF shape: (18506, 3)

### NaN values:
word    0
tag     0
lemm    0
dtype: int64

### DF shape after removing rows where tag is nan: (18506, 3)

### Unique values:
word    18506
tag        32
lemm    12001
dtype: int64


# Data Processing

# Keras Embedding

In [5]:
# get all unique letter in words
characters = set()
for word in df["lemm"]:
    for letter in word:
        characters.add(letter)

# add padding and unknown to characters
characters.add(" ")


print("\n### Number of unique characters:", len(characters))

# create a dictionary that maps characters to integers
char2int = {c: i for i, c in enumerate(sorted(characters))}

# create a dictionary that maps integers to characters
int2char = {i: c for i, c in enumerate(sorted(characters))}


max_word_length = df["lemm"].str.len().max()
print("\n### Max word length:", max_word_length)


# create a function that will pad a word
def pad_word(word, max_word_length):
    return word + " " * (max_word_length - len(word))


df["word"] = df["word"].apply(lambda x: pad_word(x, max_word_length))
df["lemm"] = df["lemm"].apply(lambda x: pad_word(x, max_word_length))



### Number of unique characters: 60

### Max word length: 25


In [6]:
import numpy as np
from keras.layers import Embedding

# Create an embedding layer
vocab_size = len(characters)
embedding_dim = 4
embedding_layer = Embedding(vocab_size, embedding_dim, input_length=1)


# Convert characters to dense vectors using the embedding layer
word = "abc"

dense_vectors = []
for char in word:
    char_index = char2int[char]
    dense_vector = embedding_layer(np.array([char_index]))
    dense_vectors.append(dense_vector)
    print("Character: {}, Index: {}, Vector: {}".format(char, char_index, dense_vector))
  
# Concatenate dense vectors into a single tensor
dense_tensor = np.concatenate(dense_vectors, axis=0)

print("Word: {}".format(word))


# apply the embedding layer to the all the words in the dataset
with tf.device('/CPU:0'):
    df["word_e"] = df["word"].apply(lambda x: embedding_layer(np.array([char2int[char] for char in x])))

df.head()


Character: a, Index: 34, Vector: [[ 0.02220435  0.03470185 -0.03728743 -0.01627105]]
Character: b, Index: 35, Vector: [[-0.02766057  0.0440712  -0.04992331  0.03657413]]
Character: c, Index: 36, Vector: [[ 0.04293891  0.04462409 -0.04066285  0.04084182]]
Word: abc


Unnamed: 0,word,tag,lemm,word_e
1,"""",p_oth,"""","((tf.Tensor(0.012610484, shape=(), dtype=float..."
2,craxi,nn_p,craxi,"((tf.Tensor(0.042938914, shape=(), dtype=float..."
3,:,p_oth,:,"((tf.Tensor(0.014609661, shape=(), dtype=float..."
4,non,adv,non,"((tf.Tensor(0.04233743, shape=(), dtype=float3..."
5,ci,adv,ci,"((tf.Tensor(0.042938914, shape=(), dtype=float..."


In [7]:
# one hot encode tags
from sklearn.preprocessing import OneHotEncoder

char_enc = OneHotEncoder(sparse_output=False)
char_enc.fit([[char] for char in characters])

def encode_word(word):
    return char_enc.transform([[char] for char in word])

def decode_word(encoded_word):
    return "".join(char_enc.inverse_transform(encoded_word))

# encode lemmas
df["lemm_e"] = df["lemm"].apply(encode_word)

In [14]:
# one hot encode tags
from sklearn.preprocessing import OneHotEncoder

# create the encoder
encoder = OneHotEncoder(sparse_output=False)

# fit the encoder
encoder.fit(df[["tag"]])

# transform the data
df["tag_e"] = encoder.transform(df[["tag"]]).tolist()

# split the dataset into train and test
from sklearn.model_selection import train_test_split

x = df[["word_e", "tag_e"]]
y = df[["lemm_e"]]

x_train, x_test, y_train, y_test = train_test_split(x, y , test_size=0.2, random_state=42)

print("### Train shape:", x_train.shape)

print("### Test shape:", x_test.shape)


### Train shape: (14804, 2)
### Test shape: (3702, 2)


# Model

In [None]:
# The lemmatizer is a sequence-to-sequence model that uses a recurrent LSTM 
# layer to generate a sequence of characters for each lemma. The input to the
#  LSTM consists of the embedding of the previous output character, 
# a character-level attention mechanism that uses the outputs of a character-level BRNN,
#  and information about the word embedding, surrounding sentence context, and output of a tagger. 
# The lemmatizer uses greedy decoding to predict the character outputs and stops either when 
# it produces an end-of-word character or reaches a character limit. 


from keras.models import Model
from keras.layers import Input, LSTM, Dense, Embedding, Concatenate, Bidirectional, TimeDistributed, RepeatVector, Activation, Dot, Lambda
from keras import backend as K

# disable gpu in tensorflow



# define the model
def define_model(src_vocab, tar_vocab, src_timesteps, tar_timesteps, n_units):
    # define training encoder
    encoder_inputs = Input(shape=(src_timesteps, src_vocab))
    encoder = Bidirectional(LSTM(n_units, return_state=True))
    encoder_outputs, forward_h, forward_c, backward_h, backward_c = encoder(encoder_inputs)
    state_h = Concatenate()([forward_h, backward_h])
    state_c = Concatenate()([forward_c, backward_c])
    encoder_states = [state_h, state_c]
    # define training decoder
    decoder_inputs = Input(shape=(None, tar_vocab))
    decoder_lstm = LSTM(n_units*2, return_sequences=True, return_state=True)
    decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states)
    decoder_dense = Dense(tar_vocab, activation='softmax')
    decoder_outputs = decoder_dense(decoder_outputs)
    model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
    # define inference encoder
    encoder_model = Model(encoder_inputs, encoder_states)
    # define inference decoder
    decoder_state_input_h = Input(shape=(n_units*2,))
    decoder_state_input_c = Input(shape=(n_units*2,))
    decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
    decoder_outputs, state_h, state_c = decoder_lstm(decoder_inputs, initial_state=decoder_states_inputs)
    decoder_states = [state_h, state_c]
    decoder_outputs = decoder_dense(decoder_outputs)
    decoder_model = Model([decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states)
    # return all models
    return model, encoder_model, decoder_model

# define model
model, encoder_model, decoder_model = define_model(60, 60, max_word_length, max_word_length, 256)

# compile model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=["accuracy"])

# fit model
model.fit([X_train, X_train], y_train, batch_size=512 , epochs=10, validation_data=([X_test, X_test], y_test))



In [3]:
# lstm model with masking layer

from keras.models import Sequential
from keras.layers import LSTM, Dense, Masking

padding_index = char2int[" "]
padding = embedding_layer(np.array([padding_index]))

model = Sequential()
model.add(Masking(mask_value=padding, input_shape=(max_word_length , embedding_dim)))
model.add(LSTM(256, return_sequences=True))
model.add(LSTM(256))
model.add(Dense(256, activation="swish"))
model.add(Dense(128, activation="swish"))
model.add(Dense(64, activation="swish"))
model.add(Dense(32, activation="softmax"))

model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

model.fit(X_train, y_train, epochs=100, batch_size=32, validation_data=(X_test, y_test))
    

KeyError: ' '