# Lemmatization



# Dataset

In [1]:
import pandas as pd
import tensorflow as tf
import numpy as np

dataset_path = "./out.csv"

df = pd.read_csv(dataset_path, sep="\t", header=None, names=["word", "tag", "lemm"])

df["word"] = df["word"].astype(str) 
df["tag"] = df["tag"].astype(str)
df["lemm"] = df["lemm"].astype(str)

# remove head
df = df.iloc[1:]

# remove tag P_OTH
df = df[df["tag"] != "P_OTH"]

print("### DF shape:" ,df.shape)
print("\n### NaN values:")
print(df.isnull().sum())

# removing rows where tag is nan
df = df.dropna(subset=["tag"])
print("\n### DF shape after removing rows where tag is nan:" ,df.shape)

# print number of unique values for each column
print("\n### Unique values:")
print(df.nunique())

# lower case all words
df["word"] = df["word"].str.lower()


### DF shape: (133756, 3)

### NaN values:
word    0
tag     0
lemm    0
dtype: int64

### DF shape after removing rows where tag is nan: (133756, 3)

### Unique values:
word    18506
tag        32
lemm    12202
dtype: int64


# Data Processing

In [10]:
# get all unique letter in words
characters = set()
for word in df["lemm"]:
    for letter in word:
        characters.add(letter)

# add padding and unknown to characters
characters.add(" ")


print("\n### Number of unique characters:", len(characters))

# create a dictionary that maps characters to integers
char2int = {c: i for i, c in enumerate(sorted(characters))}

# create a dictionary that maps integers to characters
int2char = {i: c for i, c in enumerate(sorted(characters))}


max_word_length = df["lemm"].str.len().max()
print("\n### Max word length:", max_word_length)


# create a function that will pad a word
def pad_word(word, max_word_length):
    return word + " " * (max_word_length - len(word) + 1 )

df["word"] = df["word"].apply(lambda x: pad_word(x, max_word_length))




### Number of unique characters: 60

### Max word length: 25


# Embedding

In [14]:
import numpy as np
from keras.layers import Embedding

# Create an embedding layer
vocab_size = len(characters)
embedding_dim = 8
embedding_layer = Embedding(vocab_size, embedding_dim, input_length=1)

# Convert characters to dense vectors using the embedding layer
word = "abc"

dense_vectors = []
for char in word:
    char_index = char2int[char]
    dense_vector = embedding_layer(np.array([char_index]))
    dense_vectors.append(dense_vector)
    print("Character: {}, Index: {}, Vector: {}".format(char, char_index, dense_vector))

# Concatenate dense vectors into a single tensor
dense_tensor = np.concatenate(dense_vectors, axis=0)

# apply the embedding layer to the all the words in the dataset
df["word_e"] = df["word"].apply(lambda x: embedding_layer(np.array([char2int[char] for char in x])))

df.head()

# check if the size of word_e is always the same
df["word_e"].apply(lambda x: x.shape).unique()



Character: a, 
Index: 34, 
Vector: [[-0.00634288 -0.0281008   0.02099048  0.00537093 -0.04169939  0.04311992
   0.03632576  0.03021384]]

Character: b, 
Index: 35, 
Vector: [[ 0.00803609 -0.00380163 -0.03261193 -0.02743235  0.02289846 -0.02173448
   0.02918745 -0.04527965]]

Character: c, 
Index: 36, 
Vector: [[-0.04879359  0.04591043 -0.03960212 -0.04415265 -0.0101123  -0.00556413
  -0.02741838  0.01529712]]


array([TensorShape([26, 8])], dtype=object)