# Italian Word Lemmatizer

### Import


In [3]:
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import datetime
import time 
import os

## Dataset

In [4]:
dataset_path = "./dev.csv"
df_dev = pd.read_csv(dataset_path, sep="\t", header=None,
                     names=["word", "tag", "lemm"])

dataset_path = "./test.csv"
df_test = pd.read_csv(dataset_path, sep="\t", header=None,
                      names=["word", "tag", "lemm"])

df_dev["word"] = df_dev["word"].astype(str)
df_dev["tag"] = df_dev["tag"].astype(str)
df_dev["lemm"] = df_dev["lemm"].astype(str)

df_test["word"] = df_test["word"].astype(str)
df_test["tag"] = df_test["tag"].astype(str)
df_test["lemm"] = df_test["lemm"].astype(str)

# remove head
df_dev = df_dev.iloc[1:]
df_test = df_test.iloc[1:]

# lower case all words
df_test["word"] = df_test["word"].str.lower()
df_dev["word"] = df_dev["word"].str.lower()


def get_sentences(df):
    words = []
    tags = []
    lemmas = []
    sentence = []
    max_s = 0
    for _, row in df.iterrows():
        word = row["word"]
        tag = row["tag"]
        lemm = row["lemm"]
        sentence.append([word, tag, lemm])

        if row["word"] in [".", "?", "!", ";"]:
            words.append([word for word, tag, lemm in sentence])
            tags.append([tag for word, tag, lemm in sentence])
            lemmas.append([lemm for word, tag, lemm in sentence])
            max_s = max(max_s, len(sentence))
            sentence = []

    print("Max sentence length: ", max_s)
    return words, tags, lemmas

# _s is for string
dev_words_s, dev_tags_s, dev_lemmas_s = get_sentences(df_dev)
test_words_s, test_tags_s, test_lemmas_s = get_sentences(df_test)
print("Number of sentences in dev set: ", len(dev_words_s))
print("Number of sentences in test set: ", len(test_words_s))

# print number of unique tags
print("Number of unique tags: ", len(df_dev["tag"].unique()))

for i in range(len(dev_words_s)):
    if len(dev_words_s[i]) != len(dev_tags_s[i]) or len(dev_words_s[i]) != len(dev_lemmas_s[i]):
        print("Dimension mismatch in sentence: ", i)
        print("Words: ", dev_words_s[i])
        print("Tags: ", dev_tags_s[i])
        print("Lemmas: ", dev_lemmas_s[i])
        break

Max sentence length:  95
Max sentence length:  107
Number of sentences in dev set:  703
Number of sentences in test set:  5596
Number of unique tags:  32


## Word Context

In [5]:
# Code reused from it_lemmatizer.ipynb

CTX_DIM = 12            # context dimension, 12 words on each side
PRE_VALUE = "<PRE>"     # value for padding pre 
POST_VALUE = "<POST>"   # value for padding post
NONE_TAG = "<NONE>"     # value for padding tags

def get_context(words, tags, lemmas):
    ctx = []    # context list
    w = []      # word list
    tag = []    # context tags list
    t = []      # word tags list
    lemma = []  # lemma list

    for s_index in range(len(words)):
        s = words[s_index]
        s_tags = tags[s_index]
        
        s = [PRE_VALUE] * CTX_DIM + s + [POST_VALUE] * CTX_DIM
        s_tags = [NONE_TAG] * CTX_DIM + s_tags + [NONE_TAG] * CTX_DIM

        for w_index in range(len(s)):
            if w_index < CTX_DIM or w_index >= len(s) - CTX_DIM:
                continue

            context = s[w_index - CTX_DIM:w_index] + [s[w_index]] +s[w_index + 1:w_index  + CTX_DIM + 1]
            context = " ".join(context)
            ctx.append(context)
            w.append(words[s_index][w_index-CTX_DIM])

            ctx_tags = s_tags[w_index - CTX_DIM:w_index] + [s_tags[w_index]] + s_tags[w_index + 1:w_index  + CTX_DIM + 1]
            tag.append(ctx_tags)
            t.append(tags[s_index][w_index-CTX_DIM])

            lemma.append(lemmas[s_index][w_index-CTX_DIM])
    return ctx, w, tag, t, lemma

dev_ctx, dev_words, dev_tags, dev_tag,dev_lemmas = get_context(dev_words_s, dev_tags_s, dev_lemmas_s)
test_ctx, test_words, test_tags, test_tag,test_lemmas = get_context(test_words_s, test_tags_s, test_lemmas_s)

print("Number of sample in dev set: ", len(dev_lemmas))
print("Number of sample in test set: ", len(test_lemmas))

Number of sample in dev set:  17313
Number of sample in test set:  133756


In [6]:
# Removing padding tokens and NONE_TAG
# Not useful for GPT-3

for i in range(len(test_ctx)):
    # remove PRE_VALUE and POST_VALUE
    c = test_ctx[i]
    c = c.replace(PRE_VALUE, "")
    c = c.replace(POST_VALUE, "")

    # remove multiple spaces
    c = " ".join(c.split())
    test_ctx[i] = c

    # remove NONE_TAG from tags
    t = test_tags[i]
    t = [x for x in t if x != NONE_TAG]
    test_tags[i] = t

for i in range(len(dev_ctx)):
    # remove PRE_VALUE and POST_VALUE
    c = dev_ctx[i]
    c = c.replace(PRE_VALUE, "")
    c = c.replace(POST_VALUE, "")

    # remove multiple spaces
    c = " ".join(c.split())
    dev_ctx[i] = c

    # remove NONE_TAG from tags
    t = dev_tags[i]
    t = [x for x in t if x != NONE_TAG]
    dev_tags[i] = t

print("Number of sample in dev set: ", len(dev_lemmas))
print("Number of sample in test set: ", len(test_lemmas))


Number of sample in dev set:  17313
Number of sample in test set:  133756


### Example of context

In [7]:
print("CTX Dim:", CTX_DIM, "\n")
for i in range(3):
    index = np.random.randint(0, len(dev_ctx))
    print("CTX: ", dev_ctx[index])
    print("CTX Tags: ", dev_tags[index])
    print("Word: ", dev_words[index])
    print("Tag: ", dev_tag[index])
    print("Lemma: ", dev_lemmas[index])
    print()

CTX Dim: 12 

CTX:  dei rinogradi era comunque l' abnorme sviluppo del nasario , una struttura che pu&ograve; essere grossolanamente assimilata al naso ma che nella realt&agrave; aveva un'
CTX Tags:  ['prep_a', 'nn_p', 'v_essere', 'adv', 'art', 'adj', 'nn', 'prep_a', 'nn', 'p_oth', 'art', 'nn', 'pron_rel', 'v_mod', 'v_essere', 'adv', 'v_pp', 'prep_a', 'nn', 'conj_c', 'pron_rel', 'prep_a', 'nn', 'v_avere', 'art']
Word:  che
Tag:  pron_rel
Lemma:  che

CTX:  regolamenti locali , il vicino pu&ograve; chiedere la comunione del muro soltanto allo_scopo_di fabbricare contro il muro stesso , pagando , oltre il valore della
CTX Tags:  ['nn', 'adj', 'p_oth', 'art', 'nn', 'v_mod', 'v_gvrb', 'art', 'nn', 'prep_a', 'nn', 'adv', 'conj_s', 'v_gvrb', 'prep', 'art', 'nn', 'adj_dim', 'p_oth', 'v_gvrb', 'p_oth', 'prep', 'art', 'nn', 'prep_a']
Word:  allo_scopo_di
Tag:  conj_s
Lemma:  allo_scopo_di

CTX:  " cosa sono le banane , padre ?
CTX Tags:  ['p_oth', 'pron_ies', 'v_essere', 'art', 'nn', 'p_oth', '

## Open Class Words
The evaluation is done only on open-class words and not to functional words: only the tokens having a PoS-tag comprised in the set ADJ *, ADV, NN, V * had to be lemmatised, in all the other cases the token could be copied unchanged into the lemma column as they were not considered for the evaluation (the asterisk indicates all PoS-tag possibilities beginning with that prefix).

In [8]:
def get_open_class_words(ctx, words, tags, tag, lemmas):
    open_class_words = []   # open class words
    open_class_ctx = []     # open class context 
    open_class_tags = []    # open class tags
    open_class_tag = []     # open class tag
    open_class_lemmas = []  # open class lemmas

    open_classes = ["nn", "v_gvrb", "v_essere", "v_avere", "v_pp", "v_mod", "v_clit", "adv", "adj_ind", "adj_num", "adj", "adj_pos", "adj_dim", "adj_ies"]

    for i in range(len(words)):
        t = tag[i]
        if t in open_classes:
            open_class_words.append(words[i])
            open_class_ctx.append(ctx[i])
            open_class_tags.append(tags[i])
            open_class_tag.append(t)
            open_class_lemmas.append(lemmas[i])

    return open_class_ctx, open_class_words, open_class_tags, open_class_tag, open_class_lemmas


test_ctx, test_words, test_tags, test_tag, test_lemmas = get_open_class_words(test_ctx, test_words, test_tags, test_tag, test_lemmas)

print("Number of open class words in test set: ", len(test_words))  

Number of open class words in test set:  65210


# Prompt creation

In [9]:
import openai 

# Generate a prompt for GPT-3 model 
# from a sample of the dataset
def create_prompt_data(ctx, tags, word, lemmas):
    prompt = "Context: " + ctx + "\n"
    prompt += "Tags: " + " ".join(tags) + "\n"
    prompt += "Word: " + word+ "\n"
    lemma  = " " + lemmas + "\n"
    return prompt, lemma

train_prompt = []
train_lemma = []

test_prompt = []
test_lemma = []

In [10]:
for i in range(len(dev_words)):
    prompt, lemma = create_prompt_data(dev_ctx[i], dev_tags[i], dev_words[i], dev_lemmas[i])
    train_prompt.append(prompt)
    train_lemma.append(lemma)

print("Number of training samples: ", len(train_prompt))

for i in range(len(test_words)):
    prompt, lemma = create_prompt_data(test_ctx[i], test_tags[i], test_words[i], test_lemmas[i])
    test_prompt.append(prompt)
    test_lemma.append(lemma)

print("Number of test samples: ", len(test_prompt))

Number of training samples:  17313
Number of test samples:  65210


In [11]:
train_df = pd.DataFrame(list(zip(train_prompt, train_lemma)), columns =['prompt', 'completion'])
train_df.to_json("dev_prompt.jsonl", orient='records', lines=True)

test_df = pd.DataFrame(list(zip(test_prompt, test_lemma)), columns =['prompt', 'completion'])
test_df.to_json("test_prompt.jsonl", orient='records', lines=True)

In [12]:
train_df.head()

Unnamed: 0,prompt,completion
0,Context: mi riferisco al lavoro dove non c' &e...,mi\n
1,Context: mi riferisco al lavoro dove non c' &e...,riferire\n
2,Context: mi riferisco al lavoro dove non c' &e...,al\n
3,Context: mi riferisco al lavoro dove non c' &e...,lavoro\n
4,Context: mi riferisco al lavoro dove non c' &e...,dove\n


In [13]:
# Dataset preparation with openai tools
!openai tools fine_tunes.prepare_data -f dev_prompt.jsonl -q

Analyzing...

- Your file contains 17313 prompt-completion pairs
- Based on your data it seems like you're trying to fine-tune a model for classification
- For classification, we recommend you try one of the faster and cheaper models, such as `ada`
- For classification, you can estimate the expected model performance by keeping a held out dataset, which is not used for training
- There are 77 duplicated prompt-completion sets. These are rows: [12, 222, 580, 594, 956, 1216, 1793, 1909, 2222, 2471, 2525, 2675, 3032, 3050, 3073, 3373, 3439, 3517, 3558, 4116, 4121, 4436, 4619, 4626, 4630, 5146, 5325, 6487, 6532, 6776, 6804, 6808, 6843, 6907, 6960, 6964, 6965, 7108, 7171, 7835, 7836, 8262, 8458, 8504, 9102, 9765, 9817, 9933, 10270, 10866, 11948, 12059, 12214, 12224, 12238, 12598, 12600, 12602, 12711, 12746, 12747, 13521, 13722, 13827, 13906, 14561, 15200, 15201, 15202, 15203, 16143, 16954, 16960, 17082, 17083, 17309, 17311]
- All prompts end with suffix `\n`
` more than once. We strongly su

In [14]:
# API key for openai
import json
with open('openai_api_key.json') as f:
    data = json.load(f)
    api_key = data['api_key']

openai.api_key = api_key
os.environ["OPENAI_API_KEY"] = api_key

In [305]:
# Fine tuning of ADAGPT-3 model 
!openai api fine_tunes.create -t file-WFxP8GC9Fl3jHmfYxpYt4FxB -v file-dsbAKSdqDFncrnqrSGqMR7gO -m "ada:ft-personal-2023-06-16-15-25-19" --n_epochs 6

Created fine-tune: ft-TJyXJ9Vs1vWWG7QkDDtod6Rx
Streaming events until fine-tuning is complete...

(Ctrl-C will interrupt the stream, but not cancel the fine-tune)
[2023-06-16 22:01:56] Created fine-tune: ft-TJyXJ9Vs1vWWG7QkDDtod6Rx



In [323]:
# Status of fine tuning
openai.FineTune.retrieve("ft-TJyXJ9Vs1vWWG7QkDDtod6Rx")

<FineTune fine-tune id=ft-TJyXJ9Vs1vWWG7QkDDtod6Rx at 0x2b240fc50> JSON: {
  "object": "fine-tune",
  "id": "ft-TJyXJ9Vs1vWWG7QkDDtod6Rx",
  "hyperparams": {
    "n_epochs": 6,
    "batch_size": 32,
    "prompt_loss_weight": 0.01,
    "learning_rate_multiplier": 0.1
  },
  "organization_id": "org-Vo2aGTMZQOXwFWCVzZKQZ9HU",
  "model": "ada:ft-personal-2023-06-16-15-25-19",
  "training_files": [
    {
      "object": "file",
      "id": "file-WFxP8GC9Fl3jHmfYxpYt4FxB",
      "purpose": "fine-tune",
      "filename": "dev_prompt_prepared_train.jsonl",
      "bytes": 4544763,
      "created_at": 1686926701,
      "status": "processed",
      "status_details": null
    }
  ],
  "validation_files": [
    {
      "object": "file",
      "id": "file-dsbAKSdqDFncrnqrSGqMR7gO",
      "purpose": "fine-tune",
      "filename": "dev_prompt_prepared_valid.jsonl",
      "bytes": 279695,
      "created_at": 1686926704,
      "status": "processed",
      "status_details": null
    }
  ],
  "result_file

In [15]:
# Model ID
model_id = "ada:ft-personal-2023-06-16-20-34-34"

# Evaluation

API Limits:
* Pay-as-you-go users (first 48 hours) - 60 RPM 
* Pay-as-you-go users (after 48 hours) - 3,500 RPM 
* ada 250,000 tokens per minute


In [16]:
prompts = test_df['prompt'].tolist()
print("Number of prompts: ", len(prompts))

Number of prompts:  65210


In [326]:
predictions = []
batch_size = 20 # Max number of prompts per request
total_batches = len(prompts) // batch_size

for i in range(0, len(prompts), batch_size):
    # sleep for 1 second to avoid rate limit
    time.sleep(1)

    # Get batch of prompts
    prompt = prompts[i:i+batch_size]

    # Get predictions
    response = openai.Completion.create(
        model=model_id,
        prompt=prompt,
        max_tokens=20, 
    )
    results = response.choices

    # Store predictions
    for j in range(len(results)):
        predictions.append(results[j].text)

    print("Completed batch ", i//batch_size + 1, "/", total_batches, end="\r")

Completed batch  3261 / 3260

In [337]:
from collections import Counter

predicted_lemmas = []

# A single prediction contains multiple possiple lemmas
# for each prompt. We take the most frequent lemma as the
# predicted lemma

for p in predictions:
    p = p.replace(" ", "")
    words = p.split("\n")
    freq = Counter(words)
    lemma = max(freq, key=freq.get)
    predicted_lemmas.append(lemma)

In [24]:
# Save predictions to file
with open("predictions.txt", "w") as f:
    for p in predicted_lemmas:
        f.write(p + "\n")

In [17]:
# Load predictions from file
with open("predictions.txt", "r") as f:
    predicted_lemmas = f.readlines()

In [23]:
# Evaluate predictions on test set

tot_error = 0
error_per_tag = {}
tag_count = {}
correct_lemmas = 0

for index, row in test_df.iterrows():
    completion = predicted_lemmas[index].strip()
    expected_completion = row['completion'].strip()

    tag = test_tag[index].split("_")[0]

    if tag not in tag_count:
            tag_count[tag] = 1
    else:
        tag_count[tag] += 1

    if completion != expected_completion:
        tot_error += 1
        if tag not in error_per_tag:
            error_per_tag[tag] = 1
        else:
            error_per_tag[tag] += 1
    else:
        correct_lemmas += 1

print("Accuracy: ", correct_lemmas/len(test_df))


Accuracy:  0.9753565404079129


In [21]:
print("Absolute error with respect to the PoS tag: ")
for tag in error_per_tag:
    print(tag, round(error_per_tag[tag] / tot_error,3))

print("\nRelative error with respect to the PoS tag: ")
for tag in error_per_tag:
    print(tag, round(error_per_tag[tag] / tag_count[tag], 3))

Absolute error with respect to the PoS tag: 
v 0.313
adj 0.262
nn 0.398
adv 0.027

Relative error with respect to the PoS tag: 
v 0.029
adj 0.037
nn 0.026
adv 0.007
