# LLM for Definition Modeling

Definition modeling is a task where a language model is requested to generate a definition for a given word.
In this notebook, we want to explore the ability of large language models to generate definitions for unknown words.
As LLMs are trained on large corpora, it is possible that their training data contains obscure words, so no existing
words can be used to test their ability. To address this issue, we will create several made up words and create example
sentences with them. The models will have to generate a definition for these words based just on the context.

In [1]:
import os
import openai
import json


HF_TOKEN = None
# Read file "keys.json" to get API keys
with open('keys.json') as f:
    keys = json.load(f)

    # Set API keys
    openai.api_key = keys['openai']
    openai.organization = keys['openai-organization']
    HF_TOKEN = keys['huggingface']

In [2]:
def build_prompt(before_examples, examples, after_examples):
    joined_examples = "\n".join(examples)
    return f"""{before_examples}{joined_examples}{after_examples}"""

def build_openai_completion(prompt, model="text-davinci-003", temperature=0, max_tokens=300, top_p=1, frequency_penalty=0, presence_penalty=0):
    response = openai.Completion.create(
        prompt=prompt,
        model=model,
        temperature=temperature,
        max_tokens=max_tokens,
        top_p=top_p,
        frequency_penalty=frequency_penalty,
        presence_penalty=presence_penalty
    )
    return response["choices"][0]["text"]

def build_openai_chat(prompt, examples, model="gpt-3.5-turbo"):
    response = openai.ChatCompletion.create(
        model=model,
        messages=[
            {"role": "system", "content": prompt},
            {"role": "user", "content": "\n".join(examples)}
        ]
    )
    return response["choices"][0]["message"]["content"]

In [51]:
# Query HuggingFace models
import requests

def hf_query(payload, model_id):
	headers = {"Authorization": f"Bearer {HF_TOKEN}"}
	API_URL = f"https://api-inference.huggingface.co/models/{model_id}"
	response = requests.post(API_URL, headers=headers, json=payload)
	json_response = None
	try:
		json_response = response.json()
		return json_response
	except:
		print(f"Error: {response}")
		return [{'generated_text': 'Error generating the definition'}]

In [8]:
# Loads training data from words.json
def load_training_data():
    with open("words.json") as f:
        words = json.load(f)
        return words

# Get definitions for all words in the dataset

In [29]:
dataset = load_training_data()
print(f"Total words: {len(dataset)}")

from tqdm import tqdm

Total words: 20


In [30]:
def get_definitions(getter_func, dataset):
    return {
        w : getter_func(d["examples"])
        for w, d in tqdm(dataset.items())
    }

def save_definitions(definitions, filename):
    with open(filename, "w") as f:
        json.dump(definitions, f, indent=4)

In [13]:
# OpenAI completion models

OPENAI_COMPLETION_BEFORE = "Write a dictionary definition for the word in asterisks, given its context.\nExamples:\n"
OPENAI_COMPLETION_AFTER = "\nDefinition:\n"

def openai_completion_helper(model, examples, **kwargs):
    prompt = build_prompt(OPENAI_COMPLETION_BEFORE, examples, OPENAI_COMPLETION_AFTER)
    return build_openai_completion(prompt, model=model, **kwargs)

model_getter_func = {
    "text-davinci-003": lambda examples: openai_completion_helper("text-davinci-003", examples),
    "text-curie-001": lambda examples: openai_completion_helper("text-curie-001", examples),
    "text-babbage-001": lambda examples: openai_completion_helper("text-babbage-001", examples),
    "text-ada-001": lambda examples: openai_completion_helper("text-ada-001", examples)
}

for model_name, getter_func in model_getter_func.items():
    definitions = get_definitions(getter_func, dataset)
    save_definitions(definitions, f"definitions-{model_name}.json")


In [14]:
OPENAI_CHAT_PROMPT = """You are a language researcher at the Real Academia Española. Your job is to write dictionary definitions for words from its context. Many of those words have been recently discovered or created, so are not registered in any dictionary.

Your responses must be formated like a dictionary definition.
The response must be in Spanish.
Itemize each found meaning and cluster similar meanings into a single definition.
Instead of using the given examples, generate a new one of your own.
Specify if the word is a verb, noun, adjective, adverb, etc.
In your definition, do not write the word within asterisks.
never mention facts that cannot be deduced from the examples given."""

def openai_chat_helper(examples, **kwargs):
    return build_openai_chat(OPENAI_CHAT_PROMPT, examples, **kwargs)


model_getter_func = {
    "gpt-3.5-turbo": lambda examples: openai_chat_helper(examples),
}

for model_name, getter_func in model_getter_func.items():
    definitions = get_definitions(getter_func, dataset)
    save_definitions(definitions, f"definitions-{model_name}.json")

In [48]:
hf_query({
	"inputs": build_prompt(OPENAI_COMPLETION_BEFORE, dataset["kiliche"]["examples"], OPENAI_COMPLETION_AFTER),
}, "google/flan-t5-xxl")

[{'generated_text': 'a small, secluded, or secret place'}]

In [None]:
HF_COMPLETION_BEFORE = "Write a dictionary definition for the word in asterisks, given its context.\nExamples:\n"
HF_COMPLETION_AFTER = "\nDefinition:\n"

def hf_completion_helper(model, examples, **kwargs):
    prompt = build_prompt(HF_COMPLETION_BEFORE, examples, HF_COMPLETION_AFTER)
    response = hf_query({"inputs": prompt, "wait_for_model": True, "max_new_tokens": 250}, model)
    print(response)
    return response[0]['generated_text']

model_getter_func = {
    "google-flan-t5-small": lambda examples: hf_completion_helper("google/flan-t5-small", examples),
    "google-flan-t5-base": lambda examples: hf_completion_helper("google/flan-t5-base", examples),
    "google-flan-t5-xxl": lambda examples: hf_completion_helper("google/flan-t5-xxl", examples),
}

for model_name, getter_func in model_getter_func.items():
    print(model_name)
    definitions = get_definitions(getter_func, dataset)
    save_definitions(definitions, f"definitions-{model_name}.json")