In [1]:
import os
import dotenv
from groq import Groq

In [4]:
dotenv.load_dotenv()

True

### Generating data using Groq API

In [24]:
# groq api (create .env file with appropriate parameters)
GROQ_API = os.getenv("GROQ_API_KEY")

# prompts that are used later
PROMPT_DATA = """
Generate NER data for mountains.
Generate 3 sentences and 3 labels to them.
The output should look like this and dont contain anything else (1 as a label represents a mountain):
{'tokens':[[token11, token12, token13, token14],
           [token21, token22, token23],
           [token31, token32, token33, token34]],
 'labels':[[1, 0, 0, 1],
           [0, 0, 0],
           [1, 1, 0 , 0]]"}
           
Here is the example of correct output:

"{'tokens':[['The', 'Mount', 'Everest', 'is', 'one', 'of', 'the', 'tallest', 'in', 'the', 'world'],
           ['The', 'Himalayas', 'are', 'home', 'to', 'many', 'such', 'peaks'],
           ['Mount', 'Kilimanjaro', 'is', 'located', 'in', 'Tanzania']],
 'labels':[[0, 1, 1, 0, 0, 0, 0, 0, 0, 0],
           [0, 1, 0, 0, 0, 0, 0, 0],
           [1, 1, 0, 0, 0, 0, 0]]}"

"""

PROMPT_NER = """
You faced NER problem and have to answer questions.
Highlight the mountains by labels:
You get tokenized input that looks like that:
['The', 'Mount', 'Everest', 'is', 'one', 'of', 'the', 'tallest', 'in', 'the', 'world']
and have to output labels correcponding to the input:
[0, 1, 1, 0, 0, 0, 0, 0, 0, 0]

DO NOT OUTPUT ANYTHING ELSE, JUST LABELS


THE INPUT:

%s

THE OUTPUT:

"""

In [19]:
# an easy way to generate data using LLMs

client = Groq(
    api_key=GROQ_API,
)

chat_completion = client.chat.completions.create(
    messages=[
        {
            "role": "user",
            "content": PROMPT_DATA,
        }
    ],
    model="llama3-8b-8192",
)

print(chat_completion.choices[0].message.content)

Here is the generated NER data for mountains:

{'tokens':[['The', 'Rocky', 'Mountains', 'are', 'in', 'North', 'America'],
           ['The', 'Cascade', 'Range', 'is', 'in', 'Washington'],
           ['K2', 'is', 'in', 'Pakistan', 'and', 'China']],
 'labels':[[1, 0, 1, 0, 0, 0, 0],
           [0, 0, 1, 0, 0, 0],
           [1, 0, 0, 0, 0, 0]]}

Here are the explanations for the labels:

* 'Rocky Mountains' is a mountain range and corresponds to label 1.
* 'Cascade Range' is a mountain range and corresponds to label 1.
* 'K2' is a mountain and corresponds to label 1.


### Simple fast and cheap solutions

#### NER using prompting with Groq

In [25]:
# solve NER problem using LLMs 

sentence = input("Enter the sentance:").split()

client = Groq(
    api_key=GROQ_API,
)

chat_completion = client.chat.completions.create(
    messages=[
        {
            "role": "user",
            "content": PROMPT_NER % sentence,
        }
    ],
    model="llama3-8b-8192",
)

print(chat_completion.choices[0].message.content)

Enter the sentance:The Rocky Mountains are in North America
[0, 1, 1, 0, 0, 0, 1]
