In [1]:
import os

from groq import Groq
from dotenv import load_dotenv

load_dotenv() 

client = Groq(
    api_key=os.getenv("GROQ_API_KEY")
)

In [2]:
def get_key_phrase(text):
    chat_completion = client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": f'''
                from the following text, extract the keyphrases and write them down as an array, be careful to keep the phrases intact, do not edit them. 
                Note that keyphrases are often longer than keywords and they can help learners understand the main idea of ​​the text by reading only the keyphrases. 
                There may be no keyphrases if the text has no specific content:
                Write only an array containing keyphrases or an empty array. Do not write any additional information. Do not introduce. Just generate [...]
                text:
                {text}''',
            }
        ],
        model="llama3-70b-8192",
    )
    return chat_completion.choices[0].message.content

In [3]:
import re

def extract_text_between_brackets(text):
    # Sử dụng regex để tìm các đoạn văn bản nằm giữa [ và ]
    result = re.findall(r'\[(.*?)\]', text)
    if not result:
        return '<NONE>'
    return result[0]

In [4]:
def get_final_keyphrase(text):
    llm_out = get_key_phrase(text)
    keyphrases = extract_text_between_brackets(llm_out).split('", "')
    keyphrases = [s.strip('"') for s in keyphrases]
    # for key in keyphrases:
    #     key = remove_special_characters(key)
    return keyphrases

In [5]:
def label_keyphrase(texts, keyphrases):
    # Khởi tạo mảng label với giá trị mặc định là 0, độ dài bằng số từ trong câu
    
    labels = [0] * len(texts)

    # Xử lý từng keyphrase trong mảng keyphrases
    for keyphrase in keyphrases:
        keyphrase_tokens = keyphrase.split()
        n = len(keyphrase_tokens)

        # Tìm vị trí bắt đầu của keyphrase trong câu
        for i in range(len(texts) - n + 1):
            if texts[i:i + n] == keyphrase_tokens:
                # Gán giá trị 1 cho các vị trí thuộc keyphrase
                for j in range(i, i + n):
                    labels[j] = 1

    return labels

In [6]:
import json

with open('new_data.json', 'r', encoding='utf-8') as json_file:
    data = json.load(json_file)

In [9]:
for d in data[5500:6000]:
    text = ' '.join(d['text'])
    keyphrases = get_final_keyphrase(text)
    d['keyphrase'] = keyphrases
    labels = label_keyphrase(d['text'], keyphrases)
    d['label'] = labels

In [10]:
with open('new_data.json', 'w', encoding='utf-8') as file:
    json.dump(data, file, indent=4, ensure_ascii=False)