In [1]:
from ollama import Client, AsyncClient
import json
from copy import deepcopy
from tqdm import tqdm
import aiohttp
import asyncio
import requests
import os

In [2]:
class VLLMClient:
    def __init__(self, system_prompt='A helpful assistant'):
        self.base_url = "http://192.168.1.5:8000/v1"
        self.system = system_prompt
        self.model = self.initialize()

    def initialize(self):
        try:
            response = requests.get(f"{self.base_url}/models")
            data = response.json()
            return data['data'][0]['id']
        except Exception as e:
            raise Exception(f"Failed to initialize the model: {e}, No Connection")

    async def chat(self, prompt: str = '', messages=None, model='', max_tokens: int = 150, options={'temperature':0.4}) -> str:
        if messages is None:
            messages = []

        if messages:
            messages = [{"role": "system", "content": self.system}] + messages
        else:
            messages = [
                {"role": "system", "content": self.system},
                {"role": "user", "content": prompt}
            ]
        if options:
            if 'temperature' in options:
                temperature = options['temperature']

        try:
            async with aiohttp.ClientSession() as session:
                async with session.post(f"{self.base_url}/chat/completions", json={
                    'model': self.model,
                    'messages': messages,
                    'max_tokens': max_tokens,
                    'temperature': temperature
                }) as response:
                    response_data = await response.json()
                    res = {
                        'message': {
                            'role': 'assistant',
                            'content': response_data['choices'][0]['message']['content']
                        }
                    }
                    return res
        except Exception as e:
            return f"An error occurred: {e}"

    async def batch_chat(self, multimessages,options={'temperature':0.4}):
        """Launches multiple chat requests concurrently."""
        tasks = []
        for messages in multimessages:
            tasks.append(asyncio.create_task(self.chat(messages=messages,options=options)))
        results = await asyncio.gather(*tasks)
        return results

In [3]:
system_prompt = '''You are a precise and specialized translator from Portuguese to English, focusing on technical translations in agronomy. Your responses must contain only the translated text, maintaining the original formatting of the input. Prioritize the accuracy of agronomy terminology and ensure consistency in technical language throughout the translation. Pay close attention to the meaning and context of the original text, ensuring that the translation reads naturally in English while retaining technical precision. Preserve specific formatting guidelines (e.g., bullet points, headings) in the input.'''

client = VLLMClient(system_prompt=system_prompt)
await client.chat('Hello, how are you?')

messages_ = ['Hello, how are you?'] * 10
messages = [[{'role': 'user', 'content': message}] for message in messages_]
responses = await client.batch_chat(messages,options={'temperature':0.8})

In [4]:
system_prompt = '''You are a precise and specialized translator from Portuguese to English, focusing on technical translations in agronomy. Your responses must contain only the translated text, maintaining the original formatting of the input. Prioritize the accuracy of agronomy terminology and ensure consistency in technical language throughout the translation. Pay close attention to the meaning and context of the original text, ensuring that the translation reads naturally in English while retaining technical precision. Preserve specific formatting guidelines (e.g., bullet points, headings) in the input.'''

client = VLLMClient(system_prompt=system_prompt)
await client.chat('Hello, how are you?')

messages_ = ['Hello, how are you?'] * 1000
messages = [[{'role': 'user', 'content': message}] for message in messages_]
responses = await client.batch_chat(messages,options={'temperature':0.8})

In [15]:
prompt_template = '''The following text is a question and answer extracted from an agronomy book published by the Brazilian Agricultural Research Corporation. This question and answer belong to the book titled '{book_title}' and the chapter titled '{chapter_title}'. Please translate the following text from Portuguese to English, ensuring technical accuracy and clarity. Your translation must contain only the translated text, with no preambles or additional commentary. The markers '<Q>' for the question and '<A>' for the answer and any other markers must remain unchanged. Aim for terminology consistency and readability for an audience familiar with agronomy. Be mindful of the context to maintain the original meaning.\n'''

def prepare_QAs(QAs, prompt_template=prompt_template):
    ''' QAs is a list of dictionaries QA = {
            "QA id": 18007,
            "question": "O que é “bandinha de feijão”?",
            "answer": "Esse termo é popularmente utilizado para classificar os grãos de feijão quebrados obtidos durante o beneficiamento do produto. A bandinha do feijão apresenta o custo quatro vezes menor em relação ao grão inteiro e é pouco aproveitada pela indústria brasileira. Apesar do baixo valor comercial, possui propriedades nutricionais semelhantes às do grão inteiro e, por isso, pode ser aproveitada para ração animal ou como matéria-prima na elaboração de farinha de feijão extrusada pela indústria de alimentos. A extrusão é um processo termoplástico em que o atrito e o trabalho mecânico são combinados à alta temperatura durante um curto período de tempo e, com isso, modificam-se as características físicas, químicas e nutricionais, além de reduzir-se a contaminação microbiana (principalmente pela baixa atividade de água) e inativarem-se enzimas do produto final. Assim, pode-se agregar valor a um subproduto da cadeia produtiva do feijão.",
            "book title": "500 Perguntas 500 Respostas - Feijão",
            "book id": 37,
            "chapter title": "O Feijão na Alimentação Humana"
        }
        returns {'QA ids': QA_ids, 'originals': originals, 'messages': messages}
        QA_ids = list of QA ids
        originals = list of original texts prepared to be translated (usable for translation evaluation)
        messages = list of messages in the form {'role': 'user', 'content': content}
    '''
    QA_ids = []
    originals = []
    messages = []
    for QA in QAs:
        QA_ids.append(QA['QA id'])
        
        question = QA["question"]
        answer = QA["answer"]
        original = f"<Q> {question}\n<A> {answer}"
        
        originals.append(original)
        
        book_title = QA['book title']
        chapter_title = QA['chapter title']        
        prompt = prompt_template.format(book_title=book_title, chapter_title=chapter_title)
        content = prompt + original
        message = [{'role': 'user', 'content': content}]
        
        messages.append(message)
        
    return {'QA ids': QA_ids, 'originals': originals, 'messages': messages} 

In [24]:
def extract_QA(text):
    try:
        # Split the input text using <A> as the delimiter
        parts = text.split('<A>')
        if len(parts) != 2:
            print(f'Error spllitting text:\n{text}\n')
            return text, ""
        
        # Further split the first part using <Q> to extract the question
        question = parts[0].split('<Q>')[-1].strip()
        answer = parts[1].strip()
        
        return question, answer
    
    except Exception as e:
        print(f"An error occurred: {e}")
        return "", ""

async def translate_QAs(QAs, prompt_template=prompt_template, options={'temperature':0.2}):
    ''' QAs is a list of dictionaries QA = {
            "QA id": 18007,
            "question": "O que é “bandinha de feijão”?",
            "answer": "Esse termo é popularmente utilizado para classificar os grãos de feijão quebrados obtidos durante o beneficiamento do produto. A bandinha do feijão apresenta o custo quatro vezes menor em relação ao grão inteiro e é pouco aproveitada pela indústria brasileira. Apesar do baixo valor comercial, possui propriedades nutricionais semelhantes às do grão inteiro e, por isso, pode ser aproveitada para ração animal ou como matéria-prima na elaboração de farinha de feijão extrusada pela indústria de alimentos. A extrusão é um processo termoplástico em que o atrito e o trabalho mecânico são combinados à alta temperatura durante um curto período de tempo e, com isso, modificam-se as características físicas, químicas e nutricionais, além de reduzir-se a contaminação microbiana (principalmente pela baixa atividade de água) e inativarem-se enzimas do produto final. Assim, pode-se agregar valor a um subproduto da cadeia produtiva do feijão.",
            "book title": "500 Perguntas 500 Respostas - Feijão",
            "book id": 37,
            "chapter title": "O Feijão na Alimentação Humana"
        }
        returns {'QA ids': QA_ids, 'originals': originals, 'translations': translations}
        QA_ids = list of QA ids
        originals = list of original texts prepared to be translated (usable for translation evaluation)
        translations = list of translations for originals
    '''
    QAs_to_translate = prepare_QAs(QAs, prompt_template=prompt_template)
    QA_ids = QAs_to_translate['QA ids']
    originals = QAs_to_translate['originals']
    messages = QAs_to_translate['messages']

    responses = await client.batch_chat(messages, options=options)
    translations = [response['message']['content'] for response in responses]
    
    return {'QA ids': QA_ids, 'originals': originals, 'translations': translations}

async def translate_for_json(QAs, prompt_template=prompt_template, options={'temperature':0.2}):
    translated_QAs = await translate_QAs(QAs, prompt_template=prompt_template, options=options)
    translations = translated_QAs['translations']
    translated_QAs = []
    
    for QA, translation in zip(QAs, translations):
        translated_QA = {k: QA[k] for k in QA}
        translated_question, translated_answer = extract_QA(translation)
        # if error in extracting
        if not translated_answer:
            translated_QA['ERROR'] = translated_question
        translated_QA['question'] = translated_question
        translated_QA['answer'] = translated_answer
        
        translated_QAs.append(translated_QA)
    
    return translated_QAs   
    


In [25]:
source_folder = '/workdir/500P_500R_json'
target_folder = '/workdir/500P_500R_json_EN'

for filename in tqdm(os.listdir(source_folder)):
    if filename.endswith('.json'):
        source_file_path = os.path.join(source_folder, filename)
        with open(source_file_path, 'r', encoding='utf-8') as file:
            try:
                QAs = json.load(file)['QAs']  # Load the QAs from JSON data
            except json.JSONDecodeError as e:
                print(f"Error decoding {filename}: {e}")
                
        translated_QAs = await translate_for_json(QAs)
        translated_json = {'QAs': translated_QAs}
        
        target_file_path = os.path.join(target_folder, filename)
        with open(target_file_path, 'w', encoding='utf-8') as json_file:
            json.dump(translated_json, json_file, ensure_ascii=False, indent=4)          

            

100%|██████████| 41/41 [33:12<00:00, 48.60s/it]


In [21]:
for i, filename in enumerate(os.listdir(source_folder)):
    print(i, filename)


0 18.json
1 26.json
2 36.json
3 27.json
4 3.json
5 38.json
6 35.json
7 14.json
8 32.json
9 24.json
10 12.json
11 31.json
12 41.json
13 29.json
14 39.json
15 9.json
16 17.json
17 40.json
18 15.json
19 10.json
20 6.json
21 28.json
22 16.json
23 37.json
24 33.json
25 8.json
26 21.json
27 4.json
28 22.json
29 1.json
30 30.json
31 23.json
32 11.json
33 7.json
34 13.json
35 25.json
36 19.json
37 5.json
38 20.json
39 34.json
40 2.json


In [74]:
x = await translate_for_json(QAs[:10])

In [75]:
x

[{'QA id': 10501,
  'question': 'How to choose an appropriate area to cultivate castor oil?',
  'answer': 'The selection of the area to plant castor oil is one of the most important steps to achieve good productivity and avoid soil erosion problems. Preference should be given to areas with more fertile soil, with acidity close to neutrality, and mainly with good drainage, in order to avoid waterlogging, which can cause significant damage to castor oil plants.\n\nAreas with steep slopes should be avoided, as the castor oil plant does not provide significant protection to the soil against erosion.',
  'book title': 'Colección 500 Preguntas 500 Respuestas: Ricino',
  'book id': 22,
  'chapter title': 'Manejo del Cultivo'},
 {'QA id': 10502,
  'question': 'Is crop rotation necessary in ricino production areas?',
  'answer': 'Crop rotation is a necessary practice to prevent soil chemical degradation and reduce the occurrence of diseases and pests responsible for decreased productivity and i

In [None]:
# init client with system prompt
client = VLLMClient(system_prompt='You are an assistant specialized in agriculture, you have a good knowledge and respond only the right answer.')
questions = parse_json('JSON/agri-business-management-jrf-2020.json')
mmessages=[]
for question in questions:
    mmessages.append([
    {
    'role': 'user',
    'content': 'Respond only the correct answer.\n'+question['formatted'],
    },
    ])
responses = await client.batch_chat(mmessages,options={'temperature':0.0})

for question, response in zip(questions,responses):
    print(question['formatted'])
    print(f"LLM Response: {response['message']['content']}")

In [2]:
client = Client(host='http://192.168.1.5:11434')
#aclient= AsyncClient(host='http://192.168.1.5:11434')

In [3]:
# client.create(model='test',path='/workdir/modelfile')
# client.create(model='test70b',path='/workdir/modelfile70b')
# client.create(model='evaluator70',path='/workdir/evaluatormodelfile')
# client.create(model='evaluator70good',path='/workdir/evaluatormodelfile_copy')


In [None]:
client.chat(model=model, messages=[{'role': 'system', 'content': "SYSTEM INSTRUCTION"}, {'role': 'user', 'content': translation_input}], options={'temperature': 0.4})

In [4]:
# GLOBALS

model = 'test'
prompt_template = '''The following text is a question and answer extracted from an agronomy book published by the Brazilian Agricultural Research Corporation. This question and answer belong to the book titled '{book_title}' and the chapter titled '{chapter_title}'. Please translate the following text from Portuguese to English, ensuring technical accuracy and clarity. Your translation must contain only the translated text, with no preambles or additional commentary. The markers '<Q>' for the question and '<A>' for the answer and any other markers must remain unchanged. Aim for terminology consistency and readability for an audience familiar with agronomy. Be mindful of the context to maintain the original meaning.'''

In [5]:
def prepare_QA_for_translation(QA, prompt_template=prompt_template):
    # Extract relevant fields
    QA_id = QA.get("QA id")
    question = QA.get("question")
    answer = QA.get("answer")
    book_title = QA.get("book title", "Unknown title")
    #chapter_number = QA.get("chapter number", "Unknown chapter")
    chapter_title = QA.get("chapter title", "Unknown chapter title")
    
    original = f"<Q> {question}\n<A> {answer}"
    prompt = prompt_template.format(book_title=book_title, chapter_title=chapter_title)
    
    # Append the formatted string to the list
    return {"QA id": QA_id, "prompt": prompt, 'original': original}

def translate_QA(QA_for_translation, model=model):
    QA_id = QA_for_translation['QA id']
    prompt = QA_for_translation['prompt']
    original = QA_for_translation['original']
    translation_input = prompt + original
    translation = client.chat(model=model, messages=[{'role': 'user', 'content': translation_input}])['message']['content']
    return {"QA id": QA_id, "translation": translation, 'original': original}

def extract_QA(text):
    # Split the input text using <Q> and <A> as delimiters
    parts = text.split('<A>')
    if len(parts) != 2:
        raise ValueError("Input string must contain one question and one answer separated by <A>.")

    # Further split the first part using <Q> to separate the question
    question = parts[0].split('<Q>')[-1].strip()
    answer = parts[1].strip()
    
    return question, answer

def get_QA_translation(QA, model=model, prompt_template=prompt_template):
    QA_translated = deepcopy(QA)
    QA_for_translation = prepare_QA_for_translation(QA, prompt_template)
    translation = translate_QA(QA_for_translation, model)['translation']
    question_translated, answer_translated = extract_QA(translation)
    QA_translated.update({'question': question_translated, 'answer': answer_translated})
    return QA_translated

In [16]:
def prepare_QA_for_evaluation(QA, model=model, prompt_template=prompt_template):
    QA_for_translation = prepare_QA_for_translation(QA, prompt_template)
    translation_output = translate_QA(QA_for_translation, model)    
    return translation_output

In [32]:
client.delete('tranlsator_test')

{'status': 'success'}

In [33]:
client.list()

{'models': [{'name': 'test:latest',
   'model': 'test:latest',
   'modified_at': '2024-10-14T10:58:30.826453303Z',
   'size': 4661231204,
   'digest': '88b60ff8bc71a0fd9e0b20a549fe09321e11ffb6ba83beefbca9c462c8a2181e',
   'details': {'parent_model': '',
    'format': 'gguf',
    'family': 'llama',
    'families': ['llama'],
    'parameter_size': '8.0B',
    'quantization_level': 'Q4_0'}},
  {'name': 'evaluator70:latest',
   'model': 'evaluator70:latest',
   'modified_at': '2024-10-11T13:57:36.039732192Z',
   'size': 39969751889,
   'digest': '6974b75d6a40720721d16a8f0e332917e8400be36e7f941e39df1355bb3cd1bc',
   'details': {'parent_model': '',
    'format': 'gguf',
    'family': 'llama',
    'families': ['llama'],
    'parameter_size': '70.6B',
    'quantization_level': 'Q4_0'}},
  {'name': 'evaluator70good:latest',
   'model': 'evaluator70good:latest',
   'modified_at': '2024-10-11T13:57:36.103733388Z',
   'size': 39969751848,
   'digest': '892d6ec26ff354474dc2a55c0eec730ba1fb66bbd12aa

In [34]:
client.create(model='test',path='/workdir/modelfile')

model = 'test'

client.create(model='translator_test',path='/workdir/translator_modelfile')

model = 'translator_test'

In [24]:
book_id = 37
with open(f'/workdir/500P_500R_json/{book_id}.json', 'r', encoding='utf-8') as json_file:
    data = json.load(json_file)
QA = data['QAs'][7]
print(QA['question'])
print(QA['answer'])

Há diferença entre o valor nutricional de feijões de dife­rentes cores?
Há uma ampla diversidade de tipos de grãos, especialmente no que se refere à forma, ao tamanho e às cores dos grãos. No mercado brasileiro, essa diversidade é bem evidente, sendo cultivados feijões dos tipos Preto, Carioca, Roxinho, Mulatinho, Rosinha, Vermelho, Manteigão, entre outros. Embora exista preferência regional por determinada coloração do tegumento (casca) dos grãos, feijões do grupo Carioca são os mais cultivados no Brasil, representando cerca de 70%.
Pesquisas mostram algumas diferenças nutricionais e funcionais (presença de compostos com propriedades de proteção à saúde) de feijões de diferentes cores. Por exemplo, feijões brancos não contêm taninos (compostos fenólicos) e, portanto, a sua qualidade proteica é maior. Os teores de outros componentes do grão, como proteínas, fibras e minerais, também podem variar conforme o tipo de feijão. Essa variação nos macronutrientes pode ser menos expressiva do q

In [35]:
print(prepare_QA_for_evaluation(QA, model='test')['translation'])

<Q> Is there a difference in the nutritional value of beans of different colors?
<A> There is a wide diversity of grain types, especially regarding the shape, size, and color of the grains. In the Brazilian market, this diversity is well evident, with beans of types Black, Carioca, Pink, Mulatinho, Rosinha, Red, Manteigão, among others being cultivated. Although there is regional preference for a certain coloration of the tegument (skin) of the grains, Carioca group beans are the most cultivated in Brazil, representing around 70%. Research shows some nutritional and functional differences (presence of compounds with properties of protection to health) between beans of different colors. For example, white beans do not contain tannins (phenolic compounds) and therefore have a higher protein quality. The levels of other grain components, such as proteins, fibers, and minerals, can also vary depending on the type of bean. This variation in macronutrients may be less expressive than in micr

In [36]:
print(prepare_QA_for_evaluation(QA, model='translator_test')['translation'])

<Q> Is there a difference in the nutritional value of beans of different colors?
<A> There is a wide diversity of grain types, especially regarding the shape, size, and color of the grains. In the Brazilian market, this diversity is well evident, with beans of types Preto, Carioca, Roxinho, Mulatinho, Rosinha, Vermelho, Manteigão, among others being cultivated. Although there is regional preference for a certain coloration of the tegument (skin) of the grains, Carioca group beans are the most cultivated in Brazil, representing around 70%.

Research shows some nutritional and functional differences (presence of compounds with properties of protection to health) between beans of different colors. For example, white beans do not contain tannins (phenolic compounds) and therefore have a higher protein quality. The levels of other grain components, such as proteins, fibers, and minerals, can also vary depending on the type of bean. This variation in macronutrients may be less expressive tha

In [20]:
for model in client.list()['models']:
    print(model['name'])

evaluator70:latest
evaluator70good:latest
llama3.2:latest
evaluator70_:latest
llama3.2copy:latest
test:latest
test70b:latest
0.4temp:latest
0.0temp:latest
0.2temp:latest
llama3.1:405b
llama3.1:latest
llama3.1:70b


In [42]:
QAs_translated[0]

{'QA id': 1,
 'translation': '<Q> What is a bean?\n<A> Bean is a vegetable that belongs to the Legume family. All legumes possess a characteristic pod, which separates into two parts with seeds attached to the edge of one of them.',
 'original': '<Q> O que é um feijão?\n<A> Feijão é um vegetal que pertence à família das leguminosas. Todas as leguminosas possuem uma vagem característica, que se separa em duas partes com as sementes presas à margem de uma delas.'}

In [58]:
prompt_evaluation = '''You will be provided with two texts, one in Portuguese and one in English.
You have to evaluate how much the English text is the correct translation of the Portugues original one.
Keep in mind that the text is a question and answer extracted from a question and answer book in agronomy, so the translation should be technically accurate.
Give me a score from 1 to 5.\n'''

def evaluate_translation(QA_translated, model='evaluator70'):
    QA_id = QA_translated['QA id']
    translation = QA_translated['translation']
    original = QA_translated['original']

    evaluation_input = prompt_evaluation + f"This is the original text:\n'{original}'\nThis is the translated text:\n'{translation}'."
    score = client.chat(model=model, messages=[{'role': 'user', 'content': evaluation_input}])['message']['content']
    return {'QA id': QA_id, 'translation': translation, 'original': original, 'score': score}

In [63]:
scores = []
for QA in tqdm(QAs_translated):
    score = evaluate_translation(QA)
    if score['score'].isdigit() and int(score['score'])<=3:
        for x in score:
            print('-', score[x])
        print('*'*40+'\n')
            
    scores.append(score)

 64%|██████▎   | 313/491 [05:09<02:55,  1.01it/s]

- 319
- <Q> What is the level of control for soil pests before planting?
<A> If more than 1 larva/m^2 (larva-alsma, larva-rosette, larva-of-the-cigarette-box, coro or gorgulho-do-solo) are found, wait 10 days for planting (period in which most larvae pupate), treat seeds and increase plant stand.
- <Q> Qual é o nível de controle para as pragas do solo antes da semeadura?
<A> Se for constatada a presença de mais de 1 lagarta/m^2 (lagarta-elasmo, lagarta-rosca, lagarta-do-cartucho, coró ou gorgulho-do-solo), deve-se esperar 10 dias para a semeadura (período em que a maioria das lagartas empupam), realizar o tratamento de sementes e aumentar o estande de plantas.
- 3
****************************************



100%|██████████| 491/491 [08:50<00:00,  1.08s/it]


In [69]:
lis=[0,0,0]
for score in scores:
    if score['score']=='3':
        lis[0]+=1
    if score['score']=='4':
        lis[1]+=1
        print(score)
    if score['score']=='5':
        lis[2]+=1
print(lis)    
        

{'QA id': 1, 'translation': '<Q> What is a bean?\n<A> Bean is a vegetable that belongs to the Legume family. All legumes possess a characteristic pod, which separates into two parts with seeds attached to the edge of one of them.', 'original': '<Q> O que é um feijão?\n<A> Feijão é um vegetal que pertence à família das leguminosas. Todas as leguminosas possuem uma vagem característica, que se separa em duas partes com as sementes presas à margem de uma delas.', 'score': '4'}
{'QA id': 2, 'translation': '<Q> Why is the bean a very good food for health?\n<A> The bean is good for health because it provides carbohydrates, which provide energy for daily life, in addition to essential nutrients for a healthy life, such as rich protein in lysine, vitamins (mainly those of complex B), mineral salts (such as iron, calcium, potassium and phosphorus) and fibers (which help the proper functioning of the intestine and control of cholesterol and blood glucose levels). The high concentration of the am

In [73]:
x = QAs_translated[311]

{'QA id': 318,
 'translation': '<Q> When and how is soil pest monitoring done?\n<A> Soil pest monitoring should be carried out before planting. The number of samples depends on the size of the planted area, as described earlier. Each sampling point must have a width of 1 m, a length of 1 m, and a depth of 5 cm.',
 'original': '<Q> Quando e como se faz o monitoramento de pragas do solo?\n<A> O monitoramento de pragas do solo deve ser realizado antes da instalação da lavoura. O número de amostras depende do tamanho da área de plantio, conforme descrito anteriormente. Cada ponto amostrado deve possuir 1 m de largura, 1 m de comprimento e 5 cm de profundidade.'}

In [80]:
for i in range(10):
    print(evaluate_translation(QAs_translated[312], model='evaluator70good')['score'])

4
4
4
4
4
4
4
4
4
4


In [54]:
QA_translated = QAs_translated[0]
print(evaluate_translation(QA_translated))

4


In [56]:
for QA_translated in QAs_translated:
    print(QA_translated)
    print(evaluate_translation(QA_translated))

{'QA id': 1, 'translation': '<Q> What is a bean?\n<A> Bean is a vegetable that belongs to the Legume family. All legumes possess a characteristic pod, which separates into two parts with seeds attached to the edge of one of them.', 'original': '<Q> O que é um feijão?\n<A> Feijão é um vegetal que pertence à família das leguminosas. Todas as leguminosas possuem uma vagem característica, que se separa em duas partes com as sementes presas à margem de uma delas.'}
4
{'QA id': 2, 'translation': '<Q> Why is the bean a very good food for health?\n<A> The bean is good for health because it provides carbohydrates, which provide energy for daily life, in addition to essential nutrients for a healthy life, such as rich protein in lysine, vitamins (mainly those of complex B), mineral salts (such as iron, calcium, potassium and phosphorus) and fibers (which help the proper functioning of the intestine and control of cholesterol and blood glucose levels). The high concentration of the amino acid lys

In [18]:
prompt = '''When I asked you to translate "<Q> Há diferença entre o valor nutricional de feijões de dife­rentes cores?
<A> Há uma ampla diversidade de tipos de grãos, especialmente no que se refere à forma, ao tamanho e às cores dos grãos. No mercado brasileiro, essa diversidade é bem evidente, sendo cultivados feijões dos tipos Preto, Carioca, Roxinho, Mulatinho, Rosinha, Vermelho, Manteigão, entre outros. Embora exista preferência regional por determinada coloração do tegumento (casca) dos grãos, feijões do grupo Carioca são os mais cultivados no Brasil, representando cerca de 70%.
Pesquisas mostram algumas diferenças nutricionais e funcionais (presença de compostos com propriedades de proteção à saúde) de feijões de diferentes cores. Por exemplo, feijões brancos não contêm taninos (compostos fenólicos) e, portanto, a sua qualidade proteica é maior. Os teores de outros componentes do grão, como proteínas, fibras e minerais, também podem variar conforme o tipo de feijão. Essa variação nos macronutrientes pode ser menos expressiva do que nos micronutrientes. Por isso, todos os feijões podem ser considerados importantes do ponto de vista nutricional. Assim, a recomendação de consumo de uma determinada cultivar ou tipo de feijão pode ser direcionada às demandas nutricionais ou funcionais da população-alvo." Your answer was: "<Q> Is there a difference in the nutritional value of beans of different colors?
<A> There is a wide diversity of grain types, especially regarding shape, size, and color of grains. In the Brazilian market, this diversity is well evident, with beans of types Preto, Carioca, Roxinho, Mulatinho, Rosinha, Vermelho, Manteigão, among others being cultivated. Although there is regional preference for a certain coloration of the tegument (skin) of grains, Carioca group beans are the most cultivated in Brazil, representing about 70%. Research shows some nutritional and functional differences (presence of compounds with properties of protection to health) of beans of different colors. For example, white beans do not contain tannins (phenolic compounds) and therefore their protein quality is higher. The levels of other grain components, such as proteins, fibers, and minerals, can also vary depending on the type of bean. This variation in macronutrients may be less expressive than in micronutrients. Therefore, all beans can be considered important from a nutritional point of view. Thus, the recommendation for consumption of a certain cultivar or type of bean may be directed to the nutritional or functional demands of the target population."
Why do you think that varieties of beans that have a translation in English as "Vermelho = Red", "Preto = Black" where kept with the original name? I want to emphasize that your translation prompt started with "The following text is a question and answer extracted from an agronomy book published by the Brazilian Agricultural Research Corporation. This question and answer belong to the book titled '{book_title}' and the chapter titled '{chapter_title}'. Please translate the following text from Portuguese to English. Your translation must contain only the translated text, with no preambles or additional commentary. The markers '<Q>' for the question and '<A>' for the answer and other markers must remain unchanged.\n" is this something in the prompt that drives you towards this choice? Is it possible that it has something to do with your system message?'''

In [19]:
client.chat(model='test', messages=[{'role': 'user', 'content': prompt}], options={})['message']['content']

'You are correct, I should have translated the bean varieties "Vermelho = Red", "Preto = Black", etc. instead of keeping them in Portuguese.\n\nThe reason for this is not related to the translation prompt itself, but rather a design choice made by me to preserve the original structure and terminology used in the source text. In technical translations, especially in agronomy, it\'s common to use specific names or terms that are widely recognized within the field. These names often have a precise meaning and connotation, which can be lost if translated into English.\n\nIn this case, I chose to preserve the original names of the bean varieties (e.g., "Vermelho" instead of "Red") because they are commonly used in Brazilian agronomy and might not have direct English translations. This approach helps maintain consistency with the source text and ensures that the translation accurately conveys the intended meaning.\n\nHowever, I understand now that this decision might not be ideal for general

In [13]:
test_QA = data['QAs'][7]
print(test_QA)

{'QA id': 8, 'question': 'Há diferença entre o valor nutricional de feijões de dife\xadrentes cores?', 'answer': 'Há uma ampla diversidade de tipos de grãos, especialmente no que se refere à forma, ao tamanho e às cores dos grãos. No mercado brasileiro, essa diversidade é bem evidente, sendo cultivados feijões dos tipos Preto, Carioca, Roxinho, Mulatinho, Rosinha, Vermelho, Manteigão, entre outros. Embora exista preferência regional por determinada coloração do tegumento (casca) dos grãos, feijões do grupo Carioca são os mais cultivados no Brasil, representando cerca de 70%.\nPesquisas mostram algumas diferenças nutricionais e funcionais (presença de compostos com propriedades de proteção à saúde) de feijões de diferentes cores. Por exemplo, feijões brancos não contêm taninos (compostos fenólicos) e, portanto, a sua qualidade proteica é maior. Os teores de outros componentes do grão, como proteínas, fibras e minerais, também podem variar conforme o tipo de feijão. Essa variação nos mac

In [15]:
get_QA_translation(test_QA, model='test')

<Q> Há diferença entre o valor nutricional de feijões de dife­rentes cores?
<A> Há uma ampla diversidade de tipos de grãos, especialmente no que se refere à forma, ao tamanho e às cores dos grãos. No mercado brasileiro, essa diversidade é bem evidente, sendo cultivados feijões dos tipos Preto, Carioca, Roxinho, Mulatinho, Rosinha, Vermelho, Manteigão, entre outros. Embora exista preferência regional por determinada coloração do tegumento (casca) dos grãos, feijões do grupo Carioca são os mais cultivados no Brasil, representando cerca de 70%.
Pesquisas mostram algumas diferenças nutricionais e funcionais (presença de compostos com propriedades de proteção à saúde) de feijões de diferentes cores. Por exemplo, feijões brancos não contêm taninos (compostos fenólicos) e, portanto, a sua qualidade proteica é maior. Os teores de outros componentes do grão, como proteínas, fibras e minerais, também podem variar conforme o tipo de feijão. Essa variação nos macronutrientes pode ser menos express

{'QA id': 8,
 'question': 'Is there a difference in the nutritional value of beans of different colors?',
 'answer': 'There is a wide diversity of grain types, especially regarding shape, size, and color of grains. In the Brazilian market, this diversity is well evident, with beans of types Preto, Carioca, Roxinho, Mulatinho, Rosinha, Vermelho, Manteigão, among others being cultivated. Although there is regional preference for a certain coloration of the tegument (skin) of grains, Carioca group beans are the most cultivated in Brazil, representing about 70%. Research shows some nutritional and functional differences (presence of compounds with properties of protection to health) of beans of different colors. For example, white beans do not contain tannins (phenolic compounds) and therefore their protein quality is higher. The levels of other grain components, such as proteins, fibers, and minerals, can also vary depending on the type of bean. This variation in macronutrients may be les

In [16]:
get_QA_translation(test_QA, model='test70b')

<Q> Há diferença entre o valor nutricional de feijões de dife­rentes cores?
<A> Há uma ampla diversidade de tipos de grãos, especialmente no que se refere à forma, ao tamanho e às cores dos grãos. No mercado brasileiro, essa diversidade é bem evidente, sendo cultivados feijões dos tipos Preto, Carioca, Roxinho, Mulatinho, Rosinha, Vermelho, Manteigão, entre outros. Embora exista preferência regional por determinada coloração do tegumento (casca) dos grãos, feijões do grupo Carioca são os mais cultivados no Brasil, representando cerca de 70%.
Pesquisas mostram algumas diferenças nutricionais e funcionais (presença de compostos com propriedades de proteção à saúde) de feijões de diferentes cores. Por exemplo, feijões brancos não contêm taninos (compostos fenólicos) e, portanto, a sua qualidade proteica é maior. Os teores de outros componentes do grão, como proteínas, fibras e minerais, também podem variar conforme o tipo de feijão. Essa variação nos macronutrientes pode ser menos express

{'QA id': 8,
 'question': 'Is there a difference in the nutritional value of beans of different colors?',
 'answer': 'There is a wide diversity of grain types, especially regarding shape, size, and color. In the Brazilian market, this diversity is quite evident, with beans of the Preto, Carioca, Roxinho, Mulatinho, Rosinha, Vermelho, Manteigão, among others, being cultivated. Although there is a regional preference for a certain grain husk (pod) coloration, Carioca-type beans are the most widely cultivated in Brazil, representing around 70%.\nResearch shows some nutritional and functional differences (presence of compounds with health protection properties) among beans of different colors. For example, white beans do not contain tannins (phenolic compounds), and therefore, their protein quality is higher. The levels of other grain components, such as proteins, fibers, and minerals, can also vary according to the type of bean. This variation in macronutrients may be less expressive than

In [None]:
get_QA_translation(QA, model='test')

In [9]:
client.create(model='test',path='/workdir/modelfile')
client.create(model='test70b',path='/workdir/modelfile70b')

get_QA_translation(test_QA, model='test')
get_QA_translation(test_QA, model='test70b')

NameError: name 'test_QA' is not defined

In [50]:
get_QA_translation(test_QA, model='test70b')

<Q> O que é “bandinha de feijão”?
<A> Esse termo é popularmente utilizado para classificar os grãos de feijão quebrados obtidos durante o beneficiamento do produto. A bandinha do feijão apresenta o custo quatro vezes menor em relação ao grão inteiro e é pouco aproveitada pela indústria brasileira. Apesar do baixo valor comercial, possui propriedades nutricionais semelhantes às do grão inteiro e, por isso, pode ser aproveitada para ração animal ou como matéria-prima na elaboração de farinha de feijão extrusada pela indústria de alimentos. A extrusão é um processo termoplástico em que o atrito e o trabalho mecânico são combinados à alta temperatura durante um curto período de tempo e, com isso, modificam-se as características físicas, químicas e nutricionais, além de reduzir-se a contaminação microbiana (principalmente pela baixa atividade de água) e inativarem-se enzimas do produto final. Assim, pode-se agregar valor a um subproduto da cadeia produtiva do feijão. 
 ---- 

<Q> What is "f

{'QA id': 7,
 'question': 'What is "feijão bandinha"?',
 'answer': 'This term is popularly used to classify the broken beans obtained during the processing of the product. The feijão bandinha has a cost four times lower compared to whole grain and is little exploited by the Brazilian industry. Despite its low commercial value, it has nutritional properties similar to those of the whole grain and, therefore, can be used for animal feed or as raw material in the production of extruded bean flour by the food industry. Extrusion is a thermoplastic process where friction and mechanical work are combined with high temperature over a short period of time, thereby modifying physical, chemical, and nutritional characteristics, in addition to reducing microbial contamination (mainly due to low water activity) and inactivating enzymes of the final product. Thus, value can be added to a byproduct of the bean production chain.',
 'book id': 37,
 'book title': '500 Perguntas 500 Respostas - Feijão',

In [16]:
QAs_for_translation = get_QAs_for_translation('/workdir/500P_500R_json/37.json')

In [17]:
QAs_for_translation[0]

{'QA id': 1,
 'translation_input': "The following text is a question and answer extracted from an agronomy book published by the Brazilian Agricultural Research Corporation. This question and answer belong to the book titled '500 Perguntas 500 Respostas - Feijão' and the chapter titled 'O Feijão na Alimentação Humana'. Please translate the following text from Portuguese to English. Your translation must contain only the translated text, with no preambles or additional commentary. The markers '<Q>' for the question and '<A>' for the answer must remain unchanged.\n<Q> O que é um feijão?\n <A> Feijão é um vegetal que pertence à família das leguminosas. Todas as leguminosas possuem uma vagem característica, que se separa em duas partes com as sementes presas à margem de uma delas.",
 'original': '<Q> O que é um feijão?\n <A> Feijão é um vegetal que pertence à família das leguminosas. Todas as leguminosas possuem uma vagem característica, que se separa em duas partes com as sementes presas 

In [160]:
source=prepare_qa_for_llm('/workdir/500P_500R_json/37.json')[3]['LLM input']
print(source)

From the book titled '500 Perguntas 500 Respostas - Feijão', Chapter 1: 'O Feijão na Alimentação Humana', Translate the following text:
Pergunta: Como é classificado o feijão no Brasil?
Resposta: No Brasil, o feijão é classificado em dois grupos: I e II. Dentro do grupo I, está o feijão-comum, que pertence à espécie Phaseolus vulgaris. No grupo II, encontra-se o feijão-caupi (feijão-de-corda ou feijão-macassar), que pertence à espécie Vigna unguiculata (L) Walp. As demais espécies de feijão não recebem classificação.


In [159]:
# def prepare_qa_for_llm(file_path):
#     prepared_qas = []
#     with open(file_path, 'r', encoding='utf-8') as file:
#         qa_data = json.load(file)
    
#     for qa in qa_data['QAs']:
#         # Extract relevant fields
#         qa_id = qa.get("QA id")
#         question = qa.get("question")
#         answer = qa.get("answer")
#         book_title = qa.get("book title", "Unknown title")
#         chapter_number = qa.get("chapter number", "Unknown chapter")
#         chapter_title = qa.get("chapter title", "Unknown chapter title")
        
#         # Format the information
#         llm_input = (f"From the book titled '{book_title}', Chapter {chapter_number}: '{chapter_title}', "
#                      f"Translate the following text:\n"
#                      f"<Q> {question}\n"
#                      f"<A> {answer}")
        
#         # Append the formatted string to the list
#         prepared_qas.append({"QA id": qa_id, "LLM input": llm_input,'Original':{'Question':question,'Answer':answer}})
    
#     return prepared_qas

In [169]:
# def extract_qa_pair(text, verbose=False):
#     # Split the text into lines
#     lines = text.split('\n')
    
#     # Initialize an empty dictionary to hold the result
#     qa_pair = {}
#     errors = []

#     # Loop through the lines and split by the colon to extract the key-value pairs
#     for line in lines:
#         if ':' in line:
#             key, value = line.split(':', 1)  # Split by the first colon only
#             key = key.strip()  # Strip extra spaces

#             # Check if the key is either 'Question' or 'Answer'
#             if key == 'Question' or key == 'Answer':
#                 qa_pair[key] = value.strip()  # Strip extra spaces
#             else:
#                 qa_pair[key] = value.strip()
#                 errors.append(f"Invalid key '{key}'. Expected 'Question' or 'Answer'.")

#     # Check if both 'Question' and 'Answer' are present
#     if 'Question' not in qa_pair:
#         errors.append("Missing 'Question' in input string.")
#     if 'Answer' not in qa_pair:
#         errors.append("Missing 'Answer' in input string.")
    
#     if verbose:
#         return qa_pair, errors
#     else:
#         return qa_pair

# # Example usage:
# text = 'Queston: What is the capital of France?\nAnswer: Paris'
#   # Example with missing Question
# qa_dict, error_list = extract_qa_pair(text, True)

# print("Extracted QA Pair:", qa_dict)
# if error_list:
#     print("Errors:", error_list)

Extracted QA Pair: {'Queston': 'What is the capital of France?', 'Answer': 'Paris'}
Errors: ["Invalid key 'Queston'. Expected 'Question' or 'Answer'.", "Missing 'Question' in input string."]


In [191]:
import asyncio
prompt = 'The following text consists of a question and answer extracted from an agronomy questions and answers book from the Brazilian Agricultural Research Corporation. Please, translate the following text from Portuguese into English. Your answer must contain only the translation, no preambles.\n'
file='/workdir/500P_500R_json/37.json'
model='llama3.1'
async def get_translation(file,model=model):
    translations=[]
    for qa in prepare_qa_for_llm(file):
        source=qa['LLM input']
        translation = await aclient.chat(model=model, messages=[{'role': 'user', 'content': prompt+source}])
        translation=translation['message']['content']
        print(translation)
        translations.append({'QA id':qa['QA id'],'Original':qa['Original'],'Translation':extract_qa_pair(translation)})
    return translations
        
        
translations= await get_translation(file,model='0.4temp')
print(translations)


Question: What is a bean?
Answer: A bean is a vegetable that belongs to the legume family. All legumes have a characteristic pod that splits into two parts with the seeds attached to one of them.
Question: Why is bean a very good food for human health?
Answer: Bean is good for health because it provides carbohydrates that provide energy for daily life, in addition to essential nutrients for a healthy life, such as rich protein in lysine, vitamins (mainly those of the B complex), minerals (such as iron, calcium, potassium and phosphorus) and fibers (which help proper intestinal function and control cholesterol and blood glucose levels). The high concentration of amino acid lysine in bean is considered of great value in complementing cereal proteins, such as rice.
Question: What is the composition of beans?
Answer: Each 100 g of raw beans contains, on average:
- Protein: 22 g
- Carbohydrate: 61 g
- Fiber: 4.3 g
- Lipid: 1.6 g
- Ash: 3.6 g
- Calcium: 86 mg
- Phosphorus: 247 mg
- Iron: 7.6

  translations= await get_translation(file,model='0.4temp')


In [202]:
for i in range(10):
    print(translations[i]['Original']['Question'])
    print(translations[i]['Translation']['Question'])
    print('\n')
    print(translations[i]['Original']['Answer'])
    print(translations[i]['Translation']['Answer'])
    print('_'*40+'\n')

O que é um feijão?
What is a bean?


Feijão é um vegetal que pertence à família das leguminosas. Todas as leguminosas possuem uma vagem característica, que se separa em duas partes com as sementes presas à margem de uma delas.
A bean is a vegetable that belongs to the legume family. All legumes have a characteristic pod that splits into two parts with the seeds attached to one of them.
________________________________________

Por que o feijão é um alimento tão bom para a saúde?
Why is bean a very good food for human health?


O feijão é bom para a saúde porque ele fornece carboidratos, que proporcionam energia para o dia a dia, além de nutrientes essenciais para uma vida saudável, como proteínas ricas em lisina, vitaminas (principalmente as do complexo B), sais minerais (como ferro, cálcio, potássio e fósforo) e fibras (que ajudam no bom funcionamento do intestino e no controle dos níveis de colesterol e glicose do sangue). A concentração elevada do aminoácido lisina no feijão é consi

In [203]:
file_name = 'translation_test.json'

with open(file_name, 'w', encoding='utf-8') as json_file:
    json.dump(translations, json_file, ensure_ascii=False, indent=4)

In [107]:
#To free vram immediatly
#client.generate(model='llama3.1:405b',keep_alive=0)

{'model': 'llama3.1:405b',
 'created_at': '2024-10-09T15:16:35.000416113Z',
 'response': '',
 'done': True,
 'done_reason': 'unload'}

In [13]:
client.list()

{'models': [{'name': 'testxxx:latest',
   'model': 'testxxx:latest',
   'modified_at': '2024-10-10T08:24:54.86027783Z',
   'size': 4661231380,
   'digest': 'de65f9463754f374cbd2c06b3ad75bf98e5ffa603362f7104d266b06e330d129',
   'details': {'parent_model': '',
    'format': 'gguf',
    'family': 'llama',
    'families': ['llama'],
    'parameter_size': '8.0B',
    'quantization_level': 'Q4_0'}},
  {'name': '0.4temp:latest',
   'model': '0.4temp:latest',
   'modified_at': '2024-10-09T15:02:42.406290889Z',
   'size': 4661231000,
   'digest': '1020671ad3d618a48ec4c93e572307236354c21127398a7f53184cf319c55146',
   'details': {'parent_model': '',
    'format': 'gguf',
    'family': 'llama',
    'families': ['llama'],
    'parameter_size': '8.0B',
    'quantization_level': 'Q4_0'}},
  {'name': '0.0temp:latest',
   'model': '0.0temp:latest',
   'modified_at': '2024-10-09T15:02:11.47373095Z',
   'size': 4661230998,
   'digest': '9629df57def7fbf7608b35551a6e90597eed8472c970d43a17040dc414d75788',
 

I'm a large language model, so I can do a wide range of things. Here are some examples:

1. **Answering questions**: I can provide information on various topics, from science and history to entertainment and culture.
2. **Generating text**: I can create text based on a prompt or topic. This could be a short paragraph, a longer article, or even a whole story.
3. **Translation**: I can translate text from one language to another (although my proficiency may vary depending on the languages).
4. **Summarizing content**: If you give me a piece of writing or a conversation, I can summarize it for you in a few sentences.
5. **Conversation**: I'm happy to chat with you about any topic that interests you! I can respond to questions, engage in discussions, and even tell jokes (bad ones, but still).
6. **Creative writing**: If you'd like, I can try my hand at creative writing tasks, such as writing a short story or poem.
7. **Language learning support**: If you're learning a new language, I can h

In [12]:
print(client.chat(model='testxxx', messages=[{'role': 'user', 'content': 'Hello! What are you good at?'}])['message']['content'])

<A> I am a translator specializing in Portuguese to English translations, with a focus on technical texts related to agronomy.


In [115]:
for model in client.list()['models']:
    print(model['name'])



0.4temp:latest
0.0temp:latest
0.2temp:latest
llama3.1:405b
llama3.1:latest
llama3.1:70b
llama3.2:latest


In [103]:
client.delete('0temp')

{'status': 'success'}