## YandexGPT Prompting

In [1]:
%pip install sacrebleu unbabel-comet rouge_score
%pip uninstall jwt
%pip install PyJWT -U

Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m
[0mDefaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m


In [2]:
import os
import json
import time
import requests
import jwt
import sqlite3
import pandas as pd
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
import sacrebleu
from comet import download_model, load_from_checkpoint
from tqdm import tqdm
import warnings

# Constants for Yandex API
SERVICE_ACCOUNT_ID = "ajee5dv037e9gjovng3h"
KEY_ID = "aje4k7jpuo283qj9kt9d"
PRIVATE_KEY = os.environ['private-key']
CATALOGUE_ID = "b1g29e4i9l2uqmlp2s3u"

# Function to obtain IAM token
def get_iam_token():
    now = int(time.time())
    payload = {
        'aud': 'https://iam.api.cloud.yandex.net/iam/v1/tokens',
        'iss': SERVICE_ACCOUNT_ID,
        'iat': now,
        'exp': now + 360
    }
    encoded_token = jwt.encode(
        payload,
        PRIVATE_KEY,
        algorithm='PS256',
        headers={'kid': KEY_ID}
    )
    url = 'https://iam.api.cloud.yandex.net/iam/v1/tokens'
    response = requests.post(url, headers={'Content-Type': 'application/json'}, json={'jwt': encoded_token}).json()
    return response['iamToken']

# Function to generate a translation using Yandex API
def generate_translation(system_text, user_text):
    token = get_iam_token()
    url = 'https://llm.api.cloud.yandex.net/foundationModels/v1/completion'

    data = {
        'modelUri': f'gpt://{CATALOGUE_ID}/yandexgpt/latest',
        'completionOptions': {
            'stream': False,
            'temperature': 0.3,
            'maxTokens': 200
        },
        'messages': [
            {
                "role": "system",
                "text": system_text
            },
            {
                "role": "user",
                "text": user_text
            }
        ]
    }

    response = requests.post(url, headers={'Authorization': 'Bearer ' + token}, json=data).json()
    translated_text = response['result']['alternatives'][0]['message']['text']
    return translated_text

def generate_translation(system_text, user_text):
    token = get_iam_token()
    url = 'https://llm.api.cloud.yandex.net/foundationModels/v1/completion'

    data = {
        'modelUri': f'gpt://{CATALOGUE_ID}/yandexgpt/latest',
        'completionOptions': {
            'stream': False,
            'temperature': 0.3,  # Set temperature here
            'maxTokens': 200
        },
        'messages': [
            {
                "role": "system",
                "text": system_text
            },
            {
                "role": "user",
                "text": user_text
            }
        ]
    }

    response = requests.post(url, headers={'Authorization': 'Bearer ' + token}, json=data).json()
    
    # Check if 'result' is in the response
    if 'result' in response:
        if 'alternatives' in response['result'] and len(response['result']['alternatives']) > 0:
            translated_text = response['result']['alternatives'][0]['message']['text']
            return translated_text
        else:
            raise ValueError("No alternatives found in the result.")
    elif 'error' in response:
        error_message = response['error']['message']
        if "rate quota limit exceed" in error_message or "gauge quota limit exceed" in error_message or "ai.textGenerationCompletionSessionsCount.count gauge quota limit exceed" in error_message:
            raise RuntimeError("Rate quota limit exceeded.")
        else:
            raise ValueError(f"API Error: {error_message}")
    else:
        raise ValueError("Unexpected response structure: " + str(response))

# Function to generate a translation prompt
def generate_translation_prompt(sentence, target_language):
    return f"Переведи специализированный текст на {target_language} язык.", sentence

# Function to generate a prompt with term context for translation
def generate_context_prompt_translation(sentence, topic, term, translation, target_language):
    system_text = f"Переведи специализированный текст на {target_language} язык."
    user_text = f"Тематика: {topic}. Термин: {term}. Перевод термина: {translation}. Текст: {sentence}"
    return system_text, user_text

# Function to generate a prompt with term context for definition
def generate_context_prompt_definition(sentence, topic, term, definition, target_language):
    definition = definition.split('.')[0]
    system_text = f"Переведи специализированный текст на {target_language} язык."
    user_text = f"Тематика: {topic}. Термин: {term}. Определение термина: {definition}. Текст: {sentence}"
    return system_text, user_text

# Function to compute evaluation metrics
def compute_metrics(reference, hypothesis):
    smoothie = SmoothingFunction().method4
    bleu = sentence_bleu([reference.split()], hypothesis.split(), smoothing_function=smoothie)
    rouge = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True).score(reference, hypothesis)['rougeL'].fmeasure
    chrf = sacrebleu.corpus_chrf([hypothesis], [[reference]]).score
    comet_data = [{"src": reference, "mt": hypothesis, "ref": reference}]
    comet_score = comet_model.predict(comet_data, batch_size=1, gpus=0)['scores'][0]
    return bleu, rouge, chrf, comet_score



### Prompt Examples

In [8]:
generate_translation('Переведи специализированный текст на английский язык.', 'В русском языке есть два союза, ЧТО и ЧТОБЫ. ')

'In Russian, there are two conjunctions: ЧТО and ЧТОБЫ.'

In [4]:
generate_translation('Переведи специализированный текст на английский язык.', 'Тематика: лингвистика. Термин: союз. Перевод термина: conjunction. Текст: В русском языке есть два союза, ЧТО и ЧТОБЫ. ')

'In Russian, there are two conjunctions: ЧТО and ЧТОБЫ.'

In [5]:
generate_translation('Переведи текст на английский язык.', 'Тематика: лингвистика. Термин: союз. Определение термина: служебная часть речи, с помощью которой связывают между собой простые предложения в составе сложного или однородные члены предложения. Не склоняется и не спрягается, и не является членом предложения. Выражает смысловые отношения между синтаксическими единицами. Текст: В русском языке есть два союза, ЧТО и ЧТОБЫ. ')

'In Russian, there are two conjunctions: ЧТО and ЧТОБЫ.\n\n*ЧТО* is a conjunction that connects two clauses, and it expresses a semantic relationship between these clauses.\n*ЧТОБЫ* is also a conjunction, but it is used to express a purpose.'

In [6]:
generate_translation('Переведи текст на английский язык.', 'Тематика: физика. Термин: диффузия. Определение термина: взаимное проникновение соприкасающихся веществ друг в друга вследствие теплового движения частиц. Текст: И если, благодаря сравнительной свободе движущихся частиц, диффузия газообразных тел совершается в промежуток времени, измеряемый секундами и минутами, то то же явление в жидкостях требует часов и дней, а в твердых телах, где молекулы пробираются лишь с величайшим трудом, преодолевая на своем пути тысячи препятствий, скорость диффузии так мала, что требуются для сколько-нибудь заметного результата недели и месяцы. ')

'And if, due to the relative freedom of moving particles, the diffusion of gaseous bodies occurs within a period of time measured in seconds and minutes, then the same phenomenon in liquids requires hours and days, and in solids, where molecules make their way only with great difficulty, overcoming thousands of obstacles on their way, the rate of diffusion is so slow that it takes weeks and months for any noticeable result.'

## Evaluation of results

In [3]:
db_path = 'phys_cyberleninka.db'
conn = sqlite3.connect(db_path)
cur = conn.cursor()

In [4]:
# Extract data from the tables
contexts_df = pd.read_sql_query("SELECT * FROM contexts", conn)
sentences_df = pd.read_sql_query("SELECT * FROM sentences", conn)
terms_df = pd.read_sql_query("SELECT * FROM terms", conn)

# Add a new table for translation results if it doesn't exist
cur.execute('''
CREATE TABLE IF NOT EXISTS gpt_results (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    sentence_id INTEGER,
    prompt TEXT,
    generated_translation TEXT,
    verified_translation TEXT,
    bleu_score REAL,
    rouge_score REAL,
    chrf_score REAL,
    comet_score REAL
)
''')

<sqlite3.Cursor at 0x7eff83472940>

In [5]:
# Load COMET model
comet_model_path = download_model("Unbabel/wmt22-comet-da")
comet_model = load_from_checkpoint(comet_model_path)

Fetching 5 files: 100%|██████████| 5/5 [00:26<00:00,  5.32s/it]
Lightning automatically upgraded your loaded checkpoint from v1.8.3.post1 to v2.2.4. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../../../../tmp/xdg_cache/huggingface/hub/models--Unbabel--wmt22-comet-da/snapshots/371e9839ca4e213dde891b066cf3080f75ec7e72/checkpoints/model.ckpt`
Created a temporary directory at /tmp/tmpz91mfa5f
Writing /tmp/tmpz91mfa5f/_remote_module_non_scriptable.py
Encoder model frozen.
/home/jupyter/.local/lib/python3.10/site-packages/pytorch_lightning/core/saving.py:188: Found keys that are not in the model state dict but in the checkpoint: ['encoder.model.embeddings.position_ids']


In [6]:
# Sample up to 3 contexts per term and limit to 1000 contexts
sampled_contexts = []
for term_id in terms_df['id'].unique():
    term_contexts = contexts_df[contexts_df['term_id'] == term_id]
    sampled_contexts.extend(term_contexts.sample(min(3, len(term_contexts))).to_dict('records'))
    if len(sampled_contexts) >= 1000:
        break

sampled_contexts = sampled_contexts[:1000]

In [7]:
# Save the sampled contexts to a file
with open('sampled_contexts_cl_phys.json', 'w') as file:
    json.dump(sampled_contexts, file)

In [6]:
# Load the sampled contexts from the file
with open('sampled_contexts_cl_phys.json', 'r') as file:
    sampled_contexts = json.load(file)

In [None]:
conn = sqlite3.connect('phys_cyberleninka.db')
cur = conn.cursor()
query = "SELECT * FROM gpt_results ORDER BY id DESC LIMIT 1"
cur.execute(query)
last_row = cur.fetchone()
last_row

In [22]:
sampled_contexts[97]

{'id': 1095, 'term_id': 191, 'sentence_id': 1469}

In [23]:
import sys

# topic = 'лингвистика'
topic = 'физика'

start_index = 98
end_index = start_index + 16

processed_count = 0
# Generate prompts and save results
for i, row in tqdm(enumerate(sampled_contexts[start_index:end_index], start=start_index), total=len(sampled_contexts) - start_index):
    sentence_row = sentences_df[sentences_df['id'] == row['sentence_id']].iloc[0]
    term_row = terms_df[terms_df['id'] == row['term_id']].iloc[0]
    
    if not term_row['excerpt_en']:
        continue

    # Generate prompts
    translation_prompt_ru = generate_translation_prompt(sentence_row['context_ru'], 'английский')
    translation_prompt_en = generate_translation_prompt(sentence_row['context_en'], 'русский')
    context_prompt_translation_ru = generate_context_prompt_translation(sentence_row['context_ru'], topic, term_row['term'], term_row['translation_en'], 'английский')
    context_prompt_translation_en = generate_context_prompt_translation(sentence_row['context_en'], topic, term_row['translation_en'], term_row['term'], 'русский')
    context_prompt_definition_ru = generate_context_prompt_definition(sentence_row['context_ru'], topic, term_row['term'], term_row['definition_ru'], 'английский')
    context_prompt_definition_en = generate_context_prompt_definition(sentence_row['context_en'], topic, term_row['translation_en'], term_row['excerpt_en'], 'русский')

    prompts = [
        translation_prompt_ru, 
        translation_prompt_en, 
        context_prompt_translation_ru, 
        context_prompt_translation_en,
        context_prompt_definition_ru,
        context_prompt_definition_en
    ]
    
    for system_text, user_text in prompts:
        try:
            generated_translation = generate_translation(system_text, user_text).split('\n')[0]
            verified_translation = sentence_row['context_en'] if 'английский' in system_text else sentence_row['context_ru']
            
            # Compute metrics
            bleu, rouge, chrf, comet_score = compute_metrics(verified_translation, generated_translation)
            
            # Save results to the database
            conn.execute('''
            INSERT INTO gpt_results (sentence_id, prompt, generated_translation, verified_translation, bleu_score, rouge_score, chrf_score, comet_score)
            VALUES (?, ?, ?, ?, ?, ?, ?, ?)
            ''', (row['sentence_id'], system_text + " " + user_text, generated_translation, verified_translation, bleu, rouge, chrf, comet_score))
        except ValueError as e:
            print("Error during translation:", e)
        except RuntimeError as e:
            if "Rate quota limit exceeded" in str(e):
                print("Rate quota limit exceeded. Stopping script.")
                conn.commit()
                conn.close()
                sys.exit(e)
                
        processed_count += 1
        
        # Commit every 16 sentences
        if processed_count % 16 == 0:
            conn.commit()

# Final commit
conn.commit()
conn.close()
print(f'Processed {i} sentences')

  0%|          | 0/844 [00:00<?, ?it/s]Using default tokenizer.
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Predicting DataLoader 0: 100%|██████████| 1/1 [00:00<00:00,  1.97it/s]
Using default tokenizer.
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Predicting DataLoader 0: 100%|██████████| 1/1 [00:00<00:00,  1.86it/s]
Using default tokenizer.
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Predicting DataLoader 0: 100%|██████████| 1/1 [00:00<00:00,  1.95it/s]
Using default tokenizer.
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Predicting DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 

Processed 113 sentences



