# **Setup**

In [None]:
!git clone https://github.com/dtania/ua-gec-workshop.git

In [None]:
!pip install -r ua-gec-workshop/requirements.txt

#**Data processing functions**

In [None]:
import logging
import time
import numpy as np
import subprocess

from collections import Counter
from google import genai
from google.genai import errors
from langchain_core.prompts.prompt import PromptTemplate
from Levenshtein import distance as levenshtein_distance
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm

In [None]:
logger = logging.getLogger("workshop_logger")
logger.setLevel(logging.INFO)
logger.propagate = False

if logger.hasHandlers():
    logger.handlers.clear()

formatter = logging.Formatter("%(asctime)s [%(levelname)s] %(message)s")
file_handler = logging.FileHandler("workshop_run.log", mode="w", encoding="utf-8")
file_handler.setFormatter(formatter)

stream_handler = logging.StreamHandler()
stream_handler.setFormatter(formatter)

logger.addHandler(file_handler)
logger.addHandler(stream_handler)

In [None]:
def get_data(input_file):
  """
  Reads a text file and returns a list of stripped sentences.
  """
  with open(input_file, "r") as fp:
    return [sentence.strip() for sentence in fp.readlines()]

In [None]:
def generate_response(prompt, max_retries=7):
    """
    Runs the prompt and returns the response from the API, handling rate limits with retries.
    """
    retries = 0
    client = genai.Client(api_key="<add_your_API_key_here>")
    while retries < max_retries:
        try:
            response = client.models.generate_content(
                model="gemma-3-27b-it",
                contents=prompt,
            )
            return response.text.strip()

        except errors.APIError as e:
            wait_time = 3 ** retries
            logging.warning(f"Error: {e.message}. Retrying in {wait_time} seconds...")
            time.sleep(wait_time)
            retries += 1

    raise Exception("Max retries reached. Failed to get a response.")

In [None]:
def run_batch(output_file, data, prompt):
  """
  Processes a batch of text inputs and writes the results both to a new file and a list.
  """
  corrected_sentences = []
  with open(output_file, "w", encoding="utf-8") as file:
      for text in tqdm(data, desc="Processing"):
          response = generate_response(prompt.format(data=text))
          corrected_sentences.append(response)
          file.write(response + "\n")

  return corrected_sentences

In [None]:
def compute_errant_metric(pred_file, m2_file):
    """
    Runs the errant evaluation script and returns stdout as a string.
    """
    command = [
        "python",
        "ua-gec-workshop/utils/evaluate.py",
        pred_file,
        "--m2",
        m2_file,
    ]

    try:
        result = subprocess.run(command, capture_output=True, text=True, check=True)
        return result.stdout
    except subprocess.CalledProcessError as e:
        logger.error("Evaluation script failed with error:\n%s", e.stderr)
        raise

In [None]:
# Download the model for computing cosine similarity only once
transformers_model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")

In [None]:
def compute_cosine_similarity(references, hypotheses, model: SentenceTransformer):
    """
    Computes cosine similarity for corresponding pairs of reference and hypothesis sentences
    using a provided sentence embedding model. Returns the average, minimum, and maximum similarity,
    and also includes the indices of the reference and hypothesis that produced the min and max scores.
    """
    if len(references) != len(hypotheses):
        raise ValueError("The number of references and hypotheses must be the same.")

    if not references:
        raise ValueError("The input lists must not be empty.")

    # Compute embeddings for the lists of sentences for efficiency
    ref_embs = model.encode(references)
    hyp_embs = model.encode(hypotheses)

    # Compute cosine similarity scores for each corresponding pair
    similarities = []
    for ref_emb, hyp_emb in zip(ref_embs, hyp_embs):
        sim = cosine_similarity(ref_emb.reshape(1, -1), hyp_emb.reshape(1, -1))[0][0]
        similarities.append(sim)

    # Convert list to numpy array for easier statistics computation
    similarities = np.array(similarities)
    avg_sim = np.mean(similarities)

    # Get indices of the min and max similarity values
    min_idx = int(np.argmin(similarities))
    max_idx = int(np.argmax(similarities))
    min_sim = similarities[min_idx]
    max_sim = similarities[max_idx]

    return {
        "average": avg_sim,
        "min": min_sim,
        "min_index": min_idx,
        "max": max_sim,
        "max_index": max_idx
    }


In [None]:
def compute_levenshtein_distance(references, hypotheses):
    """
    Computes Levenshtein distance between corresponding pairs of reference and hypothesis sentences.
    Returns the average, minimum, and maximum Levenshtein distance, along with the indices of the pairs
    that produced the min and max distances.
    """
    lev_distances = []
    for reference, hypothesis in zip(references, hypotheses):
        lev_dist = levenshtein_distance(reference, hypothesis)
        lev_distances.append(lev_dist)

    average_distance = sum(lev_distances) / len(lev_distances)
    min_distance = min(lev_distances)
    max_distance = max(lev_distances)
    min_index = lev_distances.index(min_distance)
    max_index = lev_distances.index(max_distance)
    return {
        "average": average_distance,
        "min": min_distance,
        "min_index": min_index,
        "max": max_distance,
        "max_index": max_index
    }

In [None]:
def compute_llm_based_precision(llm_eval_results):
  """
  Parses LLM evaluation and computes a precision based on this.
  """
  labels = []
  for result in llm_eval_results:
    if result.startswith(("так", "Так")):
      labels.append("yes")
    elif result.startswith(("ні", "Ні")):
      labels.append("no")
    else:
      labels.append("none")

  count_labels = Counter(labels)

  precision = count_labels["yes"] / (count_labels["yes"] + count_labels["no"])
  return precision

# **Prompt iteration**

## Tasks:
1. Improve `correction_prompt`. Compare metrics. When Errant improves, how do other metrics behave?
  * Try specifying type of the errors we want to be fixed: spelling, grammar, punctuation, etc.
  * Try few-shot approach, i.e. providing examples.
  * Your ideas (you can try approaches/techniques from the presentation).
2. Improve `eval_prompt`. E.g., provide input and output and ask if the correction was done well. Note: you'll have to modify `run_batch` for this task.
3. ***Ask the model to provide a reasoning/explanation for the correction. Use json output. Remember to parse json and extract only the corrected sentence when running the metrics.

**Instructions**!
* Make a copy of this notebook to your Drive and work on it from there.
* Use `dev_set` for prompt iteration.
* Use `eval_set` only once at the end of the workshop to evaluate your final solution.
* At the end of the workshop, we’ll discuss the results—what worked for you and what didn’t. Feel free to share your experience with others 🙏. You can keep an eye on your results in the `workshop_run.log` file.


In [None]:
correction_prompt = PromptTemplate(
    input_variables=["data"],
    template="""
    Дано таке речення:
    {data}

    Завдання - проаналізувати його і виправити граматичні помилки. Поверни лише виправлене речення.
    """
)

In [None]:
eval_prompt = PromptTemplate(
    input_variables=["data"],
    template="""
    Дано таке речення:
    {data}

    Оціни чи граматичне воно. Поверни "так" або "ні".
    """
)

## Dev set

In [None]:
# Prepare data
dev_source_data = get_data("ua-gec-workshop/data/dev/dev_src_data.txt")
dev_target_data = get_data("ua-gec-workshop/data/dev/dev_tgt_data.txt")

In [None]:
# Run a prompt to fix GEC
logger.info("Prompt: %s", correction_prompt.template)
dev_corrected_data = run_batch("dev_corrected_data.txt", dev_source_data, correction_prompt)

In [None]:
# Compute Errant metric
errant_stats = compute_errant_metric("dev_corrected_data.txt", "ua-gec-workshop/data/dev/dev.m2")
logger.info("Errant results:\n%s", errant_stats)

In [None]:
# Compute Levenstein distance metric
lev_stats = compute_levenshtein_distance(dev_target_data, dev_corrected_data)
logger.info(
    f"""Levenshtein Distance - Avg: {lev_stats['average']:.2f},
     Min: {lev_stats['min']}, Min sentence index: {lev_stats["min_index"]},
     Max: {lev_stats['max']}, Max sentence index: {lev_stats["max_index"]}"""
)

In [None]:
# Compute Cosimilarity metric
sim_stats = compute_cosine_similarity(dev_target_data, dev_corrected_data, transformers_model)
logger.info(
    f"""Cosine Similarity - Avg: {sim_stats['average']:.4f},
     Min: {sim_stats['min']:.4f}, Min sentence index: {sim_stats["min_index"]},
     Max: {sim_stats['max']:.4f}, Max sentence index: {sim_stats["max_index"]}"""
)

In [None]:
# Run LLM-based evaluation
dev_llm_eval_results = run_batch("dev_llm_eval_results.txt", dev_corrected_data, eval_prompt)

In [None]:
# Compute LLM-based evaluation
llm_eval = compute_llm_based_precision(dev_llm_eval_results)
logger.info(f"LLM eval - {llm_eval}")

In [None]:
def check_one_example(
    id,
    dev_source_data=dev_source_data, dev_target_data=dev_target_data,
    dev_corrected_data=dev_corrected_data, model=transformers_model
  ):
  print(f"Original sentence: {dev_source_data[id]}")
  print(f"Target sentence: {dev_target_data[id]}")
  print(f"Corrected sentence: {dev_corrected_data[id]}")

  levenstein = compute_levenshtein_distance([dev_target_data[id]], [dev_corrected_data[id]])["average"]
  similarity = compute_cosine_similarity([dev_target_data[id]], [dev_corrected_data[id]], model)["average"]
  print(f"Levenstein distance: {levenstein}")
  print(f"Cosine similarity: {similarity}")

  llm_eval = generate_response(eval_prompt.format(data=dev_corrected_data[id]))
  print(f"LLM eval: {llm_eval}")

In [None]:
check_one_example(5)

## Eval set

In [None]:
# Prepare data
eval_source_data = get_data("ua-gec-workshop/data/eval/eval_src_data.txt")
eval_target_data = get_data("ua-gec-workshop/data/eval/eval_tgt_data.txt")

In [None]:
# Run a prompt to fix GEC
logger.info("[Eval data] Prompt: %s", correction_prompt.template)
eval_corrected_data = run_batch("eval_corrected_data.txt", eval_source_data, correction_prompt)

In [None]:
# Compute Errant metric
errant_stats = compute_errant_metric("eval_corrected_data.txt", "ua-gec-workshop/data/eval/eval.m2")
logger.info("Errant results:\n%s", errant_stats)

In [None]:
# Compute Levenstein distance metric
lev_stats = compute_levenshtein_distance(eval_target_data, eval_corrected_data)
logger.info(
    f"""Levenshtein Distance - Avg: {lev_stats['average']:.2f},
     Min: {lev_stats['min']}, Min sentence index: {lev_stats["min_index"]},
     Max: {lev_stats['max']}, Max sentence index: {lev_stats["max_index"]}"""
)

In [None]:
# Compute Cosimilarity metric
sim_stats = compute_cosine_similarity(eval_target_data, eval_corrected_data, transformers_model)
logger.info(
    f"""Cosine Similarity - Avg: {sim_stats['average']:.4f},
     Min: {sim_stats['min']:.4f}, Min sentence index: {sim_stats["min_index"]},
     Max: {sim_stats['max']:.4f}, Max sentence index: {sim_stats["max_index"]}"""
)

In [None]:
# Run LLM-based evaluation
eval_llm_eval_results = run_batch("eval_llm_eval_results.txt", eval_corrected_data, eval_prompt)

In [None]:
# Compute LLM-based evaluation
llm_eval = compute_llm_based_precision(eval_llm_eval_results)
logger.info(f"LLM eval - {llm_eval}")