# Create simplifications

### Imports

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
from tqdm.notebook import tqdm
from llama_cpp import Llama
from _streamlit_app.utils_prompts import (
    SYSTEM_MESSAGE_EASIER,
    SYSTEM_MESSAGE_LS,
    SYSTEM_MESSAGE_ES,
    REWRITE_COMPLETE,
    RULES_EASIER,
    RULES_ES,
    RULES_LS,
    OPENAI_TEMPLATE_EASIER,
    OPENAI_TEMPLATE_ES,
    OPENAI_TEMPLATE_LS,
)

TEMPLATES_EASIER = (
    "verständliche_sprache",
    SYSTEM_MESSAGE_EASIER,
    RULES_EASIER,
    OPENAI_TEMPLATE_EASIER,
)
TEMPLATES_ES = ("einfache_sprache", SYSTEM_MESSAGE_ES, RULES_ES, OPENAI_TEMPLATE_ES)
TEMPLATES_LS = ("leichte_sprache", SYSTEM_MESSAGE_LS, RULES_LS, OPENAI_TEMPLATE_LS)

TEMPLATES = (TEMPLATES_EASIER, TEMPLATES_ES, TEMPLATES_LS)

import os
import time
from dotenv import load_dotenv
from anthropic import Anthropic
from mistralai import Mistral
from openai import OpenAI
from concurrent.futures import ThreadPoolExecutor


### Constants and helper functions

In [3]:
load_dotenv("/Volumes/1TB Home SSD/GitHub/.env_stat")

MISTRAL_API_KEY = os.getenv("MISTRAL_API_KEY")
ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

TEMPERATURE_SIMPLIFICATION = 0.15
SLEEP = 1

MAX_OUTPUT_TOKENS = 4096

In [4]:
openai_client = OpenAI()

GPT4o = "gpt-4o"


def call_openai(
    prompt,
    system_message,
    model_id=GPT4o,
    temperature=TEMPERATURE_SIMPLIFICATION,
    max_tokens=MAX_OUTPUT_TOKENS,
):
    time.sleep(SLEEP)
    try:
        completion = openai_client.chat.completions.create(
            model=model_id,
            temperature=temperature,
            max_tokens=max_tokens,
            messages=[
                {"role": "system", "content": system_message},
                {"role": "user", "content": prompt},
            ],
        )
        return completion.choices[0].message.content

    except Exception as e:
        print(f"Error: {e}")
        return None


# print(call_openai("Was ist die Hauptstadt der Schweiz?"))

In [5]:
anthropic_client = Anthropic(api_key=ANTHROPIC_API_KEY)

SONNET = "claude-3-5-sonnet-latest"


def call_anthropic(
    prompt,
    system_message,
    model_id=SONNET,
    temperature=TEMPERATURE_SIMPLIFICATION,
    max_tokens=MAX_OUTPUT_TOKENS,
):
    time.sleep(SLEEP)
    try:
        message = anthropic_client.messages.create(
            model=model_id,
            max_tokens=max_tokens,
            temperature=temperature,
            system=system_message,
            messages=[
                {
                    "role": "user",
                    "content": prompt,
                }
            ],
        )
        return message.content[0].text

    except Exception as e:
        print(f"Error: {e}")
        return None


# print(call_anthropic("Was ist die Hauptstadt der Schweiz?"))

In [6]:
# Latest version v3 released January 2025.
MISTRAL_SMALL_V3 = "mistral-small-latest"

mistral_client = Mistral(api_key=MISTRAL_API_KEY)


def call_mistral(
    prompt,
    system_message,
    model_id=MISTRAL_SMALL_V3,
    temperature=TEMPERATURE_SIMPLIFICATION,
    max_tokens=MAX_OUTPUT_TOKENS,
):
    messages = [
        {"role": "system", "content": system_message},
        {"role": "user", "content": prompt},
    ]

    time.sleep(SLEEP)
    try:
        message = mistral_client.chat.complete(
            model=model_id,
            messages=messages,
            temperature=temperature,
            max_tokens=max_tokens,
        )

        return message.choices[0].message.content.strip()

    except Exception as e:
        print(f"Error: {e}")
        return None


# print(call_mistral("Was ist die Hauptstadt der Schweiz?"))

# Simplify texts

### OSS models

In [None]:
def call_llm(llm, prompt, system_message=SYSTEM_MESSAGE_EASIER):
    output = llm.create_chat_completion(
        temperature=TEMPERATURE_SIMPLIFICATION,
        messages=[
            {"role": "system", "content": system_message},
            {"role": "user", "content": prompt},
        ],
    )
    response = output["choices"][0]["message"]["content"]
    return response


models = {
    "Phi-4": "/home/stat/huggingface_models/Phi-4-Q6_K_L.gguf",
    "Phi-4-Unsloth": "/home/stat/huggingface_models/Phi-4-Q5_K_M_Unsloth.gguf",
    "Gemma-2-27B": "/home/stat/huggingface_models/Gemma-2-27b-it-Q5_K_M.gguf",
    "Qwen-2.5-32B": "/home/stat/huggingface_models/Qwen2.5-32B-Instruct-Q5_K_M.gguf",
    "Qwen-2.5-72B": "/home/stat/huggingface_models/Qwen2.5-72B/Qwen2.5-72B-Instruct-Q5_K_M-00001-of-00002.gguf",
    "Llama-3.2-3B": "/home/stat/huggingface_models/Llama-3.2-3B.gguf",
    "Llama-3.1-Nemotron": "/home/stat/huggingface_models/Llama-3.1-Nemotron-70B/Llama-3.1-Nemotron-70B-Instruct-HF-Q5_K_M-00001-of-00002.gguf",
    "Llama-3.3-70B": "/home/stat/huggingface_models/Llama-3.3-70B/Llama-3.3-70B-Instruct-Q5_K_M-00001-of-00002.gguf",
    "Deepseek-R1-Distill_Llama_8B": "/home/stat/huggingface_models/DeepSeek-R1-Distill-Llama-8B-Q5_K_M.gguf",
    "Deepseek-R1-Distill_Llama_70B": "/home/stat/huggingface_models/DeepSeek-R1-Distill-Llama-70B-Q4_K_M.gguf",
}

df = pd.read_parquet("_data/testdata_50_final.parq")

# Make sure as many layers of the large models are processed on the A10 GPUs.
# 70B+ == 70 layers with 1k context, 60 layers with 4k context.
# Smaller models fit entirely on the GPU and we can set n_gpu_layers to -1.
n_gpu_layers = 70

# Set context size to 4k to fit user input and longest prompts.
n_ctx = 4096

for model_name, model_path in models.items():
    print(f"Processing with: {model_name}")

    llm = Llama(
        model_path=model_path,
        n_gpu_layers=n_gpu_layers,
        n_ctx=n_ctx,
        n_threads=16,  # Set according to the number of threads available.
        flash_attn=True,  # Experimental feature.
        verbose=True,  # Set to False for production.
    )

    # Iterate throught all three modes of simplification.
    for language_level, system_message, rules, base_prompt in TEMPLATES:
        # Iterate over all texts.
        results = []
        for idx, text in enumerate(df.source_text.values):
            print(idx)
            final_prompt = base_prompt.format(
                prompt=text,
                rules=rules,
                completeness=REWRITE_COMPLETE,
            )
            response = call_llm(
                llm,
                final_prompt,
                system_message=system_message,
            )
            response = response.strip()
            results.append((idx, response))
            print(response)
            print()
        print(results)
        pd.DataFrame(results).to_parquet(
            f"testdata_50_{model_name}_{language_level}.parq"
        )

### Proprietary models

In [8]:
df = pd.read_parquet("_data/testdata_50_final.parq")

# for model_name in ["GPT-4o", "Sonnet", "Mistral-Small-v3"]:
for model_name in ["Mistral-Small-v3"]:
    # Iterate throught all three modes of simplification.
    for language_level, system_message, rules, base_prompt in TEMPLATES:
        if language_level != "verständliche_sprache":
            continue
        print(f"Processing with: {model_name} and {language_level}")
        # Iterate over all texts.
        texts = df.source_text.values
        args = [
            (
                base_prompt.format(
                    prompt=text,
                    rules=rules,
                    completeness=REWRITE_COMPLETE
                ), system_message
            )
            for text in texts
        ]
        if model_name == "GPT-4o":
            with ThreadPoolExecutor(max_workers=5) as executor:
                response = list(executor.map(call_openai, *zip(*args)))
        elif model_name == "Sonnet":
            with ThreadPoolExecutor(max_workers=5) as executor:
                response = list(executor.map(call_anthropic, *zip(*args)))
        elif model_name == "Mistral-Small-v3":
            with ThreadPoolExecutor(max_workers=5) as executor:
                response = list(executor.map(call_mistral, *zip(*args)))
        tmp = pd.DataFrame(response)

        # Small fix to align dataframe columns to results from OSS models.
        tmp.reset_index(inplace=True)
        tmp.columns = [0, 1]
        
        tmp.to_parquet(f"_data/_results/testdata_50_{model_name}-t15_{language_level}.parq")

Processing with: Mistral-Small-v3 and verständliche_sprache
