In [None]:
!pip install -q torch torchvision torchaudio transformers

In [2]:
!pip install torchcodec

Collecting torchcodec
  Downloading torchcodec-0.9.1-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (11 kB)
Downloading torchcodec-0.9.1-cp312-cp312-manylinux_2_28_x86_64.whl (2.1 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/2.1 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m2.1/2.1 MB[0m [31m65.1 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m45.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torchcodec
Successfully installed torchcodec-0.9.1


In [None]:
!pip install google-genai

In [None]:
import torch
import numpy as np
from PIL import Image
from typing import Dict, List, Tuple, Optional
import warnings

import tempfile
import cv2
from transformers import CLIPModel, CLIPProcessor
import json
from pathlib import Path
from typing import Dict, List, Optional, Tuple
import numpy as np
from PIL import Image
import cv2
import torch
from transformers import CLIPModel, CLIPProcessor
from transformers import AutoProcessor, LlavaForConditionalGeneration
import librosa

import io
import os
import time
import json
from pathlib import Path
from typing import Tuple
from PIL import Image
from tqdm import tqdm
import pandas as pd

from datasets import load_dataset
import torchcodec

warnings.filterwarnings('ignore')

In [None]:

ds = load_dataset("arood0/mmm_project_with_audio_ru_final", split='train', streaming=True)
item = next(iter(ds))

audio = item['audio']
print(f"Type: {type(audio)}")
print(f"Dir: {dir(audio)}")

if hasattr(audio, '__call__'):
    print("Callable!")
    result = audio()
    print(f"Result type: {type(result)}")
    print(f"Result: {result}")

if hasattr(audio, 'array'):
    print(f"Has array: {audio.array[:10]}")

if hasattr(audio, 'path'):
    print(f"Has path: {audio.path}")

In [10]:

EVAL_PROMPT = """
You are a strict but nuanced evaluator of an image editing system.

INPUT:
1. AUDIO: Spoken instruction in Russian describing an edit
2. IMAGE BEFORE: Original image
3. IMAGE AFTER: Edited image

TASK:
Evaluate how well the edit matches the instruction.

IMPORTANT RULES:
- Do NOT treat this as a binary yes/no task
- Most real edits are imperfect
- Scores of exactly 0.0 or 1.0 should be VERY RARE
- Prefer values like 0.55, 0.68, 0.82, etc.

STEP 1: Understand the instruction
- Transcribe the audio in Russian
- Describe what change was requested

STEP 2: Compare BEFORE vs AFTER
- What changes actually happened?
- What parts of the image stayed the same?
- Are there artifacts or quality issues?

STEP 3: Score multiple aspects independently

Use the following scale for EACH sub-score:
- 0.0–0.2: Not present or completely wrong
- 0.3–0.4: Weak or incorrect
- 0.5–0.6: Partially correct
- 0.7–0.8: Mostly correct with issues
- 0.9–1.0: Almost perfect (rare)

SUB-SCORES:
- instruction_match: How well the edit matches the requested change
- visual_correctness: Is the change visually correct and localized
- preservation: Were unrelated regions preserved
- artifact_free: Lack of artifacts, realism, natural look

STEP 4: Compute final score
- alignment_score = average of the four sub-scores
- Round to 2 decimal places

RESPOND ONLY IN JSON:
{
  "transcription": "",
  "instruction_understood": "",
  "instruction_execution": "not_attempted | partial | complete",
  "scores": {
    "instruction_match": 0.0-1.0,
    "visual_correctness": 0.0-1.0,
    "preservation": 0.0-1.0,
    "artifact_free": 0.0-1.0
  },
  "alignment_score": 0.0-1.0,
  "changes_detected": "",
  "quality_assessment": "poor | acceptable | good",
  "issues": [],
  "explanation": ""
}
"""

class GeminiEvaluator:
    def __init__(self, api_key: str, model: str = "gemini-2.5-flash",
                 delay_between_requests: float = 10.0, n_votes: int = 3):
        try:
            from google import genai
            from google.genai import types
        except ImportError:
            raise ImportError("pip install google-genai")
        self.client = genai.Client(api_key=api_key)
        self.types = types
        self.model = model
        self.delay = delay_between_requests
        self.n_votes = n_votes

    def _image_to_bytes(self, img: Image.Image) -> bytes:
        buf = io.BytesIO()
        if img.mode in ("RGBA", "P"):
            img = img.convert("RGB")
        img.save(buf, format="JPEG", quality=85)
        return buf.getvalue()

    def _evaluate_once(self, audio_bytes: bytes, image_before: Image.Image,
                       image_after: Image.Image, audio_mime: str) -> dict:
        parts = [
            self.types.Part.from_bytes(data=audio_bytes, mime_type=audio_mime),
            self.types.Part.from_bytes(data=self._image_to_bytes(image_before), mime_type="image/jpeg"),
            self.types.Part.from_bytes(data=self._image_to_bytes(image_after), mime_type="image/jpeg"),
            EVAL_PROMPT
        ]
        response = self.client.models.generate_content(model=self.model, contents=parts)
        text = response.text.strip().removeprefix("```json").removesuffix("```").strip()
        return json.loads(text)

    def evaluate(self, audio_bytes: bytes, image_before: Image.Image,
                 image_after: Image.Image, audio_mime: str = "audio/wav") -> Tuple[float, dict]:
        results = []
        for _ in range(self.n_votes):
            res = self._evaluate_once(audio_bytes, image_before, image_after, audio_mime)
            results.append(res)
            time.sleep(self.delay)

        scores = [r.get("alignment_score", 0.5) for r in results]
        final_score = round(sum(scores) / len(scores), 3)
        median_idx = sorted(range(len(scores)), key=lambda i: scores[i])[len(scores)//2]
        final_result = results[median_idx]
        final_result["alignment_score"] = final_score
        final_result["all_scores"] = scores
        return final_score, final_result


def extract_audio(audio) -> Tuple[bytes, str]:
    import soundfile as sf
    import numpy as np

    if isinstance(audio, bytes):
        return audio, "audio/mpeg"
    if isinstance(audio, dict):
        if audio.get("bytes"):
            path = audio.get("path", "")
            mime = "audio/mpeg" if path.endswith(".mp3") else "audio/wav"
            return audio["bytes"], mime
        if "array" in audio:
            buf = io.BytesIO()
            sf.write(buf, audio["array"], audio.get("sampling_rate", 16000), format="WAV")
            return buf.getvalue(), "audio/wav"
        if audio.get("path"):
            with open(audio["path"], "rb") as f:
                return f.read(), "audio/wav"
    if hasattr(audio, "get_all_samples"):
        samples = audio.get_all_samples()
        arr = samples.data.numpy().flatten()
        buf = io.BytesIO()
        sf.write(buf, arr, samples.sample_rate, format="WAV")
        return buf.getvalue(), "audio/wav"
    raise ValueError(f"Unknown audio format: {type(audio)}")

def to_pil(img) -> Image.Image:
    if isinstance(img, Image.Image):
        return img
    if isinstance(img, dict):
        if 'bytes' in img:
            return Image.open(io.BytesIO(img['bytes']))
        if 'path' in img:
            return Image.open(img['path'])
    if isinstance(img, bytes):
        return Image.open(io.BytesIO(img))
    raise ValueError(f"Unknown image format: {type(img)}")

def save_debug_sample(image_id: str, audio_bytes: bytes, image_before: Image.Image,
                      image_after: Image.Image, result: dict, out_dir: str = "debug_samples"):
    sample_dir = os.path.join(out_dir, image_id)
    os.makedirs(sample_dir, exist_ok=True)
    with open(os.path.join(sample_dir, "audio.wav"), "wb") as f:
        f.write(audio_bytes)
    image_before.save(os.path.join(sample_dir, "before.jpg"))
    image_after.save(os.path.join(sample_dir, "after.jpg"))
    with open(os.path.join(sample_dir, "result.json"), "w", encoding="utf-8") as f:
        json.dump(result, f, ensure_ascii=False, indent=2)

def process_sample(evaluator: GeminiEvaluator, image_id: str,
                   source_item: dict, result_item: dict, debug: bool = False) -> dict:
    image_before = to_pil(source_item.get("INPUT_IMG")
                          or source_item.get("input_img")
                          or source_item.get("image"))
    image_after = to_pil(result_item.get("result_image")
                         or result_item.get("output_image")
                         or result_item.get("image"))
    audio_bytes, audio_mime = extract_audio(source_item["audio"])

    score, details = evaluator.evaluate(audio_bytes, image_before, image_after, audio_mime)

    if debug:
        print("\n" + "="*50)
        print(f"DEBUG SAMPLE: {image_id}")
        print(f"SCORE: {score}")
        print(f"EXECUTED: {details.get('instruction_execution')}")
        print(f"TRANSCRIPTION: {details.get('transcription')}")
        print(f"EXPLANATION: {details.get('explanation')}")
        print("="*50)
        save_debug_sample(image_id, audio_bytes, image_before, image_after, {"score": score, **details})

    return {
        "IMAGE_ID": image_id,
        "score": score,
        "image_before": image_before,
        "image_after": image_after,
        **details
    }

if __name__ == "__main__":
    from datasets import load_dataset

    GEMINI_API_KEY =""
    evaluator = GeminiEvaluator(GEMINI_API_KEY, n_votes=3, delay_between_requests=5)

    ds_source = load_dataset("arood0/mmm_project_with_audio_ru_final", split="train")
    ds_result = load_dataset("gab1k/mmm_project_with_ru_intermdata", split="train")

    first_sample_src = ds_source[0]
    first_sample_res = ds_result[0]

    result = process_sample(evaluator, first_sample_src["IMAGE_ID"], first_sample_src, first_sample_res, debug=True)
    print("\nProcessed result:")
    print(result)



ClientError: 429 RESOURCE_EXHAUSTED. {'error': {'code': 429, 'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. To monitor your current usage, head to: https://ai.dev/usage?tab=rate-limit. \n* Quota exceeded for metric: generativelanguage.googleapis.com/generate_content_free_tier_requests, limit: 20, model: gemini-2.5-flash\nPlease retry in 49.606470077s.', 'status': 'RESOURCE_EXHAUSTED', 'details': [{'@type': 'type.googleapis.com/google.rpc.Help', 'links': [{'description': 'Learn more about Gemini API quotas', 'url': 'https://ai.google.dev/gemini-api/docs/rate-limits'}]}, {'@type': 'type.googleapis.com/google.rpc.QuotaFailure', 'violations': [{'quotaMetric': 'generativelanguage.googleapis.com/generate_content_free_tier_requests', 'quotaId': 'GenerateRequestsPerDayPerProjectPerModel-FreeTier', 'quotaDimensions': {'model': 'gemini-2.5-flash', 'location': 'global'}, 'quotaValue': '20'}]}, {'@type': 'type.googleapis.com/google.rpc.RetryInfo', 'retryDelay': '49s'}]}}