<a href="https://colab.research.google.com/github/deltorobarba/sciences/blob/master/ai_evaluation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# <font color="blue">**Observability & Evaluation**

##### *Gecko for Multimodal Evaluation*

https://cloud.google.com/blog/products/ai-machine-learning/evaluate-your-gen-media-models-on-vertex-ai

In [None]:
# set up configurations for both rubric generation and rubric validation

# Rubric Generation
rubric_generation_config = RubricGenerationConfig(
    prompt_template=RUBRIC_GENERATION_PROMPT,
    parsing_fn=parse_json_to_qa_records,
)
# Rubric Validation
pointwise_metric = PointwiseMetric(
    metric="gecko_metric",
    metric_prompt_template=RUBRIC_VALIDATOR_PROMPT,
    custom_output_config=CustomOutputConfig(
        return_raw_output=True,
        parsing_fn=parse_rubric_results,
    ),
)
# Rubric Metric
rubric_based_gecko = RubricBasedMetric(
    generation_config=rubric_generation_config,
    critique_metric=pointwise_metric,
)

In [None]:
# prepare your dataset for evaluation

prompts = [
    "steaming cup of coffee and a croissant on a table",
    "steaming cup of coffee and toast in a cafe",
    # ... more prompts
]
images = [
    '{"contents": [{"parts": [{"file_data": {"mime_type": "image/png", "file_uri": "gs://cloud-samples-data/generative-ai/evaluation/images/coffee.png"}}]}]}',
    '{"contents": [{"parts": [{"file_data": {"mime_type": "image/png", "file_uri": "gs://cloud-samples-data/generative-ai/evaluation/images/coffee.png"}}]}]}',
    # ... more image URIs
]
eval_dataset = pd.DataFrame(
    {
        "prompt": prompts,
        "image": images, # or "video": videos for video evaluation
    }
)

In [None]:
# generate the rubrics based on your prompts using the configured rubric_based_gecko metric

dataset_with_rubrics = rubric_based_gecko.generate_rubrics(eval_dataset)

In [None]:
# run the evaluation using the generated rubrics and your dataset

eval_task = EvalTask(
    dataset=dataset_with_rubrics,
    metrics=[rubric_based_gecko],
)
eval_result = eval_task.evaluate(response_column_name="image") # or "video"

In [None]:
# After the evaluation runs, you can compute and analyze the final scores to understand
# how well your generated content aligns with the detailed criteria derived from your prompts

dataset_with_final_scores = compute_scores(eval_result.metrics_table)
np.mean(dataset_with_final_scores["final_score"])