In [None]:
# Evaluation_Dashboard.ipynb

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from nltk.translate.bleu_score import sentence_bleu

# Mock predictions vs ground truth
data = pd.DataFrame({
    "query": ["What is RAG?", "Define LangChain", "Use of Spark in AI"],
    "pred": ["Retrieval-Augmented Generation combines retrieval and LLM.", 
             "LangChain orchestrates LLM workflows.", 
             "Spark enables distributed data pipelines for AI."],
    "true": ["RAG augments language models with retrieval.", 
             "LangChain builds contextual LLM applications.", 
             "Spark handles distributed compute for ML tasks."]
})

# BLEU computation
data["BLEU"] = data.apply(lambda r: sentence_bleu([r["true"].split()], r["pred"].split()), axis=1)
avg_bleu = data["BLEU"].mean()

# Latency simulation
data["Latency"] = np.random.uniform(0.2, 0.6, size=len(data))

# Visualize
fig, ax1 = plt.subplots()
ax1.bar(data["query"], data["BLEU"], color='green')
ax1.set_ylabel("BLEU Score", color='green')
ax2 = ax1.twinx()
ax2.plot(data["query"], data["Latency"], color='orange', marker='o')
ax2.set_ylabel("Latency (s)", color='orange')
plt.title(f"RAG Evaluation Dashboard | Avg BLEU: {avg_bleu:.2f}")
plt.show()
