In [1]:
!pip install -q litellm langchain faiss-cpu sentence-transformers ragas langchain-groq groq datasets langchain-huggingface

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.3/40.3 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.6/8.6 MB[0m [31m62.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.3/31.3 MB[0m [31m62.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.9/190.9 kB[0m [31m17.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m131.1/131.1 kB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.5/45.5 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m83.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [13]:
# Cell 2: Import required libraries
import os
from datasets import Dataset
from ragas import evaluate
from ragas.metrics import Faithfulness, AnswerRelevancy, AnswerCorrectness
from ragas.llms import LangchainLLMWrapper
from langchain_groq import ChatGroq
from langchain_huggingface import HuggingFaceEmbeddings
import litellm

In [3]:
from google.colab import userdata

# Set API keys here
os.environ["GROQ_API_KEY"] = userdata.get('GROQ_API_KEY')
os.environ["OPENAI_API_KEY"] = userdata.get('OPENAI_API_KEY_')
os.environ["GOOGLE_API_KEY"] = userdata.get('GOOGLE_API_KEY')

In [4]:
# Cell 4: Define model mapping for each use case
llm_map = {
    "summary": ("openai/gpt-4o-mini", os.environ["OPENAI_API_KEY"]),
    "extract": ("groq/llama3-70b-8192", os.environ["GROQ_API_KEY"]),
    "root_cause": ("groq/llama-3.1-8b-instant", os.environ["GROQ_API_KEY"]),
    "recommendation": ("gemini/gemini-1.5-flash", os.environ["GOOGLE_API_KEY"]),
    "announcement": ("groq/llama3-70b-8192", os.environ["GROQ_API_KEY"]),
    "internal_memo": ("groq/llama-3.1-8b-instant", os.environ["GROQ_API_KEY"]),
}

In [5]:
# Cell 5: Define helper function to get LiteLLM response
def get_llm_response(model_key, incident_text):
    model_name, api_key = llm_map[model_key]
    litellm.api_key = api_key

    prompt_templates = {
        "summary": f"Summarize the following incident: {incident_text}",
        "extract": f"Extract structured information (Train ID, Location, Incident Type, Severity, Affected Services) as JSON from: {incident_text}",
        "root_cause": f"Suggest a plausible root cause based on this incident: {incident_text}",
        "recommendation": f"Suggest immediate actions for this railway incident: {incident_text}",
        "announcement": f"Write a short public announcement for passengers about this incident: {incident_text}",
        "internal_memo": f"Draft an internal memo for departments based on this incident: {incident_text}",
    }

    response = litellm.completion(
        model=model_name,
        messages=[
            {"role": "system", "content": "You are a railway incident management assistant."},
            {"role": "user", "content": prompt_templates[model_key]}
        ]
    )
    return response["choices"][0]["message"]["content"]

In [6]:
# Cell 6: Sample incident reports for testing
incidents = [
    "Train #123 delayed due to signal fault near Junction X.",
    "Passenger reported a medical emergency on platform 5 at Central Station.",
    "Minor track damage detected near KM 150."
]

In [7]:
# Cell 7: Run the assistant for each use case and collect results
results = {
    "question": [],
    "response": [],
    "reference": []  # Must be a string (not list) per sample for RAGAS
}

for incident in incidents:
    for use_case in llm_map:
        try:
            print(f"Running {use_case} on incident: {incident}")
            output = get_llm_response(use_case, incident)
            question = f"{use_case.upper()} for: {incident}"

            results["question"].append(question)
            results["response"].append(output)
            results["reference"].append("Expected reference answer (mock for now)")
        except Exception as e:
            print(f"Error during {use_case}: {e}")
            results["question"].append(question)
            results["response"].append(str(e))
            results["reference"].append("N/A")

Running summary on incident: Train #123 delayed due to signal fault near Junction X.
Running extract on incident: Train #123 delayed due to signal fault near Junction X.
Running root_cause on incident: Train #123 delayed due to signal fault near Junction X.
Running recommendation on incident: Train #123 delayed due to signal fault near Junction X.
Running announcement on incident: Train #123 delayed due to signal fault near Junction X.
Running internal_memo on incident: Train #123 delayed due to signal fault near Junction X.
Running summary on incident: Passenger reported a medical emergency on platform 5 at Central Station.
Running extract on incident: Passenger reported a medical emergency on platform 5 at Central Station.
Running root_cause on incident: Passenger reported a medical emergency on platform 5 at Central Station.
Running recommendation on incident: Passenger reported a medical emergency on platform 5 at Central Station.
Running announcement on incident: Passenger reporte

In [8]:
# Cell 8: Convert results to Ragas Dataset
dataset = Dataset.from_dict(results)
dataset

Dataset({
    features: ['question', 'response', 'reference'],
    num_rows: 18
})

In [9]:
# Cell 9: Prepare RAGAS evaluation LLM and embeddings
groq_llm_for_ragas = LangchainLLMWrapper(
    ChatGroq(
        api_key=os.environ["GROQ_API_KEY"],
        model_name="llama3-8b-8192",
        temperature=0.0
    )
)
ragas_embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [14]:
# Cell 10: Attach LLM to metrics
answer_correctness = AnswerCorrectness()
answer_relevancy = AnswerRelevancy()
answer_correctness.llm = groq_llm_for_ragas
answer_relevancy.llm = groq_llm_for_ragas

In [15]:
# Cell 11: Evaluate responses using Ragas
print("Evaluating responses using RAGAS...")
eval_result = evaluate(
    dataset,
    metrics=[
        answer_relevancy,
        answer_correctness
    ],
    embeddings=ragas_embeddings
)

Evaluating responses using RAGAS...


Evaluating:   0%|          | 0/36 [00:00<?, ?it/s]

ERROR:ragas.executor:Exception raised in Job[10]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for model `llama3-8b-8192` in organization `org_01j3302wb3e99bxkkghndsye1b` service tier `on_demand` on tokens per minute (TPM): Limit 6000, Used 5876, Requested 479. Please try again in 3.55s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}})
ERROR:ragas.executor:Exception raised in Job[13]: AssertionError(Error: 'answer_relevancy' requires embeddings to be set.)
ERROR:ragas.executor:Exception raised in Job[17]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for model `llama3-8b-8192` in organization `org_01j3302wb3e99bxkkghndsye1b` service tier `on_demand` on tokens per minute (TPM): Limit 6000, Used 6488, Requested 847. Please try again in 13.352s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'typ

In [17]:
# Cell 12: Print evaluation results
print("\nRAGAS Evaluation Summary:")
print(eval_result)

print("\nPer-sample breakdown:")
eval_result.to_pandas().head()


RAGAS Evaluation Summary:
{'answer_relevancy': 0.5865, 'answer_correctness': nan}

Per-sample breakdown:


Unnamed: 0,user_input,response,reference,answer_relevancy,answer_correctness
0,SUMMARY for: Train #123 delayed due to signal ...,Train #123 experienced a delay caused by a sig...,Expected reference answer (mock for now),0.710638,
1,EXTRACT for: Train #123 delayed due to signal ...,Here is the extracted information in JSON form...,Expected reference answer (mock for now),0.513838,
2,ROOT_CAUSE for: Train #123 delayed due to sign...,"Based on the given information, a plausible ro...",Expected reference answer (mock for now),,
3,RECOMMENDATION for: Train #123 delayed due to ...,Immediate Actions for Train #123 Delay (Signal...,Expected reference answer (mock for now),,
4,ANNOUNCEMENT for: Train #123 delayed due to si...,"""Attention all passengers,\n\nWe regret to inf...",Expected reference answer (mock for now),0.674454,
