### Importing required libraries and modules

In [1]:
import os
import time
import psutil
import pandas as pd
from datetime import datetime
from pathlib import Path

from llama_index.core import SimpleDirectoryReader, VectorStoreIndex
from llama_index.llms.ollama import Ollama

from dotenv import load_dotenv
load_dotenv() 

import os
api_key = os.getenv("OPENAI_API_KEY")

### Listing desired models

In [2]:
AVAILABLE_MODELS = {
    "Mistral:7B-q4": "mistral:7b-instruct-q4_0",
    "Gemma": "gemma",
    "DeepSeek:7B": "deepseek-r1",
    "Ollama3.2:3B-q8": "llama3.2:3b-instruct-q8_0"
}

### Model Evaluations

In [3]:
def evaluate_responses(prompt_text):

    try:
        documents_path = "files"
        
        documents = SimpleDirectoryReader(documents_path).load_data()
        
        index = VectorStoreIndex.from_documents(documents)

        model_responses = []
        performance_metrics = []

        print(f"\nEvaluating prompt: '{prompt_text}'\n")
        print("=" * 80)
        
        for model_name, model_alias in AVAILABLE_MODELS.items():
            print(f"\nProcessing model: {model_name}: ")

            llm = Ollama(model=model_alias, request_timeout=300, temperature=0.7)
            query_engine = index.as_query_engine(llm=llm)

            start_time = time.time()
            response = query_engine.query(prompt_text)
            end_time = time.time()

            execution_time = round(end_time - start_time, 2)

            model_responses.append({
                "model": model_name,
                "response": str(response),
                "execution_time": execution_time
            })

            performance_metrics.append({
                "Model": model_name,
                "Execution Time (s)": execution_time,
                "Response": str(response)
            })
            
            print(f"  Execution time: {execution_time} seconds")
            print(f"  Response: {str(response)[:150]}...")
            
        save_metrics_to_csv(performance_metrics)
        
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        save_detailed_output(prompt_text, model_responses, timestamp)

        print(f"\nResults saved to model_evaluation.csv and model_responses_{timestamp}.txt")
        
        return {
            "message": "Evaluation completed. Metrics saved.",
            "performance_metrics": performance_metrics,
        }

    except Exception as e:
        print(f"Error: {str(e)}")
        return None

def save_metrics_to_csv(data):
    filename = "model_evaluation.csv"
    file_exists = os.path.isfile(filename)

    df = pd.DataFrame(data)

    if file_exists:
        df.to_csv(filename, mode='a', header=False, index=False)
    else:
        df.to_csv(filename, mode='w', header=True, index=False)
        
def save_detailed_output(prompt, model_responses, timestamp):
    filename = f"model_responses_{timestamp}.txt"
    
    with open(filename, 'w') as f:
        f.write(f"PROMPT: {prompt}\n\n")
        f.write("=" * 80 + "\n\n")
        
        for response in model_responses:
            model_name = response["model"]
            f.write(f"MODEL: {model_name}\n")
            f.write("-" * 40 + "\n")
            f.write(f"Execution Time: {response['execution_time']} seconds\n")
            f.write("\nRESPONSE:\n")
            f.write(response["response"])
            f.write("\n\n" + "=" * 80 + "\n\n")

def main():
    print("=" * 80)
    print("MODEL EVALUATION")
    print("=" * 80)
    
    while True:
        prompt = input("\nEnter your question (or 'q' to quit): ")
        
        if prompt.lower() == 'q':
            print("\nExiting program. Goodbye!")
            break
            
        evaluate_responses(prompt)
        
        print("\n" + "-" * 80)

if __name__ == "__main__":
    main()

MODEL EVALUATION



Enter your question (or 'q' to quit):  q



Exiting program. Goodbye!



Enter your question (or 'q' to quit):  hi



Evaluating prompt: 'hi'


Processing model: Mistral: 
