In [None]:
import os
from src.llama import llama
from src.postgres_db import VectorDB

OLLAMA_HOST = os.getenv("OLLAMA_HOST", "http://ollama:11434")
MODEL = "llama3.2:1b" # Find available models here https://ollama.com/library
EMBEDDING_MODEL = "granite-embedding:30m" # Find available models here https://ollama.com/library

POSTGRES_HOST = os.getenv("POSTGRES_HOST", "vector-postgres")

vectordb = VectorDB(host=POSTGRES_HOST)
llm = llama(OLLAMA_HOST, model=MODEL, embedding_model=EMBEDDING_MODEL)

## Generate Response

In [5]:
prompt = "How much do github actions cost for linux 4-core running for 3000 mins per month?"
embedding = llm.create_embedding(f"{prompt}")
similar_docs = vectordb.search_similar(embedding, top_k=3)
context=""
for doc in similar_docs:
    context += doc[1] + "\n"

prompt_with_context = f"Context: {context}\n\nQuestion: {prompt}\n"

# response = llm.generate_response(prompt_with_context)
# print(response if response else "No response generated.")

In [None]:
from openai import OpenAI

token = os.environ["GH_TOKEN"] 
endpoint = "https://models.github.ai/inference"
model = "openai/gpt-5-mini"

client = OpenAI(
    base_url=endpoint,
    api_key=token,
)

response = client.chat.completions.create(
    messages=[
        {
            "role": "system",
            "content": "You are a consultant who is an expert in GitHub.",
        },
        {
            "role": "user",
            "content": prompt_with_context,
        }
    ],
    model=model
)

print(response.choices[0].message.content)

It depends on the runner type:

- x64 "larger" runner (Linux 4-core): $0.016/min × 3,000 min = $48.00
- arm64 "larger" runner (Linux 4-core): $0.01/min × 3,000 min = $30.00
- GPU Linux 4‑core (if using GPU): $0.07/min × 3,000 min = $210.00

Note: included/free minutes do not apply to larger runners, so these costs would be billed in full. If you meant a different runner type, tell me which one and I’ll recalc.
