In [9]:
import requests

API_URL = "http://172.16.30.126:4000/eval"  # Replace with actual VM IP if needed

def call_remote_api(question: str) -> dict:
    try:
        response = requests.post(
            API_URL, json={"question": question}, 
            headers={"Content-Type": "application/json"}, 
            timeout=10)

        response.raise_for_status()
        return response.json()
    except Exception as e:
        print(f"❌ API call failed: {e}")
        return {"answer": "", "raw_output": ""}

In [1]:
# Box 1: Imports and config

import csv
import pandas as pd
import os
from difflib import get_close_matches
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_ollama import OllamaEmbeddings, OllamaLLM
from langchain.chains import RetrievalQA
from langchain_community.document_loaders import PyMuPDFLoader

# --- Configuration ---

csv_files = [
    "knowledge/finance-llama2-1k.csv",
    "knowledge\Beliance_news_sentiment.csv",
    "knowledge\\all_stocks_5yr.csv"
    # add more CSV filenames here
]    # Your CSV knowledge base file
pdf_files = [
    "knowledge\Context and HRM_ Theory, Evidence, and Proposals_25_06_25_23_15_52.pdf",
    "knowledge/rehumanising.pdf",
    "knowledge\Fundamental of management.pdf",
    "knowledge\Principles of Economics_5.pdf",
    "knowledge\OO. Gregory Mankiw. Macroeconomics  9th edition.pdf",
    "knowledge\zq_reynolds_microeconomics_07.pdf",
    "knowledge\Transodapdf-converted.pdf","knowledge\FokMoney.pdf",
    "knowledge\TangKongtrust.pdf", 
    "knowledge\StockMarkeTVolatility_25_06_25_23_06_45.pdf"
    # add more PDF filenames here
] 

INPUT_QUESTIONS_FILE = "financial-analysis-agent/test.csv"
OUTPUT_SUBMISSION_FILE = "financial-analysis-agent\submission.csv"

ID_COLUMN = "id"
QUESTION_COLUMN = "query"
ANSWER_MODE = "abcd"  # or "rise"

LLM_MODEL = "scb10x/llama3.1-typhoon2-8b-instruct"
EMBED_MODEL = "nomic-embed-text"

print("Imports and config loaded.")


Imports and config loaded.


In [None]:
# Box 2: Load knowledge base, split text, embed, build vector DB (with cache)

PERSIST_DIR = "chroma_db"  # Directory to save/load Chroma DB
EMBED_MODEL = "nomic-embed-text"
embedder = OllamaEmbeddings(model=EMBED_MODEL)

# If DB exists, just load it
if os.path.exists(PERSIST_DIR) and len(os.listdir(PERSIST_DIR)) > 0:
    print("🔁 Loading existing vector DB...")
    vector_db = Chroma(persist_directory=PERSIST_DIR, embedding_function=embedder)
else:
    print("📄 Processing documents and creating vector DB...")

    # Load CSVs
    all_csv_docs = []
    for file in csv_files:
        try:
            df = pd.read_csv(file)
            docs = [
                Document(page_content=str(row["text"]))
                for _, row in df.iterrows()
                if str(row["text"]).strip()
            ]
            all_csv_docs.extend(docs)
            print(f"Loaded {len(docs)} docs from {file}")
        except Exception as e:
            print(f"Failed to load CSV {file}: {e}")

    # Load PDFs
    all_pdf_docs = []
    for file in pdf_files:
        try:
            loader = PyMuPDFLoader(file)
            pdf_docs = loader.load()
            all_pdf_docs.extend(pdf_docs)
            print(f"Loaded {len(pdf_docs)} docs from {file}")
        except Exception as e:
            print(f"Failed to load PDF {file}: {e}")

    # Combine all docs
    all_docs = all_csv_docs + all_pdf_docs
    print(f"Total documents to process: {len(all_docs)}")

    # Split and filter
    splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    all_chunks = splitter.split_documents(all_docs)
    all_chunks = [chunk for chunk in all_chunks if chunk.page_content.strip()]

    valid_chunks = []
    for doc in all_chunks:
        try:
            vec = embedder.embed_query(doc.page_content)
            if vec:
                valid_chunks.append(doc)
        except Exception:
            continue

    if not valid_chunks:
        raise RuntimeError("No valid chunks after embedding.")

    vector_db = Chroma.from_documents(
        valid_chunks, 
        embedding=embedder, 
        persist_directory=PERSIST_DIR
    )
    vector_db.persist()
    print(f"✅ Vector DB created and saved with {len(valid_chunks)} chunks.")

Loaded 1000 docs from knowledge/finance-llama2-1k.csv
Failed to load CSV knowledge\Beliance_news_sentiment.csv: 'text'
Failed to load CSV knowledge\all_stocks_5yr.csv: 'text'
Loaded 17 docs from knowledge\Context and HRM_ Theory, Evidence, and Proposals_25_06_25_23_15_52.pdf
Loaded 39 docs from knowledge/rehumanising.pdf
Loaded 547 docs from knowledge\Fundamental of management.pdf
Loaded 330 docs from knowledge\Principles of Economics_5.pdf
Loaded 645 docs from knowledge\OO. Gregory Mankiw. Macroeconomics  9th edition.pdf
Loaded 301 docs from knowledge\zq_reynolds_microeconomics_07.pdf
Loaded 355 docs from knowledge\Transodapdf-converted.pdf
Loaded 10 docs from knowledge\FokMoney.pdf
Loaded 10 docs from knowledge\TangKongtrust.pdf
Loaded 654 docs from knowledge\StockMarkeTVolatility_25_06_25_23_06_45.pdf
Total documents to process: 3908
Vector DB created with 13072 chunks.


In [None]:
# Box 3: Setup LLM and RetrievalQA chain

llm = OllamaLLM(model=LLM_MODEL)
agent = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vector_db.as_retriever()
)
print("LLM and QA chain ready.")


LLM and QA chain ready.


In [2]:
def interactive_query_session_via_api(
    test_csv_path,
    submission_csv_path,
    id_column="id",
    query_column="query"
):
    import csv
    import os
    import pandas as pd
    from difflib import get_close_matches

    def detect_question_type(question: str) -> str:
        q = question.lower()
        has_abcd = any(opt in q for opt in [" a.", " a:", " b.", " b:", " c.", " c:", " d.", " d:"])
        has_rise_fall = "rise" in q or "fall" in q or "ขึ้น" in q or "ลง" in q
        if has_rise_fall and not has_abcd:
            return "rise"
        elif has_abcd:
            return "abcd"
        return "unknown"

    def save_answers_update(submission_file, new_results, id_col="id"):
        if os.path.exists(submission_file):
            df = pd.read_csv(submission_file)
        else:
            df = pd.DataFrame(columns=[id_col, "answer"])
        answers_dict = dict(zip(df[id_col], df["answer"]))
        for r in new_results:
            answers_dict[r[id_col]] = r["answer"]
        updated_df = pd.DataFrame([{id_col: k, "answer": v} for k, v in answers_dict.items()])
        updated_df.to_csv(submission_file, index=False)
        print(f"✅ Saved {len(new_results)} answers to {submission_file}")

    try:
        with open(test_csv_path, encoding="utf-8") as f:
            reader = list(csv.DictReader(f))
            query_to_id = {row[query_column].strip(): row[id_column] for row in reader}
    except Exception as e:
        print(f"❌ Failed to read test file: {e}")
        return

    print("🟢 Enter queries manually. Press Enter to stop.")
    results = []

    while True:
        user_input = input("❓ Query: ").strip()
        if not user_input:
            break

        matches = get_close_matches(user_input, list(query_to_id.keys()), n=1, cutoff=0.7)
        if not matches:
            print("❌ No matching query.\n")
            continue

        matched_query = matches[0]
        matched_id = query_to_id[matched_query]
        print(f"🔎 Matched: {matched_query} → ID: {matched_id}")

        try:
            response = call_remote_api(matched_query)
            answer = response.get("answer", "").strip()
            raw_output = response.get("raw_output", "").strip()

            if answer not in ["A", "B", "C", "D", "Rise", "Fall"]:
                print(f"❌ Invalid answer: '{answer}'\n📝 Raw Output: {raw_output}\n")
                continue

            results.append({id_column: matched_id, "answer": answer})
            print(f"✅ Final Answer: {answer}")
            print(f"📝 Explanation: {raw_output}\n")

        except Exception as e:
            print(f"❌ Error: {e}")

    print(f"📊 Collected {len(results)} answers.")
    if results:
        choice = input("💾 Save to submission.csv? (y/n): ").strip().lower()
        if choice == "y":
            save_answers_update(submission_csv_path, results, id_col=id_column)
        else:
            print("⚠️ Not saved.")

In [3]:
interactive_query_session_via_api(
    test_csv_path="financial-analysis-agent/test.csv",
    submission_csv_path="financial-analysis-agent/submission.csv"
)

🟢 Enter queries manually. Press Enter to stop.
🔎 Matched: ตอบคำถามด้วยตัวเลือกที่เหมาะสม A, B, C และ D โปรดตอบด้วยคำตอบที่ถูกต้อง A, B, C หรือ D เท่านั้น อย่าใช้คำฟุ่มเฟือยหรือให้ข้อมูลเพิ่มเติม

คำถาม: ______ สถานที่ทำงานเกี่ยวข้องกับการเสริมสร้างศักยภาพให้พนักงาน ตัวอย่างเช่น 'job enrichment' ที่พนักงานได้รับขอบเขตที่ใหญ่ขึ้นในการตัดสินใจว่าจะจัดระเบียบงานของตนอย่างไร หรือ 'job enlargement' ที่พนักงานได้รับมอบหมายงานที่หลากหลายมากขึ้น

ตัวเลือกคำตอบ: A: Re-invigorating, B: Re-flourishing, C: Revitalizing, D: Rehumanizing

คำตอบ: → ID: 36deab86-cfd3-48b5-9bea-a36c1b0e63a8
❌ API call failed: HTTPConnectionPool(host='172.16.30.126', port=4000): Max retries exceeded with url: /eval (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x0000023619297750>: Failed to establish a new connection: [WinError 10061] No connection could be made because the target machine actively refused it'))
❌ Invalid answer: ''
📝 Raw Output: 

🔎 Matched: Answer the question with the appr

In [None]:
# Batch answer all queries in test.csv with strict answer filtering

def run_batch_query(
    agent,
    test_csv_path,
    submission_csv_path,
    id_column="id",
    query_column="query",
    max_retries=3
):

    def detect_question_type(question: str) -> str:
        q = question.lower()
        has_abcd = any(opt in q for opt in [" a.", " a:", " b.", " b:", " c.", " c:", " d.", " d:"])
        has_rise_fall = "rise" in q or "fall" in q or "ขึ้น" in q or "ลง" in q
        if has_rise_fall and not has_abcd:
            return "rise"
        elif has_abcd:
            return "abcd"
        return "unknown"

    def postprocess(raw_answer, mode="abcd"):
        raw = raw_answer.strip().lower()
        if mode == "abcd":
            for ch in ["a", "b", "c", "d"]:
                if raw.startswith(ch + ":") or raw.startswith(ch + ".") or raw == ch:
                    return ch.upper()
        elif mode == "rise":
            if "rise" in raw:
                return "Rise"
            elif "fall" in raw:
                return "Fall"
        return ""

    def force_valid_answer(raw_ans, mode="abcd"):
        for _ in range(max_retries):
            answer = postprocess(raw_ans, mode)
            if answer in ["A", "B", "C", "D", "Rise", "Fall"]:
                return answer
            try:
                raw_ans = agent.run(raw_ans)
            except:
                break
        return postprocess(raw_ans[:1], mode)  # fallback to first char

    # Load test queries
    try:
        df = pd.read_csv(test_csv_path)
    except Exception as e:
        print(f"Failed to load {test_csv_path}: {e}")
        return

    results = []

    for i, row in df.iterrows():
        query = str(row[query_column]).strip()
        qid = row[id_column]
        mode = detect_question_type(query)
        print(f"[{i+1}/{len(df)}]")

        try:
            raw_ans = agent.run(query)
            final_ans = force_valid_answer(raw_ans, mode)
            results.append({id_column: qid, "answer": final_ans})
            print(f"✅ Answer: {final_ans}\n")
        except Exception as e:
            print(f"❌ Error on QID {qid}: {e}\n")

    # Save to CSV
    if results:
        if os.path.exists(submission_csv_path):
            existing = pd.read_csv(submission_csv_path)
            results_dict = dict(zip(existing[id_column], existing["answer"]))
        else:
            results_dict = {}

        for r in results:
            results_dict[r[id_column]] = r["answer"]

        df_save = pd.DataFrame([{id_column: k, "answer": v} for k, v in results_dict.items()])
        df_save.to_csv(submission_csv_path, index=False)
        print(f"\n💾 Saved {len(results)} answers to {submission_csv_path}")
    else:
        print("⚠️ No answers collected.")


In [7]:
# --- Run it here ---
run_batch_query(
    agent=agent,
    test_csv_path=INPUT_QUESTIONS_FILE,
    submission_csv_path=OUTPUT_SUBMISSION_FILE,
    id_column=ID_COLUMN,
    query_column=QUESTION_COLUMN
)

[1/499] QID=36deab86-cfd3-48b5-9bea-a36c1b0e63a8 Mode=abcd


  raw_ans = agent.run(query)


✅ Answer: C

[2/499] QID=2b5bbd26-45e8-4768-ab8a-b5dc1d153ab7 Mode=abcd
✅ Answer: B

[3/499] QID=8a722080-bc16-49db-89c9-100cd61cd28a Mode=abcd
✅ Answer: A

[4/499] QID=75316e95-88f4-4fef-83b9-dde0aa52889a Mode=abcd
✅ Answer: A

[5/499] QID=bcca13bc-2675-4645-82cc-7e4c412ed294 Mode=rise
✅ Answer: Fall

[6/499] QID=ff5b5d2e-5fa1-4709-a9a7-681e4d4585bd Mode=abcd
✅ Answer: C

[7/499] QID=d7a45917-d0f9-476e-912d-ebc5af9333a1 Mode=abcd
✅ Answer: B

[8/499] QID=e625dbc8-f448-4c53-9a78-6c3f351b49c3 Mode=rise
✅ Answer: 

[9/499] QID=9bea42e5-3c21-46dc-93f7-0017f382f7cf Mode=rise
✅ Answer: Fall

[10/499] QID=0925a4d7-8546-46a8-834f-20f58f16bc99 Mode=abcd
✅ Answer: A

[11/499] QID=dc0aa42f-569d-4277-8149-b645f3cf9888 Mode=abcd
✅ Answer: A

[12/499] QID=b9964445-c648-4661-ad85-7e5e4cd0feb4 Mode=rise
✅ Answer: Rise

[13/499] QID=a803daca-2cab-4d53-be68-c75fb71da84a Mode=rise
✅ Answer: Fall

[14/499] QID=1ca64702-d7d7-4a9a-987a-4e58938a3b96 Mode=rise
✅ Answer: 

[15/499] QID=6caca908-0f01-43b8-a2f4