# Gemini Approach

TODO run several times to check variability (use majority vote for answers?)

In [1]:
import os
import re
import json
from google import genai
from pydantic import BaseModel, Field
from typing import List, Optional, Literal, Union
from dotenv import load_dotenv
from utils import *

load_dotenv()

True

In [2]:
# load question json

# company = "reit"
# company = "TSX_Y_2022"
# company = "NASDAQ_CLXT_2022"

# with open(f"../data/questions_{company}.json", "r") as file:
#     questions = json.load(file)

with open(f"../data/questions/clxt-tsxy.json", "r") as file:
    questions = json.load(file)

# open companies list
# TODO
companies_dict = get_companies_dict(r"C:\Users\felix.krause\code\trustbit\enterprise-rag-challenge\dataset_v2.json")

In [3]:
# open prompts
with open(f"prompts/prompt_gemini.md", "r") as file:
    system_prompt = file.read()

with open(f"prompts/prompt_company_extractor.md", "r") as file:
    prompt_company_extractor = file.read()

In [4]:
model_id = "gemini-2.0-flash"  # or "gemini-2.0-flash-lite-preview-02-05"  , "gemini-2.0-pro-exp-02-05", at least 1M token input context window

client = genai.Client(api_key=os.getenv("GEMINI_API_KEY"))

In [5]:
class SourceReferenceLLM(BaseModel):
    # pdf_sha1: str = Field(..., description="SHA1 hash of the PDF file")
    page_index: int = Field(..., description="Physical page number in the PDF file")

class AnswerLLM(BaseModel):
    # question_text: str = Field(..., description="Text of the question")
    # kind: Literal["number", "name", "boolean", "names"] = Field(..., description="Kind of the question")
    chain_of_thought: str = Field(..., description="Chain of thought that led to the answer value")
    value: List[str] = Field(..., description="Answer to the question, according to the question schema")
    references: List[SourceReferenceLLM] = Field(..., description="References to the source material in the PDF file")

In [6]:
class Company(BaseModel):
    chain_of_thought: str = Field(..., description="Chain of thought that led to the answer value")
    value: str = Field(..., description="Identified company name as in list of companies.")


def get_company_name(query, companies_dict, verbose=True):
    companies = list(companies_dict.keys())
    company_meta = None

    # Check if query contains company name directly
    candidates = []
    for company in companies:
        # Build a regex pattern that ignores case
        pattern = re.escape(company)
        if re.search(pattern, query, re.IGNORECASE):
            candidates.append(company)

    if len(candidates) == 1:
        if verbose:
            print(f"Found company name with re: {candidates[0]}")
        company_meta = companies_dict[candidates[0]]

    # If no company name was found, ask model to extract it
    if not company_meta:
        prompt = prompt_company_extractor.replace("<<COMPANIES>>", ", ".join(companies))
        prompt += "\n\nQuery: " + query

        response = client.models.generate_content(model=model_id, contents=[prompt], config={'response_mime_type': 'application/json', 'response_schema': Company})

        if verbose:
            print(response.parsed)

        if response.parsed.value.lower() != "skip":
            company_meta = companies_dict[response.parsed.value]
        else:
            return "N/A", None

    return company_meta["id"], company_meta["sha1"]

In [7]:
# Test if company is there
query = "Did Backpowder Inc mention any mergers or acquisitions in the annual report?"
response_0, sha1 = get_company_name(query, companies_dict)
response_0

chain_of_thought="The query mentions 'Backpowder Inc', which is not in the provided list of companies. There are also no similar company names which could be corrected by matching partial names. Therefore, the output is 'SKIP'." value='SKIP'


'N/A'

In [80]:
print(questions[0])
response_1, sha1 = get_company_name(questions[0]["text"], companies_dict)
response_1

{'text': 'Did Calyxt, Inc. mention any mergers or acquisitions in the annual report?', 'kind': 'boolean'}
Found company name with re: Calyxt, Inc.


'NASDAQ_CLXT_2022.pdf'

In [7]:
def get_document(company, verbose=True):
    uploaded_docs = list(client.files.list())
    files_set = {file.display_name: file for file in uploaded_docs}
    if f"{company}_annual_report" in files_set:
        return files_set[f"{company}_annual_report"]
    else:
        if verbose:
            print(f"Uploading {company} annual report...")
        document_path = f"../data/pdfs/{company}"
        return client.files.upload(file=document_path, config={'display_name': f'{company}_annual_report'})

# Upload PDF manually
# company_document = client.files.upload(file=test_pdf_path, config={'display_name': f'{company}_annual_report'})

In [8]:
def ask_gemini(query, companies_dict=None, verbose=True):
    prompt = system_prompt + "\n\nQuery: " + str(query)

    company_id, sha1 = get_company_name(query["text"], companies_dict, verbose)

    if company_id == "N/A":
        return AnswerLLM(chain_of_thought="Company name not found", value=["N/A"], references=[]), None

    document = get_document(company_id, verbose)

    response = client.models.generate_content(model=model_id, contents=[prompt, document], config={'response_mime_type': 'application/json', 'response_schema': AnswerLLM, "temperature": 0.0})
    if verbose:
        print(response.parsed)
    return response.parsed, sha1

In [83]:
print(questions[0])
response_0, sha1 = ask_gemini(questions[0], companies_dict)

{'text': 'Did Calyxt, Inc. mention any mergers or acquisitions in the annual report?', 'kind': 'boolean'}
Found company name with re: Calyxt, Inc.
chain_of_thought='The report mentions a proposed merger with Cibus Global, LLC.' value=['yes'] references=[SourceReference(page_index=4), SourceReference(page_index=6)]


## Create final submission

In [9]:
# Create submission based on answers list and store json
answer_items = []
for question in questions[:3]:  # FIXME
    answer, sha1 = ask_gemini(question, companies_dict)

    print("")
    print(answer)
    print(sha1)
    print(question)

    if answer.value[0] == "N/A":
        value = "N/A"
    if len(answer.value) == 1:
        value = answer.value[0]
    else:
        value = answer.value

    if not answer.references:
        references = []
    else:
        references = [SourceReference(pdf_sha1=sha1, page_index=reference.page_index) for reference in answer.references]

    answer_item = Answer(question_text=question["text"], kind=question["kind"], value=value, references=references)
    answer_items.append(answer_item)

final_submission = AnswerSubmission(answers=answer_items, team_email="felix.krause@timetoact.at", submission_name="gemini")

chain_of_thought="The query contains a misspelling of a company name. 'Backpowder Inc' is not in the list of companies, and its most likely intention 'BlackRock' is too distant, thus it is classified as SKIP." value='SKIP'

chain_of_thought='Company name not found' value=['N/A'] references=[]
None
{'text': 'Did Backpowder Inc mention any mergers or acquisitions in the annual report?', 'kind': 'boolean'}
Found company name with re: Yellow Pages Limited
Uploading TSX_Y_2022.pdf annual report...
chain_of_thought='The annual report states that the Adjusted EBITDA margin for the year ended December 31, 2022, was 36.0% of revenues. Adjusted EBITDA margin is used as a proxy for operating margin.' value=['36.0'] references=[SourceReferenceLLM(page_index=8)]

chain_of_thought='The annual report states that the Adjusted EBITDA margin for the year ended December 31, 2022, was 36.0% of revenues. Adjusted EBITDA margin is used as a proxy for operating margin.' value=['36.0'] references=[SourceRefer

In [14]:
# store submission as json
final_submission_path = f"../data/submissions/gemini_test_submission.json"

with open(final_submission_path, "w") as file:
    json.dump(final_submission.model_dump(), file, indent=4)