# QA sql query generator using Deepseek


**Authors**
1. Alfan Dinda Rahmawan (alfan.d.rahmawan@gdplabs.id)

## Install dependencies

In [1]:
%pip install -q langchain=="0.3.0"
%pip install -q langchain-aws=="0.2.7"
%pip install -q langchain_openai=="0.2.14"
%pip install -q boto3=="1.35.71"
%pip install -q pandas=="2.2.2"
%pip install -q tqdm=="4.66.4"
%pip install google-api-python-client==2.100.0 gspread==5.10.0
%pip install python-dotenv
%pip install pymysql==1.0.2
%pip install -q mysql-connector-python==9.2.0

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Collecting pymysql==1.0.2
  Downloading PyMySQL-1.0.2-py3-none-any.whl.metadata (5.1 kB)
Downloading PyMySQL-1.0.2-py3-none-any.whl (43 kB)
Installing collected packages: pymysql
  Attempting uninstall: pymysql
    Found existing installation: PyMySQL 1.1.1
    Uninstalling PyMySQL-1.1.1:
      Successfully uninstalled PyMySQL-1.1.1
Successfully installed pymysql-1.0.2
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the 

## Setting Environment

### Database Connection

In [38]:
from dotenv import load_dotenv
import mysql.connector
from mysql.connector import Error
from typing import Dict, List, Any, Optional


def connect_to_mariadb(database_name: str) -> Optional[mysql.connector.connection.MySQLConnection]:
    """
    Connect to the MariaDB database.

    Returns:
        Optional[mysql.connector.connection.MySQLConnection]: A connection object if successful, None otherwise.
    """
    try:
        connection = mysql.connector.connect(
            host="localhost",
            port=33062,
            user="app_user_demo",
            password="StrongPassw0rd!",
            database=database_name
        )

        if connection.is_connected():
            db_info = connection.get_server_info()
            print(f"Connected to MariaDB Server version {db_info}")
            return connection

    except Error as e:
        print(f"Error connecting to MariaDB: {e}")
        return None

def execute_query(connection: mysql.connector.connection.MySQLConnection, query: str) -> List[Dict[str, Any]]:
    """
    Execute a query on the MariaDB database.

    Args:
        connection: The database connection object.
        query: The SQL query to execute.

    Returns:
        List[Dict[str, Any]]: A list of dictionaries containing the query results.
    """
    cursor = connection.cursor(dictionary=True)
    cursor.execute(query)
    result = cursor.fetchall()
    cursor.close()
    return result

time_management_db = connect_to_mariadb("ru4f_time_management")
core_employee_db = connect_to_mariadb("ru4f_core_employee")


Connected to MariaDB Server version 5.5.5-10.5.28-MariaDB-ubu2004
Connected to MariaDB Server version 5.5.5-10.5.28-MariaDB-ubu2004


### Google Auth

In [2]:
import os
import pandas as pd
import json

from typing import List
from dotenv import load_dotenv

load_dotenv()

GOOGLE_SPREADSHEET_ID: str = "1dDMqrol_DrEMjvLy88IRu2WdHN7T5BU0LrD8ORLuNPI" # put your spreadsheet id here
GOOGLE_SPREADSHEET_URL: str = f"https://docs.google.com/spreadsheets/d/{GOOGLE_SPREADSHEET_ID}/edit?usp=sharing" # put your spreadsheet link here
DATA_TEST_SHEET_NAME: str = "catapa_syntetics_time_management_augment3_prompt2"

GOOGLE_SHEETS_CLIENT_EMAIL: str = os.getenv('GOOGLE_SHEETS_CLIENT_EMAIL')
GOOGLE_SHEETS_PRIVATE_KEY: str = os.getenv('GOOGLE_SHEETS_PRIVATE_KEY')

In [3]:
# Google Authentication
from modules.google_sheets_writer import GoogleUtil

PRIVATE_KEY = GOOGLE_SHEETS_PRIVATE_KEY
google: GoogleUtil = GoogleUtil(PRIVATE_KEY, GOOGLE_SHEETS_CLIENT_EMAIL)


## SQL Query fixer

In [4]:
import pandas as pd
from pydantic import BaseModel, Field
from typing import List
from langchain_core.output_parsers import JsonOutputParser
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate, HumanMessagePromptTemplate, SystemMessagePromptTemplate
from tqdm import tqdm

from modules.database_info.schema import employee_schema, time_management_schema
from modules.database_info.master_data import employee_master_data, time_management_master_data
from modules.database_info.relation import employee_relations, time_management_relations
from modules.database_info.trustee_tables import data_trustee_employee, data_trustee_time_management
from modules.database_info.anonymize_entities import anonymized_entities_description

In [5]:
SYSTEM_MESSAGE = """<instructions>
You are a specialized SQL query fixer and generator for an HR database. Your task is to analyze SQL queries that have errors, fix them, and generate multiple candidate solutions that correctly answer business questions in Indonesian language based on the provided HR database schema.

ADDITIONAL GUIDELINES
- Generate optimized, correct SQL queries that follow best practices
- Consider privacy concerns with data trustee fields and anonymized entities
- Include comments in complex SQL queries to explain the logic
- Format queries appropriately for readability
- Use appropriate joins, filters, and aggregations based on the question
- Make sure the output is in valid, properly formatted JSON
- Think step-by-step before generating the final SQL query
- When fixing errors, carefully analyze the error message and identify the root cause
- Generate multiple candidate solutions with different approaches when possible
</instructions>"""

USER_MESSAGE = """
**General SQL Generation Guidelines**:
The generated SQL must be directly executable in MariaDB 10.5.23.
- Do NOT select identifiers (e.g., `id`, `employee_id`) except when used within aggregate functions like `MAX`, `SUM`, or `AVG`.
- Always display human-readable names rather than IDs (e.g., use `religions.name` instead of `religions.id`).
- Prefix column names with the table name in the SELECT clause to avoid ambiguity.
- Use snake_case for column aliases in the SELECT clause. Do NOT use aliases in the FROM clause.
- All date literals must be wrapped in `STR_TO_DATE()`, and when comparing dates, ensure the comparator is of DATE type using `CAST()`.
- Use aggregate functions only in the SELECT or HAVING clauses, not within the WHERE clause.
- JOIN conditions must reference the correct foreign keys, and if an expected display name is missing, add an extra JOIN to the appropriate master table.
- Handle division carefully by ensuring denominators are non-zero.
- Rename output column names based on the user's instruction language for clarity.

**Dynamic Schema Handling**:
- Use the provided placeholders dynamically:

**Database Schema**:
- The schema for the database is as follows:
  {schema}

**Table Relationships**:
- The relationships between these tables are described below:
  {relations}

**Master Data**:
- Master data in the database are listed here:
  {master_data}

**Data Trustee Enabled Tables**:
- List of tables requiring a data trustee and specific columns for JOIN operations. Entries may indicate a prerequisite join with another table before joining to the `employment_statuses` table, denoted by `table_name.column_name`. A join must first occur with `table_name` then followed by a join to `employment_statuses` using `column_name` from `table_name`:
  {data_trustee_tables}

**Anonymized Entities**:
- List of anonymized entities along with their descriptions that may be used in the `WHERE` clause:
  {anonymized_entities_description}

- If new tables or columns are introduced in the schema, apply the same rules without hardcoding table-specific logic.
- If a referenced table or column is missing a display name, join to the corresponding master table to retrieve it.

**Data Trustee Requirements**:
- If any table in the query appears in the `Data Trustee Enabled Tables` list, then:
  - Add a JOIN with the `employment_statuses` table using the `employee_id` column.
  - Add filters:
    `employment_statuses.organization_id IN ('[ORGANIZATION_IDS]')`
    `AND employment_statuses.job_level_id IN ('[JOB_LEVEL_IDS]')`
    `AND employment_statuses.location_id IN ('[LOCATION_IDS]')`.
  - If the table requires a prerequisite join (as indicated in the list), perform that JOIN before joining with `employment_statuses`.
- If no table from the list is used, do not include any data trustee-specific JOINs or filters.

**Date Handling**:
- Use the provided `{current_date}` as the reference for any date-related calculations.
- Always wrap date strings in `STR_TO_DATE()` and use `CAST()` where necessary to ensure proper date comparisons.

**Query Examples** (for reference):
- *Turnover Rate*:
  Use a WITH clause to generate a series of years, and then calculate the turnover rate using appropriate JOINs and conditions.
- *Quarterly New Employee Count*:
  Count employees hired per quarter, ensuring that only active employees are counted by including `WHERE employees.active = TRUE`.
- *Employee Managerial Status*:
  Use a CASE statement to determine if an employee is a manager by checking if their `id` appears as a `manager_id` in the employees table.

**User Instruction**:
- The business question is provided in the placeholder `{business_question}`.
- The current SQL query with errors is provided in the placeholder `{current_sql_query}`.
- The error output from executing the current SQL query is provided in the placeholder `{error_output}`.

**Task**:
1. **Analyze Error**:
   - Carefully analyze the error message to identify the root cause of the issue.

2. **Fix SQL Query**:
   - Generate three candidate SQL queries that fix the identified issues while correctly answering the business question.
   - Each candidate should use a slightly different approach when possible.

3. **Ensure Data Trustee Compliance**:
   - If any table from the `Data Trustee Enabled Tables` is used, add the necessary JOINs and filters as specified.

**Output Format**:
- Your response must be a valid JSON object with the following structure:
```json
{{
  "sql_queries": {{
    "business_question": "<user_business_question>",
    "sql_query_candidate_1": "<sql_query_candidate_1>",
    "sql_query_candidate_2": "<sql_query_candidate_2>",
    "sql_query_candidate_3": "<sql_query_candidate_3>",
  }}
}}
```
- Do NOT output any additional text besides this JSON object.
"""


In [19]:
class QnAPair(BaseModel):
    paraphrased_input: str = Field(description="paraphrased_input")
    test_to_sql_query: str = Field(description="test_to_sql_query")

class QnAPairs(BaseModel):
    qna_pairs: List[QnAPair] = Field(description="list of qna pairs")

## OpenAI / DeepSeek
DEEPSEEK_MODEL_NAME = os.getenv("DEEPSEEK_MODEL")
DEEPSEEK_ENDPOINT = os.getenv("DEEPSEEK_ENDPOINT")
DEEPSEEK_API_KEY = os.getenv("DEEPSEEK_API_KEY")

llm = ChatOpenAI(
    model_name=DEEPSEEK_MODEL_NAME,
    temperature=0.7,  # Higher temperature (0.7-0.9) for more creative variations
    openai_api_base=DEEPSEEK_ENDPOINT,
    openai_api_key=DEEPSEEK_API_KEY,
    top_p=0.95,  # Keep high top_p for diverse outputs while filtering unlikely tokens
    seed=42  # Optional: set seed for reproducibility
)

# Create prompt template
system_message_prompt = SystemMessagePromptTemplate.from_template(SYSTEM_MESSAGE)
human_message_prompt = HumanMessagePromptTemplate.from_template(USER_MESSAGE)
prompt = ChatPromptTemplate.from_messages([
    system_message_prompt,
    human_message_prompt
])
parser = JsonOutputParser(pydantic_object=QnAPairs)
chain = prompt | llm | parser

### Sanity Check

#### Check Generate Data

In [34]:
from datetime import datetime

current_date = datetime.now().strftime("%d %B %Y")
database_type = "employee"
business_question = "Berapa persentase karyawan yang mengajukan pengunduran diri dengan alasan 'Pensiun' selama tahun 2023?"

current_sql_query = "SELECT (COUNT(DISTINCT termination_entries.employee_id) * 100.0 / (SELECT COUNT(DISTINCT employees.id) FROM employees JOIN employment_statuses ON employees.id = employment_statuses.employee_id WHERE employees.active = TRUE AND employment_statuses.organization_id IN ('[ORGANIZATION_IDS]') AND employment_statuses.job_level_id IN ('[JOB_LEVEL_IDS]') AND employment_statuses.location_id IN ('[LOCATION_IDS]')) AS 'persentase_pengunduran_diri' FROM termination_entries JOIN termination_reasons ON termination_entries.termination_reason_id = termination_reasons.id JOIN employees ON termination_entries.employee_id = employees.id JOIN employment_statuses ON employees.id = employment_statuses.employee_id WHERE termination_reasons.name = 'Pensiun' AND termination_entries.approval_status = 'APPROVED' AND CAST(termination_entries.effective_date AS DATE) BETWEEN STR_TO_DATE('2023-01-01', '%Y-%m-%d') AND STR_TO_DATE('2023-12-31', '%Y-%m-%d') AND employment_statuses.organization_id IN ('[ORGANIZATION_IDS]') AND employment_statuses.job_level_id IN ('[JOB_LEVEL_IDS]') AND employment_statuses.location_id IN ('[LOCATION_IDS]');"

error_output = """
Error executing query: (pymysql.err.ProgrammingError) (1064, "You have an error in your SQL syntax; check the manual that corresponds to your MySQL server version for the right syntax to use near ', 0) AS persentase_pengunduran_diri FROM termination_entries JOIN termination_re' at line 1")
"""

if database_type == "employee":
    schema = employee_schema
    relations = employee_relations
    master_data = employee_master_data
    data_trustee_tables = data_trustee_employee
    master_data = employee_master_data
else:
    schema = time_management_schema
    relations = time_management_relations
    master_data = time_management_master_data
    data_trustee_tables = data_trustee_time_management
    master_data = time_management_master_data

response = chain.invoke({
    "schema": schema,
    "relations": relations,
    "master_data": master_data,
    "data_trustee_tables": data_trustee_tables,
    "anonymized_entities_description": anonymized_entities_description,
    "current_date": current_date,
    "business_question": business_question,
    "current_sql_query": current_sql_query,
    "error_output": error_output
})
display(response)

2025-04-11 09:14:28,655 - INFO - HTTP Request: POST https://api.deepseek.com/v1/chat/completions "HTTP/1.1 200 OK"


{'sql_queries': {'business_question': "Berapa persentase karyawan yang mengajukan pengunduran diri dengan alasan 'Pensiun' selama tahun 2023?",
  'sql_query_candidate_1': "SELECT \n  (COUNT(DISTINCT termination_entries.employee_id) * 100.0 / \n  (SELECT COUNT(DISTINCT employees.id) \n  FROM employees \n  JOIN employment_statuses ON employees.id = employment_statuses.employee_id \n  WHERE employees.active = TRUE \n  AND employment_statuses.organization_id IN ('[ORGANIZATION_IDS]') \n  AND employment_statuses.job_level_id IN ('[JOB_LEVEL_IDS]') \n  AND employment_statuses.location_id IN ('[LOCATION_IDS]')\n  ) AS persentase_pengunduran_diri \nFROM termination_entries \nJOIN termination_reasons ON termination_entries.termination_reason_id = termination_reasons.id \nJOIN employees ON termination_entries.employee_id = employees.id \nJOIN employment_statuses ON employees.id = employment_statuses.employee_id \nWHERE termination_reasons.name = 'Pensiun' \nAND termination_entries.approval_statu

#### Check database

In [8]:
connection = time_management_db
query = """
SELECT
  organizations.name AS department,
  COUNT(attendances.id) AS absence_count
FROM
  employees
  LEFT JOIN employment_statuses ON employees.id = employment_statuses.employee_id
  LEFT JOIN organizations ON employment_statuses.organization_id = organizations.id
  LEFT JOIN attendances ON employees.id = attendances.employee_id
  LEFT JOIN attendance_statuses ON attendances.attendance_status_in_id = attendance_statuses.id
WHERE
  employees.active = TRUE
  AND attendance_statuses.attendance_type = 'ABSENT'
  AND attendances.date BETWEEN STR_TO_DATE('2025-02-01', '%Y-%m-%d') AND STR_TO_DATE('2025-02-28', '%Y-%m-%d')
  AND organization_id IN ('428b67d8-aa38-43b3-9fbd-714e424e78b4', '48bee117-5c8e-4dd9-ad3a-ea0691fa311b', '5bd621f0-fd6e-445c-9c65-f12e0ed97e76', '9d1e8e05-50c8-45ee-8aaf-840da1e2b2ba', '9f293790-0f46-4800-bda0-f700d51af3de', 'bdb90554-027a-43c2-abc5-5c26993b15a7')
  AND job_level_id IN ('4471d89c-7d48-460d-8d92-7b5b4e5fbea0', 'a9e8af86-fbd8-4cd3-a477-b3cff496bf40', 'aa1c7990-ac32-4522-842b-09ac7c6d04a6', 'b2ec711e-0848-4e32-8888-2bf279e3febd', 'cc1e8b6b-bdb9-4500-8d33-3afbe2f9f462')
  AND location_id IN ('12a76682-d541-4847-ba9f-e850eb519a76', '60090d71-af4f-47a3-a6fc-962218b1f4d1', '8bdd9889-4247-46f8-86a7-56fd75aa9c40')
GROUP BY
  organizations.name
ORDER BY
  absence_count DESC;
"""
cursor = connection.cursor(dictionary=True)
cursor.execute(query)
result = cursor.fetchall()
print(result)
# print(type(result))
# [str(row['organization_id']) for row in result]

[{'department': 'Information Technology', 'absence_count': 3}, {'department': 'Human Resources', 'absence_count': 1}, {'department': 'Board of Directors', 'absence_count': 1}]


### Run sql query fixing

In [39]:
import re
from typing import Any, Dict, List, Optional
import pandas as pd
from datetime import datetime, time
import logging
from tqdm import tqdm
from sqlalchemy import text
from sqlalchemy.orm import Session
from decimal import Decimal
from modules.constants import ColumnName

class SQLQueryTester:
    """Class to handle SQL query testing across different databases."""

    def __init__(self, databases: Dict[str, Session], default_db: str = 'core', refresh_interval: Optional[int] = None):
        """Initialize with database sessions.

        Args:
            databases (Dict[str, Session]): Dictionary mapping database names to their sessions
            default_db (str): Name of the default database to use when not specified
            refresh_interval (Optional[int]): Seconds between filter refreshes. None means no auto-refresh.
        """
        self.databases = databases
        self.default_db = default_db
        self.refresh_interval = refresh_interval
        self.last_refresh = datetime.now()

        # Setup logging
        self.logger = logging.getLogger(__name__)

        # Cache for prepared queries and filters
        self._query_cache = {}
        self.filters = self._get_employment_filters()


    def _get_distinct_values(self, db: Session, table: str, column: str) -> List[str]:
        """Get distinct values from a specified column in a table.

        Args:
            db (Session): Database session
            table (str): Table name
            column (str): Column name

        Returns:
            List[str]: List of distinct non-null values
        """
        query = f"SELECT DISTINCT {column} FROM {table} WHERE {column} IS NOT NULL"
        cursor = db.cursor(dictionary=True)
        cursor.execute(query)
        result = cursor.fetchall()
        cursor.close()
        return [str(row[column]) for row in result]

    def _get_filters(self, db_name: str, filters_config: List[Dict[str, str]]) -> Dict[str, List[str]]:
        """Get filter values based on provided configuration.

        Args:
            db_name (str): Name of the database to query (e.g., 'core', 'time')
            filters_config (List[Dict[str, str]]): List of filter configurations.
                Each config should have:
                - 'key': Key for the returned dictionary
                - 'table': Table name to query
                - 'column': Column name to get distinct values from

        Returns:
            Dict[str, List[str]]: Dictionary containing filter values mapped to their keys
        """
        db = self.databases.get(db_name)
        if not db:
            raise ValueError(f"Database '{db_name}' not found")

        result = {}
        for config in filters_config:
            values = self._get_distinct_values(db, config['table'], config['column'])
            if not values:
                self.logger.warning(f"No values found for {config['table']}.{config['column']}")
            result[config['key']] = values

        return result

    def _get_employment_filters(self) -> Dict[str, List[str]]:
        """Get all filter values needed for employment status queries.

        Returns:
            Dict[str, List[str]]: Dictionary containing filter values
        """
        # Define the filter configuration
        filters_config = [
            {'key': 'organization_id', 'table': 'employment_statuses', 'column': 'organization_id'},
            {'key': 'job_level_id', 'table': 'employment_statuses', 'column': 'job_level_id'},
            {'key': 'location_id', 'table': 'employment_statuses', 'column': 'location_id'}
        ]

        return self._get_filters(self.default_db, filters_config)

    def refresh_filters_if_needed(self) -> None:
        """Refresh filters if the refresh interval has passed."""
        if not self.refresh_interval:
            return

        now = datetime.now()
        elapsed = (now - self.last_refresh).total_seconds()

        if elapsed >= self.refresh_interval:
            self.logger.info("Refreshing filters due to interval expiration")
            self.filters = self._get_employment_filters()
            self.last_refresh = now
            # Clear cache when filters change
            self._query_cache.clear()

    def prepare_query(self, sql_query: str) -> str:
        """Replace placeholder values in SQL query with actual filter values.

        Args:
            sql_query (str): Original SQL query with placeholders like [ORGANIZATION_IDS]
                or '[ORGANIZATION_IDS]'

        Returns:
            str: SQL query with placeholders replaced by actual filter values
        """
        # Input validation
        if not sql_query:
            raise ValueError("SQL query cannot be empty")

        # Check cache first
        if sql_query in self._query_cache:
            return self._query_cache[sql_query]

        # Make sure filters are up-to-date
        self.refresh_filters_if_needed()
        # Create filter values with proper SQL formatting
        filter_values = {
            'organization_id': ", ".join(f"'{id}'" for id in self.filters['organization_id']),
            'job_level_id': ", ".join(f"'{id}'" for id in self.filters['job_level_id']),
            'location_id': ", ".join(f"'{id}'" for id in self.filters['location_id'])
        }

        # Handle both quoted and unquoted placeholders
        replacements = {
            # For quoted placeholders: '([ORGANIZATION_IDS])'
            "'[ORGANIZATION_IDS]'": filter_values['organization_id'],
            "'[JOB_LEVEL_IDS]'": filter_values['job_level_id'],
            "'[LOCATION_IDS]'": filter_values['location_id'],

            # For unquoted placeholders: ([ORGANIZATION_IDS])
            "[ORGANIZATION_IDS]": filter_values['organization_id'],
            "[JOB_LEVEL_IDS]": filter_values['job_level_id'],
            "[LOCATION_IDS]": filter_values['location_id']
        }

        # Replace all placeholders
        prepared_query = sql_query
        for placeholder, value in replacements.items():
            prepared_query = prepared_query.replace(placeholder, value)

        # Store in cache
        self._query_cache[sql_query] = prepared_query

        return prepared_query

    def format_query_results(self, results: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
        """Format query results by converting special data types to standard formats.

        Args:
            results (List[Dict[str, Any]]): List of dictionaries containing query results

        Returns:
            List[Dict[str, Any]]: Formatted results with dates as strings and decimals as integers
        """
        formatted_results = []
        for row in results:
            formatted_row = {}
            for key, value in row.items():
                # Check if value has a strftime method (date/datetime objects)
                if hasattr(value, 'strftime'):
                    formatted_row[key] = value.strftime('%Y-%m-%d')
                # Check if value is a Decimal
                elif str(type(value)).find('Decimal') > -1:
                    # Convert to int if it's a whole number, otherwise to float
                    try:
                        if value % 1 == 0:
                            formatted_row[key] = int(value)
                        else:
                            formatted_row[key] = float(value)
                    except:
                        formatted_row[key] = float(value)
                else:
                    formatted_row[key] = value
            formatted_results.append(formatted_row)

        return formatted_results

    def execute_query(self, sql_query: str) -> List[Dict[str, Any]]:
        """Execute SQL query on specified database.

        Args:
            sql_query (str): SQL query to execute
            db_name (str, optional): Name of database to use. If None, uses the default database.

        Returns:
            List[Dict[str, Any]]: Query results as list of dictionaries with formatted values
        """
        db_name = self.default_db

        connection = self.databases.get(db_name)
        if not connection:
            raise ValueError(f"Database '{db_name}' not found")
        cursor = connection.cursor(dictionary=True)
        try:
            # Prepare and execute query
            final_query = self.prepare_query(sql_query)
            cursor.execute(final_query)
            results = cursor.fetchall()
            # Format results to handle special data types
            cursor.close()
            return self.format_query_results(results), []
        except Exception as e:
            error_msg = f"Error executing query: {str(e)}"
            # self.logger.error(error_msg)
            # self.logger.error(f"Query: {sql_query}")
            # print(error_msg)
            return [], self.extract_error_message(error_msg)

    def extract_error_message(self, error_text: str) -> str:
        """Extract the error message from SQL error text.

        Args:
            error_text (str): The full error text containing SQL error information

        Returns:
            str: The extracted error message sentence
        """
        # Find the first line which contains the error message
        lines = error_text.strip().split('\n')
        error_line = lines[0] if lines else ""

        # If the error is about a table not existing, modify it to match the desired format
        if "Table" in error_line and "doesn't exist" in error_line:
            # Extract the table name from the original error
            import re
            table_match = re.search(r"Table '([^']+)\.([^']+)' doesn't exist", error_line)

            if table_match:
                database = table_match.group(1)
                # Replace with 'families' as requested
                return f"Error executing query: (pymysql.err.ProgrammingError) (1146, \"Table '{database}.families' doesn't exist\")"

        return error_line


# Usage example without a main function

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)

databases = {
    'core': core_employee_db,
    'time': time_management_db
}


sysntetics_data_dir = "augmented_sql_data_2"
file_name = "augmented_data_time_management_196_questions_complete.csv"
# database_type = "core"
database_type = "time"

test_data = pd.read_csv(f"{sysntetics_data_dir}/{file_name}")
query_tester = SQLQueryTester(databases, default_db=database_type)

current_date = datetime.now().strftime("%d %B %Y")

if database_type == "core":
    schema = employee_schema
    relations = employee_relations
    master_data = employee_master_data
    data_trustee_tables = data_trustee_employee
    master_data = employee_master_data
else:
    schema = time_management_schema
    relations = time_management_relations
    master_data = time_management_master_data
    data_trustee_tables = data_trustee_time_management
    master_data = time_management_master_data


# print(qna_pair)
save_correct_sql_datasets = os.path.join(sysntetics_data_dir, f"fix_{file_name}")
save_failed_sql_datasets = os.path.join(sysntetics_data_dir, f"failed_{file_name}")
final_result = []

df_failed_queries = pd.DataFrame(
    columns=[
        ColumnName.NO,
        ColumnName.BASE_PROMPT,
        ColumnName.PROMPT,
        ColumnName.FAILED_SQL_QUERY, ColumnName.ERROR
    ]
)
df_query_result = pd.DataFrame(
    columns=[
        ColumnName.NO,
        ColumnName.BASE_PROMPT,
        ColumnName.PROMPT,
        ColumnName.EXPECTED_SQL_QUERY,
        ColumnName.EXPECTED_QUERY_RESULT,
        ColumnName.TIME_TAKEN
    ]
)
processed_nos = set()
failed_nos = set()

if os.path.exists(save_correct_sql_datasets):
    df_query_result = pd.read_csv(save_correct_sql_datasets)
    processed_nos = set(df_query_result['No'].tolist())

if os.path.exists(save_failed_sql_datasets):
    df_failed_queries = pd.read_csv(save_failed_sql_datasets)
    failed_nos = set(df_failed_queries['No'].tolist())

# Iterate through test data rows with index
for idx, (_, row) in enumerate(test_data.iterrows()):
    # Only process first row for testing
    # if idx != 0:
    #     continue

    # Extract SQL query and question from row
    if ColumnName.BASE_PROMPT in row:
        base_prompt = row[ColumnName.BASE_PROMPT]
    else:
        base_prompt = row[ColumnName.PROMPT]
    no = row[ColumnName.NO]
    question = row[ColumnName.PROMPT]
    sql_query = row[ColumnName.EXPECTED_SQL_QUERY]

    # Skip already processed items
    if no in processed_nos or no in failed_nos:
        logging.info({"Status": f"Skipped {no}"})
        # print(base_prompt)
        continue

    # Execute query and get results/error
    db_query_result, db_error_msg = query_tester.execute_query(sql_query)

    # If error occurs, print debug information
    if db_error_msg:
        # Skip query if the error is about a table not found
        table_not_found_match = re.search(r"Table '([^']+)\.([^']+)' doesn't exist", db_error_msg)
        if table_not_found_match:
            print(f"{no}: ⚠️ Skipped due to table not found error.")
            # Log the failure or handle it as needed
            failed_query = {
                ColumnName.NO: no,
                ColumnName.BASE_PROMPT: base_prompt,
                ColumnName.PROMPT: question,
                ColumnName.FAILED_SQL_QUERY: sql_query,
                ColumnName.ERROR: db_error_msg
            }
            # Optionally save failed queries to a separate file
            df_failed_queries = pd.concat([df_failed_queries, pd.DataFrame([failed_query])], ignore_index=True)
            df_failed_queries.to_csv(save_failed_sql_datasets, index=False)
            continue
        
        # Skip query if the error is about a column not found
        column_not_found_match = re.search(r"Unknown column '(([^']+)\.)?([^']+)' in", db_error_msg)
        if column_not_found_match:
            print(f"{no}: ⚠️ Skipped due to column not found error.")
            # Log the failure or handle it as needed
            failed_query = {
                ColumnName.NO: no,
                ColumnName.BASE_PROMPT: base_prompt,
                ColumnName.PROMPT: question,
                ColumnName.FAILED_SQL_QUERY: sql_query,
                ColumnName.ERROR: db_error_msg
            }
            # Optionally save failed queries to a separate file
            df_failed_queries = pd.concat([df_failed_queries, pd.DataFrame([failed_query])], ignore_index=True)
            df_failed_queries.to_csv(save_failed_sql_datasets, index=False)
            continue

        found_best_sql_query = False
        max_retries = 3  # Maximum number of LLM retries
        retry_count = 0
        start_time = datetime.now()
        while not found_best_sql_query and retry_count < max_retries:
            print(f"Processing... question {no} Retry attempt: {retry_count + 1}/{max_retries}")

            try:
                # Invoke the LLM chain to get SQL query candidates
                response = chain.invoke({
                    "schema": schema,
                    "relations": relations,
                    "master_data": master_data,
                    "data_trustee_tables": data_trustee_tables,
                    "anonymized_entities_description": anonymized_entities_description,
                    "current_date": current_date,
                    "business_question": question,
                    "current_sql_query": sql_query,
                    "error_output": db_error_msg  # Pass the current error message
                })

                print(f"LLM Response (Attempt {retry_count + 1}):")

                # Try each SQL query candidate
                for i in range(3):
                    candidate_name = f'sql_query_candidate_{i+1}'
                    if candidate_name in response['sql_queries']:
                        sql_query = response['sql_queries'][candidate_name]
                        print(sql_query)
                        print(f"\nTrying candidate {i+1}")

                        # Test the candidate query
                        db_query_result, db_error_msg = query_tester.execute_query(sql_query)

                        if not db_error_msg:
                            # We found a working query!
                            print(f"✅ Candidate {i+1} executed successfully!")
                            found_best_sql_query = True
                            end_time = datetime.now()
                            execution_time = (end_time - start_time).total_seconds()  # Calculate time taken and round to 2 decimal places for seconds
                            # Create result row
                            update_query = {
                                ColumnName.NO: no,
                                ColumnName.BASE_PROMPT: base_prompt,
                                ColumnName.PROMPT: question,
                                ColumnName.EXPECTED_SQL_QUERY: sql_query,
                                ColumnName.GENERATED_QUERY_RESULT: db_query_result,
                                ColumnName.TIME_TAKEN: execution_time
                            }
                            # Add to results dataframe
                            df_query_result = pd.concat([df_query_result, pd.DataFrame([update_query])], ignore_index=True)
                            df_query_result.to_csv(save_correct_sql_datasets, index=False)
                            break
                        else:
                            print(f"❌ Candidate {i+1} failed with error: {db_error_msg}")
            except Exception as e:
                print(f"❌ Error during LLM chain invocation: {str(e)}")
                db_error_msg = f"LLM chain invocation failed: {str(e)}"
                retry_count += 1
                continue

            # If we haven't found a working query, update the error message for the next retry
            if not found_best_sql_query:
                retry_count += 1
                # Update the error message to include information about failed attempts
                db_error_msg = f"Previous attempts failed. Last error: {db_error_msg}"
                print(f"No working candidates found. Retrying... ({retry_count}/{max_retries})")

        if not found_best_sql_query:
            print(f"⚠️ Failed to find working SQL query after {max_retries} attempts.")
            # Log the failure or handle it as needed
            failed_query = {
                ColumnName.NO: no,
                ColumnName.BASE_PROMPT: base_prompt,
                ColumnName.PROMPT: question,
                ColumnName.FAILED_SQL_QUERY: sql_query,
                ColumnName.ERROR: db_error_msg
            }
            # Optionally save failed queries to a separate file
            df_failed_queries = pd.concat([df_failed_queries, pd.DataFrame([failed_query])], ignore_index=True)
            df_failed_queries.to_csv(save_failed_sql_datasets, index=False)
    else:
        # Query executed successfully, no need for fixes
        print(f"{no}: ✅ Original query executed successfully!")
        success_query = {
            ColumnName.NO: no,
            ColumnName.BASE_PROMPT: base_prompt,
            ColumnName.PROMPT: question,
            ColumnName.EXPECTED_SQL_QUERY: sql_query,
            ColumnName.EXPECTED_QUERY_RESULT: db_query_result,
            ColumnName.TIME_TAKEN: 0
        }
        # Add to results dataframe
        df_query_result = pd.concat([df_query_result, pd.DataFrame([success_query])], ignore_index=True)
        df_query_result.to_csv(save_correct_sql_datasets, index=False)


2025-04-11 10:02:16,660 - INFO - {'Status': 'Skipped 51_1'}
2025-04-11 10:02:16,661 - INFO - {'Status': 'Skipped 51_2'}
2025-04-11 10:02:16,662 - INFO - {'Status': 'Skipped 51_3'}
2025-04-11 10:02:16,663 - INFO - {'Status': 'Skipped 52_1'}
2025-04-11 10:02:16,664 - INFO - {'Status': 'Skipped 52_2'}
2025-04-11 10:02:16,667 - INFO - {'Status': 'Skipped 52_3'}
2025-04-11 10:02:16,668 - INFO - {'Status': 'Skipped 53_1'}
2025-04-11 10:02:16,669 - INFO - {'Status': 'Skipped 53_2'}
2025-04-11 10:02:16,671 - INFO - {'Status': 'Skipped 53_3'}
2025-04-11 10:02:16,675 - INFO - {'Status': 'Skipped 54_1'}
2025-04-11 10:02:16,676 - INFO - {'Status': 'Skipped 54_2'}
2025-04-11 10:02:16,679 - INFO - {'Status': 'Skipped 54_3'}
2025-04-11 10:02:16,682 - INFO - {'Status': 'Skipped 55_1'}
2025-04-11 10:02:16,684 - INFO - {'Status': 'Skipped 55_2'}
2025-04-11 10:02:16,687 - INFO - {'Status': 'Skipped 55_3'}
2025-04-11 10:02:16,689 - INFO - {'Status': 'Skipped 56_1'}
2025-04-11 10:02:16,694 - INFO - {'Statu

105_1: ✅ Original query executed successfully!
105_2: ✅ Original query executed successfully!
105_3: ✅ Original query executed successfully!
106_1: ✅ Original query executed successfully!
Processing... question 106_2 Retry attempt: 1/3


2025-04-11 10:02:18,668 - INFO - HTTP Request: POST https://api.deepseek.com/v1/chat/completions "HTTP/1.1 200 OK"


LLM Response (Attempt 1):
WITH managers AS (
  SELECT DISTINCT manager_id 
  FROM employees 
  WHERE manager_id IS NOT NULL AND active = TRUE
),
department_stats AS (
  SELECT 
    organizations.name AS department,
    COUNT(DISTINCT CASE WHEN employees.id IN (SELECT manager_id FROM managers) THEN employees.id END) AS manager_count,
    COUNT(DISTINCT employees.id) AS total_employees
  FROM employees
  JOIN employment_statuses ON employees.id = employment_statuses.employee_id
  JOIN organizations ON employment_statuses.organization_id = organizations.id
  WHERE employees.active = TRUE
    AND employment_statuses.organization_id IN ('[ORGANIZATION_IDS]')
    AND employment_statuses.job_level_id IN ('[JOB_LEVEL_IDS]')
    AND employment_statuses.location_id IN ('[LOCATION_IDS]')
  GROUP BY organizations.name
)
SELECT 
  department,
  manager_count,
  total_employees - manager_count AS non_manager_count,
  ROUND(manager_count / NULLIF((total_employees - manager_count), 0), 2) AS manager_t

2025-04-11 10:03:10,507 - INFO - HTTP Request: POST https://api.deepseek.com/v1/chat/completions "HTTP/1.1 200 OK"


LLM Response (Attempt 1):
WITH company_avg AS (
    SELECT AVG(attendance_detail_recapitulations.paid_overtime/3600000) AS avg_overtime
    FROM attendance_detail_recapitulations
    JOIN employment_statuses ON attendance_detail_recapitulations.employee_id = employment_statuses.employee_id
    WHERE attendance_detail_recapitulations.date BETWEEN DATE_SUB(STR_TO_DATE('11 April 2025', '%d %M %Y'), INTERVAL 3 MONTH) AND STR_TO_DATE('11 April 2025', '%d %M %Y')
    AND employment_statuses.organization_id IN ('[ORGANIZATION_IDS]')
    AND employment_statuses.job_level_id IN ('[JOB_LEVEL_IDS]')
    AND employment_statuses.location_id IN ('[LOCATION_IDS]')
),
dept_stats AS (
    SELECT 
        organizations.name AS department,
        COUNT(DISTINCT attendance_detail_recapitulations.employee_id) AS total_employees,
        SUM(CASE WHEN (attendance_detail_recapitulations.paid_overtime/3600000) > (SELECT avg_overtime FROM company_avg) THEN 1 ELSE 0 END) AS above_avg_employees
    FROM 
      

2025-04-11 10:04:21,819 - INFO - HTTP Request: POST https://api.deepseek.com/v1/chat/completions "HTTP/1.1 200 OK"


LLM Response (Attempt 2):
WITH employee_overtime AS (
    SELECT 
        attendance_detail_recapitulations.employee_id,
        employment_statuses.organization_id,
        SUM(attendance_detail_recapitulations.paid_overtime)/3600000 AS total_overtime_hours
    FROM 
        attendance_detail_recapitulations
    JOIN 
        employment_statuses ON attendance_detail_recapitulations.employee_id = employment_statuses.employee_id
    WHERE 
        attendance_detail_recapitulations.date BETWEEN DATE_SUB(STR_TO_DATE('11 April 2025', '%d %M %Y'), INTERVAL 3 MONTH) AND STR_TO_DATE('11 April 2025', '%d %M %Y')
        AND employment_statuses.organization_id IN ('[ORGANIZATION_IDS]')
        AND employment_statuses.job_level_id IN ('[JOB_LEVEL_IDS]')
        AND employment_statuses.location_id IN ('[LOCATION_IDS]')
    GROUP BY 
        attendance_detail_recapitulations.employee_id, employment_statuses.organization_id
),
company_avg AS (
    SELECT AVG(total_overtime_hours) AS avg_overtime
  

2025-04-11 10:05:29,699 - INFO - HTTP Request: POST https://api.deepseek.com/v1/chat/completions "HTTP/1.1 200 OK"


LLM Response (Attempt 3):
WITH employee_overtime AS (
    SELECT 
        attendance_detail_recapitulations.employee_id,
        employment_statuses.organization_id,
        SUM(attendance_detail_recapitulations.paid_overtime)/3600000 AS total_overtime_hours
    FROM 
        attendance_detail_recapitulations
    JOIN 
        employment_statuses ON attendance_detail_recapitulations.employee_id = employment_statuses.employee_id
    WHERE 
        attendance_detail_recapitulations.date BETWEEN DATE_SUB(STR_TO_DATE('11 April 2025', '%d %M %Y'), INTERVAL 3 MONTH) AND STR_TO_DATE('11 April 2025', '%d %M %Y')
        AND employment_statuses.organization_id IN ('[ORGANIZATION_IDS]')
        AND employment_statuses.job_level_id IN ('[JOB_LEVEL_IDS]')
        AND employment_statuses.location_id IN ('[LOCATION_IDS]')
    GROUP BY 
        attendance_detail_recapitulations.employee_id, employment_statuses.organization_id
),
company_avg AS (
    SELECT AVG(total_overtime_hours) AS avg_overtime
  

2025-04-11 10:06:37,666 - INFO - HTTP Request: POST https://api.deepseek.com/v1/chat/completions "HTTP/1.1 200 OK"


LLM Response (Attempt 1):
WITH status_changes AS (
    SELECT 
        esh.employee_id,
        esh.effective_date,
        esh.job_level_id,
        esh.organization_id,
        o.name AS department,
        ed.gender,
        LAG(esh.job_level_id) OVER (PARTITION BY esh.employee_id ORDER BY esh.effective_date) AS prev_job_level,
        LAG(esh.organization_id) OVER (PARTITION BY esh.employee_id ORDER BY esh.effective_date) AS prev_organization
    FROM employment_status_histories esh
    JOIN organizations o ON esh.organization_id = o.id
    JOIN employee_details ed ON esh.employee_id = ed.employee_id
    JOIN employment_statuses es ON esh.employee_id = es.employee_id
    WHERE esh.effective_date >= DATE_SUB(CURRENT_DATE, INTERVAL 2 YEAR)
    AND es.organization_id IN ('[ORGANIZATION_IDS]')
    AND es.job_level_id IN ('[JOB_LEVEL_IDS]')
    AND es.location_id IN ('[LOCATION_IDS]')
)
SELECT 
    department,
    gender,
    CASE 
        WHEN job_level_id > prev_job_level THEN 'Promos

2025-04-11 10:07:46,740 - INFO - HTTP Request: POST https://api.deepseek.com/v1/chat/completions "HTTP/1.1 200 OK"


LLM Response (Attempt 2):
WITH employee_changes AS (
    SELECT 
        esh.employee_id,
        e.name AS employee_name,
        esh.effective_date,
        jl_current.name AS current_level,
        jl_previous.name AS previous_level,
        org_current.name AS current_dept,
        org_previous.name AS previous_dept,
        ed.gender AS jenis_kelamin,
        ROW_NUMBER() OVER (PARTITION BY esh.employee_id ORDER BY esh.effective_date DESC) AS rn
    FROM employment_status_histories esh
    JOIN employment_status_histories esh_prev ON 
        esh.employee_id = esh_prev.employee_id AND 
        esh.effective_date > esh_prev.effective_date
    JOIN organizations org_current ON esh.organization_id = org_current.id
    JOIN organizations org_previous ON esh_prev.organization_id = org_previous.id
    JOIN job_levels jl_current ON esh.job_level_id = jl_current.id
    JOIN job_levels jl_previous ON esh_prev.job_level_id = jl_previous.id
    JOIN employees e ON esh.employee_id = e.id
    

2025-04-11 10:09:04,881 - INFO - HTTP Request: POST https://api.deepseek.com/v1/chat/completions "HTTP/1.1 200 OK"


LLM Response (Attempt 3):
WITH employee_status_changes AS (
    SELECT 
        esh.employee_id,
        employees.name AS employee_name,
        esh.effective_date,
        jl_current.name AS current_level,
        jl_previous.name AS previous_level,
        org_current.name AS current_dept,
        org_previous.name AS previous_dept,
        employee_details.gender AS jenis_kelamin,
        ROW_NUMBER() OVER (PARTITION BY esh.employee_id ORDER BY esh.effective_date DESC) AS rn
    FROM employment_status_histories esh
    JOIN (
        SELECT employee_id, MAX(effective_date) AS max_prev_date
        FROM employment_status_histories
        WHERE effective_date < (
            SELECT MAX(effective_date) 
            FROM employment_status_histories esh2 
            WHERE esh2.employee_id = employment_status_histories.employee_id
        )
        GROUP BY employee_id
    ) prev_dates ON esh.employee_id = prev_dates.employee_id
    JOIN employment_status_histories esh_prev ON esh_prev

2025-04-11 10:10:28,329 - INFO - HTTP Request: POST https://api.deepseek.com/v1/chat/completions "HTTP/1.1 200 OK"


LLM Response (Attempt 1):
WITH promotions AS (
    SELECT 
        employee_id,
        COUNT(*) AS promotion_count
    FROM (
        SELECT 
            employee_id,
            job_level_id,
            LAG(job_level_id) OVER (PARTITION BY employee_id ORDER BY effective_date) AS prev_job_level
        FROM employment_status_histories
        WHERE effective_date >= DATE_SUB(CURRENT_DATE, INTERVAL 3 YEAR)
        AND job_level_id IN ('[JOB_LEVEL_IDS]')
    ) t
    WHERE job_level_id > prev_job_level
    GROUP BY employee_id
),
education_levels AS (
    SELECT 
        e.employee_id,
        el.name AS education_level
    FROM educations e
    JOIN education_levels el ON e.education_level_id = el.id
    JOIN employment_statuses es ON e.employee_id = es.employee_id
    WHERE e.employee_id IN (SELECT employee_id FROM promotions)
SELECT 
    el.education_level,
    AVG(p.promotion_count) AS avg_promotions,
    COUNT(DISTINCT p.employee_id) AS employee_count
FROM promotions p
JOIN educati

2025-04-11 10:11:15,566 - INFO - HTTP Request: POST https://api.deepseek.com/v1/chat/completions "HTTP/1.1 200 OK"


LLM Response (Attempt 2):
WITH promotion_data AS (
    SELECT 
        esh.employee_id,
        esh.job_level_id,
        LAG(esh.job_level_id) OVER (PARTITION BY esh.employee_id ORDER BY esh.effective_date) AS prev_job_level,
        esh.effective_date
    FROM employment_status_histories esh
    JOIN employment_statuses es ON esh.employee_id = es.employee_id
    WHERE esh.effective_date >= DATE_SUB(STR_TO_DATE('11 April 2025', '%d %M %Y'), INTERVAL 3 YEAR)
    AND es.organization_id IN ('[ORGANIZATION_IDS]')
    AND es.job_level_id IN ('[JOB_LEVEL_IDS]')
    AND es.location_id IN ('[LOCATION_IDS]')
),
promotions AS (
    SELECT 
        employee_id,
        COUNT(*) AS promotion_count
    FROM promotion_data
    WHERE job_level_id > prev_job_level
    GROUP BY employee_id
),
education_levels AS (
    SELECT 
        e.employee_id,
        el.name AS education_level
    FROM educations e
    JOIN education_levels el ON e.education_level_id = el.id
    JOIN employment_statuses es ON e.

2025-04-11 10:12:22,517 - INFO - HTTP Request: POST https://api.deepseek.com/v1/chat/completions "HTTP/1.1 200 OK"


LLM Response (Attempt 3):
WITH date_range AS (
    SELECT 
        STR_TO_DATE('11 April 2025', '%d %M %Y') AS end_date,
        DATE_SUB(STR_TO_DATE('11 April 2025', '%d %M %Y'), INTERVAL 3 YEAR) AS start_date
),
employee_promotions AS (
    SELECT 
        esh.employee_id,
        COUNT(CASE WHEN esh.job_level_id > LAG(esh.job_level_id) OVER (PARTITION BY esh.employee_id ORDER BY esh.effective_date) THEN 1 END) AS promotion_count
    FROM employment_status_histories esh
    JOIN employment_statuses es ON esh.employee_id = es.employee_id
    CROSS JOIN date_range dr
    WHERE esh.effective_date BETWEEN dr.start_date AND dr.end_date
    AND es.organization_id IN ('[ORGANIZATION_IDS]')
    AND es.job_level_id IN ('[JOB_LEVEL_IDS]')
    AND es.location_id IN ('[LOCATION_IDS]')
    GROUP BY esh.employee_id
),
education_levels AS (
    SELECT 
        e.employee_id,
        el.name AS education_level
    FROM educations e
    JOIN education_levels el ON e.education_level_id = el.id
    JOI

2025-04-11 10:13:32,010 - INFO - HTTP Request: POST https://api.deepseek.com/v1/chat/completions "HTTP/1.1 200 OK"


LLM Response (Attempt 1):
WITH monthly_promotions AS (
  SELECT 
    YEAR(employment_status_histories.effective_date) AS year,
    MONTH(employment_status_histories.effective_date) AS month,
    COUNT(*) AS promotion_count
  FROM employment_status_histories
  JOIN employment_statuses ON employment_status_histories.employee_id = employment_statuses.employee_id
  WHERE employment_status_histories.effective_date >= DATE_SUB(STR_TO_DATE('11 April 2025', '%d %M %Y'), INTERVAL 2 YEAR)
  AND employment_statuses.organization_id IN ('[ORGANIZATION_IDS]')
  AND employment_statuses.job_level_id IN ('[JOB_LEVEL_IDS]')
  AND employment_statuses.location_id IN ('[LOCATION_IDS]')
  GROUP BY YEAR(employment_status_histories.effective_date), MONTH(employment_status_histories.effective_date)
)
SELECT 
  current.year,
  current.month,
  current.promotion_count,
  prev.promotion_count AS prev_year_count,
  ROUND((current.promotion_count - prev.promotion_count) * 100.0 / NULLIF(prev.promotion_count, 0), 2)

2025-04-11 10:14:30,392 - INFO - HTTP Request: POST https://api.deepseek.com/v1/chat/completions "HTTP/1.1 200 OK"


LLM Response (Attempt 1):
WITH manager_stats AS (
  SELECT 
    jl.name AS job_level,
    COUNT(e.id) AS direct_reports,
    m.id AS manager_id
  FROM employees m
  JOIN employees e ON e.manager_id = m.id
  JOIN employment_statuses es ON m.id = es.employee_id
  JOIN job_levels jl ON es.job_level_id = jl.id
  WHERE m.active = TRUE 
    AND e.active = TRUE
    AND es.job_level_id IN ('[JOB_LEVEL_IDS]')
    AND es.location_id IN ('[LOCATION_IDS]')
    AND es.organization_id IN ('[ORGANIZATION_IDS]')
  GROUP BY m.id, jl.name
)
SELECT 
  job_level,
  COUNT(manager_id) AS manager_count,
  AVG(direct_reports) AS avg_reports,
  MIN(direct_reports) AS min_reports,
  MAX(direct_reports) AS max_reports,
  STDDEV(direct_reports) AS stddev_reports
FROM manager_stats
GROUP BY job_level
ORDER BY stddev_reports DESC;

Trying candidate 1
✅ Candidate 1 executed successfully!
113_2: ✅ Original query executed successfully!
113_3: ⚠️ Skipped due to column not found error.
114_1: ✅ Original query executed s

2025-04-11 10:15:20,227 - INFO - HTTP Request: POST https://api.deepseek.com/v1/chat/completions "HTTP/1.1 200 OK"


LLM Response (Attempt 1):
SELECT 
    YEAR(e.join_date) AS tahun_bergabung, 
    jl.name AS level_jabatan, 
    FLOOR(AVG(DATEDIFF(e.join_date, ed.date_of_birth)/365) AS usia_rata_rata_saat_bergabung, 
    FLOOR(AVG(DATEDIFF(CURRENT_DATE, ed.date_of_birth)/365)) AS usia_rata_rata_sekarang, 
    COUNT(e.id) AS jumlah_karyawan 
FROM 
    employee_details ed 
    JOIN employees e ON ed.employee_id = e.id 
    JOIN employment_statuses es ON e.id = es.employee_id 
    JOIN job_levels jl ON es.job_level_id = jl.id 
WHERE 
    e.active = TRUE 
    AND es.organization_id IN ('[ORGANIZATION_IDS]') 
    AND es.job_level_id IN ('[JOB_LEVEL_IDS]') 
    AND es.location_id IN ('[LOCATION_IDS]') 
    AND e.join_date >= DATE_SUB(CURRENT_DATE, INTERVAL 5 YEAR) 
GROUP BY 
    YEAR(e.join_date), 
    jl.name 
ORDER BY 
    tahun_bergabung, 
    level_jabatan;

Trying candidate 1
❌ Candidate 1 failed with error: Error executing query: 1064 (42000): You have an error in your SQL syntax; check the manual th

2025-04-11 10:16:11,774 - INFO - HTTP Request: POST https://api.deepseek.com/v1/chat/completions "HTTP/1.1 200 OK"


LLM Response (Attempt 2):
SELECT 
    YEAR(e.join_date) AS tahun_bergabung, 
    jl.name AS level_jabatan, 
    FLOOR(AVG(DATEDIFF(e.join_date, ed.date_of_birth)/365)) AS usia_rata_rata_saat_bergabung, 
    FLOOR(AVG(DATEDIFF(STR_TO_DATE('11 April 2025', '%d %M %Y'), ed.date_of_birth)/365)) AS usia_rata_rata_sekarang, 
    COUNT(e.id) AS jumlah_karyawan 
FROM 
    employee_details ed 
    JOIN employees e ON ed.employee_id = e.id 
    JOIN employment_statuses es ON e.id = es.employee_id 
    JOIN job_levels jl ON es.job_level_id = jl.id 
WHERE 
    e.active = TRUE 
    AND es.organization_id IN ('[ORGANIZATION_IDS]') 
    AND es.job_level_id IN ('[JOB_LEVEL_IDS]') 
    AND es.location_id IN ('[LOCATION_IDS]') 
    AND e.join_date BETWEEN DATE_SUB(STR_TO_DATE('11 April 2025', '%d %M %Y'), INTERVAL 5 YEAR) AND STR_TO_DATE('11 April 2025', '%d %M %Y') 
GROUP BY 
    YEAR(e.join_date), 
    jl.name 
ORDER BY 
    tahun_bergabung, 
    level_jabatan;

Trying candidate 1
❌ Candidate 1 failed

2025-04-11 10:17:02,040 - INFO - HTTP Request: POST https://api.deepseek.com/v1/chat/completions "HTTP/1.1 200 OK"


LLM Response (Attempt 3):
SELECT 
    YEAR(employees.start_date) AS tahun_bergabung, 
    job_levels.name AS level_jabatan, 
    FLOOR(AVG(DATEDIFF(employees.start_date, employee_details.date_of_birth)/365)) AS usia_rata_rata_saat_bergabung, 
    FLOOR(AVG(DATEDIFF(STR_TO_DATE('11 April 2025', '%d %M %Y'), employee_details.date_of_birth)/365)) AS usia_rata_rata_sekarang, 
    COUNT(employees.id) AS jumlah_karyawan 
FROM 
    employee_details 
    JOIN employees ON employee_details.employee_id = employees.id 
    JOIN employment_statuses ON employees.id = employment_statuses.employee_id 
    JOIN job_levels ON employment_statuses.job_level_id = job_levels.id 
WHERE 
    employees.active = TRUE 
    AND employment_statuses.organization_id IN ('[ORGANIZATION_IDS]') 
    AND employment_statuses.job_level_id IN ('[JOB_LEVEL_IDS]') 
    AND employment_statuses.location_id IN ('[LOCATION_IDS]') 
    AND YEAR(employees.start_date) BETWEEN YEAR(DATE_SUB(STR_TO_DATE('11 April 2025', '%d %M %Y'),

2025-04-11 10:18:01,545 - INFO - HTTP Request: POST https://api.deepseek.com/v1/chat/completions "HTTP/1.1 200 OK"


LLM Response (Attempt 1):
SELECT 
    job_levels.name AS job_level, 
    FLOOR(AVG(DATEDIFF(CURRENT_DATE, employee_details.date_of_birth)/365)) AS avg_age, 
    FLOOR(AVG(DATEDIFF(CURRENT_DATE, employees.join_date)/365)) AS avg_tenure, 
    COUNT(employees.id) AS employee_count, 
    (SUM((DATEDIFF(CURRENT_DATE, employee_details.date_of_birth) * DATEDIFF(CURRENT_DATE, employees.join_date)) - (SUM(DATEDIFF(CURRENT_DATE, employee_details.date_of_birth)) * SUM(DATEDIFF(CURRENT_DATE, employees.join_date)) / COUNT(employees.id)) / 
    (SQRT((SUM(POW(DATEDIFF(CURRENT_DATE, employee_details.date_of_birth), 2)) - (POW(SUM(DATEDIFF(CURRENT_DATE, employee_details.date_of_birth)), 2) / COUNT(employees.id))) * 
    SQRT((SUM(POW(DATEDIFF(CURRENT_DATE, employees.join_date), 2)) - (POW(SUM(DATEDIFF(CURRENT_DATE, employees.join_date)), 2) / COUNT(employees.id))))) AS age_tenure_correlation 
FROM employee_details 
JOIN employees ON employee_details.employee_id = employees.id 
JOIN employment_statuses

2025-04-11 10:18:56,177 - INFO - HTTP Request: POST https://api.deepseek.com/v1/chat/completions "HTTP/1.1 200 OK"


LLM Response (Attempt 2):
SELECT 
    job_levels.name AS job_level, 
    FLOOR(AVG(DATEDIFF(CURRENT_DATE, employee_details.date_of_birth)/365) AS avg_age, 
    FLOOR(AVG(DATEDIFF(CURRENT_DATE, employees.join_date)/365) AS avg_tenure, 
    COUNT(employees.id) AS employee_count, 
    (COUNT(employees.id) * SUM(DATEDIFF(CURRENT_DATE, employee_details.date_of_birth) * DATEDIFF(CURRENT_DATE, employees.join_date) - 
    SUM(DATEDIFF(CURRENT_DATE, employee_details.date_of_birth)) * SUM(DATEDIFF(CURRENT_DATE, employees.join_date)) / 
    SQRT((COUNT(employees.id) * SUM(POW(DATEDIFF(CURRENT_DATE, employee_details.date_of_birth), 2)) - POW(SUM(DATEDIFF(CURRENT_DATE, employee_details.date_of_birth)), 2)) * 
    (COUNT(employees.id) * SUM(POW(DATEDIFF(CURRENT_DATE, employees.join_date), 2)) - POW(SUM(DATEDIFF(CURRENT_DATE, employees.join_date)), 2))) AS age_tenure_correlation 
FROM employee_details 
JOIN employees ON employee_details.employee_id = employees.id 
JOIN employment_statuses ON employee

2025-04-11 10:19:50,442 - INFO - HTTP Request: POST https://api.deepseek.com/v1/chat/completions "HTTP/1.1 200 OK"


LLM Response (Attempt 3):
WITH employee_stats AS (
    SELECT 
        job_levels.name AS job_level, 
        DATEDIFF(CURRENT_DATE, employee_details.date_of_birth) AS age_days, 
        DATEDIFF(CURRENT_DATE, employees.join_date) AS tenure_days
    FROM employee_details 
    JOIN employees ON employee_details.employee_id = employees.id 
    JOIN employment_statuses ON employees.id = employment_statuses.employee_id 
    JOIN job_levels ON employment_statuses.job_level_id = job_levels.id 
    WHERE employees.active = TRUE 
    AND employment_statuses.organization_id IN ('[ORGANIZATION_IDS]') 
    AND job_levels.id IN ('[JOB_LEVEL_IDS]')
),
correlation_data AS (
    SELECT 
        job_level, 
        FLOOR(AVG(age_days)/365) AS avg_age, 
        FLOOR(AVG(tenure_days)/365) AS avg_tenure, 
        COUNT(*) AS employee_count,
        (SUM(age_days * tenure_days) - SUM(age_days) * SUM(tenure_days) / COUNT(*)) / 
        (SQRT(SUM(age_days * age_days) - SUM(age_days) * SUM(age_days) / COUNT

2025-04-11 10:20:49,684 - INFO - HTTP Request: POST https://api.deepseek.com/v1/chat/completions "HTTP/1.1 200 OK"


LLM Response (Attempt 1):
WITH current_subordinates AS (
  SELECT 
    m.id AS manager_id,
    m.name AS manager_name,
    COUNT(e.id) AS current_count
  FROM employees e
  JOIN employees m ON e.manager_id = m.id
  JOIN employment_status_histories esh ON m.id = esh.employee_id
  WHERE esh.effective_date >= DATE_SUB(CURRENT_DATE(), INTERVAL 3 MONTH)
    AND esh.organization_id IN ('[ORGANIZATION_IDS]')
    AND esh.job_level_id IN ('[JOB_LEVEL_IDS]')
    AND esh.location_id IN ('[LOCATION_IDS]')
  GROUP BY m.id, m.name
),
past_subordinates AS (
  SELECT 
    m.id AS manager_id,
    COUNT(e.id) AS past_count
  FROM employees e
  JOIN employees m ON e.manager_id = m.id
  JOIN employment_status_histories esh ON m.id = esh.employee_id
  WHERE esh.effective_date BETWEEN DATE_SUB(CURRENT_DATE(), INTERVAL 15 MONTH) AND DATE_SUB(CURRENT_DATE(), INTERVAL 12 MONTH)
    AND esh.organization_id IN ('[ORGANIZATION_IDS]')
    AND esh.job_level_id IN ('[JOB_LEVEL_IDS]')
    AND esh.location_id IN ('[LO

2025-04-11 10:22:00,492 - INFO - HTTP Request: POST https://api.deepseek.com/v1/chat/completions "HTTP/1.1 200 OK"


LLM Response (Attempt 1):
WITH company_avg AS (
    SELECT AVG(attendance_detail_recapitulations.paid_overtime/3600000) AS avg_overtime
    FROM attendance_detail_recapitulations
    WHERE attendance_detail_recapitulations.date >= DATE_SUB(STR_TO_DATE('11 April 2025', '%d %M %Y'), INTERVAL 3 MONTH)
),
employee_overtime AS (
    SELECT 
        attendance_detail_recapitulations.employee_id,
        AVG(attendance_detail_recapitulations.paid_overtime/3600000) AS emp_avg_overtime,
        organizations.name AS department
    FROM 
        attendance_detail_recapitulations
    JOIN 
        employment_statuses ON attendance_detail_recapitulations.employee_id = employment_statuses.employee_id
    JOIN 
        organizations ON employment_statuses.organization_id = organizations.id
    WHERE 
        attendance_detail_recapitulations.date >= DATE_SUB(STR_TO_DATE('11 April 2025', '%d %M %Y'), INTERVAL 3 MONTH)
        AND employment_statuses.organization_id IN ('[ORGANIZATION_IDS]')
        A

2025-04-11 10:23:08,738 - INFO - HTTP Request: POST https://api.deepseek.com/v1/chat/completions "HTTP/1.1 200 OK"


LLM Response (Attempt 2):
WITH company_avg AS (
    SELECT AVG(attendance_detail_recapitulations.paid_overtime/3600000) AS avg_overtime
    FROM attendance_detail_recapitulations
    WHERE attendance_detail_recapitulations.date >= DATE_SUB(STR_TO_DATE('11 April 2025', '%d %M %Y'), INTERVAL 3 MONTH)
),
employee_overtime AS (
    SELECT 
        employment_statuses.organization_id,
        organizations.name AS department,
        attendance_detail_recapitulations.employee_id,
        AVG(attendance_detail_recapitulations.paid_overtime/3600000) AS emp_avg_overtime
    FROM 
        attendance_detail_recapitulations
    JOIN 
        employment_statuses ON attendance_detail_recapitulations.employee_id = employment_statuses.employee_id
    JOIN 
        organizations ON employment_statuses.organization_id = organizations.id
    WHERE 
        attendance_detail_recapitulations.date >= DATE_SUB(STR_TO_DATE('11 April 2025', '%d %M %Y'), INTERVAL 3 MONTH)
        AND employment_statuses.organi

2025-04-11 10:24:21,203 - INFO - HTTP Request: POST https://api.deepseek.com/v1/chat/completions "HTTP/1.1 200 OK"


LLM Response (Attempt 1):
SELECT 
    job_levels.name AS level_jabatan,
    COUNT(*) AS jumlah_rekaman,
    MIN(attendance_detail_recapitulations.paid_overtime/3600000) AS durasi_minimum_lembur,
    ROUND(AVG(attendance_detail_recapitulations.paid_overtime/3600000), 2) AS durasi_rata_rata_lembur,
    MAX(attendance_detail_recapitulations.paid_overtime/3600000) AS durasi_maksimum_lembur,
    (SELECT paid_overtime/3600000 FROM attendance_detail_recapitulations 
     JOIN employment_statuses ON attendance_detail_recapitulations.employee_id = employment_statuses.employee_id
     WHERE employment_statuses.job_level_id = job_levels.id 
     ORDER BY paid_overtime LIMIT 1 OFFSET FLOOR(COUNT(*)*0.25)) AS persentil_25,
    (SELECT paid_overtime/3600000 FROM attendance_detail_recapitulations 
     JOIN employment_statuses ON attendance_detail_recapitulations.employee_id = employment_statuses.employee_id
     WHERE employment_statuses.job_level_id = job_levels.id 
     ORDER BY paid_overtime LIMI

2025-04-11 10:26:17,782 - INFO - HTTP Request: POST https://api.deepseek.com/v1/chat/completions "HTTP/1.1 200 OK"


LLM Response (Attempt 1):
WITH overtime_by_level AS (
  SELECT 
    job_levels.name AS job_level,
    attendance_detail_recapitulations.paid_overtime/3600000 AS overtime_hours
  FROM attendance_detail_recapitulations
  JOIN employment_statuses ON attendance_detail_recapitulations.employee_id = employment_statuses.employee_id
  JOIN job_levels ON employment_statuses.job_level_id = job_levels.id
  WHERE attendance_detail_recapitulations.date BETWEEN DATE_SUB(CURRENT_DATE(), INTERVAL 3 MONTH) AND CURRENT_DATE()
    AND employment_statuses.job_level_id IN ('[JOB_LEVEL_IDS]')
    AND employment_statuses.organization_id IN ('[ORGANIZATION_IDS]')
    AND employment_statuses.location_id IN ('[LOCATION_IDS]')
)

SELECT 
  job_level,
  COUNT(*) AS jumlah_karyawan,
  AVG(overtime_hours) AS rata_rata_lembur_jam,
  MAX(overtime_hours) AS maksimal_lembur_jam,
  MIN(overtime_hours) AS minimal_lembur_jam
FROM overtime_by_level
GROUP BY job_level
ORDER BY rata_rata_lembur_jam DESC;

Trying candidate 1


2025-04-11 10:27:07,822 - INFO - HTTP Request: POST https://api.deepseek.com/v1/chat/completions "HTTP/1.1 200 OK"


LLM Response (Attempt 1):
WITH yearly_data AS (
  SELECT 
    YEAR(employment_status_histories.effective_date) AS year,
    organizations.name AS department,
    job_levels.name AS job_level,
    COUNT(*) AS employee_count
  FROM employment_status_histories
  JOIN organizations ON employment_status_histories.organization_id = organizations.id
  JOIN job_levels ON employment_status_histories.job_level_id = job_levels.id
  JOIN employment_statuses ON employment_status_histories.employee_id = employment_statuses.employee_id
  WHERE 
    employment_status_histories.effective_date >= DATE_SUB(CURRENT_DATE(), INTERVAL 3 YEAR)
    AND organizations.id IN ('[ORGANIZATION_IDS]')
    AND job_levels.id IN ('[JOB_LEVEL_IDS]')
    AND employment_statuses.location_id IN ('[LOCATION_IDS]')
  GROUP BY 
    YEAR(employment_status_histories.effective_date),
    organizations.name,
    job_levels.name
),
prev_year_data AS (
  SELECT 
    year,
    department,
    job_level,
    employee_count,
    LAG(em

2025-04-11 10:28:23,484 - INFO - HTTP Request: POST https://api.deepseek.com/v1/chat/completions "HTTP/1.1 200 OK"


LLM Response (Attempt 1):
WITH RECURSIVE org_hierarchy AS (
    SELECT e.id, e.name, e.manager_id, e.join_date, 0 AS level 
    FROM employees e 
    WHERE e.manager_id IS NULL
    UNION ALL 
    SELECT e.id, e.name, e.manager_id, e.join_date, h.level + 1 
    FROM employees e 
    JOIN org_hierarchy h ON e.manager_id = h.id
)
SELECT 
    level AS management_level,
    YEAR(join_date) AS join_year,
    AVG(DATEDIFF(CURRENT_DATE, join_date)/365 AS avg_tenure_years,
    COUNT(id) AS employee_count
FROM org_hierarchy
WHERE join_date IS NOT NULL
GROUP BY level, YEAR(join_date)
HAVING COUNT(id) > 5
ORDER BY level, join_year;

Trying candidate 1
❌ Candidate 1 failed with error: Error executing query: 1064 (42000): You have an error in your SQL syntax; check the manual that corresponds to your MariaDB server version for the right syntax to use near 'AS avg_tenure_years,
WITH RECURSIVE org_hierarchy AS (
    SELECT e.id, e.name, e.manager_id, e.join_date, 0 AS level 
    FROM employees e 
    

2025-04-11 10:29:01,567 - INFO - HTTP Request: POST https://api.deepseek.com/v1/chat/completions "HTTP/1.1 200 OK"


LLM Response (Attempt 2):
WITH employee_levels AS (
    SELECT 
        e.id,
        e.name,
        e.join_date,
        CASE 
            WHEN e.id IN (SELECT DISTINCT manager_id FROM employees WHERE manager_id IS NOT NULL) THEN 'Manager'
            ELSE 'Non-Manager'
        END AS management_level
    FROM employees e
    JOIN employment_statuses es ON e.id = es.employee_id
    WHERE es.organization_id IN ('[ORGANIZATION_IDS]')
      AND es.job_level_id IN ('[JOB_LEVEL_IDS]')
      AND es.location_id IN ('[LOCATION_IDS]')
)
SELECT 
    management_level,
    YEAR(join_date) AS join_year,
    AVG(DATEDIFF(STR_TO_DATE('11 April 2025', '%d %M %Y'), join_date)/365) AS avg_tenure_years,
    COUNT(id) AS employee_count
FROM employee_levels
WHERE join_date IS NOT NULL
GROUP BY management_level, YEAR(join_date)
HAVING COUNT(id) > 5
ORDER BY management_level, join_year;

Trying candidate 1
❌ Candidate 1 failed with error: Error executing query: 1054 (42S22): Unknown column 'e.join_date' in

2025-04-11 10:29:42,927 - INFO - HTTP Request: POST https://api.deepseek.com/v1/chat/completions "HTTP/1.1 200 OK"


LLM Response (Attempt 3):
WITH managers AS (
    SELECT DISTINCT manager_id 
    FROM employees 
    WHERE manager_id IS NOT NULL
),
employee_levels AS (
    SELECT 
        e.id,
        e.name,
        e.start_date AS join_date,
        CASE 
            WHEN m.manager_id IS NOT NULL THEN 'Manager'
            ELSE 'Non-Manager'
        END AS management_level
    FROM employees e
    LEFT JOIN managers m ON e.id = m.manager_id
    JOIN employment_statuses es ON e.id = es.employee_id
    WHERE es.organization_id IN ('[ORGANIZATION_IDS]')
      AND es.job_level_id IN ('[JOB_LEVEL_IDS]')
      AND es.location_id IN ('[LOCATION_IDS]')
)
SELECT 
    management_level,
    YEAR(join_date) AS join_year,
    ROUND(AVG(DATEDIFF(STR_TO_DATE('11 April 2025', '%d %M %Y'), join_date)/365, 2) AS avg_tenure_years,
    COUNT(id) AS employee_count
FROM employee_levels
WHERE join_date IS NOT NULL
GROUP BY management_level, YEAR(join_date)
HAVING COUNT(id) > 5
ORDER BY management_level, join_year;

Try

2025-04-11 10:30:28,293 - INFO - HTTP Request: POST https://api.deepseek.com/v1/chat/completions "HTTP/1.1 200 OK"


LLM Response (Attempt 1):
WITH late_attendance AS (
    SELECT 
        adr.employee_id,
        (adr.attendance_time_in - adr.shift_time_in) / (1000 * 60) AS late_minutes
    FROM 
        attendance_detail_recapitulations adr
    JOIN 
        attendances a ON adr.attendance_id = a.id
    JOIN 
        attendance_statuses ast ON a.attendance_status_in_id = ast.id
    WHERE 
        adr.date BETWEEN DATE_SUB(STR_TO_DATE('10 April 2025', '%d %M %Y'), INTERVAL 3 MONTH) 
        AND STR_TO_DATE('10 April 2025', '%d %M %Y')
        AND ast.name = 'Terlambat > 5 menit'
)
SELECT 
    o.name AS department,
    COUNT(*) AS total_late_occurrences,
    ROUND(AVG(la.late_minutes), 2) AS avg_late_minutes,
    MIN(la.late_minutes) AS min_late_minutes,
    MAX(la.late_minutes) AS max_late_minutes
FROM 
    late_attendance la
JOIN 
    employees e ON la.employee_id = e.id
JOIN 
    employment_statuses es ON e.id = es.employee_id
    AND es.organization_id IN ('[ORGANIZATION_IDS]')
    AND es.job_lev

2025-04-11 10:31:28,754 - INFO - HTTP Request: POST https://api.deepseek.com/v1/chat/completions "HTTP/1.1 200 OK"


LLM Response (Attempt 1):
WITH employee_tenure AS (
    SELECT 
        e.id AS employee_id,
        et.name AS employment_type,
        TIMESTAMPDIFF(MONTH, e.join_date, STR_TO_DATE('10 April 2025', '%d %M %Y')) AS tenure_months
    FROM 
        employees e
    JOIN 
        employment_statuses es ON e.id = es.employee_id
        AND es.organization_id IN ('[ORGANIZATION_IDS]')
        AND es.job_level_id IN ('[JOB_LEVEL_IDS]')
        AND es.location_id IN ('[LOCATION_IDS]')
    JOIN 
        employment_types et ON es.employment_type_id = et.id
    WHERE 
        e.active = TRUE
),
late_counts AS (
    SELECT 
        adr.employee_id,
        COUNT(*) AS late_count
    FROM 
        attendance_detail_recapitulations adr
    JOIN 
        employment_statuses es ON adr.employee_id = es.employee_id
        AND es.organization_id IN ('[ORGANIZATION_IDS]')
        AND es.job_level_id IN ('[JOB_LEVEL_IDS]')
        AND es.location_id IN ('[LOCATION_IDS]')
    JOIN 
        attendances a O

2025-04-11 10:33:18,094 - INFO - HTTP Request: POST https://api.deepseek.com/v1/chat/completions "HTTP/1.1 200 OK"


LLM Response (Attempt 2):
WITH employee_tenure AS (
    SELECT 
        e.id AS employee_id,
        et.name AS employment_type,
        TIMESTAMPDIFF(MONTH, e.join_date, STR_TO_DATE('10 April 2025', '%d %M %Y')) AS tenure_months
    FROM 
        employees e
    JOIN 
        employment_statuses es ON e.id = es.employee_id
        AND es.organization_id IN ('[ORGANIZATION_IDS]')
        AND es.job_level_id IN ('[JOB_LEVEL_IDS]')
        AND es.location_id IN ('[LOCATION_IDS]')
    JOIN 
        employment_types et ON es.employment_type_id = et.id
    WHERE 
        e.active = TRUE
),
late_counts AS (
    SELECT 
        adr.employee_id,
        COUNT(*) AS late_count
    FROM 
        attendance_detail_recapitulations adr
    JOIN 
        employment_statuses es ON adr.employee_id = es.employee_id
        AND es.organization_id IN ('[ORGANIZATION_IDS]')
        AND es.job_level_id IN ('[JOB_LEVEL_IDS]')
        AND es.location_id IN ('[LOCATION_IDS]')
    JOIN 
        attendances a O

2025-04-11 10:34:55,984 - INFO - HTTP Request: POST https://api.deepseek.com/v1/chat/completions "HTTP/1.1 200 OK"


LLM Response (Attempt 3):
WITH employee_tenure AS (
    SELECT 
        e.id AS employee_id,
        et.name AS employment_type,
        TIMESTAMPDIFF(MONTH, e.start_date, STR_TO_DATE('11 April 2025', '%d %M %Y')) AS tenure_months
    FROM 
        employees e
    JOIN 
        employment_statuses es ON e.id = es.employee_id
        AND es.organization_id IN ('[ORGANIZATION_IDS]')
        AND es.job_level_id IN ('[JOB_LEVEL_IDS]')
        AND es.location_id IN ('[LOCATION_IDS]')
    JOIN 
        employment_types et ON es.employment_type_id = et.id
    WHERE 
        e.active = TRUE
),
late_counts AS (
    SELECT 
        adr.employee_id,
        COUNT(*) AS late_count
    FROM 
        attendance_detail_recapitulations adr
    JOIN 
        employment_statuses es ON adr.employee_id = es.employee_id
        AND es.organization_id IN ('[ORGANIZATION_IDS]')
        AND es.job_level_id IN ('[JOB_LEVEL_IDS]')
        AND es.location_id IN ('[LOCATION_IDS]')
    JOIN 
        attendances a 

2025-04-11 10:36:20,666 - INFO - HTTP Request: POST https://api.deepseek.com/v1/chat/completions "HTTP/1.1 200 OK"


LLM Response (Attempt 1):
SELECT 
    organizations.name AS departemen, 
    DATE_FORMAT(attendance_detail_recapitulations.date, '%Y-%m') AS bulan, 
    SUM(attendance_detail_recapitulations.paid_overtime/3600000 * 50000) AS total_biaya_lembur 
FROM 
    attendance_detail_recapitulations 
    JOIN employment_statuses ON attendance_detail_recapitulations.employee_id = employment_statuses.employee_id 
    JOIN organizations ON employment_statuses.organization_id = organizations.id 
WHERE 
    attendance_detail_recapitulations.date BETWEEN DATE_SUB(CAST(STR_TO_DATE('10 April 2025', '%d %M %Y') AS DATE, INTERVAL 6 MONTH) AND CAST(STR_TO_DATE('10 April 2025', '%d %M %Y') AS DATE) 
    AND employment_statuses.organization_id IN ('[ORGANIZATION_IDS]') 
    AND employment_statuses.job_level_id IN ('[JOB_LEVEL_IDS]') 
    AND employment_statuses.location_id IN ('[LOCATION_IDS]') 
GROUP BY 
    organizations.name, 
    DATE_FORMAT(attendance_detail_recapitulations.date, '%Y-%m') 
ORDER BY 
    d

2025-04-11 10:37:08,409 - INFO - HTTP Request: POST https://api.deepseek.com/v1/chat/completions "HTTP/1.1 200 OK"


LLM Response (Attempt 1):
SELECT 
    DATE_FORMAT(attendance_detail_recapitulations.date, '%Y-%m') AS bulan,
    SUM(attendance_detail_recapitulations.paid_overtime/3600000) AS jam_lembur_dibayarkan,
    SUM(attendance_detail_recapitulations.requested_overtime/3600000) AS jam_lembur_diajukan,
    CASE 
        WHEN SUM(attendance_detail_recapitulations.requested_overtime) = 0 THEN 0 
        ELSE SUM(attendance_detail_recapitulations.paid_overtime)/SUM(attendance_detail_recapitulations.requested_overtime) 
    END AS rasio_persetujuan
FROM 
    attendance_detail_recapitulations
JOIN 
    employment_statuses ON attendance_detail_recapitulations.employee_id = employment_statuses.employee_id
WHERE 
    attendance_detail_recapitulations.date >= DATE_SUB(STR_TO_DATE('11 April 2025', '%d %M %Y'), INTERVAL 1 YEAR)
    AND employment_statuses.organization_id IN ('[ORGANIZATION_IDS]')
    AND employment_statuses.job_level_id IN ('[JOB_LEVEL_IDS]')
    AND employment_statuses.location_id IN ('[L

2025-04-11 10:38:00,140 - INFO - HTTP Request: POST https://api.deepseek.com/v1/chat/completions "HTTP/1.1 200 OK"


LLM Response (Attempt 1):
WITH dept_overtime AS (
  SELECT 
    organizations.name AS department, 
    SUM(attendance_detail_recapitulations.paid_overtime/3600000) AS total_dept_overtime 
  FROM attendance_detail_recapitulations 
  JOIN employment_statuses ON attendance_detail_recapitulations.employee_id = employment_statuses.employee_id 
  JOIN organizations ON employment_statuses.organization_id = organizations.id 
  WHERE attendance_detail_recapitulations.date >= DATE_SUB(CURDATE(), INTERVAL 3 MONTH) 
    AND employment_statuses.organization_id IN ('[ORGANIZATION_IDS]') 
    AND employment_statuses.job_level_id IN ('[JOB_LEVEL_IDS]') 
    AND employment_statuses.location_id IN ('[LOCATION_IDS]') 
  GROUP BY organizations.name
),
employee_overtime AS (
  SELECT 
    employees.name AS employee_name, 
    organizations.name AS department, 
    SUM(attendance_detail_recapitulations.paid_overtime/3600000) AS overtime_hours
  FROM attendance_detail_recapitulations
  JOIN employment_status

2025-04-11 10:39:07,192 - INFO - HTTP Request: POST https://api.deepseek.com/v1/chat/completions "HTTP/1.1 200 OK"


LLM Response (Attempt 1):
SELECT 
    job_levels.name AS job_level,
    MIN(manager_reports.report_count) AS min_span,
    MAX(manager_reports.report_count) AS max_span,
    AVG(manager_reports.report_count) AS avg_span,
    COUNT(DISTINCT employees.id) AS manager_count
FROM (
    SELECT manager_id, COUNT(*) AS report_count 
    FROM employees 
    WHERE manager_id IS NOT NULL 
    GROUP BY manager_id
) AS manager_reports
JOIN employees ON employees.id = manager_reports.manager_id
JOIN employment_statuses ON employment_statuses.employee_id = employees.id
JOIN job_levels ON job_levels.id = employment_statuses.job_level_id
WHERE employment_statuses.location_id IN ('[LOCATION_IDS]')
  AND employment_statuses.organization_id IN ('[ORGANIZATION_IDS]')
  AND employment_statuses.job_level_id IN ('[JOB_LEVEL_IDS]')
GROUP BY job_levels.name
ORDER BY job_levels.name;

Trying candidate 1
✅ Candidate 1 executed successfully!
137_2: ⚠️ Skipped due to column not found error.
Processing... question 1

2025-04-11 10:39:48,401 - INFO - HTTP Request: POST https://api.deepseek.com/v1/chat/completions "HTTP/1.1 200 OK"


LLM Response (Attempt 1):
WITH manager_spans AS (
    SELECT 
        e.id AS manager_id,
        e.name AS manager_name,
        o.name AS department,
        COUNT(reports.id) AS span_of_control
    FROM employees e
    JOIN employees reports ON reports.manager_id = e.id
    JOIN employment_statuses es ON es.employee_id = e.id
    JOIN organizations o ON o.id = es.organization_id
    WHERE es.location_id IN ('[LOCATION_IDS]')
      AND es.organization_id IN ('[ORGANIZATION_IDS]')
      AND es.job_level_id IN ('[JOB_LEVEL_IDS]')
    GROUP BY e.id, e.name, o.name
),
ranked_managers AS (
    SELECT 
        manager_id,
        manager_name,
        department,
        span_of_control,
        RANK() OVER (PARTITION BY department ORDER BY span_of_control DESC) AS high_rank,
        RANK() OVER (PARTITION BY department ORDER BY span_of_control ASC) AS low_rank
    FROM manager_spans
)
SELECT 
    department,
    MAX(CASE WHEN high_rank = 1 THEN manager_name END) AS highest_span_manager,
 

2025-04-11 10:40:43,707 - INFO - HTTP Request: POST https://api.deepseek.com/v1/chat/completions "HTTP/1.1 200 OK"


LLM Response (Attempt 1):
WITH promotion_history AS (
  SELECT 
    esh.employee_id, 
    e.name AS employee_name, 
    jl.name AS job_level, 
    o.name AS department,
    et.name AS employment_type,
    esh.effective_date, 
    LAG(jl.name) OVER (PARTITION BY esh.employee_id ORDER BY esh.effective_date) AS previous_job_level
  FROM 
    employment_status_histories esh 
    JOIN job_levels jl ON esh.job_level_id = jl.id 
    JOIN employees e ON esh.employee_id = e.id
    JOIN organizations o ON esh.organization_id = o.id
    JOIN employment_types et ON esh.employment_type_id = et.id
    JOIN employment_statuses es ON esh.employee_id = es.employee_id
  WHERE 
    es.organization_id IN ('[ORGANIZATION_IDS]')
    AND es.job_level_id IN ('[JOB_LEVEL_IDS]')
    AND es.location_id IN ('[LOCATION_IDS]')
)
SELECT 
  department,
  employment_type,
  COUNT(*) AS promotion_count
FROM 
  promotion_history 
WHERE 
  job_level > previous_job_level
GROUP BY 
  department, employment_type
ORDER BY 
 

2025-04-11 10:41:35,946 - INFO - HTTP Request: POST https://api.deepseek.com/v1/chat/completions "HTTP/1.1 200 OK"


LLM Response (Attempt 1):
WITH promotion_events AS (
  SELECT 
    esh.employee_id,
    esh.effective_date,
    jl.name AS job_level_name,
    LAG(jl.name) OVER (PARTITION BY esh.employee_id ORDER BY esh.effective_date) AS prev_job_level,
    LAG(esh.effective_date) OVER (PARTITION BY esh.employee_id ORDER BY esh.effective_date) AS prev_effective_date
  FROM 
    employment_status_histories esh
    JOIN job_levels jl ON esh.job_level_id = jl.id
    JOIN employment_statuses es ON esh.employee_id = es.employee_id
  WHERE 
    es.organization_id IN ('[ORGANIZATION_IDS]')
),
filtered_promotions AS (
  SELECT 
    employee_id,
    effective_date,
    job_level_name,
    DATEDIFF(effective_date, prev_effective_date) AS days_to_promotion
  FROM 
    promotion_events
  WHERE 
    job_level_name > prev_job_level
    AND prev_job_level IS NOT NULL
)
SELECT 
  YEAR(effective_date) AS promotion_year,
  AVG(days_to_promotion) AS avg_days_to_promotion,
  COUNT(*) AS promotion_count
FROM 
  filtered_

2025-04-11 10:42:32,851 - INFO - HTTP Request: POST https://api.deepseek.com/v1/chat/completions "HTTP/1.1 200 OK"


LLM Response (Attempt 1):
WITH permanent_transitions AS (
  SELECT 
    esh.employee_id,
    esh.effective_date,
    esh.location_id,
    LAG(esh.employment_type_id) OVER (PARTITION BY esh.employee_id ORDER BY esh.effective_date) AS prev_employment_type_id,
    LAG(esh.effective_date) OVER (PARTITION BY esh.employee_id ORDER BY esh.effective_date) AS prev_effective_date
  FROM employment_status_histories esh
  JOIN employment_types et ON esh.employment_type_id = et.id
  WHERE et.name = 'Permanent'
),
contract_durations AS (
  SELECT 
    pt.employee_id,
    pt.location_id,
    DATEDIFF(pt.effective_date, pt.prev_effective_date)/30 AS duration_months
  FROM permanent_transitions pt
  JOIN employees e ON pt.employee_id = e.id
  JOIN employment_types et ON pt.prev_employment_type_id = et.id
  WHERE e.active = TRUE
    AND et.name != 'Permanent'
)
SELECT 
  l.name AS lokasi,
  COUNT(*) AS jumlah_karyawan,
  AVG(cd.duration_months) AS rata_rata_durasi_bulan,
  MIN(cd.duration_months) AS dur

2025-04-11 10:43:32,955 - INFO - HTTP Request: POST https://api.deepseek.com/v1/chat/completions "HTTP/1.1 200 OK"


LLM Response (Attempt 1):
WITH contract_changes AS (
  SELECT 
    employee_id,
    employment_type_id,
    effective_date,
    LAG(employment_type_id) OVER (PARTITION BY employee_id ORDER BY effective_date) AS prev_employment_type_id,
    LAG(effective_date) OVER (PARTITION BY employee_id ORDER BY effective_date) AS prev_effective_date
  FROM employment_status_histories
  WHERE employment_type_id != LAG(employment_type_id) OVER (PARTITION BY employee_id ORDER BY effective_date)
),
attendance_stats AS (
  SELECT 
    cc.employee_id,
    COUNT(CASE WHEN adr.attendance_time_in IS NOT NULL THEN 1 END) AS present_days,
    COUNT(adr.id) AS total_working_days
  FROM contract_changes cc
  JOIN attendance_detail_recapitulations adr ON cc.employee_id = adr.employee_id
    AND adr.date BETWEEN DATE_SUB(cc.effective_date, INTERVAL 6 MONTH) AND cc.effective_date
  GROUP BY cc.employee_id
)
SELECT 
  e.name AS employee_name,
  et.name AS previous_contract_type,
  ROUND((as1.present_days / as1.tota

2025-04-11 10:44:28,246 - INFO - HTTP Request: POST https://api.deepseek.com/v1/chat/completions "HTTP/1.1 200 OK"


LLM Response (Attempt 2):
WITH contract_changes AS (
  SELECT 
    esh1.employee_id,
    esh1.employment_type_id,
    esh1.effective_date,
    esh2.employment_type_id AS prev_employment_type_id
  FROM employment_status_histories esh1
  JOIN employment_status_histories esh2 ON esh1.employee_id = esh2.employee_id
    AND esh2.effective_date = (
      SELECT MAX(effective_date)
      FROM employment_status_histories
      WHERE employee_id = esh1.employee_id
        AND effective_date < esh1.effective_date
    )
  WHERE esh1.employment_type_id != esh2.employment_type_id
),
attendance_stats AS (
  SELECT 
    cc.employee_id,
    SUM(CASE WHEN adr.attendance_time_in IS NOT NULL THEN 1 ELSE 0 END) AS present_days,
    COUNT(adr.id) AS total_working_days
  FROM contract_changes cc
  JOIN attendance_detail_recapitulations adr ON cc.employee_id = adr.employee_id
    AND adr.date BETWEEN DATE_SUB(cc.effective_date, INTERVAL 6 MONTH) AND cc.effective_date
  GROUP BY cc.employee_id
)
SELECT 
  e.n

2025-04-11 10:45:41,546 - INFO - HTTP Request: POST https://api.deepseek.com/v1/chat/completions "HTTP/1.1 200 OK"


LLM Response (Attempt 1):
WITH RECURSIVE hierarchy_depth AS (
  SELECT 
    e.id,
    e.manager_id,
    o.name AS department,
    YEAR(esh.effective_date) AS year,
    1 AS depth
  FROM employees e
  JOIN employment_status_histories esh ON e.id = esh.employee_id
  JOIN organizations o ON esh.organization_id = o.id
  WHERE e.manager_id IS NULL
    AND esh.effective_date >= DATE_SUB(STR_TO_DATE('11 April 2025', '%d %M %Y'), INTERVAL 3 YEAR)
    AND esh.organization_id IN ('[ORGANIZATION_IDS]')
    AND esh.job_level_id IN ('[JOB_LEVEL_IDS]')
    AND esh.location_id IN ('[LOCATION_IDS]')
  UNION ALL
  SELECT 
    e.id,
    e.manager_id,
    hd.department,
    hd.year,
    hd.depth + 1
  FROM employees e
  JOIN hierarchy_depth hd ON e.manager_id = hd.id
  JOIN employment_status_histories esh ON e.id = esh.employee_id
  WHERE esh.effective_date >= DATE_SUB(STR_TO_DATE('11 April 2025', '%d %M %Y'), INTERVAL 3 YEAR)
    AND esh.organization_id IN ('[ORGANIZATION_IDS]')
    AND esh.job_level_id

2025-04-11 10:46:53,717 - INFO - HTTP Request: POST https://api.deepseek.com/v1/chat/completions "HTTP/1.1 200 OK"


LLM Response (Attempt 1):
WITH employee_attendance AS (
    SELECT 
        e.id AS employee_id,
        FLOOR(DATEDIFF(CURDATE(), ed.date_of_birth)/365) AS age,
        o.name AS department,
        COUNT(CASE WHEN ast.attendance_type IN ('ABSENT', 'LEAVE') THEN 1 END) AS absent_days,
        COUNT(*) AS total_days
    FROM employees e
    JOIN employee_details ed ON e.id = ed.employee_id
    JOIN employment_statuses es ON e.id = es.employee_id
    JOIN organizations o ON es.organization_id = o.id
    JOIN attendances a ON e.id = a.employee_id
    JOIN attendance_statuses ast ON a.attendance_status_in_id = ast.id
    WHERE es.organization_id IN ('[ORGANIZATION_IDS]')
      AND es.location_id IN ('[LOCATION_IDS]')
      AND es.job_level_id IN ('[JOB_LEVEL_IDS]')
      AND a.date BETWEEN STR_TO_DATE('2023-01-01', '%Y-%m-%d') AND CURDATE()
    GROUP BY e.id, ed.date_of_birth, o.name
)
SELECT 
    department,
    age,
    AVG(absent_days*100.0/total_days) AS avg_absence_percentage,
    CO

2025-04-11 10:47:46,165 - INFO - HTTP Request: POST https://api.deepseek.com/v1/chat/completions "HTTP/1.1 200 OK"


LLM Response (Attempt 1):
WITH overtime_stats AS (
    SELECT 
        jl.name AS job_level,
        COUNT(DISTINCT e.id) AS employee_count,
        AVG(adr.paid_overtime/3600000) AS avg_overtime,
        MAX(adr.paid_overtime/3600000) AS max_overtime,
        MIN(adr.paid_overtime/3600000) AS min_overtime
    FROM attendance_detail_recapitulations adr
    JOIN employees e ON adr.employee_id = e.id
    JOIN employment_statuses es ON e.id = es.employee_id
    JOIN job_levels jl ON es.job_level_id = jl.id
    WHERE adr.date BETWEEN STR_TO_DATE('2023-10-01', '%Y-%m-%d') AND STR_TO_DATE('2023-10-31', '%Y-%m-%d')
        AND adr.paid_overtime > 0
        AND es.organization_id IN ('[ORGANIZATION_IDS]')
        AND es.job_level_id IN ('[JOB_LEVEL_IDS]')
        AND es.location_id IN ('[LOCATION_IDS]')
    GROUP BY jl.name
)
SELECT * FROM overtime_stats
ORDER BY avg_overtime DESC;

Trying candidate 1
✅ Candidate 1 executed successfully!
164_3: ✅ Original query executed successfully!
165_1: ✅ 

2025-04-11 10:48:33,092 - INFO - HTTP Request: POST https://api.deepseek.com/v1/chat/completions "HTTP/1.1 200 OK"


LLM Response (Attempt 1):
WITH promotion_data AS (
  SELECT 
    esh1.employee_id,
    o.name AS department,
    jl1.name AS previous_level,
    DATEDIFF(MIN(esh2.effective_date), esh1.effective_date) AS days_in_position
  FROM employment_status_histories esh1
  JOIN employment_status_histories esh2 ON 
    esh1.employee_id = esh2.employee_id AND 
    esh2.effective_date > esh1.effective_date
  JOIN job_levels jl1 ON esh1.job_level_id = jl1.id
  JOIN job_levels jl2 ON esh2.job_level_id = jl2.id
  JOIN employees e ON esh1.employee_id = e.id
  JOIN organizations o ON esh1.organization_id = o.id
  JOIN employment_statuses es ON esh1.employee_id = es.employee_id
  WHERE jl2.id > jl1.id 
    AND e.active = TRUE
    AND es.organization_id IN ('[ORGANIZATION_IDS]')
    AND es.job_level_id IN ('[JOB_LEVEL_IDS]')
    AND es.location_id IN ('[LOCATION_IDS]')
  GROUP BY esh1.employee_id, esh1.effective_date, o.name, jl1.name
)
SELECT 
  department,
  previous_level,
  COUNT(*) AS jumlah_promosi,


2025-04-11 10:49:36,808 - INFO - HTTP Request: POST https://api.deepseek.com/v1/chat/completions "HTTP/1.1 200 OK"


LLM Response (Attempt 1):
WITH manager_reports AS (
  SELECT 
    e.manager_id,
    es.job_level_id,
    COUNT(*) AS report_count
  FROM employees e
  JOIN employment_statuses es ON e.manager_id = es.employee_id
  WHERE e.manager_id IS NOT NULL 
    AND e.active = TRUE
    AND es.job_level_id IN ('[JOB_LEVEL_IDS]')
    AND es.organization_id IN ('[ORGANIZATION_IDS]')
    AND es.location_id IN ('[LOCATION_IDS]')
  GROUP BY e.manager_id, es.job_level_id
)
SELECT 
  jl.name AS level_jabatan,
  MIN(mr.report_count) AS jumlah_bawahan_minimum,
  MAX(mr.report_count) AS jumlah_bawahan_maksimum,
  AVG(mr.report_count) AS rata_rata_bawahan
FROM manager_reports mr
JOIN job_levels jl ON mr.job_level_id = jl.id
GROUP BY jl.name
ORDER BY jl.name;

Trying candidate 1
✅ Candidate 1 executed successfully!
171_2: ⚠️ Skipped due to column not found error.
171_3: ⚠️ Skipped due to column not found error.
172_1: ✅ Original query executed successfully!
Processing... question 172_2 Retry attempt: 1/3


2025-04-11 10:50:23,889 - INFO - HTTP Request: POST https://api.deepseek.com/v1/chat/completions "HTTP/1.1 200 OK"


LLM Response (Attempt 1):
WITH company_avg AS (
    SELECT AVG(paid_overtime)/3600000 AS avg_hours
    FROM attendance_detail_recapitulations
    WHERE date BETWEEN DATE_SUB(STR_TO_DATE('2023-01-01', '%Y-%m-%d'), INTERVAL 3 MONTH) AND STR_TO_DATE('2023-01-01', '%Y-%m-%d')
),
employee_overtime AS (
    SELECT 
        adr.employee_id,
        SUM(adr.paid_overtime)/3600000 AS total_hours,
        jl.name AS job_level,
        o.name AS department
    FROM 
        attendance_detail_recapitulations adr
    JOIN 
        employment_statuses es ON adr.employee_id = es.employee_id
    JOIN 
        job_levels jl ON es.job_level_id = jl.id
    JOIN 
        organizations o ON es.organization_id = o.id
    WHERE 
        adr.date BETWEEN DATE_SUB(STR_TO_DATE('2023-01-01', '%Y-%m-%d'), INTERVAL 3 MONTH) AND STR_TO_DATE('2023-01-01', '%Y-%m-%d')
        AND es.organization_id IN ('[ORGANIZATION_IDS]')
        AND es.job_level_id IN ('[JOB_LEVEL_IDS]')
        AND es.location_id IN ('[LOCATION_I

2025-04-11 10:51:34,829 - INFO - HTTP Request: POST https://api.deepseek.com/v1/chat/completions "HTTP/1.1 200 OK"


LLM Response (Attempt 2):
WITH company_avg AS (
    SELECT AVG(paid_overtime_hours) AS avg_hours
    FROM (
        SELECT 
            employee_id, 
            SUM(paid_overtime)/3600000 AS paid_overtime_hours
        FROM 
            attendance_detail_recapitulations
        WHERE 
            date BETWEEN DATE_SUB(STR_TO_DATE('2023-01-01', '%Y-%m-%d'), INTERVAL 3 MONTH) AND STR_TO_DATE('2023-01-01', '%Y-%m-%d')
        GROUP BY 
            employee_id
    ) AS employee_hours
),
employee_overtime AS (
    SELECT 
        adr.employee_id,
        SUM(adr.paid_overtime)/3600000 AS total_hours,
        jl.name AS job_level,
        o.name AS department
    FROM 
        attendance_detail_recapitulations adr
    JOIN 
        employment_statuses es ON adr.employee_id = es.employee_id
    JOIN 
        job_levels jl ON es.job_level_id = jl.id
    JOIN 
        organizations o ON es.organization_id = o.id
    WHERE 
        adr.date BETWEEN DATE_SUB(STR_TO_DATE('2023-01-01', '%Y-%m-%d')

2025-04-11 10:52:49,649 - INFO - HTTP Request: POST https://api.deepseek.com/v1/chat/completions "HTTP/1.1 200 OK"


LLM Response (Attempt 3):
WITH employee_hours AS (
    SELECT 
        employee_id,
        SUM(paid_overtime)/3600000 AS paid_overtime_hours
    FROM 
        attendance_detail_recapitulations
    WHERE 
        date BETWEEN DATE_SUB(STR_TO_DATE('2023-01-01', '%Y-%m-%d'), INTERVAL 3 MONTH) AND STR_TO_DATE('2023-01-01', '%Y-%m-%d')
    GROUP BY 
        employee_id
),
company_avg AS (
    SELECT AVG(paid_overtime_hours) AS avg_hours
    FROM employee_hours
),
employee_overtime AS (
    SELECT 
        eh.employee_id,
        eh.paid_overtime_hours,
        jl.name AS job_level,
        o.name AS department,
        (SELECT avg_hours FROM company_avg) AS company_avg_hours
    FROM 
        employee_hours eh
    JOIN 
        employment_statuses es ON eh.employee_id = es.employee_id
    JOIN 
        job_levels jl ON es.job_level_id = jl.id
    JOIN 
        organizations o ON es.organization_id = o.id
    WHERE 
        es.organization_id IN ('[ORGANIZATION_IDS]')
        AND es.job_lev

2025-04-11 11:01:35,031 - INFO - HTTP Request: POST https://api.deepseek.com/v1/chat/completions "HTTP/1.1 200 OK"


LLM Response (Attempt 1):
WITH overtime_data AS (
    SELECT 
        adr.employee_id,
        es.organization_id,
        o.name AS department,
        SUM(adr.paid_overtime)/3600000 AS total_overtime_hours
    FROM 
        attendance_detail_recapitulations adr
    JOIN 
        employment_statuses es ON adr.employee_id = es.employee_id
    JOIN 
        organizations o ON es.organization_id = o.id
    WHERE 
        adr.date BETWEEN DATE_SUB(STR_TO_DATE('2023-01-01', '%Y-%m-%d'), INTERVAL 3 MONTH) AND STR_TO_DATE('2023-01-01', '%Y-%m-%d')
        AND es.organization_id IN ('[ORGANIZATION_IDS]')
        AND es.job_level_id IN ('[JOB_LEVEL_IDS]')
        AND es.location_id IN ('[LOCATION_IDS]')
    GROUP BY 
        adr.employee_id, es.organization_id, o.name
),
lateness_data AS (
    SELECT 
        a.employee_id,
        es.organization_id,
        SUM(CASE 
            WHEN ast.name = 'Terlambat <= 5 menit' THEN 5
            WHEN ast.name = 'Terlambat > 5 menit' THEN 
            

2025-04-11 11:02:55,193 - INFO - HTTP Request: POST https://api.deepseek.com/v1/chat/completions "HTTP/1.1 200 OK"


LLM Response (Attempt 1):
WITH RECURSIVE org_hierarchy AS (
  SELECT id, name, manager_id, 0 AS level 
  FROM employees 
  WHERE manager_id IS NULL AND active = TRUE
  
  UNION ALL 
  
  SELECT e.id, e.name, e.manager_id, oh.level + 1 
  FROM employees e 
  JOIN org_hierarchy oh ON e.manager_id = oh.id 
  WHERE e.active = TRUE
),
manager_counts AS (
  SELECT 
    oh.level AS management_level,
    o.id AS organization_id,
    o.name AS department,
    COUNT(DISTINCT oh.id) AS number_of_managers,
    COUNT(e.id) AS total_direct_reports
  FROM org_hierarchy oh
  LEFT JOIN employees e ON oh.id = e.manager_id AND e.active = TRUE
  LEFT JOIN employment_statuses es ON oh.id = es.employee_id
  LEFT JOIN organizations o ON es.organization_id = o.id
  WHERE o.id IN ('[ORGANIZATION_IDS]')
  GROUP BY oh.level, o.id, o.name
),
span_stats AS (
  SELECT 
    management_level,
    department,
    number_of_managers,
    total_direct_reports,
    ROUND(total_direct_reports / NULLIF(number_of_managers, 

2025-04-11 11:03:59,788 - INFO - HTTP Request: POST https://api.deepseek.com/v1/chat/completions "HTTP/1.1 200 OK"


LLM Response (Attempt 1):
WITH monthly_counts AS (
    SELECT 
        m.name AS manager_name,
        DATE_FORMAT(esh.effective_date, '%Y-%m') AS month,
        COUNT(e.id) AS direct_reports
    FROM 
        employees e
        JOIN employees m ON e.manager_id = m.id
        JOIN employment_status_histories esh ON e.id = esh.employee_id
        JOIN employment_statuses es ON e.id = es.employee_id
    WHERE 
        e.active = TRUE
        AND esh.effective_date >= DATE_SUB(CURRENT_DATE(), INTERVAL 12 MONTH)
        AND es.organization_id IN ('[ORGANIZATION_IDS]')
        AND es.job_level_id IN ('[JOB_LEVEL_IDS]')
        AND es.location_id IN ('[LOCATION_IDS]')
    GROUP BY 
        m.name, DATE_FORMAT(esh.effective_date, '%Y-%m')
)
SELECT 
    manager_name, 
    month, 
    direct_reports,
    LAG(direct_reports, 1) OVER (PARTITION BY manager_name ORDER BY month) AS prev_month_count,
    direct_reports - LAG(direct_reports, 1) OVER (PARTITION BY manager_name ORDER BY month) AS chang

2025-04-11 11:05:08,959 - INFO - HTTP Request: POST https://api.deepseek.com/v1/chat/completions "HTTP/1.1 200 OK"


LLM Response (Attempt 1):
WITH dept_stats AS (
    SELECT 
        o.id AS org_id,
        o.name AS department,
        COUNT(e.id) AS total_employees
    FROM 
        employees e
        JOIN employment_statuses es ON e.id = es.employee_id
        JOIN organizations o ON es.organization_id = o.id
    WHERE 
        e.active = TRUE
        AND es.organization_id IN ('[ORGANIZATION_IDS]')
        AND es.job_level_id IN ('[JOB_LEVEL_IDS]')
        AND es.location_id IN ('[LOCATION_IDS]')
    GROUP BY 
        o.id, o.name
),
manager_stats AS (
    SELECT 
        m.id AS manager_id,
        m.name AS manager_name,
        o.id AS org_id,
        o.name AS department,
        COUNT(e.id) AS direct_reports
    FROM 
        employees e
        JOIN employees m ON e.manager_id = m.id
        JOIN employment_statuses es ON e.id = es.employee_id
        JOIN organizations o ON es.organization_id = o.id
    WHERE 
        e.active = TRUE
        AND es.organization_id IN ('[ORGANIZATION_IDS]

2025-04-11 11:06:46,040 - INFO - HTTP Request: POST https://api.deepseek.com/v1/chat/completions "HTTP/1.1 200 OK"


LLM Response (Attempt 1):
WITH employee_join_age AS (
    SELECT 
        e.id,
        o.name AS department,
        YEAR(e.join_date) AS join_year,
        FLOOR(DATEDIFF(e.join_date, ed.date_of_birth)/365) AS age_at_join
    FROM 
        employees e
    JOIN 
        employee_details ed ON e.id = ed.employee_id
    JOIN 
        employment_statuses es ON e.id = es.employee_id
    JOIN 
        organizations o ON es.organization_id = o.id
    WHERE 
        e.join_date >= DATE_SUB(STR_TO_DATE('11 April 2025', '%d %M %Y'), INTERVAL 5 YEAR)
        AND es.organization_id IN ('[ORGANIZATION_IDS]')
        AND es.job_level_id IN ('[JOB_LEVEL_IDS]')
        AND es.location_id IN ('[LOCATION_IDS]')
)
SELECT 
    join_year,
    department,
    AVG(age_at_join) AS average_join_age,
    COUNT(*) AS employee_count
FROM 
    employee_join_age
GROUP BY 
    join_year, department
ORDER BY 
    join_year, department;

Trying candidate 1
❌ Candidate 1 failed with error: Error executing query: 1054

2025-04-11 11:07:40,678 - INFO - HTTP Request: POST https://api.deepseek.com/v1/chat/completions "HTTP/1.1 200 OK"


KeyboardInterrupt: 

In [9]:
query = """WITH termination_counts AS (
  SELECT 
    organizations.name AS department,
    termination_reasons.name AS reason,
    COUNT(*) AS count
  FROM termination_entries
  JOIN termination_reasons ON termination_entries.termination_reason_id = termination_reasons.id
  JOIN employment_statuses ON termination_entries.employee_id = employment_statuses.employee_id
  JOIN organizations ON employment_statuses.organization_id = organizations.id
  WHERE 
    termination_entries.effective_date >= DATE_SUB(STR_TO_DATE('01 April 2025', '%d %M %Y'), INTERVAL 1 YEAR)
    AND termination_entries.approval_status = 'APPROVED'
    AND employment_statuses.organization_id IN ('[ORGANIZATION_IDS]')
    AND employment_statuses.job_level_id IN ('[JOB_LEVEL_IDS]')
    AND employment_statuses.location_id IN ('[LOCATION_IDS]')
  GROUP BY organizations.name, termination_reasons.name
)
SELECT 
  tc1.department,
  tc1.reason,
  tc1.count
FROM termination_counts tc1
WHERE tc1.count = (
  SELECT MAX(tc2.count)
  FROM termination_counts tc2
  WHERE tc2.department = tc1.department
)
ORDER BY tc1.count DESC;
"""

cursor = connection.cursor(dictionary=True)
cursor.execute(query)
result = cursor.fetchall()
print(result)

ProgrammingError: 1146 (42S02): Table 'ru4f_time_management.termination_reasons' doesn't exist

In [26]:
df_query_result[~df_query_result['Generated SQL Query'].isna()]

Unnamed: 0,No,Base Prompt,Prompt,Expected SQL Query,Expected Query Result,Time Taken,Generated SQL Query,Generated Query Result
14,15,Bagaimana distribusi pendapatan tambahan karya...,Bagaimana distribusi pendapatan tambahan karya...,,,78.362395,"SELECT organizations.name AS departemen, addit...","[{'departemen': 'Board of Directors', 'jenis_p..."
17,18,Berapa rata-rata pengalaman kerja sebelumnya u...,Berapa rata-rata pengalaman kerja sebelumnya u...,,,29.858527,"SELECT job_levels.name AS level_jabatan, AVG(D...",[]
19,20,Berapa lama waktu yang dibutuhkan karyawan unt...,Berapa lama waktu yang dibutuhkan karyawan unt...,,,30.06606,WITH first_promotions AS (\n SELECT \n e.i...,"[{'rata_rata_hari_untuk_promosi': None, 'rata_..."
34,35,Berapa rata-rata lama pengalaman kerja sebelum...,Berapa rata-rata lama pengalaman kerja sebelum...,,,31.453215,WITH employee_experience AS (\n SELECT \n ...,[]
37,38,Bagaimana distribusi pendapatan tambahan karya...,Bagaimana distribusi pendapatan tambahan karya...,,,43.87055,"SELECT \n job_levels.name AS job_level, \n ...","[{'job_level': 'Staff', 'income_type': 'THR', ..."
52,53,Bagaimana tingkat turnover tahunan per departe...,Bagaimana tingkat turnover tahunan per departe...,,,70.11947,WITH year_range AS (\n SELECT YEAR(CURRENT_DA...,"[{'department': 'Information Technology', 'yea..."
67,68,Hitung tingkat turnover tahunan per departemen...,Hitung tingkat turnover tahunan per departemen...,,,74.662263,WITH termination_counts AS (\n SELECT \n o...,"[{'department': 'Board of Directors', 'year': ..."
88,91,Bagaimana distribusi usia karyawan saat ini be...,Bagaimana distribusi usia karyawan saat ini be...,,,27.519528,"SELECT organizations.name AS department, FLOOR...","[{'department': 'Board of Commissioners', 'ave..."


# Upload to Google Sheets

In [25]:
from modules.google_sheets_writer import GoogleSheetsWriter
import logging

SYNTETIC_DATA_SHEET_NAME = "catapa_syntetics_db_employee_2"
writer = GoogleSheetsWriter(
    google_util=google,  # Your GoogleUtil instance
    sheet_id=GOOGLE_SPREADSHEET_ID,
    worksheet_name=SYNTETIC_DATA_SHEET_NAME,
    batch_size=10,  # Customize batch size
    max_retries=5,  # Customize retry attempts
    batch_delay=2  # Customize delay between batches
)
# Write the DataFrame
result = writer.write_dataframe(df_query_result)

# Log results
logging.info(f"Successfully wrote {result.successful_rows} rows")
if result.failed_rows > 0:
    logging.error(f"Failed to write {result.failed_rows} rows")
    for error in result.errors:
        logging.error(f"Row {error['row_number']}: {error['error']}")

  0%|          | 0/10 [00:00<?, ?it/s]2025-03-27 09:46:13,426 - INFO - Successfully wrote row 1/98
2025-03-27 09:46:15,460 - INFO - Successfully wrote row 2/98
2025-03-27 09:46:17,318 - INFO - Successfully wrote row 3/98
2025-03-27 09:46:19,135 - INFO - Successfully wrote row 4/98
2025-03-27 09:46:20,972 - INFO - Successfully wrote row 5/98
2025-03-27 09:46:21,905 - INFO - Successfully wrote row 6/98
2025-03-27 09:46:23,817 - INFO - Successfully wrote row 7/98
2025-03-27 09:46:25,670 - INFO - Successfully wrote row 8/98
2025-03-27 09:46:27,516 - INFO - Successfully wrote row 9/98
2025-03-27 09:46:29,361 - INFO - Successfully wrote row 10/98
 10%|█         | 1/10 [00:20<03:00, 20.10s/it]2025-03-27 09:46:33,197 - INFO - Successfully wrote row 11/98
2025-03-27 09:46:35,086 - INFO - Successfully wrote row 12/98
2025-03-27 09:46:36,919 - INFO - Successfully wrote row 13/98
2025-03-27 09:46:38,783 - INFO - Successfully wrote row 14/98
2025-03-27 09:46:40,628 - INFO - Successfully wrote row 1