# QA question generator using Deepseek


**Authors**
1. Alfan Dinda Rahmawan (alfan.d.rahmawan@gdplabs.id)

## Install dependencies

In [1]:
%pip install -q langchain=="0.3.0"
%pip install -q langchain-aws=="0.2.7"
%pip install -q langchain_openai=="0.2.14"
%pip install -q boto3=="1.35.71"
%pip install -q pandas=="2.2.2"
%pip install -q tqdm=="4.66.4"
%pip install google-api-python-client==2.100.0 gspread==5.10.0
%pip install python-dotenv

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Collecting google-api-python-client==2.100.0
  Downloading google_api_python_client-2.100.0-py2.py3-none-any.whl.metadata (6.6 kB)
Collecting gspread==5.10.0
  Downloading gspread-5.10.0-py3-none-any.whl.metadata (6.8 kB)
Collecting httplib2<1.dev0,>=0.15.0 (from google-api-python-client==2.100.0)
  Downloading httplib2-0.22.0-py3-none-any.whl.metadata (2.6 kB)
Collecting google-auth<3.0.0.dev0,>=1.19.0 (from google-api-python-client==2.100.0)
  Downloading google_auth-2.38.0-py2.py3-none-any.whl.metadata (4.8 kB)
Collecting google-auth-httplib2>=0.1.0 (from google-api-python-client==2.100.0)
  Do

## Google Auth

In [1]:
import os
from dotenv import load_dotenv

load_dotenv()

GOOGLE_SPREADSHEET_ID: str = "1dDMqrol_DrEMjvLy88IRu2WdHN7T5BU0LrD8ORLuNPI" # put your spreadsheet id here
GOOGLE_SPREADSHEET_URL: str = f"https://docs.google.com/spreadsheets/d/{GOOGLE_SPREADSHEET_ID}/edit?usp=sharing" # put your spreadsheet link here
DATA_TEST_SHEET_NAME: str = "catapa_syntetics_question"

GOOGLE_SHEETS_CLIENT_EMAIL: str = os.getenv('GOOGLE_SHEETS_CLIENT_EMAIL')
GOOGLE_SHEETS_PRIVATE_KEY: str = os.getenv('GOOGLE_SHEETS_PRIVATE_KEY')

In [2]:
# Google Authentication
from modules.google_sheets_writer import GoogleUtil

PRIVATE_KEY = GOOGLE_SHEETS_PRIVATE_KEY
google: GoogleUtil = GoogleUtil(PRIVATE_KEY, GOOGLE_SHEETS_CLIENT_EMAIL)


## Database Information

In [10]:
import json

employee_schema_path = "schema/employee_table.json"
payroll_schema_path = "schema/payroll_table.json"
time_management_schema_path = "schema/time_management_table.json"

master_data_attendance_path = "master_data/attendance_statuses.csv"
master_data_employment_status_path = "master_data/employment_status_types.csv"
master_data_employment_path = "master_data/employment_types.csv"


### Table Schema

In [11]:
def format_column_info(column: dict) -> str:
    """Format column information with type and description.

    Args:
        column (dict): Dictionary containing column information with at least a 'name' key.
            May also contain 'type' and 'description' keys.

    Returns:
        str: Formatted column information string.
    """
    parts = [column['name']]

    # Add type if present
    if 'type' in column:
        parts.append(f"[{column['type']}]")

    # Add description if present
    if 'description' in column:
        parts.append(f"({column['description']})")

    return " ".join(parts)

def create_schema_dictionary(schema_data: dict) -> str:
    """Create formatted schema information for all tables.

    Args:
        schema_data (dict): Dictionary containing table schema information.
            Each key is a table name, and each value is a dictionary with table information.

    Returns:
        str: Formatted schema information for all tables.
    """
    formatted_schemas = []

    for table_name, table_info in schema_data.items():
        # Get columns with a default empty list if not present
        columns = table_info.get('columns', [])

        # Format column information using list comprehension
        column_info = [f"- {format_column_info(column)}" for column in columns]

        # Join column information with newlines
        formatted_schemas.append(f"{table_name}:\n" + "\n".join(column_info))

    return "\n\n".join(formatted_schemas)

# Initialize the main dictionary with three domains
table_schema = {domain: "" for domain in ['employee', 'payroll', 'time_management']}

# Map files to their domains
file_domain_mapping = {
    employee_schema_path: 'employee',
    payroll_schema_path: 'payroll',
    time_management_schema_path: 'time_management'
}

# Process each schema file
for schema_file, domain in file_domain_mapping.items():
    try:
        with open(schema_file) as f:
            schema_data = json.load(f)
            table_schema[domain] = create_schema_dictionary(schema_data)
    except (FileNotFoundError, json.JSONDecodeError) as e:
        print(f"Error processing schema file {schema_file}: {e}")

## Show Schema
# print(table_schema['employee'])
# print(table_schema['payroll'])
# print(table_schema['time_management'])

### Table Relations

In [12]:
def create_relations_dictionary(schema_data: dict) -> str:
    """Create formatted relations information for all tables.

    Args:
        schema_data (dict): Dictionary containing table schema information.
            Each key is a table name, and each value is a dictionary with table information.

    Returns:
        str: Formatted relations information for all tables.
    """
    formatted_relations = []

    for table_name, table_info in schema_data.items():
        # Skip if no foreign keys
        if 'foreign_keys' not in table_info or not table_info['foreign_keys']:
            continue

        # Start with table name
        relations = [f"{table_name}:"]

        # Add each foreign key relation
        relations.extend([
            f"- {fk['ref_table']} referenced by {fk['column']}"
            for fk in table_info['foreign_keys']
        ])

        formatted_relations.append("\n".join(relations))

    return "\n".join(formatted_relations)

# Initialize the relations dictionary for all domains
domains = ['employee', 'payroll', 'time_management']
table_relations = {domain: "" for domain in domains}

# Map schema files to their domains
file_domain_mapping = {
    employee_schema_path: 'employee',
    payroll_schema_path: 'payroll',
    time_management_schema_path: 'time_management'
}

# Process each schema file
for schema_file, domain in file_domain_mapping.items():
    try:
        with open(schema_file) as f:
            schema_data = json.load(f)
            table_relations[domain] = create_relations_dictionary(schema_data)
    except (FileNotFoundError, json.JSONDecodeError) as e:
        print(f"Error processing relations in {schema_file}: {e}")

# show relations
# print(table_relations['employee'])
# print(table_relations['payroll'])
# print(table_relations['time_management'])

### Master Data


In [13]:
import json
import csv
import os
from typing import Dict, List, Any, Optional
from pathlib import Path

class MasterDataLoader:
    """Class to load and process master data from CSV files and schema files.

    This class handles loading master data from CSV files and extracting master data
    information from schema files, with proper error handling and logging.
    """

    def __init__(self, master_data_paths: Dict[str, str], schema_paths: Dict[str, str]):
        """Initialize the MasterDataLoader with paths to master data and schema files.

        Args:
            master_data_paths: Dictionary mapping data types to file paths
            schema_paths: Dictionary mapping domains to schema file paths
        """
        self.master_data_paths = master_data_paths
        self.schema_paths = schema_paths
        self.master_data_dict = {}
        self.db_master_data_dict = {domain: "" for domain in schema_paths.keys()}

    def read_csv_master_data(self, file_path: str) -> Optional[List[str]]:
        """Read CSV file and return list of values from 'name' column.

        Args:
            file_path: Path to the CSV file containing master data

        Returns:
            List of values from the 'name' column or None if file not found or invalid
        """
        path = Path(file_path)
        if not path.exists():
            print(f"Warning: Master data file not found: {file_path}")
            return None

        try:
            with path.open('r', encoding='utf-8') as f:
                csv_reader = csv.DictReader(f)
                # Verify 'name' column exists
                if 'name' not in csv_reader.fieldnames:
                    print(f"Warning: CSV file {file_path} does not contain a 'name' column")
                    return None
                return [row['name'] for row in csv_reader]
        except Exception as e:
            print(f"Error reading CSV file {file_path}: {e}")
            return None

    @staticmethod
    def extract_schema_master_data(schema_data: Dict[str, Any]) -> str:
        """Extract master_data information from schema.

        Args:
            schema_data: Dictionary containing schema data

        Returns:
            Formatted string of master data information
        """
        master_data_info = []

        for table_name, table_info in schema_data.items():
            if isinstance(table_info, dict) and 'master_data' in table_info:
                for field, values in table_info['master_data'].items():
                    master_data_info.append(f"{table_name} - {field} = {values}")

        return "\n".join(master_data_info)

    def load_master_data(self) -> Dict[str, str]:
        """Load master data from CSV files.

        Returns:
            Dictionary mapping data types to formatted master data strings
        """
        for key, file_path in self.master_data_paths.items():
            values = self.read_csv_master_data(file_path)
            if values:  # Only add if we got values
                self.master_data_dict[key] = f"{key} - name = {values}"

        return self.master_data_dict

    def load_schema_master_data(self) -> Dict[str, str]:
        """Load master data from schema files.

        Returns:
            Dictionary mapping domains to formatted master data strings
        """
        for schema_file, domain in self.schema_paths.items():
            path = Path(schema_file)
            if not path.exists():
                print(f"Warning: Schema file not found: {schema_file}")
                continue

            try:
                with path.open('r', encoding='utf-8') as f:
                    try:
                        schema_data = json.load(f)
                        self.db_master_data_dict[domain] = self.extract_schema_master_data(schema_data)
                    except json.JSONDecodeError:
                        print(f"Warning: Invalid JSON in schema file: {schema_file}")
            except Exception as e:
                print(f"Error reading schema file {schema_file}: {e}")

        return self.db_master_data_dict

    def load_all(self) -> tuple[Dict[str, str], Dict[str, str]]:
        """Load all master data from both CSV and schema files.

        Returns:
            Tuple containing (master_data_dict, db_master_data_dict)
        """
        self.load_master_data()
        self.load_schema_master_data()
        return self.master_data_dict, self.db_master_data_dict

# Usage example:
# Define master data files
master_data_files = {
    'attendance_statuses': master_data_attendance_path,
    'employment_status_types': master_data_employment_status_path,
    'employment_types': master_data_employment_path
}

# Map schema files to their domains
schema_file_mapping = {
    employee_schema_path: 'employee',
    payroll_schema_path: 'payroll',
    time_management_schema_path: 'time_management'
}

# Create loader and load all data
loader = MasterDataLoader(master_data_files, schema_file_mapping)
master_data_dict, db_master_data_dict = loader.load_all()

# Show result
# print("Master Data Dictionary:")
# print(master_data_dict)
# print("\nDB Master Data Dictionary:")
# print(db_master_data_dict.keys())
# print(db_master_data_dict['employee'])
employee_master_data = master_data_dict['employment_status_types'] + '\n' + master_data_dict['employment_types'] + '\n' + db_master_data_dict['employee']

### Data Trustee Tables

In [14]:
import json
from pathlib import Path
from typing import Dict, Optional

def extract_data_trustee(schema_data: Dict) -> str:
    """Extract data_trustee information from schema.

    Args:
        schema_data: Dictionary containing table schema information

    Returns:
        str: Formatted string of data trustee information with each entry on a new line
    """
    trustee_info = []

    for table_name, table_info in schema_data.items():
        if isinstance(table_info, dict) and 'data_trustee' in table_info:
            trustee_info.append(f"> {table_name}: {table_info['data_trustee']}")

    return "\n".join(trustee_info)

def load_data_trustee_info(schema_paths: Dict[str, str]) -> Dict[str, str]:
    """Load data trustee information from schema files.

    Args:
        schema_paths: Dictionary mapping schema file paths to domain names

    Returns:
        Dict[str, str]: Dictionary mapping domains to formatted data trustee information
    """
    # Initialize the data trustee dictionary with empty strings for each domain
    data_trustee_dict = {domain: "" for domain in set(schema_paths.values())}

    # Process each schema file
    for schema_file, domain in schema_paths.items():
        path = Path(schema_file)
        if not path.exists():
            print(f"Warning: Schema file not found: {schema_file}")
            continue

        try:
            with path.open('r', encoding='utf-8') as f:
                schema_data = json.load(f)
                data_trustee_dict[domain] = extract_data_trustee(schema_data)
        except json.JSONDecodeError:
            print(f"Warning: Invalid JSON in schema file: {schema_file}")
        except Exception as e:
            print(f"Error processing schema file {schema_file}: {e}")

    return data_trustee_dict

# Map files to their domains
file_domain_mapping = {
    employee_schema_path: 'employee',
    payroll_schema_path: 'payroll',
    time_management_schema_path: 'time_management'
}

# Load data trustee information
data_trustee_dict = load_data_trustee_info(file_domain_mapping)

# Example usage:
# print(data_trustee_dict['employee'])
# print(data_trustee_dict['payroll'])
# print(data_trustee_dict['time_management'])

### Anonymized Entities

In [15]:
class Entities:
    PERSON = "PERSON"
    URL = "URL"
    EMAIL_ADDRESS = "EMAIL_ADDRESS"
    KTP = "ID_KTP"
    NPWP = "ID_NPWP"
    PHONE_NUMBER = "PHONE_NUMBER"
    FACEBOOK_ACCOUNT = "FACEBOOK_ACCOUNT"
    FAMILY_CARD_NUMBER = "FAMILY_CARD_NUMBER"
    BANK_ACCOUNT = "BANK_ACCOUNT"

class CatapaEntities:
    BIRTHDATE = "BIRTHDATE"
    EMPLOYEE_IDENTIFICATION_NUMBER = "EMPLOYEE_IDENTIFICATION_NUMBER"

ENTITIES_DESCRIPTION = {
    Entities.PERSON: "Represents an individual human being, identified by a name, which can be a full name or partial "
    "name (e.g., 'John', 'Doe', 'Maria', 'Gomez'). This entity strictly refers to proper names and excludes job titles,"
    " roles, or organizational terms (e.g., 'Jumlah Karyawan Baru').",
    Entities.URL: "A Uniform Resource Locator (URL) is the address used to access a resource on the internet, "
    "typically pointing to a website or document.",
    Entities.EMAIL_ADDRESS: "A unique identifier for electronic mail communication, usually in the format "
    "'user@example.com'.",
    Entities.KTP: "An Indonesian term for 'Kartu Tanda Penduduk', referring to a National Identity Card issued "
    "to citizens of Indonesia.",
    Entities.NPWP: "An Indonesian term for 'Nomor Pokok Wajib Pajak', referring to a Taxpayer Identification "
    "Number issued by the Indonesian tax authority to individuals and entities for tax reporting purposes.",
    Entities.PHONE_NUMBER: "A unique sequence of digits assigned to a telecommunications line, typically used for "
    "voice or text communication, and often associated with an individual or business for contact purposes.",
    Entities.FACEBOOK_ACCOUNT: "A social media account associated with Facebook, typically identified by a username "
    "or profile URL, and used for communication, social networking, and sharing content.",
    Entities.FAMILY_CARD_NUMBER: "A unique identification number assigned to an Indonesian family, typically "
    "consisting of 16 digits. It is associated with the Family Card (Kartu Keluarga), which contains information "
    "about the family members and is used for various administrative and legal purposes in Indonesia.",
    Entities.BANK_ACCOUNT: "A unique identifier assigned to a bank account, used to facilitate financial transactions"
    " such as deposits, withdrawals, and transfers.",
    CatapaEntities.BIRTHDATE: "A date representing the birthdate of an individual",
    CatapaEntities.EMPLOYEE_IDENTIFICATION_NUMBER: "A unique identifier assigned to an employee within an organization."
    " The term 'identification number' is commonly used",
}

In [16]:
def format_entities_description(entities_dict: dict) -> str:
    """Format entities description dictionary into a JSON-like string format.

    This function takes a dictionary of entity descriptions and formats it into a
    properly indented, JSON-like string with escaped quotes for use in prompts.

    Args:
        entities_dict: Dictionary mapping entity names to their descriptions

    Returns:
        str: Formatted string representation of the entities dictionary
    """
    # Create list of formatted key-value pairs
    formatted_entries = []

    for key, value in entities_dict.items():
        # Escape any double quotes in the description
        escaped_value = value.replace('"', '\\"')
        # Format the key-value pair with proper JSON syntax
        formatted_entries.append(f'"{key}": "{escaped_value}"')

    # Join all entries with proper formatting (comma and newline)
    joined_entries = ",\n    ".join(formatted_entries)

    # Add consistent indentation for the entire block
    return "    " + joined_entries

# Generate the formatted string
anonymized_entities_description = format_entities_description(ENTITIES_DESCRIPTION)

# Print the result
# print(anonymized_entities_description)

## Sets up the qa generator prompt templates

### Prompt question generator with sql query

In [17]:
from pydantic import BaseModel, Field
from typing import List
from langchain_core.output_parsers import JsonOutputParser
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate, HumanMessagePromptTemplate, SystemMessagePromptTemplate

from modules.database_info.schema import employee_schema, time_management_schema
from modules.database_info.master_data import employee_master_data, time_management_master_data
from modules.database_info.relation import employee_relations, time_management_relations
from modules.database_info.trustee_tables import data_trustee_employee, data_trustee_time_management
from modules.database_info.anonymize_entities import anonymized_entities_description

In [18]:
SYSTEM_MESSAGE=  """<instructions>
You are a specialized SQL question generator for an HR database. Your task is to create diverse, realistic business questions AND their corresponding SQL queries against the provided HR database schema.

ADDITIONAL GUIDELINES
- Ensure questions are realistic for HR analysis scenarios
- Be specific about time periods, criteria, and expected outputs
- Consider privacy concerns with data trustee fields and anonymized entities
- Create questions that would be valuable for HR decision-making
- Ensure overall coverage of all major tables and SQL concepts
- Make sure the output is in valid, properly formatted JSON
- SQL queries should be correct, optimized, and follow best practices
- Include comments in complex SQL queries to explain the logic
</instructions>"""

USER_MESSAGE = """I need you to generate {total_questions} SQL business questions along with their corresponding SQL queries based on this HR database information:

    ## DATABASE SCHEMA
    {schema}

    ## DATABASE RELATIONS
    {relations}

    ## MASTER DATA
    {master_data}

    ## DATA TRUSTEE (Sensitive Fields)
    {data_trustee_tables}

    ## ANONYMIZED ENTITIES
    {anonymized_entities_description}


    ## SPECIFIC HR ANALYSIS CASES TO INCLUDE
    Your questions should cover these specific HR analysis cases, in addition to others you create:

    1. TURNOVER ANALYSIS:
    - Annual turnover rates over multiple years
    - Turnover by department, job level, or location
    - Reasons for termination analysis

    2. EMPLOYEE HIRING PATTERNS:
    - Quarterly new employee count
    - Hiring trends by department or location
    - Seasonal hiring patterns

    3. MANAGEMENT HIERARCHY:
    - Employee managerial status (Manager vs. Non-Manager)
    - Reporting structure analysis
    - Span of control (number of direct reports per manager)

    4. ATTENDANCE AND TIME TRACKING:
    - Attendance after holidays
    - Absence patterns by department or job level
    - Attendance compliance analysis

    5. EMPLOYMENT STATUS CHANGES:
    - Promotion/demotion rates
    - Contract renewals and conversions
    - Department transfer patterns

    6. DEMOGRAPHIC ANALYSIS:
    - Age distribution
    - Gender representation by department or job level
    - Diversity metrics across locations

    7. EDUCATION AND EXPERIENCE:
    - Education level distribution
    - Qualification analysis by job title
    - Prior experience correlation with job placement

    8. FAMILY AND PERSONAL:
    - Family composition analysis
    - Marital status distribution
    - Family relation type patterns

    9. EMPLOYMENT CONTRACTS:
    - Contract type distribution
    - Contract duration analysis
    - Employment type trends over time

    10. ORGANIZATIONAL STRUCTURE:
        - Department size comparison
        - Location distribution analysis
        - Job level distribution by organization

    11. FINANCIAL ANALYSIS:
        - Salary payment tracking
        - Bank transfer analysis
        - Bonus and additional income distribution
        - Total compensation analysis by organizational unit

    12. CUSTOM DATA FIELDS:
        - Custom metrics tracking and comparison
        - Employee variable analysis
        - Trend analysis of custom data fields

    13. TIME-BASED ANALYSIS:
        - Year-to-date comparisons
        - Month-over-month trends
        - Quarterly analysis
        - Custom date range reporting

    14. DATA VISUALIZATION SUPPORT:
        - Queries formatted for chart generation
        - Time-series data for trend visualization
        - Aggregated data for dashboard displays

    ## TASK
    IMPORTANT: You MUST generate EXACTLY {total_questions} distinct business questions with their corresponding SQL queries. No more, no less. The questions should:

    1. Cover all major tables and relationships in the database
    2. Range across different complexity levels:
    - Basic level (33 percent of questions):
        * Simple queries with 1-2 tables
        * Simple WHERE conditions
        * Basic aggregations (COUNT, SUM, AVG)
        * Simple ORDER BY and GROUP BY
    - Intermediate level (33 percent of questions):
        * 2-4 tables with multiple joins
        * Subqueries
        * Window functions
        * More complex filtering
    - Advanced level (34 percent of questions):
        * 3-4+ tables with complex joins
        * Nested subqueries
        * Complex queries with CTEs
        * Hierarchical data queries
        * Multiple subqueries
        * Advanced SQL features
    3. Include various SQL concepts such as:
        - Simple selection and filtering
        - Multiple table joins (INNER, LEFT, RIGHT, FULL OUTER)
        - Aggregation functions (COUNT, SUM, AVG, MIN, MAX)
        - Grouping and having clauses
        - Subqueries (correlated and non-correlated)
        - Common Table Expressions (CTEs) and recursive CTEs
        - Window functions (ROW_NUMBER, RANK, DENSE_RANK, NTILE, LAG, LEAD)
        - Date-based operations and filtering
        - Conditional logic (CASE statements, COALESCE, NULLIF)
        - Hierarchical data handling (manager-employee relationships)
        - Historical data analysis (using employment status histories)
        - Set operations (UNION, INTERSECT, EXCEPT)
        - String functions and pattern matching (LIKE, REGEXP)
        - Pivot and unpivot operations
        - Temporary tables and table variables
        - Data type conversions and casting
        - Advanced grouping (ROLLUP, CUBE, GROUPING SETS)
    4. Format the queries appropriately for your database, using the following standards:
        - Use STR_TO_DATE for date formatting and comparison
        - Always include standard organization, job level, and location filters where applicable
        - Use appropriate date functions for date calculations
        - Format query output in Bahasa Indonesia where needed
        - Include the organization_id, job_level_id, and location_id filtering using the placeholder format: IN ([ORGANIZATION_IDS]), IN ([JOB_LEVEL_IDS]), IN ([LOCATION_IDS])

    ## LINGUISTIC VARIATION REQUIREMENTS
    To ensure diverse question phrasing, use a wide variety of question structures such as:
    1. Direct questions: "Siapa karyawan dengan..."
    2. Comparative questions: "Bagaimana perbandingan antara..."
    3. Trend analysis: "Bagaimana tren perekrutan selama..."
    4. Ranking questions: "Urutkan departemen berdasarkan..."
    5. Percentage-based: "Berapa persentase karyawan yang..."
    6. Ratio questions: "Apa rasio antara karyawan..."
    7. Time-based analysis: "Kapan terjadi lonjakan..."
    8. Conditional questions: "Dalam kondisi apa departemen..."
    9. Outlier identification: "Identifikasi karyawan yang..."
    10. Distribution questions: "Bagaimana distribusi gaji..."
    11. Correlation questions: "Adakah korelasi antara..."
    12. Projection questions: "Proyeksikan jumlah karyawan..."
    13. Threshold questions: "Temukan departemen dengan tingkat turnover di atas..."
    14. Pattern identification: "Pola apa yang terlihat dalam..."
    15. Anomaly detection: "Temukan anomali dalam data..."

    ## OUTPUT FORMAT
    Provide the output in valid JSON format as follows. The examples below are just for reference - YOUR QUESTIONS MUST BE COMPLETELY DIFFERENT:

    ```json
    {{
    "sql_questions": [
        {{
            "question_id": 1,
            "category": "EMPLOYMENT ANALYSIS",
            "complexity": "Intermediate",
            "business_question": "Bagaimana perbandingan jumlah karyawan berdasarkan organisasi? Munculkan nama organisasi dan total karyawan.",
            "required_tables": ["employees", "employment_statuses", "organizations"],
            "sql_concepts": ["JOIN", "COUNT", "GROUP BY", "WHERE"],
            "sql_query": "SELECT organizations.name AS \"organization_name\", COUNT(employees.id) AS \"total_employees\" FROM employees JOIN employment_statuses ON employees.id = employment_statuses.employee_id JOIN organizations ON employment_statuses.organization_id = organizations.id WHERE employees.active = TRUE AND employment_statuses.organization_id IN ('[ORGANIZATION_IDS]') AND employment_statuses.job_level_id IN ('[JOB_LEVEL_IDS]') AND employment_statuses.location_id IN ('[LOCATION_IDS]') GROUP BY organizations.name;"
        }},
        {{
            "question_id": 2,
            "category": "TURNOVER ANALYSIS",
            "complexity": "Intermediate",
            "business_question": "Bagaimana data termination berdasarkan nama jabatan?",
            "required_tables": ["termination_entries", "employees", "employment_statuses", "job_titles"],
            "sql_concepts": ["JOIN", "COUNT", "GROUP BY", "WHERE"],
            "sql_query": "SELECT job_titles.name AS \"job_title_name\", COUNT(termination_entries.id) AS \"termination_count\" FROM termination_entries JOIN employees ON termination_entries.employee_id = employees.id JOIN employment_statuses ON employees.id = employment_statuses.employee_id JOIN job_titles ON employment_statuses.job_title_id = job_titles.id WHERE termination_entries.approval_status = 'APPROVED' AND employment_statuses.organization_id IN ('[ORGANIZATION_IDS]') AND employment_statuses.job_level_id IN ('[JOB_LEVEL_IDS]') AND employment_statuses.location_id IN ('[LOCATION_IDS]') GROUP BY job_titles.name;"
        }},
        {{
            "question_id": 3,
            "category": "TURNOVER ANALYSIS",
            "complexity": "Basic",
            "business_question": "Berikan nama employee beserta termination reasonnya untuk data termination yang belum di approve",
            "required_tables": ["termination_entries", "employees", "termination_reasons", "employment_statuses"],
            "sql_concepts": ["JOIN", "LEFT JOIN", "WHERE"],
            "sql_query": "SELECT employees.name AS \"employee_name\", termination_reasons.name AS \"Termination Reason\" FROM termination_entries JOIN employees ON termination_entries.employee_id = employees.id LEFT JOIN termination_reasons ON termination_entries.termination_reason_id = termination_reasons.id JOIN employment_statuses ON employees.id = employment_statuses.employee_id WHERE termination_entries.approval_status <> 'APPROVED' AND employment_statuses.organization_id IN ('[ORGANIZATION_IDS]') AND employment_statuses.job_level_id IN ('[JOB_LEVEL_IDS]') AND employment_statuses.location_id IN ('[LOCATION_IDS]');"
        }},
        {{
            "question_id": 4,
            "category": "FINANCIAL ANALYSIS",
            "complexity": "Intermediate",
            "business_question": "Berikan total transfer untuk bank 'ALLO BANK' di bulan Oktober 2023. Tampilkan total transfer nya saja.",
            "required_tables": ["salary_payment_summaries", "company_bank_accounts", "bank_branches", "banks"],
            "sql_concepts": ["JOIN", "SUM", "WHERE", "BETWEEN", "STR_TO_DATE"],
            "sql_query": "SELECT SUM(salary_payment_summaries.transferred_amount) AS 'total_transfer' FROM salary_payment_summaries JOIN company_bank_accounts ON salary_payment_summaries.company_bank_account_id = company_bank_accounts.id JOIN bank_branches ON company_bank_accounts.bank_branch_id = bank_branches.id JOIN banks ON bank_branches.bank_id = banks.id WHERE banks.name = 'ALLO BANK' AND salary_payment_summaries.payment_date BETWEEN STR_TO_DATE('2023-10-01', '%Y-%m-%d') AND STR_TO_DATE('2023-10-31', '%Y-%m-%d');"
        }},
        {{
            "question_id": 5,
            "category": "FINANCIAL ANALYSIS",
            "complexity": "Intermediate",
            "business_question": "Berapa total transfer dari bank 'ALLO BANK' dan 'BCA' selama tahun 2023?",
            "required_tables": ["salary_payment_summaries", "company_bank_accounts", "bank_branches", "banks"],
            "sql_concepts": ["JOIN", "SUM", "WHERE", "IN", "BETWEEN", "STR_TO_DATE", "GROUP BY"],
            "sql_query": "SELECT banks.name AS 'bank_name', SUM(salary_payment_summaries.transferred_amount) AS 'total_transferred_amount' FROM salary_payment_summaries JOIN company_bank_accounts ON salary_payment_summaries.company_bank_account_id = company_bank_accounts.id JOIN bank_branches ON company_bank_accounts.bank_branch_id = bank_branches.id JOIN banks ON bank_branches.bank_id = banks.id WHERE banks.name IN ('ALLO BANK', 'BCA') AND salary_payment_summaries.payment_date BETWEEN STR_TO_DATE('2023-01-01', '%Y-%m-%d') AND STR_TO_DATE('2023-12-31', '%Y-%m-%d') GROUP BY banks.name;"
        }},
        {{
            "question_id": 6,
            "category": "FINANCIAL ANALYSIS",
            "complexity": "Intermediate",
            "business_question": "Berapa total uang yang ditransfer untuk karyawan yang aktif di bulan Oktober 2023?",
            "required_tables": ["salary_payment_summaries", "salary_payments", "employees", "employment_statuses"],
            "sql_concepts": ["JOIN", "SUM", "WHERE", "BETWEEN", "STR_TO_DATE"],
            "sql_query": "SELECT SUM(salary_payment_summaries.transferred_amount) AS 'total_transferred_amount' FROM salary_payment_summaries JOIN salary_payments ON salary_payment_summaries.id = salary_payments.salary_payment_summary_id JOIN employees ON salary_payments.employee_id = employees.id JOIN employment_statuses ON employees.id = employment_statuses.employee_id WHERE employees.active = TRUE AND employment_statuses.organization_id IN ([ORGANIZATION_IDS]) AND employment_statuses.job_level_id IN ([JOB_LEVEL_IDS]) AND employment_statuses.location_id IN ([LOCATION_IDS]) AND salary_payment_summaries.payment_date BETWEEN STR_TO_DATE('2023-10-01', '%Y-%m-%d') AND STR_TO_DATE('2023-10-31', '%Y-%m-%d');"
        }},
        // The rest questions here...
    ]
}}

IMPORTANT REQUIREMENTS:
1. DO NOT copy the example questions - create completely new ones
2. EVERY question must include ALL required fields (question_id, category, complexity, business_question, required_tables, sql_concepts, sql_query)
3. Ensure exactly {total_questions} questions are generated
4. Maintain the exact JSON structure shown above
5. Verify that each question has the correct complexity level to maintain the 33/33/34 percent distribution
"""

In [19]:
class QnAPair(BaseModel):
    paraphrased_input: str = Field(description="paraphrased_input")
    test_to_sql_query: str = Field(description="test_to_sql_query")

class QnAPairs(BaseModel):
    qna_pairs: List[QnAPair] = Field(description="list of qna pairs")

## OpenAI / DeepSeek
DEEPSEEK_MODEL_NAME = os.getenv("DEEPSEEK_MODEL")
DEEPSEEK_ENDPOINT = os.getenv("DEEPSEEK_ENDPOINT")
DEEPSEEK_API_KEY = os.getenv("DEEPSEEK_API_KEY")
print(DEEPSEEK_MODEL_NAME, DEEPSEEK_ENDPOINT, DEEPSEEK_API_KEY)

llm = ChatOpenAI(
    model_name=DEEPSEEK_MODEL_NAME,
    temperature=0.7,  # Higher temperature (0.7-0.9) for more creative variations
    openai_api_base=DEEPSEEK_ENDPOINT,
    openai_api_key=DEEPSEEK_API_KEY,
    top_p=0.95,  # Keep high top_p for diverse outputs while filtering unlikely tokens
    seed=42  # Optional: set seed for reproducibility
)

# Create prompt template
system_message_prompt = SystemMessagePromptTemplate.from_template(SYSTEM_MESSAGE)
human_message_prompt = HumanMessagePromptTemplate.from_template(USER_MESSAGE)
prompt = ChatPromptTemplate.from_messages([
    system_message_prompt,
    human_message_prompt
])
parser = JsonOutputParser(pydantic_object=QnAPairs)
chain = prompt | llm | parser

deepseek-chat https://api.deepseek.com/v1 sk-672c188728734ae5bd09d1b6b30139a2


### Sanity check

In [20]:
from datetime import datetime

current_date = datetime.now().strftime("%d %B %Y")
total_questions = 1
database_type = "employee"

if database_type == "employee":
    schema = employee_schema
    relations = employee_relations
    master_data = employee_master_data
    data_trustee_tables = data_trustee_employee
    master_data = employee_master_data
else:
    schema = time_management_schema
    relations = time_management_relations
    master_data = time_management_master_data
    data_trustee_tables = data_trustee_time_management
    master_data = time_management_master_data

response = chain.invoke({
    "schema": schema,
    "relations": relations,
    "master_data": master_data,
    "data_trustee_tables": data_trustee_tables,
    "anonymized_entities_description": anonymized_entities_description,
    "current_date": current_date,
    "total_questions": total_questions
})
response

2025-04-11 00:32:55,396 - INFO - HTTP Request: POST https://api.deepseek.com/v1/chat/completions "HTTP/1.1 200 OK"


{'sql_questions': [{'question_id': 1,
   'category': 'DEMOGRAPHIC ANALYSIS',
   'complexity': 'Advanced',
   'business_question': 'Bagaimana distribusi usia karyawan berdasarkan jenis kelamin dan tingkat pekerjaan? Tampilkan kelompok usia (20-29, 30-39, 40-49, 50+), jenis kelamin, dan nama tingkat pekerjaan.',
   'required_tables': ['employees',
    'employee_details',
    'job_levels',
    'employment_statuses'],
   'sql_concepts': ['JOIN',
    'CASE',
    'GROUP BY',
    'DATE functions',
    'Multiple table joins'],
   'sql_query': "WITH employee_age AS (\n    SELECT \n        e.id,\n        ed.gender,\n        jl.name AS job_level_name,\n        TIMESTAMPDIFF(YEAR, ed.date_of_birth, CURDATE()) AS age\n    FROM employees e\n    JOIN employee_details ed ON e.id = ed.employee_id\n    JOIN employment_statuses es ON e.id = es.employee_id\n    JOIN job_levels jl ON es.job_level_id = jl.id\n    WHERE e.active = TRUE\n    AND es.organization_id IN ([ORGANIZATION_IDS])\n    AND es.job_level

### Generate SQL Question and SQL Query


In [21]:
class ColumnName:
    NO = "No"
    PROMPT = "Prompt"
    COMPLEXITY = "Complexity"
    CATEGORY = "Category"
    REQUIRED_TABLES = "Required Tables"
    SQL_CONCEPTS = "SQL Concepts"
    EXPECTED_SQL_QUERY = "Expected SQL Query"
    GENERATED_SQL_QUERY = "Generated SQL Query"

In [22]:
from datetime import datetime
import pandas as pd

def generate_questions_in_batches(total_questions: int, batch_size: int = 25, database_type: str = "employee") -> list:
    """Generate SQL questions in batches with deduplication to avoid token limit issues.

    Args:
        total_questions (int): Total number of questions to generate
        batch_size (int): Number of questions to generate in each batch

    Returns:
        list: Combined list of all generated questions with no duplicates
    """
    all_questions = []
    generated_question_texts = set()  # Track question text to avoid duplicates

    # Calculate initial number of batches
    remaining_questions = total_questions
    batch_num = 0

    # Continue until we have enough unique questions
    while len(all_questions) < total_questions:
        batch_num += 1
        # Calculate questions for this batch
        questions_in_batch = min(batch_size, remaining_questions)

        # Update the prompt to request a specific range of question IDs
        start_id = len(all_questions) + 1

        # Create a list of already generated questions to avoid duplicates
        existing_questions_summary = ""
        if all_questions:
            # Create a summary of existing questions (limited to avoid token issues)
            sample_questions = all_questions[-min(10, len(all_questions)):]  # Last 10 questions
            existing_questions_summary = "PREVIOUSLY GENERATED QUESTIONS (DO NOT DUPLICATE THESE):\n" + "\n".join([
                f"{q['question_id']}. {q['business_question']}" for q in sample_questions
            ])

        # Modify the USER_MESSAGE to include batch information and anti-duplication instructions
        batch_user_message = USER_MESSAGE.replace(
            "{total_questions}", str(questions_in_batch)
        )

        if existing_questions_summary:
            batch_user_message += f"\n\n{existing_questions_summary}\n\nIMPORTANT: Generate COMPLETELY NEW questions that are different from the above. Do not duplicate concepts, metrics, or specific analysis approaches."

        # Create batch-specific prompt
        batch_system_message = SystemMessagePromptTemplate.from_template(SYSTEM_MESSAGE)
        batch_human_message = HumanMessagePromptTemplate.from_template(batch_user_message)
        batch_prompt = ChatPromptTemplate.from_messages([
            batch_system_message,
            batch_human_message
        ])

        # Create chain and invoke
        batch_chain = batch_prompt | llm | parser
        current_date = datetime.now().strftime("%d %B %Y")

        if database_type == "employee":
            schema = employee_schema
            relations = employee_relations
            master_data = employee_master_data
            data_trustee_tables = data_trustee_employee
            master_data = employee_master_data
        else:
            schema = time_management_schema
            relations = time_management_relations
            master_data = time_management_master_data
            data_trustee_tables = data_trustee_time_management
            master_data = time_management_master_data

        try:
            response = batch_chain.invoke({
                "schema": schema,
                "relations": relations,
                "master_data": master_data,
                "data_trustee_tables": data_trustee_tables,
                "anonymized_entities_description": anonymized_entities_description,
                "current_date": current_date,
                "total_questions": questions_in_batch
            })

            # Filter out duplicate questions
            unique_questions = []
            for question in response['sql_questions']:
                # Normalize the question text for comparison (remove spaces, lowercase)
                normalized_text = question['business_question'].lower().replace(" ", "")

                # Check if this is a new unique question
                if normalized_text not in generated_question_texts:
                    # Adjust question ID
                    question['question_id'] = start_id + len(unique_questions)
                    unique_questions.append(question)
                    generated_question_texts.add(normalized_text)

            # Add to our collection
            all_questions.extend(unique_questions)

            print(f"Batch {batch_num}: Generated {len(unique_questions)} unique questions (Total: {len(all_questions)}/{total_questions})")

            # Update remaining questions
            remaining_questions = total_questions - len(all_questions)

            # If we didn't get any unique questions in this batch, we might be stuck
            if len(unique_questions) == 0 and batch_num > 10:
                print("Warning: Not generating any new unique questions. Breaking loop.")
                break

        except Exception as e:
            print(f"Error in batch {batch_num}: {e}")
            # Continue with next batch even if this one fails

    return all_questions[:total_questions]  # Ensure we don't return more than requested

# Generate all questions using the enhanced batching function
total_questions = 200
# database_type = "employee"
database_type = "time"
all_generated_questions = generate_questions_in_batches(total_questions, database_type=database_type)

2025-04-11 00:33:31,389 - INFO - HTTP Request: POST https://api.deepseek.com/v1/chat/completions "HTTP/1.1 200 OK"


Error in batch 1: 'business_question'


2025-04-11 00:36:08,550 - INFO - HTTP Request: POST https://api.deepseek.com/v1/chat/completions "HTTP/1.1 200 OK"


Batch 2: Generated 18 unique questions (Total: 18/200)


2025-04-11 00:38:47,504 - INFO - HTTP Request: POST https://api.deepseek.com/v1/chat/completions "HTTP/1.1 200 OK"


Batch 3: Generated 17 unique questions (Total: 35/200)


2025-04-11 00:41:24,205 - INFO - HTTP Request: POST https://api.deepseek.com/v1/chat/completions "HTTP/1.1 200 OK"


Batch 4: Generated 15 unique questions (Total: 50/200)


2025-04-11 00:44:03,033 - INFO - HTTP Request: POST https://api.deepseek.com/v1/chat/completions "HTTP/1.1 200 OK"


Batch 5: Generated 18 unique questions (Total: 68/200)


2025-04-11 00:46:40,858 - INFO - HTTP Request: POST https://api.deepseek.com/v1/chat/completions "HTTP/1.1 200 OK"


Batch 6: Generated 14 unique questions (Total: 82/200)


2025-04-11 00:49:18,418 - INFO - HTTP Request: POST https://api.deepseek.com/v1/chat/completions "HTTP/1.1 200 OK"


Batch 7: Generated 14 unique questions (Total: 96/200)


2025-04-11 00:51:56,632 - INFO - HTTP Request: POST https://api.deepseek.com/v1/chat/completions "HTTP/1.1 200 OK"


Batch 8: Generated 13 unique questions (Total: 109/200)


2025-04-11 00:54:36,514 - INFO - HTTP Request: POST https://api.deepseek.com/v1/chat/completions "HTTP/1.1 200 OK"


Error in batch 9: 'business_question'


2025-04-11 00:57:15,795 - INFO - HTTP Request: POST https://api.deepseek.com/v1/chat/completions "HTTP/1.1 200 OK"


Error in batch 10: 'business_question'


2025-04-11 01:00:03,950 - INFO - HTTP Request: POST https://api.deepseek.com/v1/chat/completions "HTTP/1.1 200 OK"


Batch 11: Generated 15 unique questions (Total: 124/200)


2025-04-11 01:02:41,059 - INFO - HTTP Request: POST https://api.deepseek.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-04-11 01:04:48,419 - INFO - Retrying request to /chat/completions in 0.402129 seconds
2025-04-11 01:04:49,499 - INFO - HTTP Request: POST https://api.deepseek.com/v1/chat/completions "HTTP/1.1 200 OK"


Batch 12: Generated 15 unique questions (Total: 139/200)


2025-04-11 01:07:25,854 - INFO - HTTP Request: POST https://api.deepseek.com/v1/chat/completions "HTTP/1.1 200 OK"


Batch 13: Generated 21 unique questions (Total: 160/200)


2025-04-11 01:10:05,067 - INFO - HTTP Request: POST https://api.deepseek.com/v1/chat/completions "HTTP/1.1 200 OK"


Batch 14: Generated 19 unique questions (Total: 179/200)


2025-04-11 01:12:41,632 - INFO - HTTP Request: POST https://api.deepseek.com/v1/chat/completions "HTTP/1.1 200 OK"


Batch 15: Generated 14 unique questions (Total: 193/200)


2025-04-11 01:15:17,479 - INFO - HTTP Request: POST https://api.deepseek.com/v1/chat/completions "HTTP/1.1 200 OK"


Batch 16: Generated 7 unique questions (Total: 200/200)


In [23]:
# Create DataFrame from all questions
df_syntetic_data = pd.DataFrame(columns=[
    ColumnName.NO,
    ColumnName.CATEGORY,
    ColumnName.COMPLEXITY,
    ColumnName.PROMPT,
    ColumnName.REQUIRED_TABLES,
    ColumnName.SQL_CONCEPTS,
    ColumnName.GENERATED_SQL_QUERY
])


for qna_pair in all_generated_questions:
    try:
        new_row = {
            ColumnName.NO: qna_pair['question_id'],
            ColumnName.CATEGORY: qna_pair['category'],
            ColumnName.COMPLEXITY: qna_pair['complexity'],
            ColumnName.PROMPT: qna_pair['business_question'],
            ColumnName.REQUIRED_TABLES: qna_pair['required_tables'],
            ColumnName.SQL_CONCEPTS: qna_pair['sql_concepts'],
            ColumnName.GENERATED_SQL_QUERY: qna_pair['sql_query']
        }
        df_syntetic_data = pd.concat([df_syntetic_data, pd.DataFrame([new_row])], ignore_index=True)
    except:
        print(qna_pair)

# Save to CSV
from pathlib import Path
syntetic_data_dir = Path('sql_generator_result')
syntetic_data_dir.mkdir(parents=True, exist_ok=True)
df_syntetic_data.to_csv(syntetic_data_dir / 'time_200_batch_2.csv', index=False)
display(df_syntetic_data)
print(f"Successfully generated {len(df_syntetic_data)} unique questions")

{'question_id': 50, 'category': 'EMPLOYMENT ANALYSIS', 'complexity': 'Intermediate', 'business_question': 'Berapa lama rata-rata karyawan tetap di perusahaan berdasarkan jenis kontrak kerja awal mereka?'}
{'question_id': 82, 'category': 'EMPLOYEE HIRING PATTERNS', 'complexity': 'Advanced', 'business_question': 'Bagaimana pola rekrutmen karyawan baru berdasarkan jenis kontrak kerja dan departemen selama 3 tahun terakhir,'}
{'question_id': 109, 'category': 'EMPLOYMENT ANALYSIS', 'complexity': 'Intermediate', 'business_question': 'Berapa lama rata-rata karyawan bertahan di'}
{'question_id': 193, 'category': 'PRESENCE ANALYSIS', 'complexity': 'Advanced', 'business_question': 'Bagaimana tingkat kehadiran karyawan (persentase hadir vs tidak hadir) per minggu selama 3 bulan terakhir?', 'required_tables': ['presence_entries', 'attendance_statuses'], 'sql_concepts': ['CTE', 'DATE functions', 'C']}


Unnamed: 0,No,Category,Complexity,Prompt,Required Tables,SQL Concepts,Generated SQL Query
0,1,EMPLOYEE HIRING PATTERNS,Basic,Berapa jumlah karyawan baru yang direkrut seti...,[employees],"[COUNT, GROUP BY, YEAR, MONTH, WHERE]","SELECT MONTH(join_date) AS month, COUNT(*) AS ..."
1,2,ATTENDANCE AND TIME TRACKING,Basic,Berapa rata-rata keterlambatan karyawan (dalam...,"[attendance_detail_recapitulations, shifts]","[JOIN, AVG, GROUP BY, WHERE, DATE_FORMAT]","SELECT DATE_FORMAT(date, '%Y-%m') AS month, AV..."
2,3,EMPLOYMENT STATUS CHANGES,Intermediate,Berapa jumlah promosi (perubahan job level) ya...,"[employment_status_histories, job_levels]","[JOIN, COUNT, GROUP BY, QUARTER, WHERE]","SELECT QUARTER(effective_date) AS quarter, COU..."
3,4,DEMOGRAPHIC ANALYSIS,Intermediate,Bagaimana distribusi usia karyawan saat ini be...,"[employee_details, employment_statuses, organi...","[JOIN, FLOOR, GROUP BY, WHERE]","SELECT organizations.name AS department, FLOOR..."
4,5,EDUCATION AND EXPERIENCE,Intermediate,Apa distribusi tingkat pendidikan karyawan ber...,"[educations, education_levels, employment_stat...","[JOIN, COUNT, GROUP BY]","SELECT job_titles.name AS job_title, education..."
...,...,...,...,...,...,...,...
191,196,EMPLOYMENT CONTRACTS,Intermediate,Berapa jumlah karyawan kontrak yang akan berak...,"[employment_statuses, employment_types, employ...","[JOIN, COUNT, DATE_ADD, GROUP BY, WHERE, BETWEEN]","SELECT employment_types.name AS contract_type,..."
192,197,DEMOGRAPHIC ANALYSIS,Basic,Bagaimana distribusi karyawan berdasarkan golo...,"[employee_details, employees]","[JOIN, COUNT, GROUP BY, WHERE]","SELECT employee_details.blood_type, employee_d..."
193,198,MANAGEMENT HIERARCHY,Advanced,Berapa jumlah karyawan yang memiliki lebih dar...,"[employees, employment_statuses, locations]","[SUBQUERY, JOIN, COUNT, GROUP BY, HAVING, WHERE]","SELECT m.name AS manager_name, COUNT(e.id) AS ..."
194,199,EDUCATION AND EXPERIENCE,Advanced,Bagaimana korelasi antara tingkat pendidikan d...,"[educations, education_levels, job_experiences...","[JOIN, AVG, DATEDIFF, GROUP BY, WHERE, SUBQUERY]",SELECT education_levels.name AS education_leve...


Successfully generated 196 unique questions


## Upload to Google Sheets

In [14]:
df_syntetic_data

Unnamed: 0,source,paraphrased_input,ner_label,context
0,test_case,pemimpin perusahaan kami bernama Ahmad Fauzi,pemimpin perusahaan kami bernama <PER>Ahmad Fa...,nama bos saya adalah On Lee
1,test_case,direktur utama di kantor saya adalah Siti Rahayu,direktur utama di kantor saya adalah <PER>Siti...,nama bos saya adalah On Lee


In [17]:
from module.google_sheets_writer import GoogleSheetsWriter
import logging


writer = GoogleSheetsWriter(
    google_util=google,  # Your GoogleUtil instance
    sheet_id=GOOGLE_SPREADSHEET_ID,
    worksheet_name=SYNTETIC_DATA_SHEET_NAME,
    batch_size=10,  # Customize batch size
    max_retries=5,  # Customize retry attempts
    batch_delay=2  # Customize delay between batches
)
# Write the DataFrame
result = writer.write_dataframe(df_syntetic_data)

# Log results
logging.info(f"Successfully wrote {result.successful_rows} rows")
if result.failed_rows > 0:
    logging.error(f"Failed to write {result.failed_rows} rows")
    for error in result.errors:
        logging.error(f"Row {error['row_number']}: {error['error']}")

  0%|          | 0/1 [00:00<?, ?it/s]2025-01-17 20:24:02,563 - INFO - Successfully wrote row 1/2
2025-01-17 20:24:04,427 - INFO - Successfully wrote row 2/2
100%|██████████| 1/1 [00:03<00:00,  3.90s/it]
2025-01-17 20:24:04,441 - INFO - Successfully wrote 2 rows
