In [1]:
import re
import os
import json
import csv
import duckdb as db
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
# from pydantic import BaseModel
from typing import Any, Dict, List, Tuple, TypedDict, Annotated
from textwrap import dedent
# from crewai_tools import tool, FileWriterTool
from crewai import Agent, Crew, Process, Task
# from langchain_ollama.llms import OllamaLLM
from langchain_groq import ChatGroq
from langchain_google_genai import ChatGoogleGenerativeAI

  from .autonotebook import tqdm as notebook_tqdm
* 'allow_population_by_field_name' has been renamed to 'populate_by_name'
* 'smart_union' has been removed


# CrewAI

In [2]:
# model = ChatGroq(
#     model="llama3-8b-8192",
#     temperature = 0,
# )

In [16]:
model = ChatGroq(
    model="llama-3.1-70b-versatile",
    temperature=0
)

In [17]:
# model = ChatGoogleGenerativeAI(
#     model='gemini-1.5-pro-exp-0801',
#     temperature=0
# )

In [18]:
from os import listdir
from os.path import isfile, join

In [19]:
def query_list(file_path: str) -> List:
    with open(file_path) as f:
        s = f.read()
    
    return s

In [27]:
def load_queries(folder_path: str):
    file_list = [f for f in listdir(folder_path) if isfile(join(folder_path, f))]
    file_list.sort()
    queries = []  

    for sql_file in file_list:
        q = query_list(join(folder_path,sql_file))
        queries.append({
            'query_name': sql_file[:-4],
            'sql_code': q 
        })          

    return queries

In [28]:
path = 'sql_files/bn_beneficiario/'

In [29]:
queries = load_queries(path)

In [30]:
project_manager = Agent(
    role = "Project Manager",
    goal = "Ensure that the project has all queries mapped correctly for the success of the migration project.",
    backstory=dedent(
        """
        You're a seasoned Project Manager with over 15 years of experience in data management and IT transformations, 
        is known for her ability to navigate complex projects with precision and foresight. You began her career as a data analyst, 
        where developed a deep understanding of SQL and traditional ETL (Extract, Transform, Load) processes. 
        Over the years, transitioned into project management, where led numerous successful data migration and modernization projects.
        You are in charge of a project that aims to transition the company's aging ETL processes, heavily reliant on SQL, 
        to a modern data stack that leverages ELT (Extract, Load, Transform) processes using PySpark. 
        The goal is to enhance data processing efficiency, scalability, and maintainability.
        """
    ),
    llm=model,
    allow_delegation=False
)

In [31]:
sql_analyst = Agent(
    role = "Senior Data Analyst",
    goal = "Analyze complicate SQL queries and extract the relationship between table name and column name of all tables in queries.",
    backstory=dedent(
        """
        You're a highly specialized developed to dissect and understand complex SQL queries,
        you could quickly and accurately extract essential information from intricate SQL statements, 
        including that ones that has many sub queries.
        Your key traits are Analytical prowess, Attention to detail, Vast knowledge of SQL syntax 
        across multiple database systems.
        """
    ),
    llm=model,
    allow_delegation=False
)

In [32]:
get_table_names = Task(
    description=dedent(
        """ 
        Search this SQL query {sql_code} for all table names involved.
        It is very important not to ignore any tables. In complex queries, 
        there are some subqueries that must be observed carefully.
        Do it line by line, get all table names and their alias when they are present.
        """
    ),
    expected_output="List of distinct table names and alias present in the query.",
    agent=project_manager,
    allow_delegation=True
)

In [33]:
extract_tables_columns = Task(
    description=dedent(
        """
        Analyze the SQL query provided below: {sql_code}
        Use the list of table names and their aliases extracted in the previous step and find the columns for each of these tables.
        For each line of the code do:
        1. **Identify Table Names and Aliases**:
        Extract all table names along with their aliases used in the query.
        2. **Identify Columns and Aliases**:
        For each table identified, list all the columns and their corresponding aliases (if any). If no alias is provided, leave the alias field blank.
        3. **Output Format**:
        Present the information in the following format:
        table_name;alias;column_name;column_alias 
        table1;alias1;columnName_n1;columnAlias_n1 
        table1;alias1;columnName_n2;columnAlias_n2 
        table2;alias2;columnName_n1;columnAlias_n1

        4. **Special Instructions**:
        - If a column does not have an alias, repeat the column name in the `column_alias` field.
        - Ensure all extracted data adheres strictly to the format specified.        
        """
    ),
    expected_output="Formated CSV file.",
    agent=sql_analyst,
    context=[get_table_names]
    #callback=lambda result: result_collector.add_result(result)
)

In [34]:
revision = Task(
    description=dedent(
        """
        Analyze the SQL query provided: {sql_code}
        Use the csv file generated in the previous step and and revise if all tables and columnas were extracted,
        if not include the tables and column names that are missing.

        **Output Format**:
        Present the information in the following format:
        table_name;alias;column_name;column_alias 
        table1;alias1;columnName_n1;columnAlias_n1 
        table1;alias1;columnName_n2;columnAlias_n2 
        table2;alias2;columnName_n1;columnAlias_n1        

        """
    ),
    expected_output="Formated CSV file.",
    agent=project_manager,
    context=[extract_tables_columns]
    #callback=lambda result: result_collector.add_result(result)
)

In [35]:
extract_rules = Task(
    description=dedent(
        """
        Analyze the SQL query provided: {sql_code}
        Use the csv file generated in the previous step and find and extract following points:
        
        1. **Identify SQL Functions and Concatenations**:
        Locate all instances of SQL functions (e.g., `NVL`, `DECODE`, `CASE`, `SUM`, etc.) or concatenations used in the query. Refer to these as "rules."

        2. **Extract Corresponding Aliases**:
        For each rule identified, find the corresponding alias. The alias is the final name given to the column in the query.

        3. **Extract Column Names**:
        Identify the column name associated with each rule and alias.

        4. **Output Format**:
        Present the information in the following format:
        column_name;alias;rule 
        columnName_n1;alias_n1;"rule_n1" 
        columnName_n2;alias_n2;"rule_n2" 
        columnName_n3;alias_n3;"rule_n3"

        - **Note**: Only include columns where a rule is found.
        - If any element (e.g., alias) is missing, indicate it with `NULL`.

        5. **Final Instructions**:
        Ensure that all extracted data adheres strictly to the format specified.
        """
    ),
    expected_output="Formated CSV file.",
    agent=sql_analyst,
    context=[extract_tables_columns]
    #callback=lambda result: result_collector.add_result(result)
)

In [36]:
crew = Crew(
    agents = [project_manager,sql_analyst],
    tasks = [get_table_names, extract_tables_columns, revision, extract_rules],
    process = Process.sequential,
    verbose = 0,
    memory=False,
    output_log_file="crew.log",
)

In [37]:
def save_to_parquet(data, filename, header: list):
    # Split the string by newlines to get rows
    rows = data.split('\n')
    
    # Split each row by semicolons to get columns
    formatted_data = [row.split(';') for row in rows]
    
    # Output filename
    output_filename = f"{'output'}/{filename}.parquet"

    df = pd.DataFrame(formatted_data, columns=header)
    table = pa.Table.from_pandas(df)
    pq.write_table(table,output_filename)

In [38]:
def save_to_csv(data, filename, header: list):
    # Split the string by newlines to get rows
    rows = data.split('\n')
    
    # Split each row by semicolons to get columns
    formatted_data = [row.split(';') for row in rows]
    
    # Output filename
    output_filename = f"{'output'}/{filename}.csv"

    # Write the formatted data to a CSV file
    with open(output_filename, 'w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file, delimiter=',',quotechar =',',quoting=csv.QUOTE_MINIMAL)
        writer.writerow(header)  # Write header
        writer.writerows(formatted_data)

In [40]:
for query in queries:
    print(query['query_name'])
    file_name = query['query_name']
    result = crew.kickoff(inputs=query)
    task_output = revision.output
    save_to_csv(task_output.raw, file_name, ['table_name','alias','column_name','column_alias'])
    task_output = extract_rules.output
    save_to_csv(task_output.raw, file_name+'_rules', ['column_name','alias','rule'])

    #result_collector.add_result('export_tables_columns',output.raw,file_name)

01_beneficiario
02_sam_familia_teto_pf
03_1_busca_microsiga
03_2_sem_setor


# Processa CSVs

In [12]:
db.sql(
"""     
    select 
        *
    from 'output/03_2_sem_setor.csv'
""")

┌──────────────────┬─────────┬─────────────────────┬─────────────────────┐
│    table_name    │  alias  │     column_name     │    column_alias     │
│     varchar      │ varchar │       varchar       │       varchar       │
├──────────────────┼─────────┼─────────────────────┼─────────────────────┤
│ SAM_BENEFICIARIO │ BEN     │ HANDLE              │ HANDLE              │
│ SAM_BENEFICIARIO │ BEN     │ ENDERECORESIDENCIAL │ ENDERECORESIDENCIAL │
│ SAM_ENDERECO     │ ENDR    │ DDD1                │ DDD1                │
│ SAM_ENDERECO     │ ENDR    │ PREFIXO1            │ PREFIXO1            │
│ SAM_ENDERECO     │ ENDR    │ NUMERO1             │ NUMERO1             │
│ SAM_ENDERECO     │ ENDR    │ HANDLE              │ HANDLE              │
│ SAM_BENEFICIARIO │ NULL    │ SETOR_UNIMED        │ SETOR_UNIMED        │
└──────────────────┴─────────┴─────────────────────┴─────────────────────┘

In [180]:
db.sql(
"""     
    with ben_ori as (
        select 'beneficiario' as origem, table_name, alias as table_alias, column_name from 'output/01_beneficiario.csv' 
        union all
        select 'sam_familia_teto_pf' as origem, table_name, alias as table_alias, column_name from 'output/02_sam_familia_teto_pf.csv' 
        union all
        select 'busca_microsiga' as origem, table_name, alias as table_alias, column_name from 'output/03_1_busca_microsiga.csv'
        union all
        select 'sem_setor' as origem, table_name, alias as table_alias, column_name from 'output/03_2_sem_setor.csv'
    ),
    ben_rul as (
        select 
            SUBSTR(column_name, INSTR(column_name, '.') + 1) AS column_name,
            alias,
            rule
        from 'output/01_beneficiario_rules.csv'
        union all
        select 
            SUBSTR(column_name, INSTR(column_name, '.') + 1) AS column_name,
            alias,
            rule
        from 'output/02_sam_familia_teto_pf_rules.csv' 
        union all
        select 
            SUBSTR(column_name, INSTR(column_name, '.') + 1) AS column_name,
            alias,
            rule
        from 'output/03_1_busca_microsiga_rules.csv'
        union all
        select 
            SUBSTR(column_name, INSTR(column_name, '.') + 1) AS column_name,
            alias,
            rule        
        from 'output/03_2_sem_setor_rules.csv'
    ),
    ben_dest as (
        select * from 'output/99_bn_beneficiario.csv'
    )
    select 
        distinct 
        ben_dest.*,
        ben_ori.*,
        ben_rul.*
    from ben_dest
    left join ben_ori 
    on(ben_ori.column_name = ben_dest.column_name)
    left join ben_rul
    on(ben_dest.colum_name_renamed = ben_rul.alias)
    order by ben_ori.column_name
""").to_csv('duckdb_output.txt')

In [75]:
db.sql(
"""     
        select 
            SUBSTR(column_name, INSTR(column_name, '.') + 1) AS column_name,
            alias,
            rule
        from 'output/01_beneficiario_rules.csv'
        union all
        select 
            SUBSTR(column_name, INSTR(column_name, '.') + 1) AS column_name,
            alias,
            rule
        from 'output/02_sam_familia_teto_pf_rules.csv' 
        union all
        select 
            SUBSTR(column_name, INSTR(column_name, '.') + 1) AS column_name,
            alias,
            rule
        from 'output/03_1_busca_microsiga_rules.csv'
        union all
        select 
            SUBSTR(column_name, INSTR(column_name, '.') + 1) AS column_name,
            alias,
            rule        
        from 'output/03_2_sem_setor_rules.csv'
""")

BinderException: Binder Error: Column "column_name" referenced that exists in the SELECT clause - but this column cannot be referenced before it is defined

In [121]:
db.sql("select table from 'output/01_beneficiario.csv'")

ParserException: Parser Error: syntax error at or near "table"