In [92]:
import duckdb
import pandas as pd
import os


In [3]:
import duckdb

def get_duckdb_query_stats(conn: duckdb.DuckDBPyConnection, query: str) -> dict:
    """Executes a DuckDB query and returns its statistics."""
    try:
        conn.execute(query)
        #Get the latest query execution info.
        result = conn.execute("SELECT rows, execution_time FROM duckdb_queries ORDER BY finished DESC LIMIT 1").fetchone()
        if result:
            rows, execution_time = result
            return {"rows": rows, "execution_time": execution_time}
        else:
            return {"error": "Could not retrieve query statistics."}

    except duckdb.Error as e:
        return {"error": str(e)}

def get_duckdb_copy_stats(conn: duckdb.DuckDBPyConnection, copy_command: str) -> dict:
    """Executes a DuckDB COPY command and returns its statistics."""
    try:
        conn.execute(copy_command)
        #Get the latest copy command execution info.
        result = conn.execute("SELECT rows, bytes_written, execution_time FROM duckdb_queries ORDER BY finished DESC LIMIT 1").fetchone()
        if result:
            rows, bytes_written, execution_time = result
            return {"rows": rows, "bytes_written": bytes_written, "execution_time": execution_time}
        else:
            return {"error": "Could not retrieve copy statistics."}

    except duckdb.Error as e:
        return {"error": str(e)}


In [None]:
import xml.etree.ElementTree as ET
import pandas as pd
import os

def xml_to_dataframe(xml_file_path):
    """
    Parses an XML file and converts it into a pandas DataFrame.

    Args:
        xml_file_path (str): The path to the XML file.

    Returns:
        pandas.DataFrame: The DataFrame containing the XML data, or None if an error occurs.
    """
    try:
        tree = ET.parse(xml_file_path)
        root = tree.getroot()

        data = []
        for action in root:
            action_data = {}
            # Extract Action attributes
            for key, value in action.attrib.items():
                action_data["Action." + key.split("}")[-1]] = value

            for element in action:
                for key, column0 in element.attrib.items():
                    action_data["Customer." + key.split("}")[-1]] = value
                if len(element) == 0:  # Simple element
                    action_data[element.tag.split("}")[-1]] = element.text
                else:  # Nested element
                    for sub_element in element:
                        for key, value in sub_element.attrib.items():
                            action_data["Account." + key.split("}")[-1]] = value
                        if len(sub_element) == 0:
                            action_data[element.tag.split("}")[-1] + "." + sub_element.tag.split("}")[-1]] = sub_element.text
                        else:
                            for sub_sub_element in sub_element:
                                if len(sub_sub_element) == 0:
                                    action_data[element.tag.split("}")[-1] + "." + sub_element.tag.split("}")[-1] + "." + sub_sub_element.tag.split("}")[-1]] = sub_sub_element.text
                                else:
                                    for sub_sub_sub_element in sub_sub_element:
                                        action_data[element.tag.split("}")[-1] + "." + sub_element.tag.split("}")[-1] + "." + sub_sub_element.tag.split("}")[-1] + "." + sub_sub_sub_element.tag.split("}")[-1]] = sub_sub_sub_element.text
            data.append(action_data)

        df = pd.DataFrame(data)
        return df

    except Exception as e:
        print(f"Error processing XML file: {e}")
        return None

path_to_xml = "C:\lopu-kg-test\project\src\data\Batch1\CustomerMgmt.xml"

xml_dataframe = xml_to_dataframe(path_to_xml)



In [39]:
def rename_columns(df):
    """
    Renames DataFrame columns by extracting the last part after '.', 
    and appends a counter if duplicates are found.

    Args:
        df (pd.DataFrame): The DataFrame to rename columns.

    Returns:
        pd.DataFrame: The DataFrame with renamed columns.
    """
    new_columns = []
    seen_columns = {}  # Track seen column names and their counts

    for col in df.columns:
        parts = col.split('.')
        new_col = parts[-1]  # Extract the last part

        if new_col in seen_columns:
            seen_columns[new_col] += 1
            new_col = f"{new_col}_{seen_columns[new_col]}"  # Append a counter
        else:
            seen_columns[new_col] = 0

        new_columns.append(new_col)

    df.columns = new_columns
    return df


xml_dataframe = rename_columns(xml_dataframe)


In [85]:
def load_data_to_duckdb(db_path, src_folder):
    data_dict = {
        "Date.txt": "wh_db.DimDate",
        "Time.txt": "wh_db.DimTime",
        "StatusType.txt": "wh_db.StatusType",
        "TaxRate.txt": "wh_db.TaxRate",
        "TradeType.txt": "wh_db.TradeType",
        "HR.csv": "wh_db.DimBroker",
        "Industry.txt": "wh_db.industry"
        # "FinwireCMP.txt": "wh_db.DimCompany"
    }
    
    con = duckdb.connect(database=db_path)
    
    for file_name, table_name in data_dict.items():
        file_path = os.path.join(src_folder, file_name)
        
        if file_name == "HR.csv":
            query = f"""
                CREATE OR REPLACE TABLE {table_name} AS 
                SELECT * FROM read_csv('{file_path}', delim=',', columns={{
                    'employeeid': 'BIGINT',
                    'managerid': 'BIGINT',
                    'employeefirstname': 'STRING',
                    'employeelastname': 'STRING',
                    'employeemi': 'STRING',
                    'employeejobcode': 'STRING',
                    'employeebranch': 'STRING',
                    'employeeoffice': 'STRING',
                    'employeephone': 'STRING'
                }}, header=False) 
                WHERE employeejobcode = '314';
            """
        else:
            query = f"COPY {table_name} FROM '{file_path}' (DELIMITER '|');"
        
        con.sql(query)
        print(f"Loaded {file_name} into {table_name}")
        print(file_path)
    
    return con

# Example usage:
db_path = 'initial_db.duckdb'
src_folder = 'src/data/Batch1'
con = load_data_to_duckdb(db_path, src_folder)

# Test loading
#print(con.sql("SELECT * FROM wh_db.DimTime limit 10").fetchdf(10))


Loaded Date.txt into wh_db.DimDate
src/data/Batch1\Date.txt
Loaded Time.txt into wh_db.DimTime
src/data/Batch1\Time.txt
Loaded StatusType.txt into wh_db.StatusType
src/data/Batch1\StatusType.txt
Loaded TaxRate.txt into wh_db.TaxRate
src/data/Batch1\TaxRate.txt
Loaded TradeType.txt into wh_db.TradeType
src/data/Batch1\TradeType.txt
Loaded HR.csv into wh_db.DimBroker
src/data/Batch1\HR.csv
Loaded Industry.txt into wh_db.industry
src/data/Batch1\Industry.txt


In [6]:
con_path = r"initial_db.duckdb"
con = duckdb.connect(con_path)

In [28]:
print(con.execute("select COLUMN_COMMENT from information_schema.columns where table_name = 'DimBroker' limit 1").fetchdf())

  COLUMN_COMMENT
0           None


In [24]:
def get_table_schema(db_path, table_name):
    query = f"""
        SELECT 
            column_name AS name, 
            data_type AS type, 
            COLUMN_COMMENT AS description
        FROM information_schema.columns
        WHERE table_name = '{table_name}'
    """

    con = duckdb.connect(database=db_path)
    result = con.sql(query).fetchall()

    # Format result into the required structure
    schema_columns = [
        {"name": row[0], "type": row[1], "description": row[2] if row[2] else ""}
        for row in result
    ]

    return result

# Example usage
db_path = 'initial_db.duckdb'
table_name = 'DimBroker'
schema_columns = get_table_schema(db_path, table_name)

print(schema_columns)


[('employeeid', 'BIGINT', None), ('managerid', 'BIGINT', None), ('employeefirstname', 'VARCHAR', None), ('employeelastname', 'VARCHAR', None), ('employeemi', 'VARCHAR', None), ('employeejobcode', 'VARCHAR', None), ('employeebranch', 'VARCHAR', None), ('employeeoffice', 'VARCHAR', None), ('employeephone', 'VARCHAR', None)]


In [None]:


con.sql(""" 
        CREATE OR REPLACE TABLE wh_db_stage.CustomerMgmt  AS  
        SELECT
        try_cast(C_ID as BIGINT) customerid,
        try_cast(CA_ID as BIGINT) accountid,
        try_cast(CA_B_ID as BIGINT) brokerid,
        nullif(C_TAX_ID, '') taxid,
        nullif(CA_NAME, '') accountdesc,
        try_cast(CA_TAX_ST as TINYINT) taxstatus,
        CASE
            WHEN ActionType IN ('NEW', 'ADDACCT', 'UPDACCT', 'UPDCUST') THEN 'Active'
            WHEN ActionType IN ('CLOSEACCT', 'INACT') THEN 'Inactive'
            ELSE NULL
        END AS status,
        nullif(C_L_NAME, '') lastname,
        nullif(C_F_NAME, '') firstname,
        nullif(C_M_NAME, '') middleinitial,
        nullif(upper(C_GNDR), '') gender,
        try_cast(C_TIER as TINYINT) tier,
        try_cast(C_DOB as DATE) dob,
        nullif(C_ADLINE1, '') addressline1,
        nullif(C_ADLINE2, '') addressline2,
        nullif(C_ZIPCODE, '') postalcode,
        nullif(C_CITY, '') city,
        nullif(C_STATE_PROV, '') stateprov,
        nullif(C_CTRY, '') country,
        CASE
            WHEN nullif(C_LOCAL, '') IS NOT NULL THEN
                concat(
                    CASE WHEN nullif(C_CTRY_CODE, '') IS NOT NULL THEN '+' || C_CTRY_CODE || ' ' ELSE '' END,
                    CASE WHEN nullif(C_AREA_CODE, '') IS NOT NULL THEN '(' || C_AREA_CODE || ') ' ELSE '' END,
                    C_LOCAL,
                    COALESCE(C_EXT, '')
                )
            ELSE NULL
        END AS phone1,
        CASE
            WHEN nullif(C_LOCAL_1, '') IS NOT NULL THEN
                concat(
                    CASE WHEN nullif(C_CTRY_CODE_1, '') IS NOT NULL THEN '+' || C_CTRY_CODE_1 || ' ' ELSE '' END,
                    CASE WHEN nullif(C_AREA_CODE_1, '') IS NOT NULL THEN '(' || C_AREA_CODE_1 || ') ' ELSE '' END,
                    C_LOCAL_1,
                    COALESCE(C_EXT_1, '')
                )
            ELSE NULL
        END AS phone2,
        CASE
            WHEN nullif(C_LOCAL_2, '') IS NOT NULL THEN
                concat(
                    CASE WHEN nullif(C_CTRY_CODE_2, '') IS NOT NULL THEN '+' || C_CTRY_CODE_2 || ' ' ELSE '' END,
                    CASE WHEN nullif(C_AREA_CODE_2, '') IS NOT NULL THEN '(' || C_AREA_CODE_2 || ') ' ELSE '' END,
                    C_LOCAL_2,
                    COALESCE(C_EXT_2, '')
                )
            ELSE NULL
        END AS phone3,
        nullif(C_PRIM_EMAIL, '') email1,
        nullif(C_ALT_EMAIL, '') email2,
        nullif(C_LCL_TX_ID, '') lcl_tx_id,
        nullif(C_NAT_TX_ID, '') nat_tx_id,
        try_cast(ActionTS as TIMESTAMP) update_ts,
        ActionType
           
            FROM xml_dataframe""")

In [81]:
con.commit()

#con.close()

<duckdb.duckdb.DuckDBPyConnection at 0x1bdc3ca44b0>

In [99]:
con.sql("select * from wh_db.DimCompany")

┌──────────────┬───────────┬──────────┬──────────────────────────────────────────────────────┬───────────────────────────────┬──────────┬────────────┬─────────────┬───────────────────────┬──────────────┬────────────┬───────────────┬──────────────┬──────────────────────────┬─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┬──────────────┬───────────┬─────────┬───────────────┬────────────┐
│ sk_companyid │ companyid │  status  │                         name                         │           industry            │ sprating │ islowgrade │     ceo     │     addressline1      │ addressline2 │ postalcode │     city      │  stateprov   │         country          │                                                                   description                                                                   │ foundingdate │ iscurrent │ batchid │ effectivedate │  enddate   │
│    int64     │   int64  

In [None]:
src_folder = 'src/data/Batch1'
file_path = os.path.join(src_folder, "FINWIRE2016Q3")

# õige
con.sql(f""" 
INSERT INTO wh_db_stage.FinWire
SELECT
    CASE
        WHEN SUBSTR(column0, 16, 3) = 'FIN' THEN
            CASE
                WHEN TRY_CAST(TRIM(SUBSTR(column0, 187, 60)) AS BIGINT) IS NOT NULL THEN 'FIN_COMPANYID'
                ELSE 'FIN_NAME'
            END
        ELSE SUBSTR(column0, 16, 3)
    END AS rectype,
    STRPTIME(SUBSTR(column0, 1, 8), '%Y%m%d') AS recdate,
    SUBSTR(column0, 19) AS value
FROM read_csv_auto('{file_path}', HEADER=FALSE, filename=false, all_varchar=true)
""")

In [None]:
# õige
con.sql("""INSERT INTO wh_db.DimCompany
WITH cmp AS (
    SELECT
        recdate,
        TRIM(SUBSTR(value, 1, 60)) AS CompanyName,
        TRIM(SUBSTR(value, 61, 10)) AS CIK,
        TRIM(SUBSTR(value, 71, 4)) AS Status,
        TRIM(SUBSTR(value, 75, 2)) AS IndustryID,
        TRIM(SUBSTR(value, 77, 4)) AS SPrating,
        TRY_CAST(TRY_CAST(SUBSTRING(value, 81, 8) AS TIMESTAMP) AS DATE) AS FoundingDate,
        TRIM(SUBSTR(value, 89, 80)) AS AddrLine1,
        TRIM(SUBSTR(value, 169, 80)) AS AddrLine2,
        TRIM(SUBSTR(value, 249, 12)) AS PostalCode,
        TRIM(SUBSTR(value, 261, 25)) AS City,
        TRIM(SUBSTR(value, 286, 20)) AS StateProvince,
        TRIM(SUBSTR(value, 306, 24)) AS Country,
        TRIM(SUBSTR(value, 330, 46)) AS CEOname,
        TRIM(SUBSTR(value, 376, 150)) AS Description
    FROM wh_db_stage.FinWire
    WHERE rectype = 'CMP'
)
SELECT
    CAST(strftime(effectivedate, '%Y%m%d') || companyid AS BIGINT) AS sk_companyid,
    companyid,
    status,
    name,
    industry,
    sprating,
    islowgrade,
    ceo,
    addressline1,
    addressline2,
    postalcode,
    city,
    stateprov,
    country,
    description,
    foundingdate,
    CASE WHEN enddate = '9999-12-31'::DATE THEN TRUE ELSE FALSE END AS iscurrent,
    batchid,
    effectivedate,
    enddate
FROM (
    SELECT
        CAST(cik AS BIGINT) AS companyid,
        CASE cmp.status
            WHEN 'ACTV' THEN 'Active'
            WHEN 'CMPT' THEN 'Completed'
            WHEN 'CNCL' THEN 'Canceled'
            WHEN 'PNDG' THEN 'Pending'
            WHEN 'SBMT' THEN 'Submitted'
            WHEN 'INAC' THEN 'Inactive'
            ELSE NULL -- or a default value, if needed
        END AS status,
        CompanyName AS name,
        ind.in_name AS industry,
        CASE
            WHEN SPrating IN ('AAA', 'AA', 'AA+', 'AA-', 'A', 'A+', 'A-', 'BBB', 'BBB+', 'BBB-', 'BB', 'BB+', 'BB-', 'B', 'B+', 'B-', 'CCC', 'CCC+', 'CCC-', 'CC', 'C', 'D') THEN SPrating
            ELSE NULL::VARCHAR
        END AS sprating,
        CASE
            WHEN SPrating IN ('AAA', 'AA', 'A', 'AA+', 'A+', 'AA-', 'A-', 'BBB', 'BBB+', 'BBB-') THEN FALSE
            WHEN SPrating IN ('BB', 'B', 'CCC', 'CC', 'C', 'D', 'BB+', 'B+', 'CCC+', 'BB-', 'B-', 'CCC-') THEN TRUE
            ELSE NULL::BOOLEAN
        END AS islowgrade,
        CEOname AS ceo,
        AddrLine1 AS addressline1,
        AddrLine2 AS addressline2,
        PostalCode AS postalcode,
        City AS city,
        StateProvince AS stateprov,
        Country AS country,
        Description AS description,
        FoundingDate AS foundingdate,
        1 AS batchid,
        recdate AS effectivedate,
        COALESCE(
            LEAD(try_cast(recdate AS DATE)) OVER (PARTITION BY cik ORDER BY recdate),
            try_cast('9999-12-31' AS DATE)
        ) AS enddate
    FROM cmp
    JOIN wh_db.industry ind ON cmp.industryid = ind.in_id
)
WHERE effectivedate < enddate;
""")

In [None]:
src_folder = 'src/data/Batch1'
file_path = os.path.join(src_folder, "Prospect.csv")
batch_number = int(''.join(filter(str.isdigit, os.path.basename(src_folder))))

con.sql(f"""
CREATE OR REPLACE TEMP TABLE temp_propect AS 
SELECT     *, 
    {batch_number} AS batchid 
    FROM read_csv_auto('{file_path}', columns={{
    "agencyid": "STRING",
    "lastname": "STRING",
    "firstname": "STRING",
    "middleinitial": "STRING",
    "gender": "STRING",
    "addressline1": "STRING",
    "addressline2": "STRING",
    "postalcode": "STRING",
    "city": "STRING",
    "state": "STRING",
    "country": "STRING",
    "phone": "STRING",
    "income": "INT",
    "numbercars": "INT",
    "numberchildren": "INT",
    "maritalstatus": "STRING",
    "age": "INT",
    "creditrating": "INT",
    "ownorrentflag": "STRING",
    "employer": "STRING",
    "numbercreditcards": "INT",
    "networth": "INT",
}}, header=False);
""")

In [145]:
con.sql("select * from temp_propect order by income DESC")


┌──────────┬────────────────────────────────┬───────────────────────────────┬───────────────┬─────────┬─────────────────────────────────────────────────────────────────────────────────┬──────────────────────────────────────────────────────────────────────────────┬──────────────┬─────────────────┬─────────────┬──────────────────────────┬────────────────┬────────┬────────────┬────────────────┬───────────────┬───────┬──────────────┬───────────────┬──────────────────────┬───────────────────┬──────────┬─────────┐
│ agencyid │            lastname            │           firstname           │ middleinitial │ gender  │                                  addressline1                                   │                                 addressline2                                 │  postalcode  │      city       │    state    │         country          │     phone      │ income │ numbercars │ numberchildren │ maritalstatus │  age  │ creditrating │ ownorrentflag │       employer       │ numbercreditc

In [140]:
con.sql("""
CREATE OR REPLACE TEMP TABLE temp_propect_marketingnameplate AS 
SELECT
    *,
    CASE 
        WHEN LENGTH(
            CONCAT(
                CASE WHEN networth > 1000000 OR income > 200000 THEN 'HighValue+' ELSE '' END,
                CASE WHEN numberchildren > 3 OR numbercreditcards > 5 THEN 'Expenses+' ELSE '' END,
                CASE WHEN age > 45 THEN 'Boomer+' ELSE '' END,
                CASE WHEN income < 50000 OR creditrating < 600 OR networth < 100000 THEN 'MoneyAlert+' ELSE '' END,
                CASE WHEN numbercars > 3 OR numbercreditcards > 7 THEN 'Spender+' ELSE '' END,
                CASE WHEN age < 25 AND networth > 1000000 THEN 'Inherited+' ELSE '' END
            )
        ) > 0 
        THEN LEFT(
            CONCAT(
                CASE WHEN networth > 1000000 OR income > 200000 THEN 'HighValue+' ELSE '' END,
                CASE WHEN numberchildren > 3 OR numbercreditcards > 5 THEN 'Expenses+' ELSE '' END,
                CASE WHEN age > 45 THEN 'Boomer+' ELSE '' END,
                CASE WHEN income < 50000 OR creditrating < 600 OR networth < 100000 THEN 'MoneyAlert+' ELSE '' END,
                CASE WHEN numbercars > 3 OR numbercreditcards > 7 THEN 'Spender+' ELSE '' END,
                CASE WHEN age < 25 AND networth > 1000000 THEN 'Inherited+' ELSE '' END
            ),
            LENGTH(
                CONCAT(
                    CASE WHEN networth > 1000000 OR income > 200000 THEN 'HighValue+' ELSE '' END,
                    CASE WHEN numberchildren > 3 OR numbercreditcards > 5 THEN 'Expenses+' ELSE '' END,
                    CASE WHEN age > 45 THEN 'Boomer+' ELSE '' END,
                    CASE WHEN income < 50000 OR creditrating < 600 OR networth < 100000 THEN 'MoneyAlert+' ELSE '' END,
                    CASE WHEN numbercars > 3 OR numbercreditcards > 7 THEN 'Spender+' ELSE '' END,
                    CASE WHEN age < 25 AND networth > 1000000 THEN 'Inherited+' ELSE '' END
                )
            ) - 1
        )
        ELSE NULL 
    END AS marketingnameplate
FROM temp_propect;

""")

In [None]:
## täpselt ei tea, mille järgi tuvastada ridu - st kas agency_id, firstname_lastname piisab
con.sql(""" 
ALTER TABLE wh_db_stage.ProspectIncremental
    ADD CONSTRAINT ProspectIncremental_pk PRIMARY KEY (agencyid, lastname, firstname);
""")

In [142]:
con.sql("""
INSERT INTO wh_db_stage.ProspectIncremental (
    agencyid, lastname, firstname, middleinitial, gender, addressline1, 
    addressline2, postalcode, city, state, country, phone, income, 
    numbercars, numberchildren, maritalstatus, age, creditrating, 
    ownorrentflag, employer, numbercreditcards, networth, 
    marketingnameplate, recordbatchid, batchid
)
SELECT
    tp.agencyid, tp.lastname, tp.firstname, tp.middleinitial, tp.gender, tp.addressline1, 
    tp.addressline2, tp.postalcode, tp.city, tp.state, tp.country, tp.phone, tp.income, 
    tp.numbercars, tp.numberchildren, tp.maritalstatus, tp.age, tp.creditrating, 
    tp.ownorrentflag, tp.employer, tp.numbercreditcards, tp.networth, 
    tp.marketingnameplate, tp.batchid, tp.batchid
FROM temp_propect_marketingnameplate AS tp
ON CONFLICT (agencyid, lastname, firstname) DO UPDATE SET
    middleinitial = excluded.middleinitial,
    gender = excluded.gender,
    addressline1 = excluded.addressline1,
    addressline2 = excluded.addressline2,
    postalcode = excluded.postalcode,
    city = excluded.city,
    state = excluded.state,
    country = excluded.country,
    phone = excluded.phone,
    income = excluded.income,
    numbercars = excluded.numbercars,
    numberchildren = excluded.numberchildren,
    maritalstatus = excluded.maritalstatus,
    age = excluded.age,
    creditrating = excluded.creditrating,
    ownorrentflag = excluded.ownorrentflag,
    employer = excluded.employer,
    numbercreditcards = excluded.numbercreditcards,
    networth = excluded.networth,
    marketingnameplate = excluded.marketingnameplate,
    recordbatchid = excluded.batchid;
""")

In [164]:
con.sql("select * from wh_db_stage.CustomerMgmt where customerid = 20")

┌────────────┬───────────┬──────────┬─────────────┬───────────────────────────────────────────────┬───────────┬──────────┬──────────┬───────────┬───────────────┬─────────┬──────┬────────────┬──────────────────┬──────────────┬────────────┬───────────────┬───────────┬──────────────────────────┬────────────────┬──────────┬───────────────────┬───────────────────────────┬───────────────────────────────┬───────────┬───────────┬─────────────────────┬────────────┐
│ customerid │ accountid │ brokerid │    taxid    │                  accountdesc                  │ taxstatus │  status  │ lastname │ firstname │ middleinitial │ gender  │ tier │    dob     │   addressline1   │ addressline2 │ postalcode │     city      │ stateprov │         country          │     phone1     │  phone2  │      phone3       │          email1           │            email2             │ lcl_tx_id │ nat_tx_id │      update_ts      │ ActionType │
│   int64    │   int64   │  int64   │   varchar   │                    varchar

In [169]:
con.sql(""" 
select distinct * from 
wh_db_stage.CustomerMgmt a
LEFT 
join wh_db.TaxRate b on
        a.nat_tx_id  = b.TX_ID
where a.customerid = 20
order by update_ts asc

""")

┌────────────┬───────────┬──────────┬─────────────┬───────────────────────────────────────────────┬───────────┬──────────┬──────────┬───────────┬───────────────┬─────────┬──────┬────────────┬──────────────────┬──────────────┬────────────┬───────────────┬───────────┬──────────────────────────┬────────────────┬──────────┬───────────────────┬───────────────────────────┬───────────────────────────────┬───────────┬───────────┬─────────────────────┬────────────┬─────────┬────────────────────────────────────────────┬─────────┐
│ customerid │ accountid │ brokerid │    taxid    │                  accountdesc                  │ taxstatus │  status  │ lastname │ firstname │ middleinitial │ gender  │ tier │    dob     │   addressline1   │ addressline2 │ postalcode │     city      │ stateprov │         country          │     phone1     │  phone2  │      phone3       │          email1           │            email2             │ lcl_tx_id │ nat_tx_id │      update_ts      │ ActionType │  tx_id  │    

In [199]:
# Cust historical pipeline
con.sql(""" 
CREATE TEMP TABLE customers AS
  SELECT
    customerid,
    taxid,
    status,
    lastname,
    firstname,
    middleinitial,
    gender,
    tier,
    dob,
    addressline1,
    addressline2,
    postalcode,
    city,
    stateprov,
    country,
    phone1,
    phone2,
    phone3,
    email1,
    email2,
    lcl_tx_id,
    nat_tx_id,
    1 batchid,
    update_ts
  FROM
    wh_db_stage.CustomerMgmt c
  WHERE
    ActionType in ('NEW', 'INACT', 'UPDCUST')
""")

In [197]:
con.sql(""" 
CREATE OR REPLACE TEMP TABLE customers_final AS
SELECT customerid, coalesce(taxid,
    last_value(taxid ORDER BY update_ts DESC) OVER w)

FROM Customers_hist
WINDOW w AS (PARTITION BY customerid ORDER BY update_ts)
ORDER BY ALL
;
""")

In [222]:
con.sql(""" 
select * from customers_final where customerid = 20
""")

┌────────────┬─────────────┬──────────┬──────────┬───────────┬───────────────┬─────────┬──────┬────────────┬──────────────────┬──────────────┬────────────┬───────────────┬───────────┬──────────────────────────┬────────────────┬──────────┬───────────────────┬───────────────────────────┬───────────────────────────────┬───────────┬───────────┬─────────┬───────────┬───────────────┬────────────┐
│ customerid │    taxid    │  status  │ lastname │ firstname │ middleinitial │ gender  │ tier │    dob     │   addressline1   │ addressline2 │ postalcode │     CITY      │ stateprov │         country          │     phone1     │  phone2  │      phone3       │          email1           │            email2             │ LCL_TX_ID │ NAT_TX_ID │ batchid │ iscurrent │ effectivedate │  enddate   │
│   int64    │   varchar   │ varchar  │ varchar  │  varchar  │    varchar    │ varchar │ int8 │    date    │     varchar      │   varchar    │  varchar   │    varchar    │  varchar  │         varchar          │  

In [221]:
con.sql("""CREATE OR REPLACE TEMP TABLE customers_final AS
SELECT
    customerid,
    COALESCE(taxid, last_value(taxid ORDER BY update_ts DESC) OVER w) AS taxid,
    status,
    COALESCE(lastname, last_value(lastname ORDER BY update_ts DESC) OVER w) AS lastname,
    COALESCE(firstname, last_value(firstname ORDER BY update_ts DESC) OVER w) AS firstname,
    COALESCE(middleinitial, last_value(middleinitial ORDER BY update_ts DESC) OVER w) AS middleinitial,
    COALESCE(gender, last_value(gender ORDER BY update_ts DESC) OVER w) AS gender,
    COALESCE(tier, last_value(tier ORDER BY update_ts DESC) OVER w) AS tier,
    COALESCE(dob, last_value(dob ORDER BY update_ts DESC) OVER w) AS dob,
    COALESCE(addressline1, last_value(addressline1 ORDER BY update_ts DESC) OVER w) AS addressline1,
    COALESCE(addressline2, last_value(addressline2 ORDER BY update_ts DESC) OVER w) AS addressline2,
    COALESCE(postalcode, last_value(postalcode ORDER BY update_ts DESC) OVER w) AS postalcode,
    COALESCE(CITY, last_value(CITY ORDER BY update_ts DESC) OVER w) AS CITY,
    COALESCE(stateprov, last_value(stateprov ORDER BY update_ts DESC) OVER w) AS stateprov,
    COALESCE(country, last_value(country ORDER BY update_ts DESC) OVER w) AS country,
    COALESCE(phone1, last_value(phone1 ORDER BY update_ts DESC) OVER w) AS phone1,
    COALESCE(phone2, last_value(phone2 ORDER BY update_ts DESC) OVER w) AS phone2,
    COALESCE(phone3, last_value(phone3 ORDER BY update_ts DESC) OVER w) AS phone3,
    COALESCE(email1, last_value(email1 ORDER BY update_ts DESC) OVER w) AS email1,
    COALESCE(email2, last_value(email2 ORDER BY update_ts DESC) OVER w) AS email2,
    COALESCE(LCL_TX_ID, last_value(LCL_TX_ID ORDER BY update_ts DESC) OVER w) AS LCL_TX_ID,
    COALESCE(NAT_TX_ID, last_value(NAT_TX_ID ORDER BY update_ts DESC) OVER w) AS NAT_TX_ID,
    batchid,
    CASE 
        WHEN NULLIF(lead(update_ts) OVER w, NULL) IS NULL THEN 'Y' 
        ELSE 'N' 
    END AS iscurrent,
    update_ts::DATE AS effectivedate,
    COALESCE(lead(update_ts::DATE) OVER w, '9999-12-31'::DATE) AS enddate
FROM
    customers
WINDOW w AS (PARTITION BY customerid ORDER BY update_ts);""")


In [None]:
# pooleli
con.sql("""select
SELECT 
  bigint(concat(date_format(c.effectivedate, 'yyyyMMdd'), customerid)) sk_customerid,
  c.customerid,
  c.taxid,
  c.status,
  c.lastname,
  c.firstname,
  c.middleinitial,
  if(c.gender IN ('M', 'F'), c.gender, 'U') gender,
  c.tier,
  c.dob,
  c.addressline1,
  c.addressline2,
  c.postalcode,
  c.city,
  c.stateprov,
  c.country,
  c.phone1,
  c.phone2,
  c.phone3,
  c.email1,
  c.email2,
  r_nat.TX_NAME as nationaltaxratedesc,
  r_nat.TX_RATE as nationaltaxrate,
  r_lcl.TX_NAME as localtaxratedesc,
  r_lcl.TX_RATE as localtaxrate,
  p.agencyid,
  p.creditrating,
  p.networth,
  p.marketingnameplate,
  c.iscurrent,
  c.batchid,
  c.effectivedate,
  c.enddate        

FROM customers_final c
JOIN wh_db.TaxRate r_lcl 
  ON c.lcl_tx_id = r_lcl.TX_ID
JOIN wh_db.TaxRate r_nat 
  ON c.nat_tx_id = r_nat.TX_ID
LEFT JOIN wh_db_stage.ProspectIncremental p 
  on 
    upper(p.lastname) = upper(c.lastname)
    and upper(p.firstname) = upper(c.firstname)
    and upper(p.addressline1) = upper(c.addressline1)
    and upper(nullif(p.addressline2, '')) = upper(nullif(c.addressline2, ''))
    and upper(p.postalcode) = upper(c.postalcode)
WHERE c.effectivedate < c.enddate

 """)

┌────────────┬─────────────┬──────────┬─────────────────┬───────────┬───────────────┬─────────┬──────┬────────────┬────────────────────────────┬──────────────┬────────────┬──────────────────┬──────────────┬──────────────────────────┬─────────────────────┬───────────────────┬─────────────────────┬────────────────────────────────────┬───────────────────────────────┬───────────┬───────────┬─────────┬───────────┬───────────────┬────────────┬─────────┬─────────────────────────────────────────────────┬─────────┬─────────┬───────────────────────────────────────────────────┬─────────┬──────────┬──────────────┬───────────┬───────────────┬─────────┬────────────────────────────┬──────────────┬────────────┬──────────────┬──────────────┬──────────────────────────┬────────────────┬─────────┬────────────┬────────────────┬───────────────┬───────┬──────────────┬───────────────┬────────────────────────────┬───────────────────┬──────────┬──────────────────────────────────────┬───────────────┬─────────┐
│

In [None]:
# Customer hakkab alles batch2-st?


src_folder = 'src/data/Batch2'
file_path = os.path.join(src_folder, "Customer.txt")
batch_number = int(''.join(filter(str.isdigit, os.path.basename(src_folder))))

columns = {
    "cdc_flag": "VARCHAR",
    "cdc_dsn": "BIGINT",
    "customerid": "BIGINT",
    "taxid": "VARCHAR",
    "status": "VARCHAR",
    "lastname": "VARCHAR",
    "firstname": "VARCHAR",
    "middleinitial": "VARCHAR",
    "gender": "VARCHAR",
    "tier": "TINYINT",
    "dob": "DATE",
    "addressline1": "VARCHAR",
    "addressline2": "VARCHAR",
    "postalcode": "VARCHAR",
    "city": "VARCHAR",
    "stateprov": "VARCHAR",
    "country": "VARCHAR",
    "c_ctry_1": "VARCHAR",
    "c_area_1": "VARCHAR",
    "c_local_1": "VARCHAR",
    "c_ext_1": "VARCHAR",
    "c_ctry_2": "VARCHAR",
    "c_area_2": "VARCHAR",
    "c_local_2": "VARCHAR",
    "c_ext_2": "VARCHAR",
    "c_ctry_3": "VARCHAR",
    "c_area_3": "VARCHAR",
    "c_local_3": "VARCHAR",
    "c_ext_3": "VARCHAR",
    "email1": "VARCHAR",
    "email2": "VARCHAR",
    "lcl_tx_id": "VARCHAR",
    "nat_tx_id": "VARCHAR"
}

df = con.sql(f"SELECT * FROM read_csv('{file_path}', columns = $columns)", params={"columns": columns}).df()

print(df.dtypes)
print(df.head())

In [153]:
# peale batch ingestionit täita ka wh_db.BatchDate

src_folder = 'src/data/Batch1'
file_path = os.path.join(src_folder, "BatchDate.txt")
batch_number = int(''.join(filter(str.isdigit, os.path.basename(src_folder))))

con.sql(f"""
INSERT INTO wh_db.BatchDate
SELECT batchdate::DATE,
    {batch_number} AS batchid 
    FROM read_csv_auto('{file_path}', columns={{
    "batchdate": "DATE"
}}, header=False);
""")

In [None]:

con.sql("""

""")

ParserException: Parser Error: syntax error at or near "MERGE"

In [124]:
con = duckdb.connect(database=db_path)
#con.sql("select * from sqlite_temp_schema ")
con.sql("select column_name from information_schema.columns where table_name = 'ProspectIncremental'").df()

Unnamed: 0,column_name
0,agencyid
1,lastname
2,firstname
3,middleinitial
4,gender
5,addressline1
6,addressline2
7,postalcode
8,city
9,state


In [30]:
print(con.execute("select COLUMN_COMMENT from information_schema.columns").fetchdf())

                              COLUMN_COMMENT
0                                 Batch date
1     Batch ID when this record was inserted
2                Surrogate key for AccountID
3                Customer account identifier
4           Surrogate key of managing broker
..                                       ...
250                             Credit cards
251                Estimated total net worth
252                      Marketing nameplate
253  Batch ID when this record last inserted
254                                     None

[255 rows x 1 columns]


1. Information schemast tabelid ja skeemad sisse
2. Data pipelineid ja nende logimine, et pärast võrrelda LLM-i tulemusi päris tulemustega.
3. 

Küsimused:
1. Kas a la risk metric summa on alati numbriline väärtus?
2. Meil oleks vaja luua kontroll, et statustype ei tohi olla D, milliseid samme peaksime muutma?