In [92]:
import duckdb
import pandas as pd
import os


In [93]:
import xml.etree.ElementTree as ET
import pandas as pd
import os

def xml_to_dataframe(xml_file_path):
    """
    Parses an XML file and converts it into a pandas DataFrame.

    Args:
        xml_file_path (str): The path to the XML file.

    Returns:
        pandas.DataFrame: The DataFrame containing the XML data, or None if an error occurs.
    """
    try:
        tree = ET.parse(xml_file_path)
        root = tree.getroot()

        data = []
        for action in root:
            action_data = {}
            # Extract Action attributes
            for key, value in action.attrib.items():
                action_data["Action." + key.split("}")[-1]] = value

            for element in action:
                for key, value in element.attrib.items():
                    action_data["Customer." + key.split("}")[-1]] = value
                if len(element) == 0:  # Simple element
                    action_data[element.tag.split("}")[-1]] = element.text
                else:  # Nested element
                    for sub_element in element:
                        for key, value in sub_element.attrib.items():
                            action_data["Account." + key.split("}")[-1]] = value
                        if len(sub_element) == 0:
                            action_data[element.tag.split("}")[-1] + "." + sub_element.tag.split("}")[-1]] = sub_element.text
                        else:
                            for sub_sub_element in sub_element:
                                if len(sub_sub_element) == 0:
                                    action_data[element.tag.split("}")[-1] + "." + sub_element.tag.split("}")[-1] + "." + sub_sub_element.tag.split("}")[-1]] = sub_sub_element.text
                                else:
                                    for sub_sub_sub_element in sub_sub_element:
                                        action_data[element.tag.split("}")[-1] + "." + sub_element.tag.split("}")[-1] + "." + sub_sub_element.tag.split("}")[-1] + "." + sub_sub_sub_element.tag.split("}")[-1]] = sub_sub_sub_element.text
            data.append(action_data)

        df = pd.DataFrame(data)
        return df

    except Exception as e:
        print(f"Error processing XML file: {e}")
        return None

path_to_xml = "C:\lopu-kg-test\project\src\data\Batch1\CustomerMgmt.xml"

xml_dataframe = xml_to_dataframe(path_to_xml)



In [None]:
def rename_columns(df):
    """
    Renames DataFrame columns by extracting the last part after '.', 
    and appends a counter if duplicates are found.

    Args:
        df (pd.DataFrame): The DataFrame to rename columns.

    Returns:
        pd.DataFrame: The DataFrame with renamed columns.
    """
    new_columns = []
    seen_columns = {}  # Track seen column names and their counts

    for col in df.columns:
        parts = col.split('.')
        new_col = parts[-1]  # Extract the last part

        if new_col in seen_columns:
            seen_columns[new_col] += 1
            new_col = f"{new_col}_{seen_columns[new_col]}"  # Append a counter
        else:
            seen_columns[new_col] = 0

        new_columns.append(new_col)

    df.columns = new_columns
    return df


xml_dataframe = rename_columns(xml_dataframe)


In [95]:
def load_data_to_duckdb(db_path, src_folder):
    data_dict = {
        "Date.txt": "wh_db.DimDate",
        "Time.txt": "wh_db.DimTime",
        "StatusType.txt": "wh_db.StatusType",
        "TaxRate.txt": "wh_db.TaxRate",
        "TradeType.txt": "wh_db.TradeType",
        "HR.csv": "wh_db.DimBroker"
        # "FinwireCMP.txt": "wh_db.DimCompany"
    }
    
    con = duckdb.connect(database=db_path)
    
    for file_name, table_name in data_dict.items():
        file_path = os.path.join(src_folder, file_name)
        
        if file_name == "HR.csv":
            query = f"""
                CREATE OR REPLACE TABLE {table_name} AS 
                SELECT * FROM read_csv('{file_path}', delim=',', columns={{
                    'employeeid': 'BIGINT',
                    'managerid': 'BIGINT',
                    'employeefirstname': 'STRING',
                    'employeelastname': 'STRING',
                    'employeemi': 'STRING',
                    'employeejobcode': 'STRING',
                    'employeebranch': 'STRING',
                    'employeeoffice': 'STRING',
                    'employeephone': 'STRING'
                }}, header=False) 
                WHERE employeejobcode = '314';
            """
        else:
            query = f"COPY {table_name} FROM '{file_path}' (DELIMITER '|');"
        
        con.sql(query)
        print(f"Loaded {file_name} into {table_name}")
    
    return con

# Example usage:
db_path = 'initial_db.duckdb'
src_folder = 'src/data/Batch1'
con = load_data_to_duckdb(db_path, src_folder)

# Test loading
print(con.sql("SELECT * FROM wh_db.DimTime limit 10").fetchdf())


Loaded Date.txt into wh_db.DimDate
Loaded Time.txt into wh_db.DimTime
Loaded StatusType.txt into wh_db.StatusType
Loaded TaxRate.txt into wh_db.TaxRate
Loaded TradeType.txt into wh_db.TradeType
Loaded HR.csv into wh_db.DimBroker
   sk_timeid timevalue  hourid hourdesc  minuteid minutedesc  secondid  \
0          1  00:00:01       0       00         0      00:00         1   
1          2  00:00:02       0       00         0      00:00         2   
2          3  00:00:03       0       00         0      00:00         3   
3          4  00:00:04       0       00         0      00:00         4   
4          5  00:00:05       0       00         0      00:00         5   
5          6  00:00:06       0       00         0      00:00         6   
6          7  00:00:07       0       00         0      00:00         7   
7          8  00:00:08       0       00         0      00:00         8   
8          9  00:00:09       0       00         0      00:00         9   
9         10  00:00:10       0 

In [96]:
# insert puudu _stages

duckdb.sql(""" SELECT
        try_cast(C_ID as BIGINT) customerid,
        try_cast(CA_ID as BIGINT) accountid,
        try_cast(CA_B_ID as BIGINT) brokerid,
        nullif(C_TAX_ID, '') taxid,
        nullif(CA_NAME, '') accountdesc,
        try_cast(CA_TAX_ST as TINYINT) taxstatus,
        CASE
            WHEN ActionType IN ('NEW', 'ADDACCT', 'UPDACCT', 'UPDCUST') THEN 'Active'
            WHEN ActionType IN ('CLOSEACCT', 'INACT') THEN 'Inactive'
            ELSE NULL
        END AS status,
        nullif(C_L_NAME, '') lastname,
        nullif(C_F_NAME, '') firstname,
        nullif(C_M_NAME, '') middleinitial,
        nullif(upper(C_GNDR), '') gender,
        try_cast(C_TIER as TINYINT) tier,
        try_cast(C_DOB as DATE) dob,
        nullif(C_ADLINE1, '') addressline1,
        nullif(C_ADLINE2, '') addressline2,
        nullif(C_ZIPCODE, '') postalcode,
        nullif(C_CITY, '') city,
        nullif(C_STATE_PROV, '') stateprov,
        nullif(C_CTRY, '') country,
        CASE
            WHEN nullif(C_LOCAL, '') IS NOT NULL THEN
                concat(
                    CASE WHEN nullif(C_CTRY_CODE, '') IS NOT NULL THEN '+' || C_CTRY_CODE || ' ' ELSE '' END,
                    CASE WHEN nullif(C_AREA_CODE, '') IS NOT NULL THEN '(' || C_AREA_CODE || ') ' ELSE '' END,
                    C_LOCAL,
                    COALESCE(C_EXT, '')
                )
            ELSE NULL
        END AS phone1,
        CASE
            WHEN nullif(C_LOCAL_1, '') IS NOT NULL THEN
                concat(
                    CASE WHEN nullif(C_CTRY_CODE_1, '') IS NOT NULL THEN '+' || C_CTRY_CODE_1 || ' ' ELSE '' END,
                    CASE WHEN nullif(C_AREA_CODE_1, '') IS NOT NULL THEN '(' || C_AREA_CODE_1 || ') ' ELSE '' END,
                    C_LOCAL_1,
                    COALESCE(C_EXT_1, '')
                )
            ELSE NULL
        END AS phone2,
        CASE
            WHEN nullif(C_LOCAL_2, '') IS NOT NULL THEN
                concat(
                    CASE WHEN nullif(C_CTRY_CODE_2, '') IS NOT NULL THEN '+' || C_CTRY_CODE_2 || ' ' ELSE '' END,
                    CASE WHEN nullif(C_AREA_CODE_2, '') IS NOT NULL THEN '(' || C_AREA_CODE_2 || ') ' ELSE '' END,
                    C_LOCAL_2,
                    COALESCE(C_EXT_2, '')
                )
            ELSE NULL
        END AS phone3,
        nullif(C_PRIM_EMAIL, '') email1,
        nullif(C_ALT_EMAIL, '') email2,
        nullif(C_LCL_TX_ID, '') lcl_tx_id,
        nullif(C_NAT_TX_ID, '') nat_tx_id,
        try_cast(ActionTS as TIMESTAMP) update_ts,
        ActionType
           
            FROM xml_dataframe""").df()

Unnamed: 0,customerid,accountid,brokerid,taxid,accountdesc,taxstatus,status,lastname,firstname,middleinitial,...,country,phone1,phone2,phone3,email1,email2,lcl_tx_id,nat_tx_id,update_ts,ActionType
0,0,0,15746,923-54-6498,CJlmMuFyibKOmKLHIaTeWugvCgZdmcfpDsYb,1,Active,Joannis,Adara,,...,Canada,+1 (872) 523-8928,492-3961,,Adara.Joannis@moose-mail.com,Adara.Joannis@gmx.com,CA3,YT3,2007-07-07 04:28:56,NEW
1,1,1,15467,645-68-9627,BbxTgVGOlgyrYtVRjsXDJKmKDUp s ApIzUvH nFk,2,Active,Paperno,Jirina,P,...,United States of America,767-4707,,,Jirina.P.Paperno@ip6.li,Jirina.P.Paperno@devils.com,BC6,NU7,2007-07-07 04:47:03,NEW
2,2,2,13243,332-28-3838,IGzIDNTTRUDKwGaoVczrTMFJYIxSzLxJVLulovjqdqH j,1,Active,McBryan,Mariam,,...,United States of America,(420) 757-364261998,811-7498,,Mariam.McBryan@aggies.com,Mariam.McBryan@shtrudel.tv,NY4,IA1,2007-07-07 06:17:28,NEW
3,3,3,9688,472-49-1339,ZHXwHtCcLZqdWhWOPSWWSsvHxqBuIbXSTRTIIGzFENztrH,1,Active,Adey,Robinia,L,...,United States of America,+1 (819) 163-0774,(777) 787-1085,,Robinia.L.Adey@12Freeukisp.co.uk,Robinia.L.Adey@icqmail.com,WI4,MO1,2007-07-07 07:57:28,NEW
4,4,4,11326,700-39-4024,mzlYZlTIDmOGuKQHOEFTaOGNiAQcjtE,1,Active,Haubert,Lulu,,...,United States of America,734-4072,713-2893,,Lulu.Haubert@buffaloes.com,,ON4,MB7,2007-07-07 09:38:29,NEW
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24565,1939,3707,,,,,Inactive,,,,...,,,,,,,,,2017-07-07 07:33:34,CLOSEACCT
24566,7407,15107,5274,,xVYNQSzhFPHOPcFMAIGKMDoUKKLrZRRJRK,1,Active,,,,...,,,,,,,,,2017-07-07 11:25:40,ADDACCT
24567,4294,15108,4258,,IiqVbPKUxQtzVxVRdTZtRrMQqtAuzJrXolt,1,Active,,,,...,,,,,,,,,2017-07-07 17:40:37,ADDACCT
24568,5790,11196,3559,,tdmUhxMhzRgOSBLCKYrlkBLbSma DQXAG,,Active,,,,...,,,,,,,,,2017-07-07 20:40:04,UPDACCT


In [97]:
con.commit()

con.close()

In [101]:
con = duckdb.connect(database=db_path)
#con.sql("select * from sqlite_temp_schema ")
con.sql("select * from information_schema.tables ")

┌───────────────┬──────────────┬─────────────────────┬────────────┬──────────────────────────────┬──────────────────────┬───────────────────────────┬──────────────────────────┬────────────────────────┬────────────────────┬──────────┬───────────────┬───────────────┐
│ table_catalog │ table_schema │     table_name      │ table_type │ self_referencing_column_name │ reference_generation │ user_defined_type_catalog │ user_defined_type_schema │ user_defined_type_name │ is_insertable_into │ is_typed │ commit_action │ TABLE_COMMENT │
│    varchar    │   varchar    │       varchar       │  varchar   │           varchar            │       varchar        │          varchar          │         varchar          │        varchar         │      varchar       │ varchar  │    varchar    │    varchar    │
├───────────────┼──────────────┼─────────────────────┼────────────┼──────────────────────────────┼──────────────────────┼───────────────────────────┼──────────────────────────┼────────────────────────┼─

1. Information schemast tabelid ja skeemad sisse
2. Data pipelineid ja nende logimine, et pärast võrrelda LLM-i tulemusi päris tulemustega.
3. 

Küsimused:
1. Kas a la risk metric summa on alati numbriline väärtus?
2. Meil oleks vaja luua kontroll, et statustype ei tohi olla D, milliseid samme peaksime muutma?