In [16]:

from pyspark.sql import SparkSession
import sqlite3
import pandas as pd

def incremental_load():
    conn = sqlite3.connect('../../../Customers_ELT.db')
    cursor = conn.cursor()
    try:
        # EXTRACT (Loading CSVs from S3 or local storage)
        customers = pd.read_csv("../../../Customer.csv")
        invoices = pd.read_csv("../../../Invoice.csv")
        invoice_lines = pd.read_csv("../../../InvoiceLine.csv")

        # Check if the 'customer_loyalty' table exists
        cursor.execute("""
             SELECT name FROM sqlite_master WHERE type='table' AND name='customer_loyalty_and_invoice_size_ELT';
         """)
        table_exists = cursor.fetchone()

        # If the table exists, get the latest processed updated_at from the customer_loyalty table
        if table_exists:
            latest_updated_at_query = "SELECT MAX(updated_at) FROM customer_loyalty_and_invoice_size_ELT"
            cursor.execute(latest_updated_at_query)
            last_updated  = cursor.fetchone()[0]

        if not table_exists or last_updated  is None:
            last_updated  = '1900-01-01 00:00:00'  # Default to a very old timestamp
#         print(last_updated)
#         last_updated = pd.to_datetime(last_updated)
#         print(last_updated)
#         # Filter rows that have been created or updated after the last load
#         customers_filtered = customers[pd.to_datetime(customers['UpdatedAt']) > last_updated]
#         invoices_filtered = invoices[pd.to_datetime(invoices['UpdatedAt']) > last_updated]
#         invoice_lines_filtered = invoice_lines[pd.to_datetime(invoice_lines['UpdatedAt']) > last_updated]
#         print(customers_filtered,invoices_filtered, invoice_lines_filtered)
#         # Load filtered data into temporary tables

        last_updated = pd.to_datetime(last_updated)
        print(f"last_updated: {last_updated}")

        # Convert 'UpdatedAt' to datetime format, assuming format is MM/DD/YYYY
        customers['UpdatedAt'] = pd.to_datetime(customers['UpdatedAt'], format='%m/%d/%Y', errors='coerce')
        invoices['UpdatedAt'] = pd.to_datetime(invoices['UpdatedAt'], format='%m/%d/%Y', errors='coerce')
        invoice_lines['UpdatedAt'] = pd.to_datetime(invoice_lines['UpdatedAt'], format='%m/%d/%Y', errors='coerce')

        # Print first few rows to check the dates
#         print(customers['UpdatedAt'].head())
#         print(invoices['UpdatedAt'].head())
#         print(invoice_lines['UpdatedAt'].head())

        # Filter rows that have been created or updated after the last load
        customers_filtered = customers[customers['UpdatedAt'] > last_updated]
        invoices_filtered = invoices[invoices['UpdatedAt'] > last_updated]
        invoice_lines_filtered = invoice_lines[invoice_lines['UpdatedAt'] > last_updated]

        # Print filtered data to verify
        print("Filtered Customers:")
        print(customers_filtered)
        print("Filtered Invoices:")
        print(invoices_filtered)
        print("Filtered Invoice Lines:")
        print(invoice_lines_filtered)
        
        
        customers_filtered.to_sql('Customers_temp', conn, if_exists='replace', index=False)
        invoices_filtered.to_sql('Invoices_temp', conn, if_exists='replace', index=False)
        invoice_lines_filtered.to_sql('Invoices_Line_temp', conn, if_exists='replace', index=False)

        # Save historical data for customers to be updated (loyalty_score, avg_invoice_size, created_at)
        conn.execute("""DROP TABLE IF EXISTS customers_old_data_temp""")
        conn.execute("""
            CREATE TABLE customers_old_data_temp AS
            SELECT CustomerId, loyalty_score, avg_invoice_size, created_at
            FROM customer_loyalty_and_invoice_size_ELT
            WHERE CustomerId IN (SELECT CustomerId FROM Customers_temp)
        """)
        conn.commit()

        # Remove the existing rows from the target table for customers being updated
        conn.execute("""
            DELETE FROM customer_loyalty_and_invoice_size_ELT
            WHERE CustomerId IN (SELECT CustomerId FROM Customers_temp)
        """)
        conn.commit()

        # TRANSFORM (Perform transformations and combine with old data)
        transform_query = """
            INSERT INTO customer_loyalty_and_invoice_size_ELT 
            (CustomerId, FirstName, LastName, loyalty_score, avg_invoice_size, created_at, updated_at, updated_by)
            SELECT 
                C.CustomerId,
                C.FirstName,
                C.LastName,
                COALESCE(O.loyalty_score, 0) + COUNT(I.InvoiceId) AS loyalty_score,  -- Combine old and new loyalty score
                (
                    (COALESCE(O.loyalty_score, 0) * COALESCE(O.avg_invoice_size, 0)) + SUM(IL.total_spend)
                ) / NULLIF(COALESCE(O.loyalty_score, 0) + COUNT(I.InvoiceId), 0) AS avg_invoice_size,  -- Combine old and new avg invoice size
                COALESCE(O.created_at, CURRENT_TIMESTAMP) AS created_at,  -- Retain old created_at or set to current
                CURRENT_TIMESTAMP AS updated_at,
                'process:SL' AS updated_by
            FROM 
                Customers_temp C
            LEFT JOIN Invoices_temp I ON C.CustomerId = I.CustomerId
            LEFT JOIN (
                SELECT InvoiceId, SUM(UnitPrice * Quantity) AS total_spend
                FROM Invoices_Line_temp
                GROUP BY InvoiceId
            ) IL ON I.InvoiceId = IL.InvoiceId
            LEFT JOIN customers_old_data_temp O ON C.CustomerId = O.CustomerId
            GROUP BY 
                C.CustomerId, C.FirstName, C.LastName;
        """

        # Execute the transformation query
        conn.execute(transform_query)
        # Commit the changes to the database
        conn.commit()

    finally:
        # Clean up temporary tables
        conn.execute("DROP TABLE IF EXISTS Customers_temp")
        conn.execute("DROP TABLE IF EXISTS Invoices_temp")
        conn.execute("DROP TABLE IF EXISTS Invoices_Line_temp")
        conn.execute("DROP TABLE IF EXISTS customers_old_data_temp")
        conn.commit()

        # Close the SQLite connection
        conn.close()


incremental_load()

last_updated: 2024-09-14 21:29:40
Filtered Customers:
Empty DataFrame
Columns: [CustomerId, FirstName, LastName, Company, Address, City, State, Country, PostalCode, Phone, Fax, Email, SupportRepId, UpdatedAt]
Index: []
Filtered Invoices:
     InvoiceId  CustomerId       InvoiceDate  \
225        226          40  21/09/2023 00:00   
226        227          44  22/09/2023 00:00   
227        228          50  25/09/2023 00:00   
256        257          34  01/02/2024 00:00   
257        258          48  09/02/2024 00:00   
258        259          49  22/02/2024 00:00   
285        286          23  12/06/2024 00:00   
286        287          24  25/06/2024 00:00   
287        288          26  25/06/2024 00:00   
316        317           3  28/10/2024 00:00   
317        318           7  29/10/2024 00:00   
318        319          13  01/11/2024 00:00   
346        347          47  05/03/2025 00:00   
347        348          56  10/03/2025 00:00   
348        349          11  18/03/2025 00:

In [39]:
conn = sqlite3.connect('../../../Customers_ELT.db')
cursor = conn.cursor()

# Query to check if a row with the given CustomerId exists
check_query = "SELECT * FROM customer_loyalty_and_invoice_size_ELT"

# Execute the query with the specific CustomerId
cursor.execute(check_query)

# Fetch all rows from the result set
results = cursor.fetchall()

# Print each row
for row in results:
    print(row)

# print(result)

(1, 'Luís', 'Gonçalves', 10, 5.660000000000001, '2024-09-14 22:16:05', '2024-09-14 22:16:05', 'process:SL')
(2, 'Leonie', 'Köhler', 7, 5.3742857142857146, '2024-09-14 22:16:05', '2024-09-14 22:16:05', 'process:SL')
(3, 'François', 'Tremblay', 7, 5.659999999999999, '2024-09-14 22:16:05', '2024-09-14 22:16:05', 'process:SL')
(4, 'Bjørn', 'Hansen', 7, 5.659999999999999, '2024-09-14 22:16:05', '2024-09-14 22:16:05', 'process:SL')
(5, 'František', 'Wichterlová', 7, 5.802857142857143, '2024-09-14 22:16:05', '2024-09-14 22:16:05', 'process:SL')
(6, 'Helena', 'Holý', 7, 7.088571428571427, '2024-09-14 22:16:05', '2024-09-14 22:16:05', 'process:SL')
(7, 'Astrid', 'Gruber', 7, 6.088571428571428, '2024-09-14 22:16:05', '2024-09-14 22:16:05', 'process:SL')
(8, 'Daan', 'Peeters', 7, 5.3742857142857146, '2024-09-14 22:16:05', '2024-09-14 22:16:05', 'process:SL')
(9, 'Kara', 'Nielsen', 7, 5.3742857142857146, '2024-09-14 22:16:05', '2024-09-14 22:16:05', 'process:SL')
(10, 'Eduardo', 'Martins', 7, 5.37

In [38]:
from pyspark.sql import SparkSession
import sqlite3  # Assuming you're using sqlite3
import pandas as pd

def load():

    conn = sqlite3.connect('../../../Customers_ELT.db')

    try:
        # EXTRACT (Loading CSVs from S3 or local storage)
        # -----------------------------------------------
        customers = pd.read_csv("../../../Customer.csv")
        invoices = pd.read_csv("../../../Invoice.csv")
        invoice_lines = pd.read_csv("../../../InvoiceLine.csv")


        # LOAD (Save the raw data into SQLite without transformation)
        # -----------------------------------------------------------------------
        # Load raw data into SQLite
        customers.to_sql('Customers', conn, if_exists='replace', index=False)
        invoices.to_sql('Invoices', conn, if_exists='replace', index=False)
        invoice_lines.to_sql('Invoices_Line', conn, if_exists='replace', index=False)

        # TRANSFORM (Perform transformations with SQL queries using KT_DB functions)
        # -------------------------------------------------------------------------
        drop_query="""DROP TABLE IF EXISTS customer_loyalty_and_invoice_size_ELT"""
        conn.execute(drop_query)
        conn.commit()

        transform_query = """
            CREATE TABLE customer_loyalty_and_invoice_size_ELT AS 
            SELECT 
                C.CustomerId,
                C.FirstName,
                C.LastName,
                COUNT(I.InvoiceId) AS loyalty_score,
                AVG(IL.total_spend) AS avg_invoice_size,
                CURRENT_TIMESTAMP AS created_at,
                CURRENT_TIMESTAMP AS updated_at,
                'process:SL' AS updated_by
            FROM 
                Customers C
            LEFT JOIN Invoices I ON C.CustomerId = I.CustomerId
            LEFT JOIN (
                SELECT InvoiceId, SUM(UnitPrice * Quantity) AS total_spend
                FROM Invoices_Line
                GROUP BY InvoiceId
            ) IL ON I.InvoiceId = IL.InvoiceId
            GROUP BY 
                C.CustomerId, C.FirstName, C.LastName;
        """

        # Execute the transformation query
        conn.execute(transform_query)
        # Commit the changes to the database
        conn.commit()

    finally:
        # Close the SQLite connection and stop Spark session
        conn.close()  # Close the SQLite connection
        
load()


In [31]:
from pyspark.sql import SparkSession
import sqlite3
import pandas as pd

def incremental_load():
    conn = sqlite3.connect('../../../Customers_ELT.db')
    cursor = conn.cursor()
    
    try:
        # EXTRACT (Loading CSVs from local storage)
        customers = pd.read_csv("../../../Customer.csv")
        invoices = pd.read_csv("../../../Invoice.csv")
        invoice_lines = pd.read_csv("../../../InvoiceLine.csv")

        # Check if the 'customer_loyalty' table exists
        cursor.execute("""
            SELECT name FROM sqlite_master WHERE type='table' AND name='customer_loyalty_and_invoice_size_ELT';
        """)
        table_exists = cursor.fetchone()

        # If the table exists, get the latest processed updated_at from the customer_loyalty table
        if table_exists:
            latest_updated_at_query = "SELECT MAX(updated_at) FROM customer_loyalty_and_invoice_size_ELT"
            cursor.execute(latest_updated_at_query)
            last_updated = cursor.fetchone()[0]

        if not table_exists or last_updated is None:
            last_updated = '1900-01-01 00:00:00'  # Default to a very old timestamp
            
        # Convert last_updated to a datetime object
        last_updated = pd.to_datetime(last_updated)
        print(f"Last Updated: {last_updated}")

        # Convert 'UpdatedAt' columns in CSVs to datetime, assuming format is DD/MM/YYYY
        customers['UpdatedAt'] = pd.to_datetime(customers['UpdatedAt'], dayfirst=True, errors='coerce')
        invoices['UpdatedAt'] = pd.to_datetime(invoices['UpdatedAt'], dayfirst=True, errors='coerce')
        invoice_lines['UpdatedAt'] = pd.to_datetime(invoice_lines['UpdatedAt'], dayfirst=True, errors='coerce')

        # Filter rows that have been created or updated after the last load
        customers_filtered = customers[customers['UpdatedAt'] > last_updated]
        invoices_filtered = invoices[invoices['UpdatedAt'] > last_updated]
        invoice_lines_filtered = invoice_lines[invoice_lines['UpdatedAt'] > last_updated]

        # Load filtered data into temporary tables
        customers_filtered.to_sql('Customers_temp', conn, if_exists='replace', index=False)
        invoices_filtered.to_sql('Invoices_temp', conn, if_exists='replace', index=False)
        invoice_lines_filtered.to_sql('Invoices_Line_temp', conn, if_exists='replace', index=False)

        # Save historical data for customers to be updated (loyalty_score, avg_invoice_size, created_at)
        conn.execute("""DROP TABLE IF EXISTS customers_old_data_temp""")
        conn.execute("""
            CREATE TABLE customers_old_data_temp AS
            SELECT CustomerId, loyalty_score, avg_invoice_size, created_at
            FROM customer_loyalty_and_invoice_size_ELT
            WHERE CustomerId IN (SELECT CustomerId FROM Customers_temp)
        """)
        conn.commit()

        # Remove the existing rows from the target table for customers being updated
        conn.execute("""
            DELETE FROM customer_loyalty_and_invoice_size_ELT
            WHERE CustomerId IN (SELECT CustomerId FROM Customers_temp)
        """)
        conn.commit()

        # TRANSFORM (Perform transformations and combine with old data)
        transform_query = """
            INSERT INTO customer_loyalty_and_invoice_size_ELT 
            (CustomerId, FirstName, LastName, loyalty_score, avg_invoice_size, created_at, updated_at, updated_by)
            SELECT 
                C.CustomerId,
                C.FirstName,
                C.LastName,
                COALESCE(O.loyalty_score, 0) + COUNT(I.InvoiceId) AS loyalty_score,  -- Combine old and new loyalty score
                (
                    (COALESCE(O.loyalty_score, 0) * COALESCE(O.avg_invoice_size, 0)) + SUM(IL.total_spend)
                ) / NULLIF(COALESCE(O.loyalty_score, 0) + COUNT(I.InvoiceId), 0) AS avg_invoice_size,  -- Combine old and new avg invoice size
                COALESCE(O.created_at, CURRENT_TIMESTAMP) AS created_at,  -- Retain old created_at or set to current
                CURRENT_TIMESTAMP AS updated_at,
                'process:SL' AS updated_by
            FROM 
                Customers_temp C
            LEFT JOIN Invoices_temp I ON C.CustomerId = I.CustomerId
            LEFT JOIN (
                SELECT InvoiceId, SUM(UnitPrice * Quantity) AS total_spend
                FROM Invoices_Line_temp
                GROUP BY InvoiceId
            ) IL ON I.InvoiceId = IL.InvoiceId
            LEFT JOIN customers_old_data_temp O ON C.CustomerId = O.CustomerId
            GROUP BY 
                C.CustomerId, C.FirstName, C.LastName;
        """

        # Execute the transformation query
        conn.execute(transform_query)
        # Commit the changes to the database
        conn.commit()

    finally:
        # Clean up temporary tables
        conn.execute("DROP TABLE IF EXISTS Customers_temp")
        conn.execute("DROP TABLE IF EXISTS Invoices_temp")
        conn.execute("DROP TABLE IF EXISTS Invoices_Line_temp")
        conn.execute("DROP TABLE IF EXISTS customers_old_data_temp")
        conn.commit()

        # Close the SQLite connection
        conn.close()
incremental_load()

Last Updated: 2024-09-14 21:58:31


In [23]:
#efrat's code

In [37]:
import sqlite3
import pandas as pd

def load():
    conn = sqlite3.connect('../../../Customers_ELT.db')
    cursor = conn.cursor()
    
    try:
        # EXTRACT (Loading CSVs from local storage)
        customers = pd.read_csv("../../../Customer.csv")
        invoices = pd.read_csv("../../../Invoice.csv")
        invoice_lines = pd.read_csv("../../../InvoiceLine.csv")

        # Check if the 'customer_loyalty' table exists
        cursor.execute("""
            SELECT name FROM sqlite_master WHERE type='table' AND name='customer_loyalty_and_invoice_size_ELT';
        """)
        table_exists = cursor.fetchone()

        # If the table exists, get the latest processed updated_at from the customer_loyalty table
        if table_exists:
            latest_updated_at_query = "SELECT MAX(updated_at) FROM customer_loyalty_and_invoice_size_ELT"
            cursor.execute(latest_updated_at_query)
            last_updated = cursor.fetchone()[0]
        else:
            last_updated = None

        if not last_updated:
            last_updated = '1900-01-01 00:00:00'  # Default to a very old timestamp

        # Convert last_updated to a datetime object
        last_updated = pd.to_datetime(last_updated)
        print(f"Last Updated: {last_updated}")

        # Convert 'UpdatedAt' columns in CSVs to datetime, assuming format is DD/MM/YYYY
        customers['UpdatedAt'] = pd.to_datetime(customers['UpdatedAt'], dayfirst=True, errors='coerce')
        invoices['UpdatedAt'] = pd.to_datetime(invoices['UpdatedAt'], dayfirst=True, errors='coerce')
        invoice_lines['UpdatedAt'] = pd.to_datetime(invoice_lines['UpdatedAt'], dayfirst=True, errors='coerce')

        # Filter rows that have been created or updated after the last load
        customers_filtered = customers[customers['UpdatedAt'] > last_updated]
        invoices_filtered = invoices[invoices['UpdatedAt'] > last_updated]
        invoice_lines_filtered = invoice_lines[invoice_lines['UpdatedAt'] > last_updated]

        # Load filtered data into temporary tables
        customers_filtered.to_sql('Customers_temp', conn, if_exists='replace', index=False)
        invoices_filtered.to_sql('Invoices_temp', conn, if_exists='replace', index=False)
        invoice_lines_filtered.to_sql('Invoices_Line_temp', conn, if_exists='replace', index=False)

        # Save historical data for customers to be updated (loyalty_score, avg_invoice_size, created_at)
        conn.execute("""DROP TABLE IF EXISTS customers_old_data_temp""")
        conn.execute("""
            CREATE TABLE customers_old_data_temp AS
            SELECT CustomerId, loyalty_score, avg_invoice_size, created_at
            FROM customer_loyalty_and_invoice_size_ELT
            WHERE CustomerId IN (SELECT CustomerId FROM Customers_temp)
        """)
        conn.commit()

        # Remove the existing rows from the target table for customers being updated
        conn.execute("""
            DELETE FROM customer_loyalty_and_invoice_size_ELT
            WHERE CustomerId IN (SELECT CustomerId FROM Customers_temp)
        """)
        conn.commit()

        # TRANSFORM (Insert or update existing records with new data)
        transform_query = """
            INSERT INTO customer_loyalty_and_invoice_size_ELT 
            (CustomerId, FirstName, LastName, loyalty_score, avg_invoice_size, created_at, updated_at, updated_by)
            SELECT 
                C.CustomerId,
                C.FirstName,
                C.LastName,
                COALESCE(O.loyalty_score, 0) + COUNT(I.InvoiceId) AS loyalty_score,  -- Combine old and new loyalty score
                (
                    (COALESCE(O.loyalty_score, 0) * COALESCE(O.avg_invoice_size, 0)) + SUM(IL.total_spend)
                ) / NULLIF(COALESCE(O.loyalty_score, 0) + COUNT(I.InvoiceId), 0) AS avg_invoice_size,  -- Combine old and new avg invoice size
                COALESCE(O.created_at, CURRENT_TIMESTAMP) AS created_at,  -- Retain old created_at or set to current
                CURRENT_TIMESTAMP AS updated_at,
                'process:SL' AS updated_by
            FROM 
                Customers_temp C
            LEFT JOIN Invoices_temp I ON C.CustomerId = I.CustomerId
            LEFT JOIN (
                SELECT InvoiceId, SUM(UnitPrice * Quantity) AS total_spend
                FROM Invoices_Line_temp
                GROUP BY InvoiceId
            ) IL ON I.InvoiceId = IL.InvoiceId
            LEFT JOIN customers_old_data_temp O ON C.CustomerId = O.CustomerId
            GROUP BY 
                C.CustomerId, C.FirstName, C.LastName;
        """

        # Execute the transformation query
        conn.execute(transform_query)
        # Commit the changes to the database
        conn.commit()

    finally:
        # Clean up temporary tables
        conn.execute("DROP TABLE IF EXISTS Customers_temp")
        conn.execute("DROP TABLE IF EXISTS Invoices_temp")
        conn.execute("DROP TABLE IF EXISTS Invoices_Line_temp")
        conn.execute("DROP TABLE IF EXISTS customers_old_data_temp")
        conn.commit()

        # Close the SQLite connection
        conn.close()


load()

Last Updated: 2024-09-14 21:58:31
