<a href="https://colab.research.google.com/github/cluciani-angel/documentation/blob/main/ETL_Debugging_Script.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# https://gemini.google.com/app/988afc90d752b723
import pandas as pd
import traceback # Import traceback to get more detailed error info
import uuid # Import the uuid library for the demonstration

def process_data_safely(dataframe):
    """
    Processes a DataFrame row by row to identify problematic data.
    You need to add your actual processing logic inside the 'try' block below.
    """
    processed_rows = []

    # Iterate over each row in the DataFrame using its index
    for index, row in dataframe.iterrows():
        try:
            # --- START: YOUR POTENTIAL ERROR-PRONE CODE ---
            #
            # ATTENTION: Replace the line below with your actual data transformation code.
            # This example line intentionally causes the error with the dummy data.
            # It tries to convert a hex string into a 16-byte UUID. It will fail on
            # the 'corrupted_data' row because its hex string is only 8 bytes long.

            row['uuid_column'] = uuid.UUID(bytes=bytes.fromhex(row['id_bytes_hex']))

            # --- END: YOUR POTENTIAL ERROR-PRONE CODE ---

            # If the row is processed successfully, add it to our list
            processed_rows.append(row)

        except (ValueError, TypeError) as e:
            # Catch specific errors related to data conversion
            print(f"❌ Error processing row index: {index}")
            print(f"   Problematic Row Data:\n{row}\n")
            print(f"   Error Type: {type(e).__name__}")
            print(f"   Error Message: {e}\n")

            # Use traceback to print the exact line where the error occurred
            print("   Traceback:")
            traceback.print_exc()

            # You can decide to stop the process or continue
            print("--> Stopping ETL process due to error.")
            return None # Stop and return nothing

    print("✅ All rows processed successfully.")
    return pd.DataFrame(processed_rows)


# --- Main Execution ---

# 1. Load your data
# This should be the real path to your CSV file
file_path = '/content/drive/MyDrive/Reportes Auditoria/SIELCON/08-2025/Tickets08-2025.csv'
print(f"--> Starting debug process for: {file_path}")

try:
    # --- IMPORTANT ---
    # In your actual use, you should comment out the dummy data and
    # load your real data from the CSV file.
    # df = pd.read_csv(file_path, dtype=str) # Reading all as string initially is safer

    # Dummy data that replicates the error condition.
    # 'corrupted_data' is encoded into an 8-byte hex string, which will cause the error.
    dummy_data = {
        'id_column': ['d3d6e0a01b3a4c9f8f1e7d6a5b4c3d2e', 'corrupted_data', 'e1c2a3b45d6e7f8a9b0c1d2e3f4a5b6c'],
        'other_data': [100, 200, 300]
    }
    df = pd.DataFrame(dummy_data)
    # Simulate a column that should contain 32 hex characters (16 bytes)
    # The second entry will only have 16 hex characters (8 bytes)
    df['id_bytes_hex'] = df['id_column'].apply(lambda x: x.encode().hex() if x != 'corrupted_data' else '636f727275707465')

    print("--> Data loaded. Starting safe processing...")

    # 2. Run the safe processing function
    final_df = process_data_safely(df)

    if final_df is not None:
        print("\n--> ETL process completed without fatal errors.")
        print("Final DataFrame head:")
        print(final_df.head())

except FileNotFoundError:
    print(f"❌ Error: The file was not found at {file_path}")
except Exception as e:
    print(f"❌ An unexpected error occurred during initial data loading: {e}")

--> Starting debug process for: /content/drive/MyDrive/Reportes Auditoria/SIELCON/08-2025/Tickets08-2025.csv
--> Data loaded. Starting safe processing...
❌ Error processing row index: 0
   Problematic Row Data:
id_column                        d3d6e0a01b3a4c9f8f1e7d6a5b4c3d2e
other_data                                                    100
id_bytes_hex    6433643665306130316233613463396638663165376436...
Name: 0, dtype: object

   Error Type: ValueError
   Error Message: bytes is not a 16-char string

   Traceback:
--> Stopping ETL process due to error.


Traceback (most recent call last):
  File "/tmp/ipython-input-3297398693.py", line 22, in process_data_safely
    row['uuid_column'] = uuid.UUID(bytes=bytes.fromhex(row['id_bytes_hex']))
                         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.12/uuid.py", line 187, in __init__
    raise ValueError('bytes is not a 16-char string')
ValueError: bytes is not a 16-char string
