In [0]:
dbutils.widgets.text("target_dataset","")
dbutils.widgets.text("temp_target_table","")
dbutils.widgets.text("target_table","")

In [0]:
target_dataset = dbutils.widgets.get("target_dataset")
temp_target_table = dbutils.widgets.get("temp_target_table")
target_table = dbutils.widgets.get("target_table")

In [0]:
from google.cloud import bigquery
from google.oauth2 import service_account
import pandas as pd

bq_creds_path = dbutils.secrets.get(scope="bigquery_ims_analytics", key="path_to_credentials_json_file")
bq_project_id = dbutils.secrets.get(scope="bigquery_ims_analytics", key="project_id")
bq_location = dbutils.secrets.get(scope="bigquery_ims_analytics", key="location")
 
credentials_path = bq_creds_path
credentials = service_account.Credentials.from_service_account_file(credentials_path)
 
bq_client = bigquery.Client(
    credentials=credentials,
    project=bq_project_id
)

In [0]:
# Function to extract schema
def get_table_schema(client, table_id):
    table = client.get_table(table_id)
    return pd.DataFrame(
        [(field.name, field.field_type) for field in table.schema],
        columns=["column_name", "data_type"]
    )

In [0]:

# Read Schemas and compare the column count
target_table_id = f"dealshare-d82f7.{target_dataset}.{target_table}"
temp_target_table_id = f"dealshare-d82f7.staging_raw.{temp_target_table}"

schema_df_1 = get_table_schema(bq_client, target_table_id)
schema_df_2 = get_table_schema(bq_client, temp_target_table_id)

metrics_df = pd.DataFrame([
    {
        "metric": "column_count",
        "target_value": len(schema_df_1),
        "temp_value": len(schema_df_2),
        "status": "MATCH" if len(schema_df_1) == len(schema_df_2) else "MISMATCH"
    }
])

display(metrics_df)

In [0]:
import re

def split_fields(schema):
    """Split fields at top-level commas only"""
    return re.split(r",(?![^<]*>)", schema)

def explode_array_struct(schema_str, parent_col):
    """
    Recursively explode ARRAY<STRUCT<...>> schemas
    """
    result = {}

    schema_str = schema_str.strip().upper()

    if not schema_str.startswith("ARRAY<STRUCT<"):
        raise ValueError("Schema is not ARRAY<STRUCT<...>>")

    inner = schema_str[len("ARRAY<STRUCT<"):-2]

    # Mark array itself
    result[f"{parent_col}[]"] = "ARRAY"

    fields = split_fields(inner)

    for field in fields:
        name, dtype = field.strip().split(" ", 1)
        name = name.lower()
        dtype = dtype.strip()

        key = f"{parent_col}[].{name}"

        # Nested ARRAY<STRUCT>
        if dtype.startswith("ARRAY<STRUCT<"):
            nested = explode_array_struct(dtype, key)
            result.update(nested)

        # Simple ARRAY (non-struct)
        elif dtype.startswith("ARRAY<"):
            result[f"{key}[]"] = dtype

        # Primitive or STRUCT
        else:
            result[key] = dtype

    return result


In [0]:
def get_schema_dict(target_dataset, table_name, bq_client):
    query = f"""
    SELECT table_name, column_name, ordinal_position, data_type
    FROM `{target_dataset}.INFORMATION_SCHEMA.COLUMNS`
    WHERE table_name = '{table_name}'
    order by ordinal_position ASC
    """
    df_check = bq_client.query(query)
    rows = df_check.result()
    schema_dict = {}

    for row in rows:
        col = row["column_name"].lower()
        dtype = row["data_type"].upper()

        # Normal column
        if not dtype.startswith("ARRAY<STRUCT<"):
            schema_dict[col] = dtype

        # ARRAY<STRUCT> column â†’ explode
        else:
            exploded = explode_array_struct(dtype, col)
            schema_dict.update(exploded)

    return schema_dict

In [0]:
# create dictionaries from schemas for fast lookup
schema_dict1 = get_schema_dict(f'{target_dataset}', target_table, bq_client)
schema_dict2 = get_schema_dict(f'staging_raw', temp_target_table, bq_client)

# get all column names from both tables
all_columns = set(schema_dict1.keys()) | set(schema_dict2.keys())

rows = []

for col in sorted(all_columns):
    dt1 = schema_dict1.get(col)
    dt2 = schema_dict2.get(col)

    if dt1 is None:
        comparison = "MISSING_IN_TABLE_1"
    elif dt2 is None:
        comparison = "MISSING_IN_TABLE_2"
    elif dt1 != dt2:
        comparison = "DATATYPE_MISMATCH"
    else:
        comparison = "MATCH"

    rows.append({
        "column_name": col,
        "data_type_table1": dt1,
        "data_type_table2": dt2,
        "comparison_result": comparison
    })

column_comparison_df = pd.DataFrame(rows)

display(column_comparison_df)