# Data Catalog Python Script
* Repository located here: https://github.com/emilyporter920/Data_Cataloging

## Import Libaries

In [None]:
# Import dependencies
from snowflake.snowpark.session import Session
import json
import pandas as pd
from platform import python_version
from datetime import datetime
import openpyxl
from config import account, user, authenticator, warehouse1, role1, warehouse2, role2, warehouse3 

# Shows Python version (SnowPark uses anything below 3.8.x)
print(python_version())

# GVR PROD Database

In [None]:
# Create Snowflake Session object (GVR_PROD)
connection_parameters = {
    "account": account,
    "user": user,
    "authenticator": authenticator,
    "warehouse": warehouse1,
    "role": role1
}

session = Session.builder.configs(connection_parameters).create()

## Primary Keys

In [None]:
# Grabbing primary keys from PROD
primary_keys = session.sql("SHOW PRIMARY KEYS IN DATABASE GVR_PROD").collect()

primary_keys= pd.DataFrame(list(primary_keys))

In [None]:
# Get rid of columns you don't need in PRIMARY_KEYS table
primary_keys = primary_keys.drop(['created_on', 'constraint_name', 'rely'], axis=1)
primary_keys.head()

In [None]:
# Rename columns in PRIMARY_KEYS table to match COLUMN_TABLE columns
primary_keys = primary_keys.rename(columns= {'database_name': 'DATABASE', 'schema_name': 'SCHEMA', 
                                             'table_name': 'TABLE_NAME', 'column_name': 'COLUMN_NAME', 'key_sequence': 'PK'})

primary_keys.head()

In [None]:
# Get rid of extra comment column
primary_keys = primary_keys.drop(['comment'], axis=1)

## Information Schema

In [None]:
# Information_Schema data
column_table = session.sql("SELECT TABLE_CATALOG, TABLE_SCHEMA, TABLE_NAME, COLUMN_NAME, DATA_TYPE, CHARACTER_MAXIMUM_LENGTH, NUMERIC_PRECISION, NUMERIC_SCALE, IS_NULLABLE, COMMENT FROM GVR_PROD.INFORMATION_SCHEMA.COLUMNS").to_pandas()

column_table.head()

In [None]:
# Rename columns in COLUMN_TABLE table
column_table = column_table.rename(columns= {'TABLE_CATALOG': 'DATABASE', 'TABLE_SCHEMA': 'SCHEMA', 'CHARACTER_MAXIMUM_LENGTH': 'LENGTH', 
                                             'NUMERIC_PRECISION': 'PRECISION', 'NUMERIC_SCALE': 'SCALE', 'IS_NULLABLE': 'NULLABLE'})

column_table.head()

In [None]:
# Merge PRIMARY_KEYS and COLUMN_TABLE tables
merged_tables = pd.merge(column_table, primary_keys, how='left', on=['DATABASE', 'SCHEMA', 'TABLE_NAME', 'COLUMN_NAME'])

merged_tables.head()

# Roles

In [None]:
# Read in the roles table
roles = session.sql("SELECT TABLE_CATALOG, TABLE_SCHEMA, GRANTEE_NAME, DELETED_ON, NAME FROM GVR_DEV.ADMIN.GRANTS_TO_ROLES_2023_04_26 ").to_pandas()

In [None]:
# Drop the DELETED_ON column
roles = roles.drop(['DELETED_ON'], axis=1)

In [None]:
# Delete rows that do not have table_catalog as GVR_PROD or GVR_Is360_DEV_DB
roles = roles[roles['TABLE_CATALOG'].isin(['GVR_PROD', 'GVR_IS360_DEV_DB'])]

In [None]:
# Drop duplicates of roles when DATABASE, SCHEMA, & TABLE_NAME
roles = roles.drop_duplicates(subset=['GRANTEE_NAME', 'TABLE_CATALOG', 'TABLE_SCHEMA', 'NAME'])

In [None]:
# Concatenate all the roles that apply to the same table_catalog and table_schema
roles['CONCAT_ROLES'] = roles.groupby(['TABLE_CATALOG', 'TABLE_SCHEMA', 'NAME'])['GRANTEE_NAME'].transform(lambda x: ', '.join(sorted(x)))

In [None]:
# Rename columns
roles = roles.rename(columns={'TABLE_CATALOG': 'DATABASE', 'TABLE_SCHEMA': 'SCHEMA', 'NAME': 'TABLE_NAME', 'CONCAT_ROLES': 'ROLES'})

In [None]:
# Drop the GRANTEE_NAME column
roles = roles.drop(['GRANTEE_NAME'], axis=1)

In [None]:
# Show roles table
roles.head()

# IS360 Database

In [None]:
# Create Snowflake Session object (IS360)
connection_parameters = {
    "account": account,
    "user": user,
    "authenticator": authenticator,
    "warehouse": warehouse2,
    "role": role2
}

session = Session.builder.configs(connection_parameters).create()

## Primary Keys

In [None]:
# Selecting the UAT data from the Snowflake table
primary_keys2 = session.sql("SHOW PRIMARY KEYS IN DATABASE GVR_IS360_DEV_DB").collect()

primary_keys2 = pd.DataFrame(list(primary_keys2))

In [None]:
# Get rid of columns you don't need in PRIMARY_KEYS table
primary_keys2 = primary_keys2.drop(['created_on', 'constraint_name', 'rely'], axis=1)

primary_keys2.head()

In [None]:
# Rename columns in PRIMARY_KEYS table to match COLUMN_TABLE columns
primary_keys2 = primary_keys2.rename(columns= {'database_name': 'DATABASE', 'schema_name': 'SCHEMA', 
                                             'table_name': 'TABLE_NAME', 'column_name': 'COLUMN_NAME', 'key_sequence': 'PK'})

primary_keys2.head()

In [None]:
# Get rid of extra comment column
primary_keys2 = primary_keys2.drop(['comment'], axis=1)

## Information Schema

In [None]:
# Information_Schema data
column_table2 = session.sql("SELECT TABLE_CATALOG, TABLE_SCHEMA, TABLE_NAME, COLUMN_NAME, DATA_TYPE, CHARACTER_MAXIMUM_LENGTH, NUMERIC_PRECISION, NUMERIC_SCALE, IS_NULLABLE, COMMENT FROM GVR_IS360_DEV_DB.INFORMATION_SCHEMA.COLUMNS").to_pandas()

column_table2.head()

In [None]:
# Rename columns in COLUMN_TABLE table
column_table2 = column_table2.rename(columns= {'TABLE_CATALOG': 'DATABASE', 'TABLE_SCHEMA': 'SCHEMA', 'CHARACTER_MAXIMUM_LENGTH': 'LENGTH', 
                                             'NUMERIC_PRECISION': 'PRECISION', 'NUMERIC_SCALE': 'SCALE', 'IS_NULLABLE': 'NULLABLE'})

column_table2.head()

In [None]:
# Merge PRIMARY_KEYS and COLUMN_TABLE tables
merged_tables2 = pd.merge(column_table2, primary_keys2, how='left', on=['DATABASE', 'SCHEMA', 'TABLE_NAME', 'COLUMN_NAME'])

merged_tables2.head()

In [None]:
# Get rid of schemas: Information_Schema, Admin, and Stage
merged_tables3 = merged_tables2[~merged_tables2['SCHEMA'].isin(['INFORMATION_SCHEMA', 'STAGE', 'ADMIN'])]

merged_tables3.head()

# Merge GVR PROD with IS360 Database

In [None]:
# Appending IS360 to GVR_PROD dataframe
merged_tables3 = merged_tables.append(merged_tables2, ignore_index=True)

merged_tables3.head()

In [None]:
# # Rename columns in COLUMN_TABLE table
# merged_tables3 = column_table2.rename(columns= {'TABLE_CATALOG': 'DATABASE', 'TABLE_SCHEMA': 'SCHEMA', 'CHARACTER_MAXIMUM_LENGTH': 'LENGTH', 
#                                              'NUMERIC_PRECISION': 'PRECISION', 'NUMERIC_SCALE': 'SCALE', 'IS_NULLABLE': 'NULLABLE'})

# column_table2.head()

# Zones

In [None]:
# Generates values for the ZONE column
def zone(column):
    if column['SCHEMA'] in ['SMS', 'SALESFORCE', 'AX', 'MAC-PAC', 'PROTHEUS_AR', 'PROTHEUS_BR', 'PROTHEUS_CH', 'QAD', 'HFM', 'INSITE360_TELEMETRY', 'AVA_DEMO', 'IOT_CORE', 'AVA_LEGACY', 
                            'AVA_DEV', 'AVA_CORE_DEV', 'AVA_UAT', 'AVA_QA', 'AVA_CORE_UAT', 'AVA', 'ARCHIVE', 'AVA_CORE_QA', 'AVA_CORE_DEMO', 'PUSH_SALE_EVENT', 'CENSUS']:
        val = 'DATALAKE'
    elif column['SCHEMA'] in ['DATA_MART_FIN_NA', 'DATA_MART_FIN_LA', 'DATA_MART_AMO_NA', 'DATA_MART_FIN_GLOBAL', 'DATA_MART_CUSTOMER']:
        val = 'DATAMART'
    elif column['SCHEMA'] in ['DW']:
        val = 'DATAWAREHOUSE'
    elif column['SCHEMA'] in ['RPT']:
        val = 'LEGACY REPORTING'
    else:
        val = ' '
    return val

In [None]:
# Apply the zone function to the merged_tables3 dataframe
merged_tables3['ZONE'] = merged_tables3.apply(zone, axis=1)

merged_tables3.head()

# Historical Retention

In [None]:
# Add the historical retention data (hard coded for now, will be available in the future when this historical retention changes)
merged_tables3['HISTORICAL_RETENTION'] = '32 DAYS'

merged_tables3.head()

# Load Times

## GVR PROD

In [None]:
# Load_table created to show when the table was last loaded
load_table1 = session.sql("SELECT SCHEMA_NAME, TABLE_NAME, LAST_LOAD_TIME FROM GVR_PROD.INFORMATION_SCHEMA.LOAD_HISTORY").to_pandas()

load_table1.head()

In [None]:
# Change the last_load_time to only be yyyy/mm/dd
load_table1['LAST_LOAD_TIME'] = pd.to_datetime(load_table1['LAST_LOAD_TIME']).dt.date

load_table1.head()

In [None]:
# Rename columns in LOAD_TABLE table to match MERGED_TABLES table
load_table1 = load_table1.rename(columns= {'SCHEMA_NAME': 'SCHEMA'})

load_table1.head()

In [None]:
# Sort values to show the most recent load times
most_recent_times1 = load_table1.sort_values('LAST_LOAD_TIME', ascending=False)

most_recent_times1.head()

In [None]:
# Select only the most recent load time
most_recent_times1 = most_recent_times1.drop_duplicates(subset='TABLE_NAME', keep='first')

most_recent_times1 = most_recent_times1.reset_index(drop=True)

most_recent_times1.head()

## IS360

In [None]:
# Load_table created to show when the table was last loaded
load_table2 = session.sql("SELECT SCHEMA_NAME, TABLE_NAME, LAST_LOAD_TIME FROM GVR_IS360_DEV_DB.INFORMATION_SCHEMA.LOAD_HISTORY").to_pandas()

load_table2.head()

In [None]:
# Change the last_load_time to only be yyyy/mm/dd
load_table2['LAST_LOAD_TIME'] = pd.to_datetime(load_table2['LAST_LOAD_TIME']).dt.date

load_table2.head()

In [None]:
# Rename columns in LOAD_TABLE table to match MERGED_TABLES table
load_table2 = load_table2.rename(columns= {'SCHEMA_NAME': 'SCHEMA'})

load_table2.head()

In [None]:
# Sort values to show the most recent load times
most_recent_times2 = load_table2.sort_values('LAST_LOAD_TIME', ascending=False)

most_recent_times2.head()

In [None]:
# Select only the most recent load time 
most_recent_times2 = most_recent_times2.drop_duplicates(subset='TABLE_NAME', keep='first')

most_recent_times2 = most_recent_times2.reset_index(drop=True)

most_recent_times2.head()

## Appending GVR PROD & IS360 Load Times

In [None]:
# Append the two most recent times tables together
most_recent_times = most_recent_times1.append(most_recent_times2, ignore_index=True)

most_recent_times.head()

## Merge Load Times To Merged_Tables

In [None]:
# Merge PRIMARY_KEYS and COLUMN_TABLE tables
merged_tables3 = pd.merge(merged_tables3, most_recent_times, how='left', on=['SCHEMA', 'TABLE_NAME'])

merged_tables3.head()

## Table Type

In [None]:
# Hard code that all table types are Type1 (History Available)
merged_tables3['TABLE_TYPE'] = 'History Available'

merged_tables3.head()

## Load Strategy

In [None]:
# Load strategy table
load_strat = session.sql("SELECT DISTINCT TABLE_NAME, TABLE_TYPE, INCREMENTAL_LOAD, FULL_LOAD FROM GVR_PROD.ADMIN.BUILD_WAREHOUSE").to_pandas()

load_strat.head()

In [None]:
load_strat = load_strat.rename(columns={'TABLE_TYPE': 'SCHEMA'})

load_strat.head()

In [None]:
load_strat["SCHEMA"] = load_strat["SCHEMA"].replace({'P-BR': 'PROTHEUS_BR', 'P-CH': 'PROTHEUS_CH', 'P-AR': 'PROTHEUS_AR'})

In [None]:
merged_tables3 = merged_tables3.merge(load_strat, on=['TABLE_NAME', 'SCHEMA'], how='left')

merged_tables3.head()

In [None]:
# Create a function to determine load strategy
def determine_load_strategy(column):
    if column['INCREMENTAL_LOAD'] > 0:
        return 'INCREMENTAL LOAD'
    elif column['FULL_LOAD'] > 0:
        return 'FULL LOAD'
    elif column['INCREMENTAL_LOAD'] > 0 and column['FULL_LOAD'] > 0:
        return 'FULL LOAD'
    else:
        return ' '        

In [None]:
merged_tables3['LOAD_STRATEGY'] = merged_tables3.apply(determine_load_strategy, axis=1)

merged_tables3.head()

## Delete Unneccessary Columns

In [None]:
merged_tables3 = merged_tables3.drop(['INCREMENTAL_LOAD', 'FULL_LOAD'], axis=1)

merged_tables3.head()

# Join Roles to Catalog

In [None]:
# Merge the roles table with the merged_tables3 table
merged_tables3 = merged_tables3.merge(roles, on=['DATABASE', 'SCHEMA', 'TABLE_NAME'], how='left')

merged_tables3.head()

In [None]:
# Drop duplicates of database, schema, and table_name
merged_tables3 = merged_tables3.drop_duplicates(subset=['DATABASE', 'SCHEMA', 'TABLE_NAME', 'COLUMN_NAME'])

merged_tables3.head()

# Data Profiling Links

In [None]:
# Function to generate the URL based on schema name
def generate_url(database, schema, table_name):
    if schema == 'AX' and database == "GVR_PROD":
        base_url = "https://vontier.sharepoint.com/sites/GVR-DataAnalytics/Shared%20Documents/Forms/AllItems.aspx?id=%2Fsites%2FGVR%2DDataAnalytics%2FShared%20Documents%2F1%2E%20DA%20Document%20Management%20System%2F5%2E%20Artifacts%2FData%20Lake%20Artifacts%2FData%20Profiling%2FGVR%5FDEV%2FAX%2F"
        parent_url = "%2Ehtml&viewid=1735308e%2Da40f%2D4ff1%2D8987%2D77fd3877b77c&parent=%2Fsites%2FGVR%2DDataAnalytics%2FShared%20Documents%2F1%2E%20DA%20Document%20Management%20System%2F5%2E%20Artifacts%2FData%20Lake%20Artifacts%2FData%20Profiling%2FGVR%5FDEV%2FAX"
    # elif schema == 'AS400_GER' and database == "GVR_PROD":
    #     base_url = "https://vontier.sharepoint.com/sites/GVR-DataAnalytics/Shared%20Documents/Forms/AllItems.aspx?id=%2Fsites%2FGVR%2DDataAnalytics%2FShared%20Documents%2F1%2E%20DA%20Document%20Management%20System%2F5%2E%20Artifacts%2FData%20Lake%20Artifacts%2FData%20Profiling%2FGVR%5FDEV%2FAS400%5FGER%2F"
    #     parent_url = "%2Ehtml&viewid=1735308e%2Da40f%2D4ff1%2D8987%2D77fd3877b77c&parent=%2Fsites%2FGVR%2DDataAnalytics%2FShared%20Documents%2F1%2E%20DA%20Document%20Management%20System%"
    elif schema == "HFM" and database == "GVR_PROD":
        base_url = "https://vontier.sharepoint.com/sites/GVR-DataAnalytics/Shared%20Documents/Forms/AllItems.aspx?id=%2Fsites%2FGVR%2DDataAnalytics%2FShared%20Documents%2F1%2E%20DA%20Document%20Management%20System%2F5%2E%20Artifacts%2FData%20Lake%20Artifacts%2FData%20Profiling%2FGVR%5FDEV%2FHFM%2F"
        parent_url = "%2Ehtml&viewid=1735308e%2Da40f%2D4ff1%2D8987%2D77fd3877b77c&parent=%2Fsites%2FGVR%2DDataAnalytics%2FShared%20Documents%2F1%2E%20DA%20Document%20Management%20System%2F5%2E%20Artifacts%2FData%20Lake%20Artifacts%2FData%20Profiling%2FGVR%5FDEV%2FHFM"
    elif schema == "MAC-PAC" and database == "GVR_PROD":
        base_url = "https://vontier.sharepoint.com/sites/GVR-DataAnalytics/Shared%20Documents/Forms/AllItems.aspx?id=%2Fsites%2FGVR%2DDataAnalytics%2FShared%20Documents%2F1%2E%20DA%20Document%20Management%20System%2F5%2E%20Artifacts%2FData%20Lake%20Artifacts%2FData%20Profiling%2FGVR%5FDEV%2FMAC%2DPAC%2F"
        parent_url = "%2Ehtml&viewid=1735308e%2Da40f%2D4ff1%2D8987%2D77fd3877b77c&parent=%2Fsites%2FGVR%2DDataAnalytics%2FShared%20Documents%2F1%2E%20DA%20Document%20Management%20System%2F5%2E%20Artifacts%2FData%20Lake%20Artifacts%2FData%20Profiling%2FGVR%5FDEV%2FMAC%2DPAC"
    elif schema == "PROTHEUS_AR" and database == "GVR_PROD":
        base_url = "https://vontier.sharepoint.com/sites/GVR-DataAnalytics/Shared%20Documents/Forms/AllItems.aspx?id=%2Fsites%2FGVR%2DDataAnalytics%2FShared%20Documents%2F1%2E%20DA%20Document%20Management%20System%2F5%2E%20Artifacts%2FData%20Lake%20Artifacts%2FData%20Profiling%2FGVR%5FDEV%2FPROTHEUS%5FAR%2F"
        parent_url = "%2Ehtml&viewid=1735308e%2Da40f%2D4ff1%2D8987%2D77fd3877b77c&parent=%2Fsites%2FGVR%2DDataAnalytics%2FShared%20Documents%2F1%2E%20DA%20Document%20Management%20System%2F5%2E%20Artifacts%2FData%20Lake%20Artifacts%2FData%20Profiling%2FGVR%5FDEV%2FPROTHEUS%5FAR"
    elif schema == "PROTHEUS_BR" and database == "GVR_PROD":
        base_url = "https://vontier.sharepoint.com/sites/GVR-DataAnalytics/Shared%20Documents/Forms/AllItems.aspx?id=%2Fsites%2FGVR%2DDataAnalytics%2FShared%20Documents%2F1%2E%20DA%20Document%20Management%20System%2F5%2E%20Artifacts%2FData%20Lake%20Artifacts%2FData%20Profiling%2FGVR%5FDEV%2FPROTHEUS%5FBR%2F"
        parent_url = "%2Ehtml&viewid=1735308e%2Da40f%2D4ff1%2D8987%2D77fd3877b77c&parent=%2Fsites%2FGVR%2DDataAnalytics%2FShared%20Documents%2F1%2E%20DA%20Document%20Management%20System%2F5%2E%20Artifacts%2FData%20Lake%20Artifacts%2FData%20Profiling%2FGVR%5FDEV%2FPROTHEUS%5FBR"
    elif schema == "PROTHEUS_CH" and database == "GVR_PROD":
        base_url = "https://vontier.sharepoint.com/sites/GVR-DataAnalytics/Shared%20Documents/Forms/AllItems.aspx?id=%2Fsites%2FGVR%2DDataAnalytics%2FShared%20Documents%2F1%2E%20DA%20Document%20Management%20System%2F5%2E%20Artifacts%2FData%20Lake%20Artifacts%2FData%20Profiling%2FGVR%5FDEV%2FPROTHEUS%5FCH%2F"
        parent_url = "%2Ehtml&viewid=1735308e%2Da40f%2D4ff1%2D8987%2D77fd3877b77c&parent=%2Fsites%2FGVR%2DDataAnalytics%2FShared%20Documents%2F1%2E%20DA%20Document%20Management%20System%2F5%2E%20Artifacts%2FData%20Lake%20Artifacts%2FData%20Profiling%2FGVR%5FDEV%2FPROTHEUS%5FCH"
    elif schema == "QAD" and database == "GVR_PROD":
        base_url = "https://vontier.sharepoint.com/sites/GVR-DataAnalytics/Shared%20Documents/Forms/AllItems.aspx?id=%2Fsites%2FGVR%2DDataAnalytics%2FShared%20Documents%2F1%2E%20DA%20Document%20Management%20System%2F5%2E%20Artifacts%2FData%20Lake%20Artifacts%2FData%20Profiling%2FGVR%5FDEV%2FQAD%2F"
        parent_url = "%2Ehtml&viewid=1735308e%2Da40f%2D4ff1%2D8987%2D77fd3877b77c&parent=%2Fsites%2FGVR%2DDataAnalytics%2FShared%20Documents%2F1%2E%20DA%20Document%20Management%20System%2F5%2E%20Artifacts%2FData%20Lake%20Artifacts%2FData%20Profiling%2FGVR%5FDEV%2FQAD"
    elif schema == "SMS" and database == "GVR_PROD":
        base_url = "https://vontier.sharepoint.com/sites/GVR-DataAnalytics/Shared%20Documents/Forms/AllItems.aspx?id=%2Fsites%2FGVR%2DDataAnalytics%2FShared%20Documents%2F1%2E%20DA%20Document%20Management%20System%2F5%2E%20Artifacts%2FData%20Lake%20Artifacts%2FData%20Profiling%2FGVR%5FDEV%2FSMS%2F"
        parent_url = "%2Ehtml&viewid=1735308e%2Da40f%2D4ff1%2D8987%2D77fd3877b77c&parent=%2Fsites%2FGVR%2DDataAnalytics%2FShared%20Documents%2F1%2E%20DA%20Document%20Management%20System%2F5%2E%20Artifacts%2FData%20Lake%20Artifacts%2FData%20Profiling%2FGVR%5FDEV%2FSMS"
    elif schema == "AVA" and database == "GVR_IS360_DEV_DB":
        base_url = "https://vontier.sharepoint.com/sites/GVR-DataAnalytics/Shared%20Documents/Forms/AllItems.aspx?id=%2Fsites%2FGVR%2DDataAnalytics%2FShared%20Documents%2F1%2E%20DA%20Document%20Management%20System%2F5%2E%20Artifacts%2FData%20Lake%20Artifacts%2FData%20Profiling%2FGVR%5FIS360%5FDEV%5FDB%2FAVA%2F"
        parent_url = "%2Ehtml&viewid=1735308e%2Da40f%2D4ff1%2D8987%2D77fd3877b77c&parent=%2Fsites%2FGVR%2DDataAnalytics%2FShared%20Documents%2F1%2E%20DA%20Document%20Management%20System%2F5%2E%20Artifacts%2FData%20Lake%20Artifacts%2FData%20Profiling%2FGVR%5FIS360%5FDEV%5FDB%2FAVA"
    elif schema == "AVA_CORE" and database == "GVR_IS360_DEV_DB":
        base_url = "https://vontier.sharepoint.com/sites/GVR-DataAnalytics/Shared%20Documents/Forms/AllItems.aspx?id=%2Fsites%2FGVR%2DDataAnalytics%2FShared%20Documents%2F1%2E%20DA%20Document%20Management%20System%2F5%2E%20Artifacts%2FData%20Lake%20Artifacts%2FData%20Profiling%2FGVR%5FIS360%5FDEV%5FDB%2FAVA%5FCORE&viewid=1735308e%2Da40f%2D4ff1%2D8987%2D77fd3877b77c"
        parent_url = ""
    elif schema == "AVA_CORE_DEMO" and database == "GVR_IS360_PROD_DB":
        base_url = "https://vontier.sharepoint.com/sites/GVR-DataAnalytics/Shared%20Documents/Forms/AllItems.aspx?id=%2Fsites%2FGVR%2DDataAnalytics%2FShared%20Documents%2F1%2E%20DA%20Document%20Management%20System%2F5%2E%20Artifacts%2FData%20Lake%20Artifacts%2FData%20Profiling%2FGVR%5FIS360%5FDEV%5FDB%2FAVA%5FCORE%5FDEMO&viewid=1735308e%2Da40f%2D4ff1%2D8987%2D77fd3877b77c"
        parent_url = ""
    elif schema == "AVA_CORE_DEV" and database == "GVR_IS360_DEV_DB":
        base_url = "https://vontier.sharepoint.com/sites/GVR-DataAnalytics/Shared%20Documents/Forms/AllItems.aspx?id=%2Fsites%2FGVR%2DDataAnalytics%2FShared%20Documents%2F1%2E%20DA%20Document%20Management%20System%2F5%2E%20Artifacts%2FData%20Lake%20Artifacts%2FData%20Profiling%2FGVR%5FIS360%5FDEV%5FDB%2FAVA%5FCORE%5FDEV%2F"
        parent_url = "%2Ehtml&viewid=1735308e%2Da40f%2D4ff1%2D8987%2D77fd3877b77c&parent=%2Fsites%2FGVR%2DDataAnalytics%2FShared%20Documents%2F1%2E%20DA%20Document%20Management%20System%2F5%2E%20Artifacts%2FData%20Lake%20Artifacts%2FData%20Profiling%2FGVR%5FIS360%5FDEV%5FDB%2FAVA%5FCORE%5FDEV"
    elif schema == "AVA_CORE_QA" and database == "GVR_IS360_DEV_DB":
        base_url = "https://vontier.sharepoint.com/sites/GVR-DataAnalytics/Shared%20Documents/Forms/AllItems.aspx?id=%2Fsites%2FGVR%2DDataAnalytics%2FShared%20Documents%2F1%2E%20DA%20Document%20Management%20System%2F5%2E%20Artifacts%2FData%20Lake%20Artifacts%2FData%20Profiling%2FGVR%5FIS360%5FDEV%5FDB%2FAVA%5FCORE%5FQA%2F"
        parent_url = "%2Ehtml&viewid=1735308e%2Da40f%2D4ff1%2D8987%2D77fd3877b77c&parent=%2Fsites%2FGVR%2DDataAnalytics%2FShared%20Documents%2F1%2E%20DA%20Document%20Management%20System%2F5%2E%20Artifacts%2FData%20Lake%20Artifacts%2FData%20Profiling%2FGVR%5FIS360%5FDEV%5FDB%2FAVA%5FCORE%5FQA"
    elif schema == "AVA_CORE_UAT" and database == "GVR_IS360_DEV_DB":
        base_url = "https://vontier.sharepoint.com/sites/GVR-DataAnalytics/Shared%20Documents/Forms/AllItems.aspx?id=%2Fsites%2FGVR%2DDataAnalytics%2FShared%20Documents%2F1%2E%20DA%20Document%20Management%20System%2F5%2E%20Artifacts%2FData%20Lake%20Artifacts%2FData%20Profiling%2FGVR%5FIS360%5FDEV%5FDB%2FAVA%5FCORE%5FUAT%2F"
        parent_url = "%2Ehtml&viewid=1735308e%2Da40f%2D4ff1%2D8987%2D77fd3877b77c&parent=%2Fsites%2FGVR%2DDataAnalytics%2FShared%20Documents%2F1%2E%20DA%20Document%20Management%20System%2F5%2E%20Artifacts%2FData%20Lake%20Artifacts%2FData%20Profiling%2FGVR%5FIS360%5FDEV%5FDB%2FAVA%5FCORE%5FUAT"
    elif schema == "AVA_DEV" and database == "GVR_IS360_DEV_DB":
        base_url = "https://vontier.sharepoint.com/sites/GVR-DataAnalytics/Shared%20Documents/Forms/AllItems.aspx?id=%2Fsites%2FGVR%2DDataAnalytics%2FShared%20Documents%2F1%2E%20DA%20Document%20Management%20System%2F5%2E%20Artifacts%2FData%20Lake%20Artifacts%2FData%20Profiling%2FGVR%5FIS360%5FDEV%5FDB%2FAVA%5FDEV%2F"
        parent_url = "%2Ehtml&viewid=1735308e%2Da40f%2D4ff1%2D8987%2D77fd3877b77c&parent=%2Fsites%2FGVR%2DDataAnalytics%2FShared%20Documents%2F1%2E%20DA%20Document%20Management%20System%2F5%2E%20Artifacts%2FData%20Lake%20Artifacts%2FData%20Profiling%2FGVR%5FIS360%5FDEV%5FDB%2FAVA%5FDEV"
    elif schema == "AVA_QA" and database == "GVR_IS360_DEV_DB":
        base_url = "https://vontier.sharepoint.com/sites/GVR-DataAnalytics/Shared%20Documents/Forms/AllItems.aspx?id=%2Fsites%2FGVR%2DDataAnalytics%2FShared%20Documents%2F1%2E%20DA%20Document%20Management%20System%2F5%2E%20Artifacts%2FData%20Lake%20Artifacts%2FData%20Profiling%2FGVR%5FIS360%5FDEV%5FDB%2FAVA%5FQA%2F"
        parent_url = "%2Ehtml&viewid=1735308e%2Da40f%2D4ff1%2D8987%2D77fd3877b77c&parent=%2Fsites%2FGVR%2DDataAnalytics%2FShared%20Documents%2F1%2E%20DA%20Document%20Management%20System%2F5%2E%20Artifacts%2FData%20Lake%20Artifacts%2FData%20Profiling%2FGVR%5FIS360%5FDEV%5FDB%2FAVA%5FQA"
    elif schema == "AVA_UAT" and database == "GVR_IS360_DEV_DB":
        base_url = "https://vontier.sharepoint.com/sites/GVR-DataAnalytics/Shared%20Documents/Forms/AllItems.aspx?id=%2Fsites%2FGVR%2DDataAnalytics%2FShared%20Documents%2F1%2E%20DA%20Document%20Management%20System%2F5%2E%20Artifacts%2FData%20Lake%20Artifacts%2FData%20Profiling%2FGVR%5FIS360%5FDEV%5FDB%2FAVA%5FUAT%2F"
        parent_url = "%2Ehtml&viewid=1735308e%2Da40f%2D4ff1%2D8987%2D77fd3877b77c&parent=%2Fsites%2FGVR%2DDataAnalytics%2FShared%20Documents%2F1%2E%20DA%20Document%20Management%20System%2F5%2E%20Artifacts%2FData%20Lake%20Artifacts%2FData%20Profiling%2FGVR%5FIS360%5FDEV%5FDB%2FAVA%5FUAT"
    elif schema == "INSITE360_TELEMETRY" and database == "GVR_IS360_DEV_DB":
        base_url = "https://vontier.sharepoint.com/sites/GVR-DataAnalytics/Shared%20Documents/Forms/AllItems.aspx?id=%2Fsites%2FGVR%2DDataAnalytics%2FShared%20Documents%2F1%2E%20DA%20Document%20Management%20System%2F5%2E%20Artifacts%2FData%20Lake%20Artifacts%2FData%20Profiling%2FGVR%5FIS360%5FDEV%5FDB%2FINSITE360%5FTELEMETRY%2F"
        parent_url = "%2Ehtml&viewid=1735308e%2Da40f%2D4ff1%2D8987%2D77fd3877b77c&parent=%2Fsites%2FGVR%2DDataAnalytics%2FShared%20Documents%2F1%2E%20DA%20Document%20Management%20System%2F5%2E%20Artifacts%2FData%20Lake%20Artifacts%2FData%20Profiling%2FGVR%5FIS360%5FDEV%5FDB%2FINSITE360%5FTELEMETRY"
    elif schema == "PUSH_SALE_EVENT" and database == "GVR_IS360_DEV_DB":
        base_url = "https://vontier.sharepoint.com/sites/GVR-DataAnalytics/Shared%20Documents/Forms/AllItems.aspx?id=%2Fsites%2FGVR%2DDataAnalytics%2FShared%20Documents%2F1%2E%20DA%20Document%20Management%20System%2F5%2E%20Artifacts%2FData%20Lake%20Artifacts%2FData%20Profiling%2FGVR%5FIS360%5FDEV%5FDB%2FPUSH%5FSALE%5FEVENT%2F"
        parent_url = "%2Ehtml&viewid=1735308e%2Da40f%2D4ff1%2D8987%2D77fd3877b77c&parent=%2Fsites%2FGVR%2DDataAnalytics%2FShared%20Documents%2F1%2E%20DA%20Document%20Management%20System%2F5%2E%20Artifacts%2FData%20Lake%20Artifacts%2FData%20Profiling%2FGVR%5FIS360%5FDEV%5FDB%2FPUSH%5FSALE%5FEVENT"
    else:
        return None

    table_url = base_url + table_name.replace("_", "%5F") + parent_url

    return table_url

# Make a new column with the URL
merged_tables3['DATA_PROFILE_URL'] = merged_tables3.apply(lambda row: generate_url(row['DATABASE'], row['SCHEMA'], row['TABLE_NAME']), axis=1)

In [None]:
merged_tables3.head()

# Drop Schemas

In [None]:
# Get rid of schemas: Information_Schema, Admin, and Stage
merged_tables3 = merged_tables3[~merged_tables3['SCHEMA'].isin(['INFORMATION_SCHEMA', 'STAGE', 'ADMIN',  'ARCHIVE', 
                                                                'AVA_DEMO', 'AVA_LEGACY', 'CENSUS', 'IOT_CORE'])]

merged_tables3.head()

In [None]:
# Verify the columns are all capitalized
merged_tables3.columns = merged_tables3.columns.str.upper()

merged_tables3.head()

In [None]:
# Verify the length of the DataFrame to confirm that all rows were inserted
print(len(merged_tables3))

# Write to Excel

In [None]:
# with pd.ExcelWriter('Catalog/Data_Catalog.xlsx', mode='a', engine='openpyxl', if_sheet_exists="replace",) as writer:
#     merged_tables3.to_excel(writer, sheet_name='Catalog', index=False)

# DataFrame to Snowflake Table

In [None]:
import snowflake.connector
import pandas as pd
import numpy as np

# Snowflake connection parameters
conn_params = {
    "account": account,
    "user": user,
    "authenticator": authenticator,
    "warehouse": warehouse3,
    "role": role1
}

# Create a table name
table_name = 'GVR_DEV.UDD.CATALOG'

# Establish connection to Snowflake
conn = snowflake.connector.connect(**conn_params)

# Create the table in Snowflake
with conn.cursor() as cursor:
    cursor.execute(f"""
        CREATE OR REPLACE TABLE {table_name} (
            DATABASE TEXT(50) NOT NULL,
            SCHEMA TEXT(50) NOT NULL,
            TABLE_NAME TEXT(100) NOT NULL,
            COLUMN_NAME TEXT(50) NOT NULL,
            DATA_TYPE TEXT(25) NOT NULL,
            LENGTH NUMBER(15),
            PRECISION NUMBER(5),
            SCALE NUMBER(5),
            NULLABLE TEXT(3) NOT NULL,
            COMMENT TEXT(500),
            PRIMARY_KEY NUMBER(2),
            ZONE TEXT(25) NOT NULL,
            HISTORICAL_RETENTION TEXT(25) NOT NULL,
            LAST_LOAD_TIME TIMESTAMP_NTZ(9),
            TABLE_TYPE TEXT(25) NOT NULL,
            LOAD_STRATEGY TEXT(25),
            ROLES TEXT(750),
            DATA_PROFILE_URL TEXT(750)
        )
    """)

# Replace NaN values in the DataFrame with appropriate default values
merged_tables3_cleaned = merged_tables3.replace({np.nan: None})

# Prepare the INSERT statement
insert_query = f"""
    INSERT INTO {table_name} VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
"""

# Execute the INSERT statement for each row in the DataFrame
with conn.cursor() as cursor:
    for row in merged_tables3_cleaned.itertuples(index=False, name=None):
        cursor.execute(insert_query, row)

# Close the connection
conn.close()
