# Data Catalog Python Script
* Repository located here: https://github.com/emilyporter920/Data_catalog

## Import Libaries

In [1]:
# Import dependencies
from snowflake.snowpark.session import Session
import json
import pandas as pd
from platform import python_version
from datetime import datetime
import openpyxl
from config import account, user, authenticator, warehouse1, role1, warehouse2, role2 

# Shows Python version (SnowPark uses anything below 3.8.x)
print(python_version())

3.8.15


## Connection to Snowflake
* REMEMBER: Change the email address inside of creds.json to appropriate email address

In [2]:
# Create Snowflake Session object (GVR_PROD)
connection_parameters = {
    "account": account,
    "user": user,
    "authenticator": authenticator,
    "warehouse": warehouse1,
    "role": role1
}

session = Session.builder.configs(connection_parameters).create()

Initiating login request with your identity provider. A browser window should have opened for you to complete the login. If you can't see it, check existing browser windows, or your OS settings. Press CTRL+C to abort and try again...


## Creating Catalog
* Merging 'PRIMARY_KEYS' & 'COLUMN_TABLE'

# GVR_PROD Database

In [3]:
# Selecting the UAT data from the Snowflake table
primary_keys = session.sql("SHOW PRIMARY KEYS IN DATABASE GVR_PROD").collect()

print(primary_keys)

[Row(created_on=datetime.datetime(2021, 11, 30, 14, 59, 55, 769000, tzinfo=<DstTzInfo 'America/New_York' EST-1 day, 19:00:00 STD>), database_name='GVR_PROD', schema_name='ADMIN', table_name='BATCHJOBHISTORY_GERMANY', column_name='DAILY_JOB_MONITORING', key_sequence=3, constraint_name='SYS_CONSTRAINT_1a66e28b-f502-486c-a6a1-9fc1f7a4046a', rely='false', comment=None), Row(created_on=datetime.datetime(2021, 11, 30, 14, 59, 55, 769000, tzinfo=<DstTzInfo 'America/New_York' EST-1 day, 19:00:00 STD>), database_name='GVR_PROD', schema_name='ADMIN', table_name='BATCHJOBHISTORY_GERMANY', column_name='DATE_', key_sequence=2, constraint_name='SYS_CONSTRAINT_1a66e28b-f502-486c-a6a1-9fc1f7a4046a', rely='false', comment=None), Row(created_on=datetime.datetime(2021, 11, 30, 14, 59, 55, 769000, tzinfo=<DstTzInfo 'America/New_York' EST-1 day, 19:00:00 STD>), database_name='GVR_PROD', schema_name='ADMIN', table_name='BATCHJOBHISTORY_GERMANY', column_name='SERIALNUMBER', key_sequence=1, constraint_name='S

In [4]:
primary_keys = pd.DataFrame(list(primary_keys))

In [5]:
primary_keys.head()

Unnamed: 0,created_on,database_name,schema_name,table_name,column_name,key_sequence,constraint_name,rely,comment
0,2021-11-30 14:59:55.769000-05:00,GVR_PROD,ADMIN,BATCHJOBHISTORY_GERMANY,DAILY_JOB_MONITORING,3,SYS_CONSTRAINT_1a66e28b-f502-486c-a6a1-9fc1f7a...,False,
1,2021-11-30 14:59:55.769000-05:00,GVR_PROD,ADMIN,BATCHJOBHISTORY_GERMANY,DATE_,2,SYS_CONSTRAINT_1a66e28b-f502-486c-a6a1-9fc1f7a...,False,
2,2021-11-30 14:59:55.769000-05:00,GVR_PROD,ADMIN,BATCHJOBHISTORY_GERMANY,SERIALNUMBER,1,SYS_CONSTRAINT_1a66e28b-f502-486c-a6a1-9fc1f7a...,False,
3,2021-11-15 01:03:08.191000-05:00,GVR_PROD,ADMIN,BATCHJOBHISTORY_GIL,RECID,1,SYS_CONSTRAINT_e7326ebe-4345-4c63-a328-de5fa62...,False,
4,2023-06-07 16:22:07.769000-04:00,GVR_PROD,AX,ACCOUNTINGDISTRIBUTION,RECID,1,SYS_CONSTRAINT_fa781a07-1e01-4eb2-8f06-85415c7...,False,


In [6]:
# Get rid of columns you don't need in PRIMARY_KEYS table
primary_keys = primary_keys.drop(['created_on', 'constraint_name', 'rely'], axis=1)
primary_keys.head()

Unnamed: 0,database_name,schema_name,table_name,column_name,key_sequence,comment
0,GVR_PROD,ADMIN,BATCHJOBHISTORY_GERMANY,DAILY_JOB_MONITORING,3,
1,GVR_PROD,ADMIN,BATCHJOBHISTORY_GERMANY,DATE_,2,
2,GVR_PROD,ADMIN,BATCHJOBHISTORY_GERMANY,SERIALNUMBER,1,
3,GVR_PROD,ADMIN,BATCHJOBHISTORY_GIL,RECID,1,
4,GVR_PROD,AX,ACCOUNTINGDISTRIBUTION,RECID,1,


In [7]:
# Rename columns in PRIMARY_KEYS table to match COLUMN_TABLE columns
primary_keys = primary_keys.rename(columns= {'database_name': 'DATABASE', 'schema_name': 'SCHEMA', 
                                             'table_name': 'TABLE_NAME', 'column_name': 'COLUMN_NAME', 'key_sequence': 'PK'})

primary_keys.head()

Unnamed: 0,DATABASE,SCHEMA,TABLE_NAME,COLUMN_NAME,PK,comment
0,GVR_PROD,ADMIN,BATCHJOBHISTORY_GERMANY,DAILY_JOB_MONITORING,3,
1,GVR_PROD,ADMIN,BATCHJOBHISTORY_GERMANY,DATE_,2,
2,GVR_PROD,ADMIN,BATCHJOBHISTORY_GERMANY,SERIALNUMBER,1,
3,GVR_PROD,ADMIN,BATCHJOBHISTORY_GIL,RECID,1,
4,GVR_PROD,AX,ACCOUNTINGDISTRIBUTION,RECID,1,


In [8]:
# Add last time table refreshed 
column_table = session.sql("SELECT TABLE_CATALOG, TABLE_SCHEMA, TABLE_NAME, COLUMN_NAME, DATA_TYPE, CHARACTER_MAXIMUM_LENGTH, NUMERIC_PRECISION, NUMERIC_SCALE, IS_NULLABLE FROM GVR_PROD.INFORMATION_SCHEMA.COLUMNS").to_pandas()

column_table.head()

Unnamed: 0,TABLE_CATALOG,TABLE_SCHEMA,TABLE_NAME,COLUMN_NAME,DATA_TYPE,CHARACTER_MAXIMUM_LENGTH,NUMERIC_PRECISION,NUMERIC_SCALE,IS_NULLABLE
0,GVR_PROD,SALESFORCE,LEAD,CURRENCYISOCODE,TEXT,16777216.0,,,YES
1,GVR_PROD,SMS,CUSTOMER_SITES,SITE_SEC_ENABLED,TEXT,100.0,,,NO
2,GVR_PROD,SALESFORCE,TASK,REBATE_REVIEW__C,BOOLEAN,,,,YES
3,GVR_PROD,STAGE,CUSTTRANS,SETTLEMENT,NUMBER,,38.0,0.0,YES
4,GVR_PROD,STAGE,DIRPARTYTABLE,ISCONSOLIDATIONCOMPANY,TEXT,5.0,,,YES


In [9]:
# Rename columns in COLUMN_TABLE table
column_table = column_table.rename(columns= {'TABLE_CATALOG': 'DATABASE', 'TABLE_SCHEMA': 'SCHEMA', 'CHARACTER_MAXIMUM_LENGTH': 'LENGTH', 
                                             'NUMERIC_PRECISION': 'PRECISION', 'NUMERIC_SCALE': 'SCALE', 'IS_NULLABLE': 'NULLABLE'})

column_table.head()

Unnamed: 0,DATABASE,SCHEMA,TABLE_NAME,COLUMN_NAME,DATA_TYPE,LENGTH,PRECISION,SCALE,NULLABLE
0,GVR_PROD,SALESFORCE,LEAD,CURRENCYISOCODE,TEXT,16777216.0,,,YES
1,GVR_PROD,SMS,CUSTOMER_SITES,SITE_SEC_ENABLED,TEXT,100.0,,,NO
2,GVR_PROD,SALESFORCE,TASK,REBATE_REVIEW__C,BOOLEAN,,,,YES
3,GVR_PROD,STAGE,CUSTTRANS,SETTLEMENT,NUMBER,,38.0,0.0,YES
4,GVR_PROD,STAGE,DIRPARTYTABLE,ISCONSOLIDATIONCOMPANY,TEXT,5.0,,,YES


In [10]:
# Merge PRIMARY_KEYS and COLUMN_TABLE tables
merged_tables = pd.merge(column_table, primary_keys, how='left', on=['DATABASE', 'SCHEMA', 'TABLE_NAME', 'COLUMN_NAME'])

merged_tables.head()

Unnamed: 0,DATABASE,SCHEMA,TABLE_NAME,COLUMN_NAME,DATA_TYPE,LENGTH,PRECISION,SCALE,NULLABLE,PK,comment
0,GVR_PROD,SALESFORCE,LEAD,CURRENCYISOCODE,TEXT,16777216.0,,,YES,,
1,GVR_PROD,SMS,CUSTOMER_SITES,SITE_SEC_ENABLED,TEXT,100.0,,,NO,,
2,GVR_PROD,SALESFORCE,TASK,REBATE_REVIEW__C,BOOLEAN,,,,YES,,
3,GVR_PROD,STAGE,CUSTTRANS,SETTLEMENT,NUMBER,,38.0,0.0,YES,,
4,GVR_PROD,STAGE,DIRPARTYTABLE,ISCONSOLIDATIONCOMPANY,TEXT,5.0,,,YES,,


In [11]:
print(len(merged_tables))

61920


In [12]:
# # Verifying that this is a dataframe so we can drop the columns we don't need
# merged_tables = pd.DataFrame(merged_tables)

In [13]:
# Get rid of schemas: Information_Schema, Admin, and Stage
merged_tables=merged_tables[~merged_tables['SCHEMA'].isin(['INFORMATION_SCHEMA'])]

In [14]:
merged_tables.head()

Unnamed: 0,DATABASE,SCHEMA,TABLE_NAME,COLUMN_NAME,DATA_TYPE,LENGTH,PRECISION,SCALE,NULLABLE,PK,comment
0,GVR_PROD,SALESFORCE,LEAD,CURRENCYISOCODE,TEXT,16777216.0,,,YES,,
1,GVR_PROD,SMS,CUSTOMER_SITES,SITE_SEC_ENABLED,TEXT,100.0,,,NO,,
2,GVR_PROD,SALESFORCE,TASK,REBATE_REVIEW__C,BOOLEAN,,,,YES,,
3,GVR_PROD,STAGE,CUSTTRANS,SETTLEMENT,NUMBER,,38.0,0.0,YES,,
4,GVR_PROD,STAGE,DIRPARTYTABLE,ISCONSOLIDATIONCOMPANY,TEXT,5.0,,,YES,,


# IS360 Database

In [15]:
# Create Snowflake Session object (IS360)
connection_parameters = {
    "account": account,
    "user": user,
    "authenticator": authenticator,
    "warehouse": warehouse2,
    "role": role2
}

session = Session.builder.configs(connection_parameters).create()

Initiating login request with your identity provider. A browser window should have opened for you to complete the login. If you can't see it, check existing browser windows, or your OS settings. Press CTRL+C to abort and try again...


In [16]:
# Selecting the UAT data from the Snowflake table
primary_keys2 = session.sql("SHOW PRIMARY KEYS IN DATABASE GVR_IS360_DEV_DB").collect()

print(primary_keys2)

[Row(created_on=datetime.datetime(2023, 6, 6, 15, 39, 22, 335000, tzinfo=<DstTzInfo 'America/New_York' EDT-1 day, 20:00:00 DST>), database_name='GVR_IS360_DEV_DB', schema_name='AVA_CORE_DEMO', table_name='flyway_schema_history', column_name='installed_rank', key_sequence=1, constraint_name='SYS_CONSTRAINT_ec40699a-8db8-4679-808e-8d0e31e5f316', rely='false', comment=None), Row(created_on=datetime.datetime(2023, 6, 7, 13, 28, 49, 415000, tzinfo=<DstTzInfo 'America/New_York' EDT-1 day, 20:00:00 DST>), database_name='GVR_IS360_DEV_DB', schema_name='AVA_CORE_DEV', table_name='flyway_schema_history', column_name='installed_rank', key_sequence=1, constraint_name='SYS_CONSTRAINT_4f76a153-93bb-4e51-9511-eb31135a6cdf', rely='false', comment=None), Row(created_on=datetime.datetime(2023, 6, 7, 14, 8, 23, 125000, tzinfo=<DstTzInfo 'America/New_York' EDT-1 day, 20:00:00 DST>), database_name='GVR_IS360_DEV_DB', schema_name='AVA_CORE_QA', table_name='flyway_schema_history', column_name='installed_rank

In [17]:
primary_keys2 = pd.DataFrame(list(primary_keys2))

In [18]:
# Get rid of columns you don't need in PRIMARY_KEYS table
primary_keys2 = primary_keys2.drop(['created_on', 'constraint_name', 'rely'], axis=1)
primary_keys2.head()

Unnamed: 0,database_name,schema_name,table_name,column_name,key_sequence,comment
0,GVR_IS360_DEV_DB,AVA_CORE_DEMO,flyway_schema_history,installed_rank,1,
1,GVR_IS360_DEV_DB,AVA_CORE_DEV,flyway_schema_history,installed_rank,1,
2,GVR_IS360_DEV_DB,AVA_CORE_QA,flyway_schema_history,installed_rank,1,
3,GVR_IS360_DEV_DB,AVA_CORE_UAT,flyway_schema_history,installed_rank,1,
4,GVR_IS360_DEV_DB,AVA_DEMO,THERMISTOR_PROBE_LOOKUP,ATG_TYPE,1,


In [19]:
# Rename columns in PRIMARY_KEYS table to match COLUMN_TABLE columns
primary_keys2 = primary_keys2.rename(columns= {'database_name': 'DATABASE', 'schema_name': 'SCHEMA', 
                                             'table_name': 'TABLE_NAME', 'column_name': 'COLUMN_NAME', 'key_sequence': 'PK'})

primary_keys2.head()

Unnamed: 0,DATABASE,SCHEMA,TABLE_NAME,COLUMN_NAME,PK,comment
0,GVR_IS360_DEV_DB,AVA_CORE_DEMO,flyway_schema_history,installed_rank,1,
1,GVR_IS360_DEV_DB,AVA_CORE_DEV,flyway_schema_history,installed_rank,1,
2,GVR_IS360_DEV_DB,AVA_CORE_QA,flyway_schema_history,installed_rank,1,
3,GVR_IS360_DEV_DB,AVA_CORE_UAT,flyway_schema_history,installed_rank,1,
4,GVR_IS360_DEV_DB,AVA_DEMO,THERMISTOR_PROBE_LOOKUP,ATG_TYPE,1,


In [20]:
# Add last time table refreshed 
column_table2 = session.sql("SELECT TABLE_CATALOG, TABLE_SCHEMA, TABLE_NAME, COLUMN_NAME, DATA_TYPE, CHARACTER_MAXIMUM_LENGTH, NUMERIC_PRECISION, NUMERIC_SCALE, IS_NULLABLE FROM GVR_IS360_DEV_DB.INFORMATION_SCHEMA.COLUMNS").to_pandas()

column_table2.head()

Unnamed: 0,TABLE_CATALOG,TABLE_SCHEMA,TABLE_NAME,COLUMN_NAME,DATA_TYPE,CHARACTER_MAXIMUM_LENGTH,NUMERIC_PRECISION,NUMERIC_SCALE,IS_NULLABLE
0,GVR_IS360_DEV_DB,CENSUS,RESULTS_MAPPING_TABLE_MANCHESTER_NASHUA,COUNTY,TEXT,5000.0,,,YES
1,GVR_IS360_DEV_DB,PUSH_SALE_EVENT,GVR1_CUSTOMER_SITES_WL_V,SITE_ALT_MAILING_ADD_FLAG,TEXT,1.0,,,YES
2,GVR_IS360_DEV_DB,ARCHIVE,PIPELINE_PUSH_SALE_EVENT_TMP_HISTORY,PAYLOAD_TRANSACTIONAMOUNT,NUMBER,,38.0,14.0,YES
3,GVR_IS360_DEV_DB,ARCHIVE,PUSH_SALE_EVENT_LINE_ITEMS_HISTORY3,ACTUALSALESPRICE,FLOAT,,,,YES
4,GVR_IS360_DEV_DB,AVA_CORE_DEMO,flyway_schema_history,installed_on,TIMESTAMP_LTZ,,,,NO


In [21]:
# Rename columns in COLUMN_TABLE table
column_table2 = column_table2.rename(columns= {'TABLE_CATALOG': 'DATABASE', 'TABLE_SCHEMA': 'SCHEMA', 'CHARACTER_MAXIMUM_LENGTH': 'LENGTH', 
                                             'NUMERIC_PRECISION': 'PRECISION', 'NUMERIC_SCALE': 'SCALE', 'IS_NULLABLE': 'NULLABLE'})

column_table2.head()

Unnamed: 0,DATABASE,SCHEMA,TABLE_NAME,COLUMN_NAME,DATA_TYPE,LENGTH,PRECISION,SCALE,NULLABLE
0,GVR_IS360_DEV_DB,CENSUS,RESULTS_MAPPING_TABLE_MANCHESTER_NASHUA,COUNTY,TEXT,5000.0,,,YES
1,GVR_IS360_DEV_DB,PUSH_SALE_EVENT,GVR1_CUSTOMER_SITES_WL_V,SITE_ALT_MAILING_ADD_FLAG,TEXT,1.0,,,YES
2,GVR_IS360_DEV_DB,ARCHIVE,PIPELINE_PUSH_SALE_EVENT_TMP_HISTORY,PAYLOAD_TRANSACTIONAMOUNT,NUMBER,,38.0,14.0,YES
3,GVR_IS360_DEV_DB,ARCHIVE,PUSH_SALE_EVENT_LINE_ITEMS_HISTORY3,ACTUALSALESPRICE,FLOAT,,,,YES
4,GVR_IS360_DEV_DB,AVA_CORE_DEMO,flyway_schema_history,installed_on,TIMESTAMP_LTZ,,,,NO


In [22]:
# Merge PRIMARY_KEYS and COLUMN_TABLE tables
merged_tables2 = pd.merge(column_table2, primary_keys2, how='left', on=['DATABASE', 'SCHEMA', 'TABLE_NAME', 'COLUMN_NAME'])

merged_tables2.head()

Unnamed: 0,DATABASE,SCHEMA,TABLE_NAME,COLUMN_NAME,DATA_TYPE,LENGTH,PRECISION,SCALE,NULLABLE,PK,comment
0,GVR_IS360_DEV_DB,CENSUS,RESULTS_MAPPING_TABLE_MANCHESTER_NASHUA,COUNTY,TEXT,5000.0,,,YES,,
1,GVR_IS360_DEV_DB,PUSH_SALE_EVENT,GVR1_CUSTOMER_SITES_WL_V,SITE_ALT_MAILING_ADD_FLAG,TEXT,1.0,,,YES,,
2,GVR_IS360_DEV_DB,ARCHIVE,PIPELINE_PUSH_SALE_EVENT_TMP_HISTORY,PAYLOAD_TRANSACTIONAMOUNT,NUMBER,,38.0,14.0,YES,,
3,GVR_IS360_DEV_DB,ARCHIVE,PUSH_SALE_EVENT_LINE_ITEMS_HISTORY3,ACTUALSALESPRICE,FLOAT,,,,YES,,
4,GVR_IS360_DEV_DB,AVA_CORE_DEMO,flyway_schema_history,installed_on,TIMESTAMP_LTZ,,,,NO,,


In [23]:
# Get rid of schemas: Information_Schema, Admin, and Stage
merged_tables3 = merged_tables2[~merged_tables2['SCHEMA'].isin(['INFORMATION_SCHEMA', 'STAGE', 'ADMIN'])]

merged_tables3.head()

Unnamed: 0,DATABASE,SCHEMA,TABLE_NAME,COLUMN_NAME,DATA_TYPE,LENGTH,PRECISION,SCALE,NULLABLE,PK,comment
0,GVR_IS360_DEV_DB,CENSUS,RESULTS_MAPPING_TABLE_MANCHESTER_NASHUA,COUNTY,TEXT,5000.0,,,YES,,
1,GVR_IS360_DEV_DB,PUSH_SALE_EVENT,GVR1_CUSTOMER_SITES_WL_V,SITE_ALT_MAILING_ADD_FLAG,TEXT,1.0,,,YES,,
2,GVR_IS360_DEV_DB,ARCHIVE,PIPELINE_PUSH_SALE_EVENT_TMP_HISTORY,PAYLOAD_TRANSACTIONAMOUNT,NUMBER,,38.0,14.0,YES,,
3,GVR_IS360_DEV_DB,ARCHIVE,PUSH_SALE_EVENT_LINE_ITEMS_HISTORY3,ACTUALSALESPRICE,FLOAT,,,,YES,,
4,GVR_IS360_DEV_DB,AVA_CORE_DEMO,flyway_schema_history,installed_on,TIMESTAMP_LTZ,,,,NO,,


## Merge GVR_PROD with IS360 Database

In [24]:
# merged_tables3 = pd.merge(merged_tables, merged_tables2, how='left', on=['DATABASE', 'SCHEMA', 'TABLE_NAME', 'COLUMN_NAME'])

# merged_tables3.head()

In [25]:
# Appending IS360 to GVR_PROD dataframe
merged_tables3 = merged_tables.append(merged_tables2, ignore_index=True)

merged_tables3.head()

  merged_tables3 = merged_tables.append(merged_tables2, ignore_index=True)


Unnamed: 0,DATABASE,SCHEMA,TABLE_NAME,COLUMN_NAME,DATA_TYPE,LENGTH,PRECISION,SCALE,NULLABLE,PK,comment
0,GVR_PROD,SALESFORCE,LEAD,CURRENCYISOCODE,TEXT,16777216.0,,,YES,,
1,GVR_PROD,SMS,CUSTOMER_SITES,SITE_SEC_ENABLED,TEXT,100.0,,,NO,,
2,GVR_PROD,SALESFORCE,TASK,REBATE_REVIEW__C,BOOLEAN,,,,YES,,
3,GVR_PROD,STAGE,CUSTTRANS,SETTLEMENT,NUMBER,,38.0,0.0,YES,,
4,GVR_PROD,STAGE,DIRPARTYTABLE,ISCONSOLIDATIONCOMPANY,TEXT,5.0,,,YES,,


# Hard coding:

In [26]:
# # Generates values for the ZONE column

# def zone(column):
#     if column['SCHEMA'] in ['SMS', 'SALESFORCE', 'AX', 'MAC-PAC', 'PROTHEUS_AR', 'PROTHEUS_BR', 'PROTHEUS_CH', 'QAD', 'HFM']:
#         val = 'DATALAKE'
#     elif column['SCHEMA'] in ['DATA_MART_FIN_NA', 'DATA_MART_AMO_NA', 'DATA_MART_FIN_GLOBAL', 'DATA_MART_CUSTOMER']:
#         val = 'DATAMART'
#     elif column['SCHEMA'] in ['DW']:
#         val = 'DATAWAREHOUSE'
#     elif column['SCHEMA'] in ['RPT']:
#         val = 'CONSUMPTION'
#     elif column['SCHEMA'] in ['INSITE360_TELEMETRY', 'AVA_DEMO', 'IOT_CORE', 'AVA_LEGACY', 'AVA_DEV',
#                               'AVA_CORE_DEV', 'AVA_UAT', 'AVA_QA', 'AVA_CORE_UAT', 'AVA', 'ARCHIVE', 
#                               'AVA_CORE_QA', 'AVA_CORE_DEMO', 'PUSH_SALE_EVENT', 'CENSUS']:
#         val = 'RAW'
#     else:
#         val = ' '
#     return val

In [27]:
# Generates values for the ZONE column
# THIS ONE IS FOR IS360 DATALAKE POSSIBILITY

def zone(column):
    if column['SCHEMA'] in ['SMS', 'SALESFORCE', 'AX', 'MAC-PAC', 'PROTHEUS_AR', 'PROTHEUS_BR', 'PROTHEUS_CH', 'QAD', 'HFM', 'INSITE360_TELEMETRY', 'AVA_DEMO', 'IOT_CORE', 'AVA_LEGACY', 
                            'AVA_DEV', 'AVA_CORE_DEV', 'AVA_UAT', 'AVA_QA', 'AVA_CORE_UAT', 'AVA', 'ARCHIVE', 'AVA_CORE_QA', 'AVA_CORE_DEMO', 'PUSH_SALE_EVENT', 'CENSUS']:
        val = 'DATALAKE'
    elif column['SCHEMA'] in ['DATA_MART_FIN_NA', 'DATA_MART_AMO_NA', 'DATA_MART_FIN_GLOBAL', 'DATA_MART_CUSTOMER']:
        val = 'DATAMART'
    elif column['SCHEMA'] in ['DW']:
        val = 'DATAWAREHOUSE'
    elif column['SCHEMA'] in ['RPT']:
        val = 'CONSUMPTION'
    else:
        val = ' '
    return val

In [28]:
# Apply the zone function to the merged_tables dataframe
merged_tables3['ZONE'] = merged_tables3.apply(zone, axis=1)

merged_tables3.head()

Unnamed: 0,DATABASE,SCHEMA,TABLE_NAME,COLUMN_NAME,DATA_TYPE,LENGTH,PRECISION,SCALE,NULLABLE,PK,comment,ZONE
0,GVR_PROD,SALESFORCE,LEAD,CURRENCYISOCODE,TEXT,16777216.0,,,YES,,,DATALAKE
1,GVR_PROD,SMS,CUSTOMER_SITES,SITE_SEC_ENABLED,TEXT,100.0,,,NO,,,DATALAKE
2,GVR_PROD,SALESFORCE,TASK,REBATE_REVIEW__C,BOOLEAN,,,,YES,,,DATALAKE
3,GVR_PROD,STAGE,CUSTTRANS,SETTLEMENT,NUMBER,,38.0,0.0,YES,,,
4,GVR_PROD,STAGE,DIRPARTYTABLE,ISCONSOLIDATIONCOMPANY,TEXT,5.0,,,YES,,,


In [29]:
# Add the historical retention data (hard coded for now, will be available in the future when this historical retention changes)
merged_tables3['HISTORICAL_RETENTION'] = '32 DAYS'

In [30]:
merged_tables3.head()

Unnamed: 0,DATABASE,SCHEMA,TABLE_NAME,COLUMN_NAME,DATA_TYPE,LENGTH,PRECISION,SCALE,NULLABLE,PK,comment,ZONE,HISTORICAL_RETENTION
0,GVR_PROD,SALESFORCE,LEAD,CURRENCYISOCODE,TEXT,16777216.0,,,YES,,,DATALAKE,32 DAYS
1,GVR_PROD,SMS,CUSTOMER_SITES,SITE_SEC_ENABLED,TEXT,100.0,,,NO,,,DATALAKE,32 DAYS
2,GVR_PROD,SALESFORCE,TASK,REBATE_REVIEW__C,BOOLEAN,,,,YES,,,DATALAKE,32 DAYS
3,GVR_PROD,STAGE,CUSTTRANS,SETTLEMENT,NUMBER,,38.0,0.0,YES,,,,32 DAYS
4,GVR_PROD,STAGE,DIRPARTYTABLE,ISCONSOLIDATIONCOMPANY,TEXT,5.0,,,YES,,,,32 DAYS


# Load Times: currently duplicates everything
* Take most recent load time for each table

In [31]:
# # Load_table created to show when the table was last loaded
# load_table = session.sql("SELECT SCHEMA_NAME, TABLE_NAME, LAST_LOAD_TIME FROM GVR_PROD.INFORMATION_SCHEMA.LOAD_HISTORY").to_pandas()

# load_table.head()

In [32]:
# # Change the last_load_time to only be yyyy/mm/dd
# load_table['LAST_LOAD_TIME'] = pd.to_datetime(load_table['LAST_LOAD_TIME']).dt.date

# load_table.head()

In [33]:
# # Rename columns in LOAD_TABLE table to match MERGED_TABLES table
# load_table = load_table.rename(columns= {'SCHEMA_NAME': 'SCHEMA'})

# load_table.head()

# This duplicates everything.
* Find way to retrieve most recent load_date time and then don't gather the others?

In [34]:
# merged_tables1 = pd.merge(merged_tables, load_table, how='left', on=['SCHEMA', 'TABLE_NAME'])

# merged_tables1.head()

In [35]:
# print(len(merged_tables1))

# Table Type - Add Later
* Everything should be a Type 1 table

In [36]:
# # Hard code that all table types are Type1
# merged_tables3['TABLE_TYPE'] = 'Type1'

# merged_tables3.head()

# Data Retention - Add Later

In [37]:
# parameters = session.sql("SHOW PARAMETERS IN DATABASE GVR_PROD").collect()

# parameters = pd.DataFrame(list(parameters))

# parameters.head()

## Write Data Frame to Excel Worksheet 'Catalog.xlsx' Inside 'Catalog' Folder
* https://stackoverflow.com/questions/68798422/python-excelwriter-overwriting-excel-sheet
* https://stackoverflow.com/questions/68759330/python-appending-dataframe-to-exsiting-excel-file-and-sheet

In [38]:
# Get rid of schemas: Information_Schema, Admin, and Stage
merged_tables3 = merged_tables3[~merged_tables3['SCHEMA'].isin(['INFORMATION_SCHEMA', 'STAGE', 'ADMIN'])]

merged_tables3.head()

Unnamed: 0,DATABASE,SCHEMA,TABLE_NAME,COLUMN_NAME,DATA_TYPE,LENGTH,PRECISION,SCALE,NULLABLE,PK,comment,ZONE,HISTORICAL_RETENTION
0,GVR_PROD,SALESFORCE,LEAD,CURRENCYISOCODE,TEXT,16777216.0,,,YES,,,DATALAKE,32 DAYS
1,GVR_PROD,SMS,CUSTOMER_SITES,SITE_SEC_ENABLED,TEXT,100.0,,,NO,,,DATALAKE,32 DAYS
2,GVR_PROD,SALESFORCE,TASK,REBATE_REVIEW__C,BOOLEAN,,,,YES,,,DATALAKE,32 DAYS
7,GVR_PROD,SALESFORCE,LEAD,CREATEDBYID,TEXT,16777216.0,,,YES,,,DATALAKE,32 DAYS
18,GVR_PROD,MAC-PAC,GLTRANP2,TSJEFP,NUMBER,,38.0,0.0,YES,,,DATALAKE,32 DAYS


In [39]:
# Verify the columns are all capitalized
merged_tables3.columns = merged_tables3.columns.str.upper()

merged_tables3.head()

Unnamed: 0,DATABASE,SCHEMA,TABLE_NAME,COLUMN_NAME,DATA_TYPE,LENGTH,PRECISION,SCALE,NULLABLE,PK,COMMENT,ZONE,HISTORICAL_RETENTION
0,GVR_PROD,SALESFORCE,LEAD,CURRENCYISOCODE,TEXT,16777216.0,,,YES,,,DATALAKE,32 DAYS
1,GVR_PROD,SMS,CUSTOMER_SITES,SITE_SEC_ENABLED,TEXT,100.0,,,NO,,,DATALAKE,32 DAYS
2,GVR_PROD,SALESFORCE,TASK,REBATE_REVIEW__C,BOOLEAN,,,,YES,,,DATALAKE,32 DAYS
7,GVR_PROD,SALESFORCE,LEAD,CREATEDBYID,TEXT,16777216.0,,,YES,,,DATALAKE,32 DAYS
18,GVR_PROD,MAC-PAC,GLTRANP2,TSJEFP,NUMBER,,38.0,0.0,YES,,,DATALAKE,32 DAYS


# Rearrange Columns - Add Later

In [40]:
# Rearrange columns

# Write to Excel

In [41]:
with pd.ExcelWriter('Catalog/Data_Catalog.xlsx', mode='a', engine='openpyxl', if_sheet_exists="replace",) as writer:
    merged_tables3.to_excel(writer, sheet_name='Catalog', index=False)