# Data Catalog Python Script
* Repository located here: https://github.com/emilyporter920/Data_Cataloging

## Import Libaries

In [1]:
# Import dependencies
from snowflake.snowpark.session import Session
import json
import pandas as pd
from platform import python_version
from datetime import datetime
import openpyxl
from config import account, user, authenticator, warehouse1, role1, warehouse2, role2 

# Shows Python version (SnowPark uses anything below 3.8.x)
print(python_version())

3.8.15


# GVR PROD Database

In [2]:
# Create Snowflake Session object (GVR_PROD)
connection_parameters = {
    "account": account,
    "user": user,
    "authenticator": authenticator,
    "warehouse": warehouse1,
    "role": role1
}

session = Session.builder.configs(connection_parameters).create()

Initiating login request with your identity provider. A browser window should have opened for you to complete the login. If you can't see it, check existing browser windows, or your OS settings. Press CTRL+C to abort and try again...


## Primary Keys

In [3]:
# Grabbing primary keys from PROD
primary_keys = session.sql("SHOW PRIMARY KEYS IN DATABASE GVR_PROD").collect()

primary_keys= pd.DataFrame(list(primary_keys))

In [4]:
# Get rid of columns you don't need in PRIMARY_KEYS table
primary_keys = primary_keys.drop(['created_on', 'constraint_name', 'rely'], axis=1)
primary_keys.head()

Unnamed: 0,database_name,schema_name,table_name,column_name,key_sequence,comment
0,GVR_PROD,ADMIN,BATCHJOBHISTORY_GERMANY,DAILY_JOB_MONITORING,3,
1,GVR_PROD,ADMIN,BATCHJOBHISTORY_GERMANY,DATE_,2,
2,GVR_PROD,ADMIN,BATCHJOBHISTORY_GERMANY,SERIALNUMBER,1,
3,GVR_PROD,ADMIN,BATCHJOBHISTORY_GIL,RECID,1,
4,GVR_PROD,AX,ACCOUNTINGDISTRIBUTION,RECID,1,


In [5]:
# Rename columns in PRIMARY_KEYS table to match COLUMN_TABLE columns
primary_keys = primary_keys.rename(columns= {'database_name': 'DATABASE', 'schema_name': 'SCHEMA', 
                                             'table_name': 'TABLE_NAME', 'column_name': 'COLUMN_NAME', 'key_sequence': 'PK'})

primary_keys.head()

Unnamed: 0,DATABASE,SCHEMA,TABLE_NAME,COLUMN_NAME,PK,comment
0,GVR_PROD,ADMIN,BATCHJOBHISTORY_GERMANY,DAILY_JOB_MONITORING,3,
1,GVR_PROD,ADMIN,BATCHJOBHISTORY_GERMANY,DATE_,2,
2,GVR_PROD,ADMIN,BATCHJOBHISTORY_GERMANY,SERIALNUMBER,1,
3,GVR_PROD,ADMIN,BATCHJOBHISTORY_GIL,RECID,1,
4,GVR_PROD,AX,ACCOUNTINGDISTRIBUTION,RECID,1,


In [6]:
# Get rid of extra comment column
primary_keys = primary_keys.drop(['comment'], axis=1)

## Information Schema

In [7]:
# Information_Schema data
column_table = session.sql("SELECT TABLE_CATALOG, TABLE_SCHEMA, TABLE_NAME, COLUMN_NAME, DATA_TYPE, CHARACTER_MAXIMUM_LENGTH, NUMERIC_PRECISION, NUMERIC_SCALE, IS_NULLABLE, COMMENT FROM GVR_PROD.INFORMATION_SCHEMA.COLUMNS").to_pandas()

column_table.head()

Unnamed: 0,TABLE_CATALOG,TABLE_SCHEMA,TABLE_NAME,COLUMN_NAME,DATA_TYPE,CHARACTER_MAXIMUM_LENGTH,NUMERIC_PRECISION,NUMERIC_SCALE,IS_NULLABLE,COMMENT
0,GVR_PROD,RPT,TBLWARREXPENDITURES,PAYMENTAPPROVEDDATE,DATE,,,,YES,
1,GVR_PROD,RPT,VSALESREPORTDETAIL,REVENUE,NUMBER,,38.0,3.0,NO,
2,GVR_PROD,SALESFORCE,ACCOUNT_BLUE_SHEET__C,NAME,TEXT,80.0,,,YES,
3,GVR_PROD,SALESFORCE,GVR_ACCOUNTFORECASTS__C,APR_WK_1_KIT_FC__C,NUMBER,,18.0,0.0,YES,
4,GVR_PROD,SMS,MCS_SVC_REQ,MSR_EST_AMT,NUMBER,,38.0,0.0,YES,


In [8]:
# Rename columns in COLUMN_TABLE table
column_table = column_table.rename(columns= {'TABLE_CATALOG': 'DATABASE', 'TABLE_SCHEMA': 'SCHEMA', 'CHARACTER_MAXIMUM_LENGTH': 'LENGTH', 
                                             'NUMERIC_PRECISION': 'PRECISION', 'NUMERIC_SCALE': 'SCALE', 'IS_NULLABLE': 'NULLABLE'})

column_table.head()

Unnamed: 0,DATABASE,SCHEMA,TABLE_NAME,COLUMN_NAME,DATA_TYPE,LENGTH,PRECISION,SCALE,NULLABLE,COMMENT
0,GVR_PROD,RPT,TBLWARREXPENDITURES,PAYMENTAPPROVEDDATE,DATE,,,,YES,
1,GVR_PROD,RPT,VSALESREPORTDETAIL,REVENUE,NUMBER,,38.0,3.0,NO,
2,GVR_PROD,SALESFORCE,ACCOUNT_BLUE_SHEET__C,NAME,TEXT,80.0,,,YES,
3,GVR_PROD,SALESFORCE,GVR_ACCOUNTFORECASTS__C,APR_WK_1_KIT_FC__C,NUMBER,,18.0,0.0,YES,
4,GVR_PROD,SMS,MCS_SVC_REQ,MSR_EST_AMT,NUMBER,,38.0,0.0,YES,


In [9]:
# Merge PRIMARY_KEYS and COLUMN_TABLE tables
merged_tables = pd.merge(column_table, primary_keys, how='left', on=['DATABASE', 'SCHEMA', 'TABLE_NAME', 'COLUMN_NAME'])

merged_tables.head()

Unnamed: 0,DATABASE,SCHEMA,TABLE_NAME,COLUMN_NAME,DATA_TYPE,LENGTH,PRECISION,SCALE,NULLABLE,COMMENT,PK
0,GVR_PROD,RPT,TBLWARREXPENDITURES,PAYMENTAPPROVEDDATE,DATE,,,,YES,,
1,GVR_PROD,RPT,VSALESREPORTDETAIL,REVENUE,NUMBER,,38.0,3.0,NO,,
2,GVR_PROD,SALESFORCE,ACCOUNT_BLUE_SHEET__C,NAME,TEXT,80.0,,,YES,,
3,GVR_PROD,SALESFORCE,GVR_ACCOUNTFORECASTS__C,APR_WK_1_KIT_FC__C,NUMBER,,18.0,0.0,YES,,
4,GVR_PROD,SMS,MCS_SVC_REQ,MSR_EST_AMT,NUMBER,,38.0,0.0,YES,,


# IS360 Database

In [10]:
# Create Snowflake Session object (IS360)
connection_parameters = {
    "account": account,
    "user": user,
    "authenticator": authenticator,
    "warehouse": warehouse2,
    "role": role2
}

session = Session.builder.configs(connection_parameters).create()

Initiating login request with your identity provider. A browser window should have opened for you to complete the login. If you can't see it, check existing browser windows, or your OS settings. Press CTRL+C to abort and try again...


## Primary Keys

In [11]:
# Selecting the UAT data from the Snowflake table
primary_keys2 = session.sql("SHOW PRIMARY KEYS IN DATABASE GVR_IS360_DEV_DB").collect()

primary_keys2 = pd.DataFrame(list(primary_keys2))

In [12]:
# Get rid of columns you don't need in PRIMARY_KEYS table
primary_keys2 = primary_keys2.drop(['created_on', 'constraint_name', 'rely'], axis=1)

primary_keys2.head()

Unnamed: 0,database_name,schema_name,table_name,column_name,key_sequence,comment
0,GVR_IS360_DEV_DB,AVA_CORE_DEV,flyway_schema_history,installed_rank,1,
1,GVR_IS360_DEV_DB,AVA_CORE_QA,flyway_schema_history,installed_rank,1,
2,GVR_IS360_DEV_DB,AVA_CORE_UAT,flyway_schema_history,installed_rank,1,
3,GVR_IS360_DEV_DB,AVA_DEV,THERMISTOR_PROBE_LOOKUP,ATG_TYPE,1,
4,GVR_IS360_DEV_DB,AVA_DEV,THERMISTOR_PROBE_LOOKUP,LENGTH_UOM,2,


In [13]:
# Rename columns in PRIMARY_KEYS table to match COLUMN_TABLE columns
primary_keys2 = primary_keys2.rename(columns= {'database_name': 'DATABASE', 'schema_name': 'SCHEMA', 
                                             'table_name': 'TABLE_NAME', 'column_name': 'COLUMN_NAME', 'key_sequence': 'PK'})

primary_keys2.head()

Unnamed: 0,DATABASE,SCHEMA,TABLE_NAME,COLUMN_NAME,PK,comment
0,GVR_IS360_DEV_DB,AVA_CORE_DEV,flyway_schema_history,installed_rank,1,
1,GVR_IS360_DEV_DB,AVA_CORE_QA,flyway_schema_history,installed_rank,1,
2,GVR_IS360_DEV_DB,AVA_CORE_UAT,flyway_schema_history,installed_rank,1,
3,GVR_IS360_DEV_DB,AVA_DEV,THERMISTOR_PROBE_LOOKUP,ATG_TYPE,1,
4,GVR_IS360_DEV_DB,AVA_DEV,THERMISTOR_PROBE_LOOKUP,LENGTH_UOM,2,


In [14]:
# Get rid of extra comment column
primary_keys2 = primary_keys2.drop(['comment'], axis=1)

## Information Schema

In [15]:
# Information_Schema data
column_table2 = session.sql("SELECT TABLE_CATALOG, TABLE_SCHEMA, TABLE_NAME, COLUMN_NAME, DATA_TYPE, CHARACTER_MAXIMUM_LENGTH, NUMERIC_PRECISION, NUMERIC_SCALE, IS_NULLABLE, COMMENT FROM GVR_IS360_DEV_DB.INFORMATION_SCHEMA.COLUMNS").to_pandas()

column_table2.head()

Unnamed: 0,TABLE_CATALOG,TABLE_SCHEMA,TABLE_NAME,COLUMN_NAME,DATA_TYPE,CHARACTER_MAXIMUM_LENGTH,NUMERIC_PRECISION,NUMERIC_SCALE,IS_NULLABLE,COMMENT
0,GVR_IS360_DEV_DB,CENSUS,RESULTS_MAPPING_TABLE_MANCHESTER_NASHUA,COUNTY,TEXT,5000.0,,,YES,
1,GVR_IS360_DEV_DB,PUSH_SALE_EVENT,GVR1_CUSTOMER_SITES_WL_V,SITE_ALT_MAILING_ADD_FLAG,TEXT,1.0,,,YES,
2,GVR_IS360_DEV_DB,AVA_DEV,INVENTORY_INCREASE,VOLUME_TC_INCREASE,NUMBER,,15.0,4.0,YES,Amount of fuel added to the tank as a Temperat...
3,GVR_IS360_DEV_DB,AVA_UAT,flyway_schema_history,version,TEXT,50.0,,,YES,
4,GVR_IS360_DEV_DB,ARCHIVE,PIPELINE_PUSH_SALE_EVENT_TMP_HISTORY,PAYLOAD_TRANSACTIONAMOUNT,NUMBER,,38.0,14.0,YES,


In [16]:
# Rename columns in COLUMN_TABLE table
column_table2 = column_table2.rename(columns= {'TABLE_CATALOG': 'DATABASE', 'TABLE_SCHEMA': 'SCHEMA', 'CHARACTER_MAXIMUM_LENGTH': 'LENGTH', 
                                             'NUMERIC_PRECISION': 'PRECISION', 'NUMERIC_SCALE': 'SCALE', 'IS_NULLABLE': 'NULLABLE'})

column_table2.head()

Unnamed: 0,DATABASE,SCHEMA,TABLE_NAME,COLUMN_NAME,DATA_TYPE,LENGTH,PRECISION,SCALE,NULLABLE,COMMENT
0,GVR_IS360_DEV_DB,CENSUS,RESULTS_MAPPING_TABLE_MANCHESTER_NASHUA,COUNTY,TEXT,5000.0,,,YES,
1,GVR_IS360_DEV_DB,PUSH_SALE_EVENT,GVR1_CUSTOMER_SITES_WL_V,SITE_ALT_MAILING_ADD_FLAG,TEXT,1.0,,,YES,
2,GVR_IS360_DEV_DB,AVA_DEV,INVENTORY_INCREASE,VOLUME_TC_INCREASE,NUMBER,,15.0,4.0,YES,Amount of fuel added to the tank as a Temperat...
3,GVR_IS360_DEV_DB,AVA_UAT,flyway_schema_history,version,TEXT,50.0,,,YES,
4,GVR_IS360_DEV_DB,ARCHIVE,PIPELINE_PUSH_SALE_EVENT_TMP_HISTORY,PAYLOAD_TRANSACTIONAMOUNT,NUMBER,,38.0,14.0,YES,


In [17]:
# Merge PRIMARY_KEYS and COLUMN_TABLE tables
merged_tables2 = pd.merge(column_table2, primary_keys2, how='left', on=['DATABASE', 'SCHEMA', 'TABLE_NAME', 'COLUMN_NAME'])

merged_tables2.head()

Unnamed: 0,DATABASE,SCHEMA,TABLE_NAME,COLUMN_NAME,DATA_TYPE,LENGTH,PRECISION,SCALE,NULLABLE,COMMENT,PK
0,GVR_IS360_DEV_DB,CENSUS,RESULTS_MAPPING_TABLE_MANCHESTER_NASHUA,COUNTY,TEXT,5000.0,,,YES,,
1,GVR_IS360_DEV_DB,PUSH_SALE_EVENT,GVR1_CUSTOMER_SITES_WL_V,SITE_ALT_MAILING_ADD_FLAG,TEXT,1.0,,,YES,,
2,GVR_IS360_DEV_DB,AVA_DEV,INVENTORY_INCREASE,VOLUME_TC_INCREASE,NUMBER,,15.0,4.0,YES,Amount of fuel added to the tank as a Temperat...,
3,GVR_IS360_DEV_DB,AVA_UAT,flyway_schema_history,version,TEXT,50.0,,,YES,,
4,GVR_IS360_DEV_DB,ARCHIVE,PIPELINE_PUSH_SALE_EVENT_TMP_HISTORY,PAYLOAD_TRANSACTIONAMOUNT,NUMBER,,38.0,14.0,YES,,


In [18]:
# Get rid of schemas: Information_Schema, Admin, and Stage
merged_tables3 = merged_tables2[~merged_tables2['SCHEMA'].isin(['INFORMATION_SCHEMA', 'STAGE', 'ADMIN'])]

merged_tables3.head()

Unnamed: 0,DATABASE,SCHEMA,TABLE_NAME,COLUMN_NAME,DATA_TYPE,LENGTH,PRECISION,SCALE,NULLABLE,COMMENT,PK
0,GVR_IS360_DEV_DB,CENSUS,RESULTS_MAPPING_TABLE_MANCHESTER_NASHUA,COUNTY,TEXT,5000.0,,,YES,,
1,GVR_IS360_DEV_DB,PUSH_SALE_EVENT,GVR1_CUSTOMER_SITES_WL_V,SITE_ALT_MAILING_ADD_FLAG,TEXT,1.0,,,YES,,
2,GVR_IS360_DEV_DB,AVA_DEV,INVENTORY_INCREASE,VOLUME_TC_INCREASE,NUMBER,,15.0,4.0,YES,Amount of fuel added to the tank as a Temperat...,
3,GVR_IS360_DEV_DB,AVA_UAT,flyway_schema_history,version,TEXT,50.0,,,YES,,
4,GVR_IS360_DEV_DB,ARCHIVE,PIPELINE_PUSH_SALE_EVENT_TMP_HISTORY,PAYLOAD_TRANSACTIONAMOUNT,NUMBER,,38.0,14.0,YES,,


# Merge GVR PROD with IS360 Database

In [19]:
# Appending IS360 to GVR_PROD dataframe
merged_tables3 = merged_tables.append(merged_tables2, ignore_index=True)

merged_tables3.head()

  merged_tables3 = merged_tables.append(merged_tables2, ignore_index=True)


Unnamed: 0,DATABASE,SCHEMA,TABLE_NAME,COLUMN_NAME,DATA_TYPE,LENGTH,PRECISION,SCALE,NULLABLE,COMMENT,PK
0,GVR_PROD,RPT,TBLWARREXPENDITURES,PAYMENTAPPROVEDDATE,DATE,,,,YES,,
1,GVR_PROD,RPT,VSALESREPORTDETAIL,REVENUE,NUMBER,,38.0,3.0,NO,,
2,GVR_PROD,SALESFORCE,ACCOUNT_BLUE_SHEET__C,NAME,TEXT,80.0,,,YES,,
3,GVR_PROD,SALESFORCE,GVR_ACCOUNTFORECASTS__C,APR_WK_1_KIT_FC__C,NUMBER,,18.0,0.0,YES,,
4,GVR_PROD,SMS,MCS_SVC_REQ,MSR_EST_AMT,NUMBER,,38.0,0.0,YES,,


# Zones

In [20]:
# Generates values for the ZONE column
def zone(column):
    if column['SCHEMA'] in ['SMS', 'SALESFORCE', 'AX', 'MAC-PAC', 'PROTHEUS_AR', 'PROTHEUS_BR', 'PROTHEUS_CH', 'QAD', 'HFM', 'INSITE360_TELEMETRY', 'AVA_DEMO', 'IOT_CORE', 'AVA_LEGACY', 
                            'AVA_DEV', 'AVA_CORE_DEV', 'AVA_UAT', 'AVA_QA', 'AVA_CORE_UAT', 'AVA', 'ARCHIVE', 'AVA_CORE_QA', 'AVA_CORE_DEMO', 'PUSH_SALE_EVENT', 'CENSUS']:
        val = 'DATALAKE'
    elif column['SCHEMA'] in ['DATA_MART_FIN_NA', 'DATA_MART_AMO_NA', 'DATA_MART_FIN_GLOBAL', 'DATA_MART_CUSTOMER']:
        val = 'DATAMART'
    elif column['SCHEMA'] in ['DW']:
        val = 'DATAWAREHOUSE'
    elif column['SCHEMA'] in ['RPT']:
        val = 'LEGACY REPORTING'
    else:
        val = ' '
    return val

In [21]:
# Apply the zone function to the merged_tables3 dataframe
merged_tables3['ZONE'] = merged_tables3.apply(zone, axis=1)

merged_tables3.head()

Unnamed: 0,DATABASE,SCHEMA,TABLE_NAME,COLUMN_NAME,DATA_TYPE,LENGTH,PRECISION,SCALE,NULLABLE,COMMENT,PK,ZONE
0,GVR_PROD,RPT,TBLWARREXPENDITURES,PAYMENTAPPROVEDDATE,DATE,,,,YES,,,LEGACY REPORTING
1,GVR_PROD,RPT,VSALESREPORTDETAIL,REVENUE,NUMBER,,38.0,3.0,NO,,,LEGACY REPORTING
2,GVR_PROD,SALESFORCE,ACCOUNT_BLUE_SHEET__C,NAME,TEXT,80.0,,,YES,,,DATALAKE
3,GVR_PROD,SALESFORCE,GVR_ACCOUNTFORECASTS__C,APR_WK_1_KIT_FC__C,NUMBER,,18.0,0.0,YES,,,DATALAKE
4,GVR_PROD,SMS,MCS_SVC_REQ,MSR_EST_AMT,NUMBER,,38.0,0.0,YES,,,DATALAKE


# Historical Retention

In [22]:
# Add the historical retention data (hard coded for now, will be available in the future when this historical retention changes)
merged_tables3['HISTORICAL_RETENTION'] = '32 DAYS'

merged_tables3.head()

Unnamed: 0,DATABASE,SCHEMA,TABLE_NAME,COLUMN_NAME,DATA_TYPE,LENGTH,PRECISION,SCALE,NULLABLE,COMMENT,PK,ZONE,HISTORICAL_RETENTION
0,GVR_PROD,RPT,TBLWARREXPENDITURES,PAYMENTAPPROVEDDATE,DATE,,,,YES,,,LEGACY REPORTING,32 DAYS
1,GVR_PROD,RPT,VSALESREPORTDETAIL,REVENUE,NUMBER,,38.0,3.0,NO,,,LEGACY REPORTING,32 DAYS
2,GVR_PROD,SALESFORCE,ACCOUNT_BLUE_SHEET__C,NAME,TEXT,80.0,,,YES,,,DATALAKE,32 DAYS
3,GVR_PROD,SALESFORCE,GVR_ACCOUNTFORECASTS__C,APR_WK_1_KIT_FC__C,NUMBER,,18.0,0.0,YES,,,DATALAKE,32 DAYS
4,GVR_PROD,SMS,MCS_SVC_REQ,MSR_EST_AMT,NUMBER,,38.0,0.0,YES,,,DATALAKE,32 DAYS


# Load Times

## GVR PROD

In [23]:
# Load_table created to show when the table was last loaded
load_table1 = session.sql("SELECT SCHEMA_NAME, TABLE_NAME, LAST_LOAD_TIME FROM GVR_PROD.INFORMATION_SCHEMA.LOAD_HISTORY").to_pandas()

load_table1.head()

Unnamed: 0,SCHEMA_NAME,TABLE_NAME,LAST_LOAD_TIME
0,ADMIN,SOURCE_TABLE_COUNTS,2023-06-29 01:33:08.984000-04:00
1,ADMIN,SOURCE_TABLE_COUNTS,2023-06-29 01:33:08.984000-04:00
2,ADMIN,SOURCE_TABLE_COUNTS,2023-06-29 01:33:08.984000-04:00
3,ADMIN,SOURCE_TABLE_COUNTS,2023-06-28 01:33:21.518000-04:00
4,ADMIN,SOURCE_TABLE_COUNTS,2023-06-28 01:33:21.518000-04:00


In [24]:
# Change the last_load_time to only be yyyy/mm/dd
load_table1['LAST_LOAD_TIME'] = pd.to_datetime(load_table1['LAST_LOAD_TIME']).dt.date

load_table1.head()

Unnamed: 0,SCHEMA_NAME,TABLE_NAME,LAST_LOAD_TIME
0,ADMIN,SOURCE_TABLE_COUNTS,2023-06-29
1,ADMIN,SOURCE_TABLE_COUNTS,2023-06-29
2,ADMIN,SOURCE_TABLE_COUNTS,2023-06-29
3,ADMIN,SOURCE_TABLE_COUNTS,2023-06-28
4,ADMIN,SOURCE_TABLE_COUNTS,2023-06-28


In [25]:
# Rename columns in LOAD_TABLE table to match MERGED_TABLES table
load_table1 = load_table1.rename(columns= {'SCHEMA_NAME': 'SCHEMA'})

load_table1.head()

Unnamed: 0,SCHEMA,TABLE_NAME,LAST_LOAD_TIME
0,ADMIN,SOURCE_TABLE_COUNTS,2023-06-29
1,ADMIN,SOURCE_TABLE_COUNTS,2023-06-29
2,ADMIN,SOURCE_TABLE_COUNTS,2023-06-29
3,ADMIN,SOURCE_TABLE_COUNTS,2023-06-28
4,ADMIN,SOURCE_TABLE_COUNTS,2023-06-28


In [26]:
# Sort values to show the most recent load times
most_recent_times1 = load_table1.sort_values('LAST_LOAD_TIME', ascending=False)

most_recent_times1.head()

Unnamed: 0,SCHEMA,TABLE_NAME,LAST_LOAD_TIME
0,ADMIN,SOURCE_TABLE_COUNTS,2023-06-29
6138,STAGE,LOGISTICSPOSTALADDRESS,2023-06-29
2300,SALESFORCE,OPPORTUNITYFIELDHISTORY,2023-06-29
2299,SALESFORCE,OPPORTUNITYFIELDHISTORY,2023-06-29
2298,SALESFORCE,OPPORTUNITYFIELDHISTORY,2023-06-29


In [27]:
# Select only the most recent load time
most_recent_times1 = most_recent_times1.drop_duplicates(subset='TABLE_NAME', keep='first')

most_recent_times1 = most_recent_times1.reset_index(drop=True)

most_recent_times1.head()

Unnamed: 0,SCHEMA,TABLE_NAME,LAST_LOAD_TIME
0,ADMIN,SOURCE_TABLE_COUNTS,2023-06-29
1,STAGE,LOGISTICSPOSTALADDRESS,2023-06-29
2,SALESFORCE,OPPORTUNITYFIELDHISTORY,2023-06-29
3,STAGE,LEDGERJOURNALTABLE,2023-06-29
4,STAGE,LEDGERJOURNALTRANS,2023-06-29


## IS360

In [28]:
# Load_table created to show when the table was last loaded
load_table2 = session.sql("SELECT SCHEMA_NAME, TABLE_NAME, LAST_LOAD_TIME FROM GVR_IS360_DEV_DB.INFORMATION_SCHEMA.LOAD_HISTORY").to_pandas()

load_table2.head()

Unnamed: 0,SCHEMA_NAME,TABLE_NAME,LAST_LOAD_TIME
0,AVA_3809,AVA_CONFIGURATION,2023-06-20 15:50:02.069000-04:00
1,AVA_3809,AVA_CONFIGURATION,2023-06-20 15:07:35.383000-04:00
2,AVA_3809,AVA_CONFIGURATION,2023-06-20 14:37:32.421000-04:00
3,AVA_3809,DATA_EXCEPTION,2023-06-20 15:50:03.821000-04:00
4,AVA_3809,DATA_EXCEPTION,2023-06-20 15:07:36.489000-04:00


In [29]:
# Change the last_load_time to only be yyyy/mm/dd
load_table2['LAST_LOAD_TIME'] = pd.to_datetime(load_table2['LAST_LOAD_TIME']).dt.date

load_table2.head()

Unnamed: 0,SCHEMA_NAME,TABLE_NAME,LAST_LOAD_TIME
0,AVA_3809,AVA_CONFIGURATION,2023-06-20
1,AVA_3809,AVA_CONFIGURATION,2023-06-20
2,AVA_3809,AVA_CONFIGURATION,2023-06-20
3,AVA_3809,DATA_EXCEPTION,2023-06-20
4,AVA_3809,DATA_EXCEPTION,2023-06-20


In [30]:
# Rename columns in LOAD_TABLE table to match MERGED_TABLES table
load_table2 = load_table2.rename(columns= {'SCHEMA_NAME': 'SCHEMA'})

load_table2.head()

Unnamed: 0,SCHEMA,TABLE_NAME,LAST_LOAD_TIME
0,AVA_3809,AVA_CONFIGURATION,2023-06-20
1,AVA_3809,AVA_CONFIGURATION,2023-06-20
2,AVA_3809,AVA_CONFIGURATION,2023-06-20
3,AVA_3809,DATA_EXCEPTION,2023-06-20
4,AVA_3809,DATA_EXCEPTION,2023-06-20


In [31]:
# Sort values to show the most recent load times
most_recent_times2 = load_table2.sort_values('LAST_LOAD_TIME', ascending=False)

most_recent_times2.head()

Unnamed: 0,SCHEMA,TABLE_NAME,LAST_LOAD_TIME
228,AVA_LP4473,DATA_EXCEPTION,2023-06-29
234,AVA_LP4473,THERMISTOR_PROBE_LOOKUP,2023-06-29
191,AVA_LP-4232,THERMISTOR_PROBE_LOOKUP,2023-06-29
226,AVA_LP4473,DATA_EXCEPTION,2023-06-29
227,AVA_LP4473,DATA_EXCEPTION,2023-06-29


In [32]:
# Select only the most recent load time 
most_recent_times2 = most_recent_times2.drop_duplicates(subset='TABLE_NAME', keep='first')

most_recent_times2 = most_recent_times2.reset_index(drop=True)

most_recent_times2.head()

Unnamed: 0,SCHEMA,TABLE_NAME,LAST_LOAD_TIME
0,AVA_LP4473,DATA_EXCEPTION,2023-06-29
1,AVA_LP4473,THERMISTOR_PROBE_LOOKUP,2023-06-29
2,AVA_LP4473,AVA_CONFIGURATION,2023-06-29
3,AVA_LEGACY,TANKS_LINES,2023-06-28
4,AVA_LEGACY,WSM_CALIB_CHART_POINTS,2023-06-28


## Appending GVR PROD & IS360 Load Times

In [33]:
# Append the two most recent times tables together
most_recent_times = most_recent_times1.append(most_recent_times2, ignore_index=True)

most_recent_times.head()

  most_recent_times = most_recent_times1.append(most_recent_times2, ignore_index=True)


Unnamed: 0,SCHEMA,TABLE_NAME,LAST_LOAD_TIME
0,ADMIN,SOURCE_TABLE_COUNTS,2023-06-29
1,STAGE,LOGISTICSPOSTALADDRESS,2023-06-29
2,SALESFORCE,OPPORTUNITYFIELDHISTORY,2023-06-29
3,STAGE,LEDGERJOURNALTABLE,2023-06-29
4,STAGE,LEDGERJOURNALTRANS,2023-06-29


## Merge Load Times To Merged_Tables

In [34]:
# Merge PRIMARY_KEYS and COLUMN_TABLE tables
merged_tables3 = pd.merge(merged_tables3, most_recent_times, how='left', on=['SCHEMA', 'TABLE_NAME'])

merged_tables3.head()

Unnamed: 0,DATABASE,SCHEMA,TABLE_NAME,COLUMN_NAME,DATA_TYPE,LENGTH,PRECISION,SCALE,NULLABLE,COMMENT,PK,ZONE,HISTORICAL_RETENTION,LAST_LOAD_TIME
0,GVR_PROD,RPT,TBLWARREXPENDITURES,PAYMENTAPPROVEDDATE,DATE,,,,YES,,,LEGACY REPORTING,32 DAYS,
1,GVR_PROD,RPT,VSALESREPORTDETAIL,REVENUE,NUMBER,,38.0,3.0,NO,,,LEGACY REPORTING,32 DAYS,
2,GVR_PROD,SALESFORCE,ACCOUNT_BLUE_SHEET__C,NAME,TEXT,80.0,,,YES,,,DATALAKE,32 DAYS,
3,GVR_PROD,SALESFORCE,GVR_ACCOUNTFORECASTS__C,APR_WK_1_KIT_FC__C,NUMBER,,18.0,0.0,YES,,,DATALAKE,32 DAYS,
4,GVR_PROD,SMS,MCS_SVC_REQ,MSR_EST_AMT,NUMBER,,38.0,0.0,YES,,,DATALAKE,32 DAYS,


## Table Type

In [35]:
# Hard code that all table types are Type1 (History Available)
merged_tables3['TABLE_TYPE'] = 'History Available'

merged_tables3.head()

Unnamed: 0,DATABASE,SCHEMA,TABLE_NAME,COLUMN_NAME,DATA_TYPE,LENGTH,PRECISION,SCALE,NULLABLE,COMMENT,PK,ZONE,HISTORICAL_RETENTION,LAST_LOAD_TIME,TABLE_TYPE
0,GVR_PROD,RPT,TBLWARREXPENDITURES,PAYMENTAPPROVEDDATE,DATE,,,,YES,,,LEGACY REPORTING,32 DAYS,,History Available
1,GVR_PROD,RPT,VSALESREPORTDETAIL,REVENUE,NUMBER,,38.0,3.0,NO,,,LEGACY REPORTING,32 DAYS,,History Available
2,GVR_PROD,SALESFORCE,ACCOUNT_BLUE_SHEET__C,NAME,TEXT,80.0,,,YES,,,DATALAKE,32 DAYS,,History Available
3,GVR_PROD,SALESFORCE,GVR_ACCOUNTFORECASTS__C,APR_WK_1_KIT_FC__C,NUMBER,,18.0,0.0,YES,,,DATALAKE,32 DAYS,,History Available
4,GVR_PROD,SMS,MCS_SVC_REQ,MSR_EST_AMT,NUMBER,,38.0,0.0,YES,,,DATALAKE,32 DAYS,,History Available


## Dropping Schemas

In [36]:
# Get rid of schemas: Information_Schema, Admin, and Stage
merged_tables3 = merged_tables3[~merged_tables3['SCHEMA'].isin(['INFORMATION_SCHEMA', 'STAGE', 'ADMIN',  'ARCHIVE', 
                                                                'AVA_DEMO', 'AVA_LEGACY', 'CENSUS', 'IOT_CORE'])]

merged_tables3.head()

Unnamed: 0,DATABASE,SCHEMA,TABLE_NAME,COLUMN_NAME,DATA_TYPE,LENGTH,PRECISION,SCALE,NULLABLE,COMMENT,PK,ZONE,HISTORICAL_RETENTION,LAST_LOAD_TIME,TABLE_TYPE
0,GVR_PROD,RPT,TBLWARREXPENDITURES,PAYMENTAPPROVEDDATE,DATE,,,,YES,,,LEGACY REPORTING,32 DAYS,,History Available
1,GVR_PROD,RPT,VSALESREPORTDETAIL,REVENUE,NUMBER,,38.0,3.0,NO,,,LEGACY REPORTING,32 DAYS,,History Available
2,GVR_PROD,SALESFORCE,ACCOUNT_BLUE_SHEET__C,NAME,TEXT,80.0,,,YES,,,DATALAKE,32 DAYS,,History Available
3,GVR_PROD,SALESFORCE,GVR_ACCOUNTFORECASTS__C,APR_WK_1_KIT_FC__C,NUMBER,,18.0,0.0,YES,,,DATALAKE,32 DAYS,,History Available
4,GVR_PROD,SMS,MCS_SVC_REQ,MSR_EST_AMT,NUMBER,,38.0,0.0,YES,,,DATALAKE,32 DAYS,,History Available


In [37]:
# Verify the columns are all capitalized
merged_tables3.columns = merged_tables3.columns.str.upper()

merged_tables3.head()

Unnamed: 0,DATABASE,SCHEMA,TABLE_NAME,COLUMN_NAME,DATA_TYPE,LENGTH,PRECISION,SCALE,NULLABLE,COMMENT,PK,ZONE,HISTORICAL_RETENTION,LAST_LOAD_TIME,TABLE_TYPE
0,GVR_PROD,RPT,TBLWARREXPENDITURES,PAYMENTAPPROVEDDATE,DATE,,,,YES,,,LEGACY REPORTING,32 DAYS,,History Available
1,GVR_PROD,RPT,VSALESREPORTDETAIL,REVENUE,NUMBER,,38.0,3.0,NO,,,LEGACY REPORTING,32 DAYS,,History Available
2,GVR_PROD,SALESFORCE,ACCOUNT_BLUE_SHEET__C,NAME,TEXT,80.0,,,YES,,,DATALAKE,32 DAYS,,History Available
3,GVR_PROD,SALESFORCE,GVR_ACCOUNTFORECASTS__C,APR_WK_1_KIT_FC__C,NUMBER,,18.0,0.0,YES,,,DATALAKE,32 DAYS,,History Available
4,GVR_PROD,SMS,MCS_SVC_REQ,MSR_EST_AMT,NUMBER,,38.0,0.0,YES,,,DATALAKE,32 DAYS,,History Available


## Data Profiling Links

In [38]:
# Generates values for the DATA_PROFILING column
def profiling(column):
    if column['SCHEMA'] == 'AX' and column['DATABASE'] == 'GVR_PROD':
        val = 'https://vontier.sharepoint.com/sites/GVR-DataAnalytics/Shared%20Documents/Forms/AllItems.aspx?id=%2Fsites%2FGVR%2DDataAnalytics%2FShared%20Documents%2F1%2E%20DA%20Document%20Management%20System%2F5%2E%20Artifacts%2FData%20Lake%20Artifacts%2FData%20Profiling%2FGVR%5FDEV%2FAX&viewid=1735308e%2Da40f%2D4ff1%2D8987%2D77fd3877b77c'
    elif column['SCHEMA'] == 'HFM' and column['DATABASE'] == 'GVR_PROD':
        val = 'https://vontier.sharepoint.com/sites/GVR-DataAnalytics/Shared%20Documents/Forms/AllItems.aspx?id=%2Fsites%2FGVR%2DDataAnalytics%2FShared%20Documents%2F1%2E%20DA%20Document%20Management%20System%2F5%2E%20Artifacts%2FData%20Lake%20Artifacts%2FData%20Profiling%2FGVR%5FDEV%2FHFM&viewid=1735308e%2Da40f%2D4ff1%2D8987%2D77fd3877b77c'
    elif column['SCHEMA'] == 'PROTHEUS_AR' and column['DATABASE'] == 'GVR_PROD':
        val = 'https://vontier.sharepoint.com/sites/GVR-DataAnalytics/Shared%20Documents/Forms/AllItems.aspx?id=%2Fsites%2FGVR%2DDataAnalytics%2FShared%20Documents%2F1%2E%20DA%20Document%20Management%20System%2F5%2E%20Artifacts%2FData%20Lake%20Artifacts%2FData%20Profiling%2FGVR%5FDEV%2FPROTHEUS%5FAR&viewid=1735308e%2Da40f%2D4ff1%2D8987%2D77fd3877b77c'
    elif column['SCHEMA'] == 'PROTHEUS_BR' and column['DATABASE'] == 'GVR_PROD':
        val = 'https://vontier.sharepoint.com/sites/GVR-DataAnalytics/Shared%20Documents/Forms/AllItems.aspx?id=%2Fsites%2FGVR%2DDataAnalytics%2FShared%20Documents%2F1%2E%20DA%20Document%20Management%20System%2F5%2E%20Artifacts%2FData%20Lake%20Artifacts%2FData%20Profiling%2FGVR%5FDEV%2FPROTHEUS%5FBR&viewid=1735308e%2Da40f%2D4ff1%2D8987%2D77fd3877b77c'
    elif column['SCHEMA'] == 'PROTHEUS_CH' and column['DATABASE'] == 'GVR_PROD':
        val = 'https://vontier.sharepoint.com/sites/GVR-DataAnalytics/Shared%20Documents/Forms/AllItems.aspx?id=%2Fsites%2FGVR%2DDataAnalytics%2FShared%20Documents%2F1%2E%20DA%20Document%20Management%20System%2F5%2E%20Artifacts%2FData%20Lake%20Artifacts%2FData%20Profiling%2FGVR%5FDEV%2FPROTHEUS%5FCH&viewid=1735308e%2Da40f%2D4ff1%2D8987%2D77fd3877b77c'    
    elif column['SCHEMA'] == 'QAD' and column['DATABASE'] == 'GVR_PROD':
        val = 'https://vontier.sharepoint.com/sites/GVR-DataAnalytics/Shared%20Documents/Forms/AllItems.aspx?id=%2Fsites%2FGVR%2DDataAnalytics%2FShared%20Documents%2F1%2E%20DA%20Document%20Management%20System%2F5%2E%20Artifacts%2FData%20Lake%20Artifacts%2FData%20Profiling%2FGVR%5FDEV%2FQAD&viewid=1735308e%2Da40f%2D4ff1%2D8987%2D77fd3877b77c'  
    elif column['SCHEMA'] == 'SMS' and column['DATABASE'] == 'GVR_PROD':
        val = 'https://vontier.sharepoint.com/sites/GVR-DataAnalytics/Shared%20Documents/Forms/AllItems.aspx?id=%2Fsites%2FGVR%2DDataAnalytics%2FShared%20Documents%2F1%2E%20DA%20Document%20Management%20System%2F5%2E%20Artifacts%2FData%20Lake%20Artifacts%2FData%20Profiling%2FGVR%5FDEV%2FSMS&viewid=1735308e%2Da40f%2D4ff1%2D8987%2D77fd3877b77c'  
    elif column['SCHEMA'] == 'AVA' and column['DATABASE'] == 'GVR_IS360_DEV_DB':
        val = 'https://vontier.sharepoint.com/sites/GVR-DataAnalytics/Shared%20Documents/Forms/AllItems.aspx?id=%2Fsites%2FGVR%2DDataAnalytics%2FShared%20Documents%2F1%2E%20DA%20Document%20Management%20System%2F5%2E%20Artifacts%2FData%20Lake%20Artifacts%2FData%20Profiling%2FGVR%5FIS360%5FDEV%5FDB%2FAVA&viewid=1735308e%2Da40f%2D4ff1%2D8987%2D77fd3877b77c'  
    elif column['SCHEMA'] == 'AVA_CORE_DEMO' and column['DATABASE'] == 'GVR_IS360_DEV_DB':
        val = 'https://vontier.sharepoint.com/sites/GVR-DataAnalytics/Shared%20Documents/Forms/AllItems.aspx?id=%2Fsites%2FGVR%2DDataAnalytics%2FShared%20Documents%2F1%2E%20DA%20Document%20Management%20System%2F5%2E%20Artifacts%2FData%20Lake%20Artifacts%2FData%20Profiling%2FGVR%5FIS360%5FDEV%5FDB%2FAVA%5FCORE%5FDEMO&viewid=1735308e%2Da40f%2D4ff1%2D8987%2D77fd3877b77c'  
    elif column['SCHEMA'] == 'AVA_CORE_DEV' and column['DATABASE'] == 'GVR_IS360_DEV_DB':
        val = 'https://vontier.sharepoint.com/sites/GVR-DataAnalytics/Shared%20Documents/Forms/AllItems.aspx?id=%2Fsites%2FGVR%2DDataAnalytics%2FShared%20Documents%2F1%2E%20DA%20Document%20Management%20System%2F5%2E%20Artifacts%2FData%20Lake%20Artifacts%2FData%20Profiling%2FGVR%5FIS360%5FDEV%5FDB%2FAVA%5FCORE%5FDEV&viewid=1735308e%2Da40f%2D4ff1%2D8987%2D77fd3877b77c'      
    elif column['SCHEMA'] == 'AVA_CORE_QA' and column['DATABASE'] == 'GVR_IS360_DEV_DB':
        val = 'https://vontier.sharepoint.com/sites/GVR-DataAnalytics/Shared%20Documents/Forms/AllItems.aspx?id=%2Fsites%2FGVR%2DDataAnalytics%2FShared%20Documents%2F1%2E%20DA%20Document%20Management%20System%2F5%2E%20Artifacts%2FData%20Lake%20Artifacts%2FData%20Profiling%2FGVR%5FIS360%5FDEV%5FDB%2FAVA%5FCORE%5FQA&viewid=1735308e%2Da40f%2D4ff1%2D8987%2D77fd3877b77c'      
    elif column['SCHEMA'] == 'AVA_CORE_UAT' and column['DATABASE'] == 'GVR_IS360_DEV_DB':
        val = 'https://vontier.sharepoint.com/sites/GVR-DataAnalytics/Shared%20Documents/Forms/AllItems.aspx?id=%2Fsites%2FGVR%2DDataAnalytics%2FShared%20Documents%2F1%2E%20DA%20Document%20Management%20System%2F5%2E%20Artifacts%2FData%20Lake%20Artifacts%2FData%20Profiling%2FGVR%5FIS360%5FDEV%5FDB%2FAVA%5FCORE%5FQA&viewid=1735308e%2Da40f%2D4ff1%2D8987%2D77fd3877b77c'      
    elif column['SCHEMA'] == 'AVA_DEV' and column['DATABASE'] == 'GVR_IS360_DEV_DB':
        val = 'https://vontier.sharepoint.com/sites/GVR-DataAnalytics/Shared%20Documents/Forms/AllItems.aspx?id=%2Fsites%2FGVR%2DDataAnalytics%2FShared%20Documents%2F1%2E%20DA%20Document%20Management%20System%2F5%2E%20Artifacts%2FData%20Lake%20Artifacts%2FData%20Profiling%2FGVR%5FIS360%5FDEV%5FDB%2FAVA%5FDEV&viewid=1735308e%2Da40f%2D4ff1%2D8987%2D77fd3877b77c'      
    elif column['SCHEMA'] == 'AVA_QA' and column['DATABASE'] == 'GVR_IS360_DEV_DB':
        val = 'https://vontier.sharepoint.com/sites/GVR-DataAnalytics/Shared%20Documents/Forms/AllItems.aspx?id=%2Fsites%2FGVR%2DDataAnalytics%2FShared%20Documents%2F1%2E%20DA%20Document%20Management%20System%2F5%2E%20Artifacts%2FData%20Lake%20Artifacts%2FData%20Profiling%2FGVR%5FIS360%5FDEV%5FDB%2FAVA%5FQA&viewid=1735308e%2Da40f%2D4ff1%2D8987%2D77fd3877b77c'      
    elif column['SCHEMA'] == 'AVA_UAT' and column['DATABASE'] == 'GVR_IS360_DEV_DB':
        val = 'https://vontier.sharepoint.com/sites/GVR-DataAnalytics/Shared%20Documents/Forms/AllItems.aspx?id=%2Fsites%2FGVR%2DDataAnalytics%2FShared%20Documents%2F1%2E%20DA%20Document%20Management%20System%2F5%2E%20Artifacts%2FData%20Lake%20Artifacts%2FData%20Profiling%2FGVR%5FIS360%5FDEV%5FDB%2FAVA%5FUAT&viewid=1735308e%2Da40f%2D4ff1%2D8987%2D77fd3877b77c' 
    elif column['SCHEMA'] == 'INSITE360_TELEMETRY' and column['DATABASE'] == 'GVR_IS360_DEV_DB':
        val = 'https://vontier.sharepoint.com/sites/GVR-DataAnalytics/Shared%20Documents/Forms/AllItems.aspx?id=%2Fsites%2FGVR%2DDataAnalytics%2FShared%20Documents%2F1%2E%20DA%20Document%20Management%20System%2F5%2E%20Artifacts%2FData%20Lake%20Artifacts%2FData%20Profiling%2FGVR%5FIS360%5FDEV%5FDB%2FINSITE360%5FTELEMETRY&viewid=1735308e%2Da40f%2D4ff1%2D8987%2D77fd3877b77c' 
    elif column['SCHEMA'] == 'PUSH_SALES_EVENT' and column['DATABASE'] == 'GVR_IS360_DEV_DB':
        val = 'https://vontier.sharepoint.com/sites/GVR-DataAnalytics/Shared%20Documents/Forms/AllItems.aspx?id=%2Fsites%2FGVR%2DDataAnalytics%2FShared%20Documents%2F1%2E%20DA%20Document%20Management%20System%2F5%2E%20Artifacts%2FData%20Lake%20Artifacts%2FData%20Profiling%2FGVR%5FIS360%5FDEV%5FDB%2FPUSH%5FSALE%5FEVENT&viewid=1735308e%2Da40f%2D4ff1%2D8987%2D77fd3877b77c' 
    else:
        val = ' '
    return val

In [39]:
# Apply the zone function to the merged_tables3 dataframe
merged_tables3['DATA_PROFILING'] = merged_tables3.apply(profiling, axis=1)

merged_tables3.head()

Unnamed: 0,DATABASE,SCHEMA,TABLE_NAME,COLUMN_NAME,DATA_TYPE,LENGTH,PRECISION,SCALE,NULLABLE,COMMENT,PK,ZONE,HISTORICAL_RETENTION,LAST_LOAD_TIME,TABLE_TYPE,DATA_PROFILING
0,GVR_PROD,RPT,TBLWARREXPENDITURES,PAYMENTAPPROVEDDATE,DATE,,,,YES,,,LEGACY REPORTING,32 DAYS,,History Available,
1,GVR_PROD,RPT,VSALESREPORTDETAIL,REVENUE,NUMBER,,38.0,3.0,NO,,,LEGACY REPORTING,32 DAYS,,History Available,
2,GVR_PROD,SALESFORCE,ACCOUNT_BLUE_SHEET__C,NAME,TEXT,80.0,,,YES,,,DATALAKE,32 DAYS,,History Available,
3,GVR_PROD,SALESFORCE,GVR_ACCOUNTFORECASTS__C,APR_WK_1_KIT_FC__C,NUMBER,,18.0,0.0,YES,,,DATALAKE,32 DAYS,,History Available,
4,GVR_PROD,SMS,MCS_SVC_REQ,MSR_EST_AMT,NUMBER,,38.0,0.0,YES,,,DATALAKE,32 DAYS,,History Available,https://vontier.sharepoint.com/sites/GVR-DataA...


## Load Strategy

In [40]:
# Load strategy table
load_strat = session.sql("SELECT DISTINCT TABLE_NAME, TABLE_TYPE, INCREMENTAL_LOAD, FULL_LOAD FROM GVR_PROD.ADMIN.BUILD_WAREHOUSE").to_pandas()

load_strat.head()

Unnamed: 0,TABLE_NAME,TABLE_TYPE,INCREMENTAL_LOAD,FULL_LOAD
0,BATCHJOBHISTORY_GERMANY,ADMIN,0,0
1,BATCHJOBHISTORY_GIL,ADMIN,0,0
2,BATCHJOBHISTORY_ITALY,ADMIN,0,0
3,dimAccount,DIM,0,1
4,dimAddress,DIM,0,1


In [41]:
load_strat = load_strat.rename(columns={'TABLE_TYPE': 'SCHEMA'})

load_strat.head()

Unnamed: 0,TABLE_NAME,SCHEMA,INCREMENTAL_LOAD,FULL_LOAD
0,BATCHJOBHISTORY_GERMANY,ADMIN,0,0
1,BATCHJOBHISTORY_GIL,ADMIN,0,0
2,BATCHJOBHISTORY_ITALY,ADMIN,0,0
3,dimAccount,DIM,0,1
4,dimAddress,DIM,0,1


In [42]:
load_strat["SCHEMA"] = load_strat["SCHEMA"].replace({'P-BR': 'PROTHEUS_BR', 'P-CH': 'PROTHEUS_CH', 'P-AR': 'PROTHEUS_AR'})

In [43]:
merged_tables3 = merged_tables3.merge(load_strat, on=['TABLE_NAME', 'SCHEMA'], how='left')

merged_tables3.head()

Unnamed: 0,DATABASE,SCHEMA,TABLE_NAME,COLUMN_NAME,DATA_TYPE,LENGTH,PRECISION,SCALE,NULLABLE,COMMENT,PK,ZONE,HISTORICAL_RETENTION,LAST_LOAD_TIME,TABLE_TYPE,DATA_PROFILING,INCREMENTAL_LOAD,FULL_LOAD
0,GVR_PROD,RPT,TBLWARREXPENDITURES,PAYMENTAPPROVEDDATE,DATE,,,,YES,,,LEGACY REPORTING,32 DAYS,,History Available,,0.0,1.0
1,GVR_PROD,RPT,VSALESREPORTDETAIL,REVENUE,NUMBER,,38.0,3.0,NO,,,LEGACY REPORTING,32 DAYS,,History Available,,,
2,GVR_PROD,SALESFORCE,ACCOUNT_BLUE_SHEET__C,NAME,TEXT,80.0,,,YES,,,DATALAKE,32 DAYS,,History Available,,,
3,GVR_PROD,SALESFORCE,GVR_ACCOUNTFORECASTS__C,APR_WK_1_KIT_FC__C,NUMBER,,18.0,0.0,YES,,,DATALAKE,32 DAYS,,History Available,,,
4,GVR_PROD,SMS,MCS_SVC_REQ,MSR_EST_AMT,NUMBER,,38.0,0.0,YES,,,DATALAKE,32 DAYS,,History Available,https://vontier.sharepoint.com/sites/GVR-DataA...,3.0,0.0


In [44]:
# Create a function to determine load strategy
def determine_load_strategy(column):
    if column['INCREMENTAL_LOAD'] > 0:
        return 'INCREMENTAL LOAD'
    elif column['FULL_LOAD'] > 0:
        return 'FULL LOAD'
    elif column['INCREMENTAL_LOAD'] > 0 and column['FULL_LOAD'] > 0:
        return 'INCREMENTAL LOAD'
    else:
        return 'NA'        

In [45]:
merged_tables3['LOAD_STRATEGY'] = merged_tables3.apply(determine_load_strategy, axis=1)

merged_tables3.head()

Unnamed: 0,DATABASE,SCHEMA,TABLE_NAME,COLUMN_NAME,DATA_TYPE,LENGTH,PRECISION,SCALE,NULLABLE,COMMENT,PK,ZONE,HISTORICAL_RETENTION,LAST_LOAD_TIME,TABLE_TYPE,DATA_PROFILING,INCREMENTAL_LOAD,FULL_LOAD,LOAD_STRATEGY
0,GVR_PROD,RPT,TBLWARREXPENDITURES,PAYMENTAPPROVEDDATE,DATE,,,,YES,,,LEGACY REPORTING,32 DAYS,,History Available,,0.0,1.0,FULL LOAD
1,GVR_PROD,RPT,VSALESREPORTDETAIL,REVENUE,NUMBER,,38.0,3.0,NO,,,LEGACY REPORTING,32 DAYS,,History Available,,,,
2,GVR_PROD,SALESFORCE,ACCOUNT_BLUE_SHEET__C,NAME,TEXT,80.0,,,YES,,,DATALAKE,32 DAYS,,History Available,,,,
3,GVR_PROD,SALESFORCE,GVR_ACCOUNTFORECASTS__C,APR_WK_1_KIT_FC__C,NUMBER,,18.0,0.0,YES,,,DATALAKE,32 DAYS,,History Available,,,,
4,GVR_PROD,SMS,MCS_SVC_REQ,MSR_EST_AMT,NUMBER,,38.0,0.0,YES,,,DATALAKE,32 DAYS,,History Available,https://vontier.sharepoint.com/sites/GVR-DataA...,3.0,0.0,INCREMENTAL LOAD


## Delete Unneccessary Columns

In [46]:
merged_tables3 = merged_tables3.drop(['INCREMENTAL_LOAD', 'FULL_LOAD'], axis=1)

merged_tables3.head()

Unnamed: 0,DATABASE,SCHEMA,TABLE_NAME,COLUMN_NAME,DATA_TYPE,LENGTH,PRECISION,SCALE,NULLABLE,COMMENT,PK,ZONE,HISTORICAL_RETENTION,LAST_LOAD_TIME,TABLE_TYPE,DATA_PROFILING,LOAD_STRATEGY
0,GVR_PROD,RPT,TBLWARREXPENDITURES,PAYMENTAPPROVEDDATE,DATE,,,,YES,,,LEGACY REPORTING,32 DAYS,,History Available,,FULL LOAD
1,GVR_PROD,RPT,VSALESREPORTDETAIL,REVENUE,NUMBER,,38.0,3.0,NO,,,LEGACY REPORTING,32 DAYS,,History Available,,
2,GVR_PROD,SALESFORCE,ACCOUNT_BLUE_SHEET__C,NAME,TEXT,80.0,,,YES,,,DATALAKE,32 DAYS,,History Available,,
3,GVR_PROD,SALESFORCE,GVR_ACCOUNTFORECASTS__C,APR_WK_1_KIT_FC__C,NUMBER,,18.0,0.0,YES,,,DATALAKE,32 DAYS,,History Available,,
4,GVR_PROD,SMS,MCS_SVC_REQ,MSR_EST_AMT,NUMBER,,38.0,0.0,YES,,,DATALAKE,32 DAYS,,History Available,https://vontier.sharepoint.com/sites/GVR-DataA...,INCREMENTAL LOAD


## Rearrange Columns

In [47]:
# Rearrange columns - to be added later

# Write to Excel

In [48]:
with pd.ExcelWriter('Catalog/Data_Catalog.xlsx', mode='a', engine='openpyxl', if_sheet_exists="replace",) as writer:
    merged_tables3.to_excel(writer, sheet_name='Catalog', index=False)

# DataFrame to Snowflake Table

In [49]:
# # DataFrame location
# conn = {
#     "account": account,
#     "user": user,
#     "authenticator": authenticator,
#     "warehouse": warehouse2,
#     "role": role2
# }

# # Create a table name
# table_name = 'Data_Catalog'

# # Create the table in Snowflake
# with conn.cursor() as cursor:

#     # CHANGE THE VARIABLES AND DATA TYPES OF THESE COLUMNS TO MATCH THE DATAFRAME
#     cursor.execute(f"CREATE OR REPLACE TABLE {table_name} (Name STRING, Age INTEGER, City STRING)")

# # Insert DataFrame records into the table
# with conn.cursor() as cursor:
#     for _, row in merged_tables3.iterrows():

#         # %s is a placeholder for strings, (%d is for integer values & %f is for float values)
#         cursor.execute(f"INSERT INTO {table_name} VALUES (%s, %s, %s)", tuple(row))

# # Close the connection
# conn.close()