In [1]:
#ProgramName: Daily-05-UpdateMasterCatalog-Db-Sc-Tbl-LastUpdated
#Purpose: Weekly Update Master Catalog - Routine Maintenance-LastUpdated, DatabaseName, SchemaName, TableName
#Author:  Greg Turmel, Director, Data Governance 
#Date:    2020.08.30 - 2021.06.30
#Errata:  0.1 Improvements can be made to script using for/looping through the metadata captured

import os, sys, argparse, csv, pyodbc, sql, time, datetime
import sqlalchemy as db
import errno, pathlib2

from dotenv import load_dotenv # add this line
import pandas as pd
import numpy as np

now01 = datetime.datetime.now()

load_dotenv() # add this line
user = os.getenv('MySQLeUser')
password = os.getenv('MySQLeUserPass')
host = os.getenv('MySQLeHOST')
db = os.getenv('MySQLeDB')

In [2]:
# This segment builds the appropriatte file system structure as a variable driven exercise
# Take time to set your 'eeeeeeeeee' number as variable 'pn' below 
# and set the program directory variable called 'programDirectory' before running
# ===================================================================================================================
pn = r'eeeeeeeeee'            #This represents the windows system employee login folder - IBM team uses a 9 number
# ===================================================================================================================

programDirectory = 'Daily' # Update this variable to wherever you want the program subfolder/files to be located 
un = r'C:\Users'
cn = r'Documents\Py'
an = r'Master\Process'
bn = r'Master\Reports'

sn = r'Master\sql'
tn = r'config'

#Set a parent directory
parentDirectory = "{}\{}\{}".format(un,pn,cn)
print('Parent Directory is: ', parentDirectory)
mode = 0o666

#Set path location for working with local file(s)
path = os.path.join(parentDirectory, programDirectory,)
pathMP = os.path.join(parentDirectory, programDirectory, an)
pathMR = os.path.join(parentDirectory, programDirectory, bn)
pathMS = os.path.join(parentDirectory, programDirectory, sn)
pathMT = os.path.join(parentDirectory, programDirectory, tn)

procpath = pathMP
csvpath = pathMR
sqlpath = pathMS
configpath = pathMT

try:
    if not os.path.exists(path):
        os.makedirs(path, mode)
        print('Program Directory subfolder has been created: ', programDirectory)
    else:
        print('Directory:', programDirectory, '>>>>> Note ---- this folder already exists <<<<<')
except OSError as error:
    print(error)
    pass

try:
    if not os.path.exists(pathMP):
        os.makedirs(pathMP, mode)
        print('Program Directory subfolder has been created: ', pathMP)
    else:
        print('Directory:', pathMP, '>>>>> Note ---- this folder already exists <<<<<')
except OSError as error:
    print(error)
    pass

try:
    if not os.path.exists(pathMR):
        os.makedirs(pathMR, mode)
        print('Program Directory subfolder has been created: ', pathMR)
    else:
        print('Directory:', pathMR, '>>>>> Note ---- this folder already exists <<<<<')
except OSError as error:
    print(error)
    pass

try:
    if not os.path.exists(pathMS):
        os.makedirs(pathMS, mode)
        print('Program Directory subfolder has been created: ', pathMS)
    else:
        print('Directory:', pathMS, '>>>>> Note ---- this folder already exists <<<<<')
except OSError as error:
    print(error)
    pass

try:
    if not os.path.exists(pathMT):
        os.makedirs(pathMT, mode)
        print('Program Directory subfolder has been created: ', pathMT)
    else:
        print('Central Config Directory:', pathMT, '>>>>> Note ---- this folder already exists <<<<<')
except OSError as error:
    print(error)
    pass

Parent Directory is:  C:\Users\e201873842\Documents\Py
Directory: Daily >>>>> Note ---- this folder already exists <<<<<
Directory: C:\Users\e201873842\Documents\Py\Daily\Master\Process >>>>> Note ---- this folder already exists <<<<<
Directory: C:\Users\e201873842\Documents\Py\Daily\Master\Reports >>>>> Note ---- this folder already exists <<<<<
Directory: C:\Users\e201873842\Documents\Py\Daily\Master\sql >>>>> Note ---- this folder already exists <<<<<
Central Config Directory: C:\Users\e201873842\Documents\Py\Daily\config >>>>> Note ---- this folder already exists <<<<<


In [3]:
#C:\Users\eeeeeeeeee\Documents\Py\Weekly\Master\

with open(os.path.join(pathMR, 'upDate.csv'), 'w') as f:
    pass
            
dbList = ['ApplicationData','Assessment','Control','DownstreamFeeds','DownstreamFeeds_V1.5','GCPS_Operations','GSDR','GSDR_Synergy','GSDR_Synergy_Temp','GSDR_Temp','ODS_WebApps','PPROD','Predictive_Analytics']

for x in dbList:
    try:
        conn = pyodbc.connect('Server=PRODODSSQL;'
                              'Trusted_Connection=yes;'
                              'DRIVER={{SQL Server}};'
                              'Database={0}'.format(x))

        sql_query = pd.read_sql_query(''' 
SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED;
select DISTINCT CONVERT(char(8), GetDate(),112) as TodaysDate, DB_Name() AS DatabaseName, s.name as SchemaName, OBJECT_NAME(o.OBJECT_ID) AS TableName, t.modify_date AS LastUpdated
FROM
SYS.objects o JOIN SYS.schemas s 
   ON o.schema_id=s.schema_id
        JOIN sys.dm_db_partition_stats p
   ON o.object_id=p.object_id
        JOIN sys.tables t
   ON o.modify_date = t.modify_date
WHERE o.type LIKE 'U'
---AND p.row_count > 0
ORDER BY TableName, LastUpdated
                              '''
                              ,conn) #

        dfp = pd.DataFrame(sql_query)
        with open(os.path.join(pathMR, 'upDate.csv'), 'a') as f:
            dfp.to_csv(f, header=f.tell()==0, index=False, line_terminator='\n')

        conn.close()
    except:
        continue
    finally:
        print('Production Catalog collected: ', x, now01.strftime("%Y-%m-%d %H:%M:%S"))
        continue

now01 = datetime.datetime.now()
print('Production System Catalog Data Collection Process is now Complete: ', now01.strftime("%Y-%m-%d %H:%M:%S"))

Production Catalog collected:  ApplicationData 2021-06-14 07:50:50
Production Catalog collected:  Assessment 2021-06-14 07:50:50
Production Catalog collected:  Control 2021-06-14 07:50:50
Production Catalog collected:  DownstreamFeeds 2021-06-14 07:50:50
Production Catalog collected:  DownstreamFeeds_V1.5 2021-06-14 07:50:50
Production Catalog collected:  GCPS_Operations 2021-06-14 07:50:50
Production Catalog collected:  GSDR 2021-06-14 07:50:50
Production Catalog collected:  GSDR_Synergy 2021-06-14 07:50:50
Production Catalog collected:  GSDR_Synergy_Temp 2021-06-14 07:50:50
Production Catalog collected:  GSDR_Temp 2021-06-14 07:50:50
Production Catalog collected:  ODS_WebApps 2021-06-14 07:50:50
Production Catalog collected:  PPROD 2021-06-14 07:50:50
Production Catalog collected:  Predictive_Analytics 2021-06-14 07:50:50
Production System Catalog Data Collection Process is now Complete:  2021-06-14 07:50:52


In [4]:
#Set up table - remove previous records 
#Comment out if you want to track changes in table by the TodaysDate column
#TypeError: 'NoneType' object is not iterable --- Solution is: SET ANSI_WARNINGS OFF

#Connect to SQL Server
conn = pyodbc.connect('Driver={SQL Server};'
                      'Server=DEVODSSQL;'
                      'Database=Greg;'
                      'Trusted_Connection=yes;')
cursor = conn.cursor()
cursor.execute(''' 
SET NOCOUNT ON
SET ANSI_WARNINGS OFF
IF EXISTS(SELECT 1 FROM dbo.MasterPRODCatalog)
BEGIN
   DELETE FROM Greg.dbo.MasterPRODCatalog WHERE [LastUpdated] > '19000101';
END
'''
,)
conn.commit()

conn.close()

print(' *** Note:  database table Greg.dbo.MasterPRODCatalog cleaned out - step complete ***')

 *** Note:  database table Greg.dbo.MasterPRODCatalog cleaned out - step complete ***


In [5]:
# Import CSV
data = pd.read_csv("{}/{}".format(pathMR, 'upDate.csv'))   
df = pd.DataFrame(data, columns= ['TodaysDate','DatabaseName','SchemaName','TableName','LastUpdated'])

# Connect to SQL Server
conn = pyodbc.connect('Driver={SQL Server};'
                      'Server=DEVODSSQL;'
                      'Database=Greg;'
                      'Trusted_Connection=yes;')
cursor = conn.cursor()

# Insert DataFrame to Table
for row in df.itertuples():
    cursor.execute('''
                INSERT INTO Greg.dbo.MasterPRODCatalog (TodaysDate,DatabaseName,SchemaName,TableName,LastUpdated)
                VALUES (?,?,?,?,?);
                ''',
                row.TodaysDate,
                row.DatabaseName,
                row.SchemaName,
                row.TableName,
                row.LastUpdated
                )
conn.commit()

cursor.close()

print(' *** Note: csv file written to database table complete ***')

sql_query = pd.read_sql_query(''' 
select count(*) FROM Greg.dbo.MasterPRODCatalog WHERE [TodaysDate] LIKE convert(varchar, getdate(), 112);
                              '''
                              ,conn) # 
conn.close()

now01 = datetime.datetime.now()
print('Processing Step to LOAD MasterCatalog Table is now Complete: ',now01.strftime("%Y-%m-%d %H:%M:%S"))
print(' *** Note: count from table data loaded is: ***', sql_query)


 *** Note: csv file written to database table complete ***
Processing Step to LOAD MasterCatalog Table is now Complete:  2021-06-14 07:51:03
 *** Note: count from table data loaded is: ***        
0  4310


In [6]:
# Import CSV
data = pd.read_csv("{}/{}".format(pathMR, 'upDate.csv'))   
df = pd.DataFrame(data, columns= ['TodaysDate','DatabaseName','SchemaName','TableName','LastUpdated'])

# Connect to SQL Server
conn = pyodbc.connect('Driver={SQL Server};'
                      'Server=DEVODSSQL;'
                      'Database=Greg;'
                      'Trusted_Connection=yes;')
cursor = conn.cursor()

# READ ME
# Insert DataFrame to Master Tracking Table - Analysis to determine systematic changes overwritten by the next step
# By using this tracking table the IBM team can monitor the gross total changes month by month that are lost by the
# system catalog being updated daily and loss of that dynmic change can impact decisions on activites for migration
# DO NOT look at this STEP as a duplicate of the one below. This one TRACKS and the next one updates the Master with
# New changes found

for row in df.itertuples():
    cursor.execute('''
                INSERT INTO Greg.dbo.MasterPRODCatalogTracking (TodaysDate,DatabaseName,SchemaName,TableName,LastUpdated)
                VALUES (?,?,?,?,?);
                ''',
                row.TodaysDate,
                row.DatabaseName,
                row.SchemaName,
                row.TableName,
                row.LastUpdated
                )
conn.commit()

cursor.close()

print(' *** Note: csv file written to database table complete ***')

sql_query = pd.read_sql_query(''' 
select count(*) FROM Greg.dbo.MasterPRODCatalogTracking WHERE [TodaysDate] LIKE convert(varchar, getdate(), 112);
                              '''
                              ,conn) # 
conn.close()

now01 = datetime.datetime.now()
print(' *** LOAD MasterCatalogTracking Table is now Complete: ',now01.strftime("%Y-%m-%d %H:%M:%S"))
print(' *** Note: count from table data loaded is: ***', sql_query)


 *** Note: csv file written to database table complete ***
 *** LOAD MasterCatalogTracking Table is now Complete:  2021-06-14 07:51:13
 *** Note: count from table data loaded is: ***        
0  4310


In [7]:
#Connect to SQL Server
conn = pyodbc.connect('Driver={SQL Server};'
                      'Server=DEVODSSQL;'
                      'Database=Greg;'
                      'Trusted_Connection=yes;')
cursor = conn.cursor()
#If Exists, Crear out any data from the tempGreg table if last used 
cursor.execute(''' 
SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED
SET NOCOUNT ON
SET ANSI_WARNINGS OFF
IF OBJECT_ID('tempdb..#tempGreg') IS NOT NULL 
BEGIN 
    DROP TABLE #tempGreg
END 
;
'''
,)
conn.commit()

#Update the Master table with any tables that have been updated by the ODS programming team(s) and or software processes
cursor = conn.cursor()
cursor.execute(''' 
SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED
SET NOCOUNT ON
SET ANSI_WARNINGS OFF
MERGE [Greg].[dbo].[Master] T
USING dbo.[MasterPRODCatalog] S ON T.DatabaseName=S.DatabaseName AND T.SchemaName=S.SchemaName AND T.TableName=S.TableName
WHEN MATCHED THEN
UPDATE SET LastUpdated=S.LastUpdated
WHEN NOT MATCHED BY TARGET THEN 
INSERT (DatabaseName,SchemaName,TableName,LastUpdated)
VALUES (S.DatabaseName,S.SchemaName,S.TableName,S.LastUpdated);
'''
,)
conn.commit()

conn.close()

now01 = datetime.datetime.now()
print(' *** Note:  database table Greg.dbo.Master updated: ', now01.strftime("%Y-%m-%d %H:%M:%S"))

 *** Note:  database table Greg.dbo.Master updated:  2021-06-14 07:51:13


In [8]:
now01 = datetime.datetime.now()
print('Routine Maintenance Job - Last Updated Field updated - Process Complete: ', now01.strftime("%Y-%m-%d %H:%M:%S"))

Routine Maintenance Job - Last Updated Field updated - Process Complete:  2021-06-14 07:51:13
