In [1]:
#ProgramName:    Daily-03-Load-SchemaCompare-QA-vs-P-DEV
#Purpose:        Find differences in object details between QA and Prod ODS
#Author:         Greg Turmel, Director, Data Governance 
#Date:           2020.08.30 - 2021.06.30
#Errata:         0.1 Improvements can be made to script using for/looping through the databases

import os, sys, argparse, csv, pyodbc, sql, time, datetime
import sqlalchemy as db
import errno, pathlib2

from dotenv import load_dotenv # add this line
import pandas as pd
import numpy as np

load_dotenv() # add this line
user = os.getenv('MySQLeUser')
password = os.getenv('MySQLeUserPass')
host = os.getenv('MySQLeHOST')
db = os.getenv('MySQLeDB')

In [2]:
# This segment builds the appropriatte file system structure as a variable driven exercise
# Take time to set your 'eeeeeeeeee' number as variable 'pn' below 
# and set the program directory variable called 'programDirectory' before running
# ===================================================================================================================
pn = r'eeeeeeeeee'            #This represents the windows system employee login folder - IBM team uses a 9 number
# ===================================================================================================================

programDirectory = 'Daily' # Update this variable to wherever you want the program subfolder/files to be located 
un = r'C:\Users'
cn = r'Documents\Py'
an = r'Compare\Process'
bn = r'Compare\Reports'

sn = r'Compare\sql'
tn = r'config'

#Set a parent directory
parentDirectory = "{}\{}\{}".format(un,pn,cn)
print('Parent Directory is: ', parentDirectory)
mode = 0o666

#Set path location for working with local file(s)
path = os.path.join(parentDirectory, programDirectory,)
pathCP = os.path.join(parentDirectory, programDirectory, an)
pathCR = os.path.join(parentDirectory, programDirectory, bn)
pathCS = os.path.join(parentDirectory, programDirectory, sn)
pathCT = os.path.join(parentDirectory, programDirectory, tn)

procpath = pathCP
csvpath = pathCR
sqlpath = pathCS
configpath = pathCT

try:
    if not os.path.exists(path):
        os.makedirs(path, mode)
        print('Program Directory subfolder has been created: ', programDirectory)
    else:
        print('Directory:', programDirectory, '>>>>> Note ---- this folder already exists <<<<<')
except OSError as error:
    print(error)
    pass

try:
    if not os.path.exists(pathCP):
        os.makedirs(pathCP, mode)
        print('Program Directory subfolder has been created: ', pathCP)
    else:
        print('Directory:', pathCP, '>>>>> Note ---- this folder already exists <<<<<')
except OSError as error:
    print(error)
    pass

try:
    if not os.path.exists(pathCR):
        os.makedirs(pathCR, mode)
        print('Program Directory subfolder has been created: ', pathCR)
    else:
        print('Directory:', pathCR, '>>>>> Note ---- this folder already exists <<<<<')
except OSError as error:
    print(error)
    pass

try:
    if not os.path.exists(pathCS):
        os.makedirs(pathCS, mode)
        print('Program Directory subfolder has been created: ', pathCS)
    else:
        print('Directory:', pathCS, '>>>>> Note ---- this folder already exists <<<<<')
except OSError as error:
    print(error)
    pass

try:
    if not os.path.exists(pathCT):
        os.makedirs(pathCT, mode)
        print('Program Directory subfolder has been created: ', pathCT)
    else:
        print('Directory:', pathCT, '>>>>> Note ---- this folder already exists <<<<<')
except OSError as error:
    print(error)
    pass

Parent Directory is:  C:\Users\e201873842\Documents\Py
Directory: Daily >>>>> Note ---- this folder already exists <<<<<
Directory: C:\Users\e201873842\Documents\Py\Daily\Compare\Process >>>>> Note ---- this folder already exists <<<<<
Directory: C:\Users\e201873842\Documents\Py\Daily\Compare\Reports >>>>> Note ---- this folder already exists <<<<<
Directory: C:\Users\e201873842\Documents\Py\Daily\Compare\sql >>>>> Note ---- this folder already exists <<<<<
Directory: C:\Users\e201873842\Documents\Py\Daily\config >>>>> Note ---- this folder already exists <<<<<


In [3]:
################################################################################
# Variables: 
################################################################################
now01 = datetime.datetime.now()
csvtype = 'Base' # csvtype - use 'Test or Base'
Dbug = 'Off'     # Debugging - use On or Off  
# Process files in procpath
file00 = 'dbListC.csv'
file01 = 'dbListN.csv'
file02 = 'tblListC.csv'
file03 = 'tblListN.csv'
file04 = 'schemaListC.csv'
file05 = 'schemaListN.csv'
# Report files in csvpath
file06 = 'compareReport0.csv'
file07 = 'compareReport1.csv'
file08 = 'compareReport2.csv'
# Report files in csvpath
file16 = 'compareReport10.csv'
file17 = 'compareReport11.csv'
file18 = 'compareReport12.csv'
# Report files in csvpath
file19 = 'collectionP1.csv'
file20 = 'collectionQ1.csv'

In [4]:
################################################################################
# Specify a path for reports: Make sure output location is available for write 
# Built this for enhancements and more automation 
# Note: currently the specificly named files below are the only ones used
################################################################################
now01 = datetime.datetime.now()
irange = (1,2)
for i in irange:
    with open(os.path.join(csvpath, 'compareReport{0}.csv').format(i), 'w') as fp: pass 

krange = (0,1,2)
for k in krange:
    with open(os.path.join(sqlpath, 'sqlfile0{0}.sql').format(k), 'w') as fp: pass

    with open(os.path.join(configpath, 'dbListC.csv'), 'w') as fp: pass
    with open(os.path.join(configpath, 'dbListN.csv'), 'w') as fp: pass
    with open(os.path.join(configpath, 'tblListC.csv'), 'w') as fp: pass
    with open(os.path.join(configpath, 'tblListN.csv'), 'w') as fp: pass
    with open(os.path.join(configpath, 'schemaListC.csv'), 'w') as fp: pass
    with open(os.path.join(configpath, 'schemaListN.csv'), 'w') as fp: pass
    
    with open(os.path.join(procpath, 'collectionP1.csv'), 'w') as fp: pass
    with open(os.path.join(procpath, 'collectionQ1.csv'), 'w') as fp: pass

now01 = datetime.datetime.now()
print('Processing Complete: ',now01.strftime("%Y-%m-%d %H:%M:%S"))

Processing Complete:  2021-06-14 07:25:09


In [5]:
#Connect to SQL Server
conn = pyodbc.connect('Driver={SQL Server};'
                      'Server=DEVODSSQL;'
                      'Database=Greg;'
                      'Trusted_Connection=yes;')

sql_query = pd.read_sql_query(''' 
select DISTINCT DatabaseName FROM Greg.dbo.Master WHERE Core Like 'Core' ORDER BY 1
                              '''
                              ,conn) # here, the 'conn' is the variable that contains your database connection information from above

df = pd.DataFrame(sql_query)
with open(os.path.join(configpath, 'dbListC.csv'), 'w') as f:
    df.to_csv(f, header=None, index=False, line_terminator='\n')
   
sql_query = pd.read_sql_query(''' 
select DISTINCT SchemaName FROM Greg.dbo.Master WHERE Core Like 'Core' ORDER BY 1
                              '''
                              ,conn) # here, the 'conn' is the variable that contains your database connection information from above

df = pd.DataFrame(sql_query)
with open(os.path.join(configpath, 'schemaListC.csv'), 'w') as f:
    df.to_csv(f, header=None, index=False, line_terminator='\n')
   
sql_query = pd.read_sql_query(''' 
select DISTINCT TableName FROM Greg.dbo.Master WHERE Core Like 'Core' ORDER BY 1
                              '''
                              ,conn) # here, the 'conn' is the variable that contains your database connection information from above

df = pd.DataFrame(sql_query)
with open(os.path.join(configpath, 'tblListC.csv'), 'w') as f:
    df.to_csv(f, header=None, index=False, line_terminator='\n')
   
conn.close()

now01 = datetime.datetime.now()
print('Core Source files created: ',now01.strftime("%Y-%m-%d %H:%M:%S"))


Core Source files created:  2021-06-14 07:25:10


In [6]:
#Connect to SQL Server
conn = pyodbc.connect('Driver={SQL Server};'
                      'Server=DEVODSSQL;'
                      'Database=Greg;'
                      'Trusted_Connection=yes;')

sql_query = pd.read_sql_query(''' 
select DISTINCT DatabaseName FROM Greg.dbo.Master WHERE Core Like 'Non' ORDER BY 1
                              '''
                              ,conn) # here, the 'conn' is the variable that contains your database connection information from above

df = pd.DataFrame(sql_query)
with open(os.path.join(configpath, 'dbListN.csv'), 'w') as f:
    df.to_csv(f, header=None, index=False, line_terminator='\n')
   
sql_query = pd.read_sql_query(''' 
select DISTINCT SchemaName FROM Greg.dbo.Master WHERE Core Like 'Non' ORDER BY 1
                              '''
                              ,conn) # here, the 'conn' is the variable that contains your database connection information from above

df = pd.DataFrame(sql_query)
with open(os.path.join(configpath, 'schemaListN.csv'), 'w') as f:
    df.to_csv(f, header=None, index=False, line_terminator='\n')
   
sql_query = pd.read_sql_query(''' 
select DISTINCT TableName FROM Greg.dbo.Master WHERE Core Like 'Non' ORDER BY 1
                              '''
                              ,conn) # here, the 'conn' is the variable that contains your database connection information from above

df = pd.DataFrame(sql_query)
with open(os.path.join(configpath, 'tblListN.csv'), 'w') as f:
    df.to_csv(f, header=None, index=False, line_terminator='\n')
   
conn.close()

now01 = datetime.datetime.now()
print('Non-Core Source files created: ',now01.strftime("%Y-%m-%d %H:%M:%S"))


Non-Core Source files created:  2021-06-14 07:25:10


In [7]:
# Connect to SQL Server
conn = pyodbc.connect('Driver={SQL Server};'
                      'Server=DEVODSSQL;'
                      'Database=Greg;'
                      'Trusted_Connection=yes;')

sql_query = pd.read_sql_query(''' 
select * FROM Greg.dbo.Master WHERE Core Like 'Core' ORDER BY 'DatabaseName', 'SchemaName', 'TableName'
                              '''
                              ,conn) # here, the 'conn' is the variable that contains your database connection information from above

df = pd.DataFrame(sql_query)
with open(os.path.join(configpath, 'coreSourceBase.csv'), 'w') as f:
    df.to_csv(f, header=f.tell()==0, index=False, line_terminator='\n')
   
conn.close()

now01 = datetime.datetime.now()
print('Core Source file created: ',now01.strftime("%Y-%m-%d %H:%M:%S"))


Core Source file created:  2021-06-14 07:25:10


In [8]:
# Connect to SQL Server
conn = pyodbc.connect('Driver={SQL Server};'
                      'Server=DEVODSSQL;'
                      'Database=Greg;'
                      'Trusted_Connection=yes;')

sql_query = pd.read_sql_query(''' 
select * FROM Greg.dbo.Master WHERE Core Like 'Non' ORDER BY 'DatabaseName', 'SchemaName', 'TableName'
                              '''
                              ,conn) # here, the 'conn' is the variable that contains your database connection information from above

df = pd.DataFrame(sql_query)
with open(os.path.join(configpath, 'noncoreSourceBase.csv'), 'w') as f:
    df.to_csv(f, header=f.tell()==0, index=False, line_terminator='\n')
   
conn.close()

now01 = datetime.datetime.now()
print('Core Source file created: ',now01.strftime("%Y-%m-%d %H:%M:%S"))


Core Source file created:  2021-06-14 07:25:10


In [9]:
#Create CORE and NonCore source files from analysis
#Need: (copy from or unpack Py.zip file to local drive)
#1. coreSourceBase.csv 
#2. coreSourceTest.csv
#3. noncoreSourceBase.csv
#4. noncoreSourceTest.csv

# Read excel file with core source tables
f1a = pd.read_csv("{}/{}".format(pathCT, 'coreSource'+ csvtype +'.csv', encoding='unicode_escape'))
f1b = pd.read_csv("{}/{}".format(pathCT, 'noncoreSource'+ csvtype +'.csv', encoding='unicode_escape'))

#Core                               # (Core) Create dataframe with what's found in f1 dataframe
f1c = f1a['DatabaseName']
f1d = f1a['TableName']
f1g = f1a['SchemaName']

#NONCore                            # (Non) Create dataframe with what is found in f1 dataframe
f1e = f1b['DatabaseName']
f1f = f1b['TableName']
f1h = f1b['SchemaName']

#Core                               # Create dataframe without dupes and use in connection for-loop
dbListC = f1c.drop_duplicates() 
tblListC = f1d.drop_duplicates()
schemaListC = f1g.drop_duplicates()

#NONCore                            # Create dataframe without dupes and use in connection for-loop
dbListN = f1e.drop_duplicates()     
tblListN = f1f.drop_duplicates()    
schemaListN = f1h.drop_duplicates() 

In [10]:
#Sanity check the dataframe 
dbListC.head()

0     ApplicationData
2          Assessment
17               GSDR
Name: DatabaseName, dtype: object

In [11]:
for x in dbListC:
    try:
        conn = pyodbc.connect('Server=PRODODSSQL;'
                              'Trusted_Connection=yes;'
                              'DRIVER={{SQL Server}};'
                              'Database={0}'.format(x))

        sql_query = pd.read_sql_query(''' 
SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED;
SELECT DISTINCT CONVERT(char(8), GetDate(),112) as TodaysDate, TABLE_CATALOG as DatabaseName, TABLE_SCHEMA as SchemaName, TABLE_NAME as TableName, COLUMN_NAME as ColumnName, ORDINAL_POSITION as OrdinalPosition, IS_NULLABLE as IsNullable, DATA_TYPE as DataType
FROM INFORMATION_SCHEMA.COLUMNS
ORDER BY TABLE_CATALOG, TABLE_SCHEMA, TABLE_NAME, ORDINAL_POSITION;
                              '''
                              ,conn) # here, the 'conn' is the variable that contains your database connection information from above

        dfp = pd.DataFrame(sql_query)
        with open(os.path.join(pathCP, 'collectionP1.csv'), 'a') as f:
            dfp.to_csv(f, header=f.tell()==0, index=False, line_terminator='\n')

        conn.close()
    except:
        continue
    finally:
        print('Production CORE collected: ', x, now01.strftime("%Y-%m-%d %H:%M:%S"))
        continue

now01 = datetime.datetime.now()
print('Production CORE Collection Process Complete: ', now01.strftime("%Y-%m-%d %H:%M:%S"))

for x in dbListC:
    try:
        conn = pyodbc.connect('Server=QAODSSQL;'
                              'Trusted_Connection=yes;'
                              'DRIVER={{SQL Server}};'
                              'Database={0}'.format(x))

        sql_query = pd.read_sql_query(''' 
SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED;
SELECT DISTINCT CONVERT(char(8), GetDate(),112) as TodaysDate, TABLE_CATALOG as DatabaseName, TABLE_SCHEMA as SchemaName, TABLE_NAME as TableName, COLUMN_NAME as ColumnName, ORDINAL_POSITION as OrdinalPosition, IS_NULLABLE as IsNullable, DATA_TYPE as DataType
FROM INFORMATION_SCHEMA.COLUMNS
ORDER BY TABLE_CATALOG, TABLE_SCHEMA, TABLE_NAME, ORDINAL_POSITION;
                              '''
                              ,conn) # here, the 'conn' is the variable that contains your database connection information from above

        dfq = pd.DataFrame(sql_query)
        with open(os.path.join(pathCP, 'collectionQ1.csv'), 'a') as f:
            dfq.to_csv(f, header=f.tell()==0, index=False, line_terminator='\n')

        conn.close()
    except:
        continue
    finally:
        print('QA CORE collected: ', x, now01.strftime("%Y-%m-%d %H:%M:%S"))
        continue

now01 = datetime.datetime.now()
print('QA CORE Collection Process Complete: ', now01.strftime("%Y-%m-%d %H:%M:%S"))

Production CORE collected:  ApplicationData 2021-06-14 07:25:10
Production CORE collected:  Assessment 2021-06-14 07:25:10
Production CORE collected:  GSDR 2021-06-14 07:25:10
Production CORE Collection Process Complete:  2021-06-14 07:25:12
QA CORE collected:  ApplicationData 2021-06-14 07:25:12
QA CORE collected:  Assessment 2021-06-14 07:25:12
QA CORE collected:  GSDR 2021-06-14 07:25:12
QA CORE Collection Process Complete:  2021-06-14 07:25:17


#LOAD excel collections into sp and sq dataframes for debug and process evaluations

In [12]:
df3 = pd.merge(dfp, dfq, how='outer', indicator='Exist')
df3 = df3.loc[df3['Exist'] != 'both']

In [13]:
with open(os.path.join(pathCR, 'CompareReport1.csv'), 'a') as f:
    df3.to_csv(f, header=f.tell()==0, index=False, line_terminator='\n')

In [14]:
df4 = pd.merge(dfp, dfq, how='outer', indicator='Exist')
df4 = df4.loc[df4['Exist'] == 'both']

In [15]:
with open(os.path.join(pathCR, 'CompareReport2.csv'), 'a') as f:
    df4.to_csv(f, header=f.tell()==0, index=False, line_terminator='\n')

In [16]:
# LOAD Daily results into Compare Tables (Difference & Match Exists) 

In [17]:
# Import CSV
data = pd.read_csv("{}/{}".format(pathCR, 'CompareReport1.csv'), encoding='unicode_escape')   
df = pd.DataFrame(data, columns= ['TodaysDate','DatabaseName','SchemaName','TableName','ColumnName','OrdinalPosition','IsNullable','DataType','Exist'])
df2 = df.fillna(value=0)
#Sanity check the dataframe --- uncomment out (df head) to see results during processing
#df2.head()

In [18]:
df2.columns.values

array(['TodaysDate', 'DatabaseName', 'SchemaName', 'TableName',
       'ColumnName', 'OrdinalPosition', 'IsNullable', 'DataType', 'Exist'],
      dtype=object)

In [19]:
df2.columns

Index(['TodaysDate', 'DatabaseName', 'SchemaName', 'TableName', 'ColumnName',
       'OrdinalPosition', 'IsNullable', 'DataType', 'Exist'],
      dtype='object')

In [20]:
# Import CSV
data = pd.read_csv("{}/{}".format(pathCR, 'CompareReport1.csv'), encoding='unicode_escape')   
df = pd.DataFrame(data, columns= ['TodaysDate','DatabaseName','SchemaName','TableName','ColumnName','OrdinalPosition','IsNullable','DataType','Exist'])
df2 = df.fillna(value=0)

# Connect to SQL Server
conn = pyodbc.connect('Driver={SQL Server};'
                      'Server=DEVODSSQL;'
                      'Database=Greg;'
                      'Trusted_Connection=yes;')
cursor = conn.cursor()

# Insert DataFrame to Table
for row in df2.itertuples():
    cursor.execute('''
                INSERT INTO Greg.dbo.CompareDifference (TodaysDate,DatabaseName,SchemaName,TableName,ColumnName,OrdinalPosition,IsNullable,DataType,Exist)
                VALUES (?,?,?,?,?,?,?,?,?);
                ''',
                row.TodaysDate,
                row.DatabaseName,
                row.SchemaName,
                row.TableName,
                row.ColumnName,
                row.OrdinalPosition,
                row.IsNullable,
                row.DataType,
                row.Exist
                )
conn.commit()

cursor.close()

print(' *** Note: csv file written to database table complete ***')

sql_query = pd.read_sql_query(''' 
select count(*) from Greg.dbo.CompareDifference
                              '''
                              ,conn) # here, the 'conn' is the variable that contains your database connection information from above
conn.close()
print(' *** Note: count from table data loaded is: ***', sql_query)
now01 = datetime.datetime.now()
print('Processing Complete: ',now01.strftime("%Y-%m-%d %H:%M:%S"))

 *** Note: csv file written to database table complete ***
 *** Note: count from table data loaded is: ***          
0  110799
Processing Complete:  2021-06-14 07:25:27


In [21]:
# Import CSV
data = pd.read_csv("{}/{}".format(pathCR, 'CompareReport2.csv'), encoding='unicode_escape')   
df = pd.DataFrame(data, columns= ['TodaysDate','DatabaseName','SchemaName','TableName','ColumnName','OrdinalPosition','IsNullable','DataType','Exist'])
df2 = df.fillna(value=0)

# Connect to SQL Server
conn = pyodbc.connect('Driver={SQL Server};'
                      'Server=DEVODSSQL;'
                      'Database=Greg;'
                      'Trusted_Connection=yes;')
cursor = conn.cursor()

# Insert DataFrame to Table
for row in df2.itertuples():
    cursor.execute('''
                INSERT INTO Greg.dbo.CompareMatchExists (TodaysDate,DatabaseName,SchemaName,TableName,ColumnName,OrdinalPosition,IsNullable,DataType,Exist)
                VALUES (?,?,?,?,?,?,?,?,?);
                ''',
                row.TodaysDate,
                row.DatabaseName,
                row.SchemaName,
                row.TableName,
                row.ColumnName,
                row.OrdinalPosition,
                row.IsNullable,
                row.DataType,
                row.Exist
                )
conn.commit()

cursor.close()

print(' *** Note: csv file written to database table complete ***')

sql_query = pd.read_sql_query(''' 
select count(*) from Greg.dbo.CompareMatchExists
                              '''
                              ,conn) # here, the 'conn' is the variable that contains your database connection information from above
conn.close()
print(' *** Note: count from table data loaded is: ***', sql_query)
now01 = datetime.datetime.now()
print('Processing Complete: ',now01.strftime("%Y-%m-%d %H:%M:%S"))

 *** Note: csv file written to database table complete ***
 *** Note: count from table data loaded is: ***          
0  975631
Processing Complete:  2021-06-14 07:26:36
