Data QC Utility D2D

In [21]:
import os
import numpy as np 
import pandas as pd 
from io import BytesIO
from fuzzywuzzy import process
import tableauserverclient as TSC 

In [22]:
def tableau_authentication(user_name, password, site_id, tableau_cloud_url):
    tableau_auth = TSC.TableauAuth(user_name, password, site_id)  # Tableau Authentication details 'username', 'password' and 'site-id'
    server = TSC.Server(tableau_cloud_url, use_server_version=True)  # Tableau server url
    return tableau_auth, server  # Returning the variables

In [23]:
def download_parameter_data(workbook_name, parameter_name, parameter_value, save_location, tableau_auth, server):
    with server.auth.sign_in(tableau_auth):  # Server login
        all_workbooks, pagination_item = server.workbooks.get()  # Finds all the workbook present on the server
        target_workbook = next((wb for wb in all_workbooks if wb.name == workbook_name), None)  # Matches with the target workbook
        if not target_workbook:
            raise ValueError(f"Workbook {workbook_name} not found.")
        
        server.workbooks.populate_views(target_workbook)  # Finds all the sheets present in the dashboard 
        for target_view in target_workbook.views:  # The loop goes through all the sheets present in the workbook one by one
            sheet_name = target_view.name
            options = TSC.CSVRequestOptions()
            options.vf(parameter_name, parameter_value)  # Parameter is applied
            try:
                server.views.populate_csv(target_view, req_options=options)  # The sheet data is extracted
            except TSC.ServerResponseError as e:
                print(f"Error querying data for Parameter {parameter_value}: {e}")
                continue

            csv_data = BytesIO(b''.join(target_view.csv))  # The sheet data is converted to csv format
            df = pd.read_csv(csv_data)  # Read into pandas DataFrame
            df = df.drop_duplicates()  # Remove duplicates to ensure clean data
            file_name = os.path.join(save_location, f"{parameter_value} {sheet_name}.xlsx")  # Save each parameter's data into a separate Excel file
            df.to_excel(file_name, index=False)
    print(f"{parameter_value} data downloaded for {workbook_name} to location! ")

In [24]:
def save_data(control_df, save_location, tableau_auth, server):
    for f in range(len(control_df)):
        workbook_name  = control_df.loc[f,'Dashboard']  # Targetted dashboard name
        parameter_name = control_df.loc[f,'Parameter Name']  # Parameter in the workbook
        parameter_value = control_df.loc[f,'Parameter Value']  # All the values of the parameter
        if workbook_name is np.nan:
            break
        else:  
            download_parameter_data(workbook_name, parameter_name, parameter_value, save_location, tableau_auth, server)

In [25]:
def compare_data(source_workbook_data_location, QC_workbook_data_location, compared_data_location):
    source_files = {f for f in os.listdir(source_workbook_data_location) if os.path.isfile(os.path.join(source_workbook_data_location, f))}  # Reading all the files present in the Source Workbook Data
    QC_files = {f for f in os.listdir(QC_workbook_data_location) if os.path.isfile(os.path.join(QC_workbook_data_location, f))}  # Reading all the files present in the QC Workbook Data
    common_files = source_files.intersection(QC_files)  # Checking for the common files
    if not common_files:
        print("No common files to compare.")
        return
    for file_name in common_files:
        if 'Dashboard' not in file_name:
            source_file_path = os.path.join(source_workbook_data_location, file_name)  # Getting the Source Data File path
            QC_file_path = os.path.join(QC_workbook_data_location, file_name)  # Getting the QC Data File path
            output_path = os.path.join(compared_data_location, f"{file_name}")  # Output Path to save the Comparision Results
            try:
                sdf = pd.read_excel(source_file_path)  # Reading the excel files
                qcdf = pd.read_excel(QC_file_path)
                with pd.ExcelWriter(output_path) as writer:
                    if sdf.equals(qcdf):  # Checking if the files are equal or not
                        print(f"The {file_name} files have no difference and the data will not be saved!!")
                    else:
                        comparision_result  = sdf.compare(qcdf)  # Comparing the files
                        comparision_result.to_excel(writer, sheet_name='Differences', index=True)
            except Exception as e:
                print(f"Error comparing '{file_name}': {e}")
    print(f"Comparison result are saved!!")

In [26]:
def get_group(file_name, path):  # Function to get the group name 
    files = {f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))}
    group_name_1 = []
    group_name = []
    for file in files:
        if file == file_name:
            measure = file_name.split(' ')[2].strip().replace('.xlsx', '')
            file_path = os.path.join(path, file)
            df = pd.read_excel(file_path)
            measure_1 = process.extractOne(measure, df.columns)[0]
            group_name_1 = df[measure_1]
            for i in group_name_1:
                if i is not np.nan:
                    group_name.append(i)  
            return group_name

In [27]:
def output(compared_data_location, output_file_location, source_workbook_data_location, control_df):
    files = {f for f in os.listdir(compared_data_location) if os.path.isfile(os.path.join(compared_data_location, f))}  # Reading all the comparision results
    diff_data = []  # Creating a empty list to store all the data
    for file in files:  # Reading all the files one by one
        source_db = []  # To store the data points
        QC_db = []
        group = []
        file_path = os.path.join(compared_data_location, file)
        dashboard_name = control_df['QC Dashboard'][0]  # Getting the dashboard name
        kpi_name = file.split(' ')[0]  # Getting the KPI value of the sheet being processed
        chart_name = file.replace('.xlsx', '')  # Getting the Chart Name of the sheet being processed
        group = get_group(file, source_workbook_data_location)  # Extracting the name of the measure the data is grouped by
        df = pd.read_excel(file_path)
        if 'Measure Calculation' in df.columns:  # Checking if the Measure Calculation column is present
            colindex = df.columns.get_loc('Measure Calculation')  # Getting the column number of the Measure Calculation column
            if colindex + 1 < len(df.columns):  # Checking if the column next to Measure Calculation column is within the columns present in the sheet
                qcdb_name = df.columns[colindex + 1]
                sdb = df['Measure Calculation'].replace('self', np.nan).dropna()  # Extracting the values
                qcdb = df[qcdb_name].replace('other', np.nan).dropna()
                for n in sdb:
                    source_db.append(float(n.replace(',', '')))  # Storing the one by one into a list by converting the data type to float
                for n in qcdb:
                    QC_db.append(float(n.replace(',', '')))       
        for i in range(len(source_db)):
            diff_data.append({  # Adding all the data together into a list
                'Dashboard Name': dashboard_name,
                'Chart Name': chart_name,
                'KPI': kpi_name,
                'Data Measure': group[i],             
                'Source Workbook Data Points': source_db[i],
                'QC Workbook Data Points': QC_db[i]
        })
    odf = pd.DataFrame(diff_data)  # Storing the data into a data frame
    output_file = os.path.join(output_file_location, f"Output Feed.xlsx")
    odf.to_excel(output_file, index=False)  # Saving output
    print("Results are Saved!!")

In [28]:
# Folder location to save Source Workbook, QC Workbook, and Compared data, Output Feed File, Control File and Tableau Auth Details File
source_workbook_data_location = "/workspaces/Tableau_Python/Workbook Data/Dashboard to Dashboard/Source Workbook Data"
QC_workbook_data_location = "Workbook Data/Dashboard to Dashboard/QC Workbook Data"
compared_data_location = "/workspaces/Tableau_Python/Workbook Data/Dashboard to Dashboard/Compared Data"
tableau_authentication_details_file_location = "/workspaces/Tableau_Python/Tableau Auth Details.txt"
output_file_location = "/workspaces/Tableau_Python/Workbook Data/Dashboard to Dashboard"
control_file_location = "/workspaces/Tableau_Python/Control_File.xlsx"

In [29]:
# Reading the Control File: Tableau Auth Details
with open(tableau_authentication_details_file_location, 'r') as details:
    user_details = details.readlines()
user_name = user_details[0].strip().replace('User Name: ', '')  # Tableau Cloud User Name
password = user_details[1].strip().replace('Password: ', '')  # Tableau Cloud Password
site_id = user_details[2].strip().replace('Site-ID: ', '')  # Tableau Cloud Site-ID
tableau_cloud_url = user_details[3].strip().replace('Cloud URL: ', '')  # Tableau Cloud URL

In [30]:
# Reading the Control_file
control_df = pd.read_excel(control_file_location)
control_df['row_number'] = control_df.reset_index().index
control_source_df = control_df[["row_number","Dashboard","Parameter Name","Parameter Value"]]  # Getting the Source Workbook Details
control_QC_df = control_df[["row_number","QC Dashboard","QC Parameter Name","QC Parameter Value"]]  # Getting the QC Workbook Details
control_QC_df = control_QC_df.rename(columns={'QC Dashboard': 'Dashboard', 'QC Parameter Name': 'Parameter Name','QC Parameter Value': 'Parameter Value', })  # Renaming the column names of the QC Workbook details

In [31]:
tableau_auth, server = tableau_authentication(user_name, password, site_id, tableau_cloud_url)  # Authenticating to the Tableau Cloud

In [None]:
save_data(control_source_df, source_workbook_data_location, tableau_auth, server)  # Downloads the Source Workbook Data
save_data(control_QC_df, QC_workbook_data_location, tableau_auth, server)  # Downloads the QC Workbook Data

In [None]:
compare_data(source_workbook_data_location, QC_workbook_data_location, compared_data_location)  # Save the comparision results

In [None]:
output(compared_data_location, output_file_location, source_workbook_data_location, control_df)  # Saving the Output Feed file for Tableau