In [1]:
import data_fingerprint

In [2]:
# Define the function for generating fingerprints
def generate_fingerprints(file_path, data_fingerprint):
    # Order-Dependent Fingerprint
    fingerprint_dep = data_fingerprint.process_file_with_order_dependent_fingerprint(file_path)
    
    # Order-Independent Fingerprint
    fingerprint_indep = data_fingerprint.process_file_with_order_independent_fingerprint(file_path)

    return fingerprint_dep, fingerprint_indep

In [3]:
import warnings
import pandas as pd
import os

warnings.filterwarnings('ignore')

# List of numbers
nums = [1, 2, 3, 4, 5]

# Test names based on file number
test_names = {
    1: "Standard csv",
    2: "European format csv",
    3: "Mixed Formats csv",
    4: "csv with missing Data",
    5: "csv without header and unusual delimiter"
}

# Directory where the validation files are located
file_directory = "validation_files/"

# Function to apply conditional formatting
def highlight_status(val):
    color = 'green' if val == 'Pass' else 'red'
    return f'color: {color}'

# Function to extract the file extension
def get_file_extension(file_path):
    return os.path.splitext(file_path)[1]

# Loop through each number and create a separate DataFrame for each
for num in nums:
    # Generate file paths for the current num in the specified directory
    file_paths = [
        os.path.join(file_directory, f"file{num}.csv"),
        os.path.join(file_directory, f"file{num}.csvoutput.xlsx"),
        os.path.join(file_directory, f"file{num}.csvoutput.json"),
        os.path.join(file_directory, f"file{num}.csvoutput.xml"),
        os.path.join(file_directory, f"file{num}.csvoutput.html"),
        os.path.join(file_directory, f"file{num}.csvoutput.h5"),
        os.path.join(file_directory, f"file{num}.csvoutput.dta"),
        os.path.join(file_directory, f"file{num}.csvoutput.feather"),
        os.path.join(file_directory, f"file{num}.csvoutput.parquet"),
        os.path.join(file_directory, f"file{num}.csvoutput.pkl")
    ]

    # List to collect fingerprint data for the current num
    fingerprint_data = []

    # Variables to store the first entry of the .csv fingerprints
    first_csv_dep_fingerprint = None
    first_csv_indep_fingerprint = None

    # Loop through each file and store fingerprints in a list
    for file_path in file_paths:
        fingerprint_dep, fingerprint_indep = generate_fingerprints(file_path, data_fingerprint)

        # If it's the .csv file, store its first entry fingerprints
        if file_path.endswith(".csv"):
            if first_csv_dep_fingerprint is None:
                first_csv_dep_fingerprint = fingerprint_dep
                first_csv_indep_fingerprint = fingerprint_indep

        # Compare the current fingerprints with the first .csv file fingerprints
        dep_status = "Pass" if fingerprint_dep == first_csv_dep_fingerprint else "Fail"
        indep_status = "Pass" if fingerprint_indep == first_csv_indep_fingerprint else "Fail"

        # Extract file extension for the 'File Name' column
        file_extension = get_file_extension(file_path)

        # Append the data with pass/fail status, omitting the actual fingerprint columns
        fingerprint_data.append({
            'Test Name': test_names[num],  # Add the test name column
            'File Extension': file_extension,  # Only file extension is displayed
            'Order-Dependent Status': dep_status,
            'Order-Independent Status': indep_status
        })

    # Convert the list of dictionaries to a DataFrame for the current num
    df_fingerprints = pd.DataFrame(fingerprint_data)

    # Apply conditional formatting to highlight "Pass" in green and "Fail" in red
    styled_df = df_fingerprints.style.applymap(highlight_status, subset=['Order-Dependent Status', 'Order-Independent Status'])

    # Display the styled DataFrame for the current num in Jupyter
    display(styled_df)
    print(f"Fingerprint Data for file{num}")


Unnamed: 0,Test Name,File Extension,Order-Dependent Status,Order-Independent Status
0,Standard csv,.csv,Pass,Pass
1,Standard csv,.xlsx,Pass,Pass
2,Standard csv,.json,Pass,Pass
3,Standard csv,.xml,Pass,Pass
4,Standard csv,.html,Pass,Pass
5,Standard csv,.h5,Pass,Pass
6,Standard csv,.dta,Pass,Pass
7,Standard csv,.feather,Pass,Pass
8,Standard csv,.parquet,Pass,Pass
9,Standard csv,.pkl,Pass,Pass


Fingerprint Data for file1


Unnamed: 0,Test Name,File Extension,Order-Dependent Status,Order-Independent Status
0,European format csv,.csv,Pass,Pass
1,European format csv,.xlsx,Pass,Pass
2,European format csv,.json,Pass,Pass
3,European format csv,.xml,Pass,Pass
4,European format csv,.html,Pass,Pass
5,European format csv,.h5,Pass,Pass
6,European format csv,.dta,Pass,Pass
7,European format csv,.feather,Pass,Pass
8,European format csv,.parquet,Pass,Pass
9,European format csv,.pkl,Pass,Pass


Fingerprint Data for file2


Unnamed: 0,Test Name,File Extension,Order-Dependent Status,Order-Independent Status
0,Mixed Formats csv,.csv,Pass,Pass
1,Mixed Formats csv,.xlsx,Pass,Pass
2,Mixed Formats csv,.json,Pass,Pass
3,Mixed Formats csv,.xml,Pass,Pass
4,Mixed Formats csv,.html,Fail,Fail
5,Mixed Formats csv,.h5,Pass,Pass
6,Mixed Formats csv,.dta,Pass,Pass
7,Mixed Formats csv,.feather,Pass,Pass
8,Mixed Formats csv,.parquet,Pass,Pass
9,Mixed Formats csv,.pkl,Pass,Pass


Fingerprint Data for file3


Unnamed: 0,Test Name,File Extension,Order-Dependent Status,Order-Independent Status
0,csv with missing Data,.csv,Pass,Pass
1,csv with missing Data,.xlsx,Pass,Pass
2,csv with missing Data,.json,Pass,Pass
3,csv with missing Data,.xml,Pass,Pass
4,csv with missing Data,.html,Pass,Pass
5,csv with missing Data,.h5,Pass,Pass
6,csv with missing Data,.dta,Pass,Pass
7,csv with missing Data,.feather,Pass,Pass
8,csv with missing Data,.parquet,Pass,Pass
9,csv with missing Data,.pkl,Pass,Pass


Fingerprint Data for file4
Could not detect delimiter. Defaulting to comma.
Could not detect delimiter. Defaulting to comma.
Error loading file based on extension: error parsing attribute name, line 2, column 18 (<string>, line 2)
Error loading file based on extension: error parsing attribute name, line 2, column 18 (<string>, line 2)


Unnamed: 0,Test Name,File Extension,Order-Dependent Status,Order-Independent Status
0,csv without header and unusual delimiter,.csv,Pass,Pass
1,csv without header and unusual delimiter,.xlsx,Pass,Pass
2,csv without header and unusual delimiter,.json,Pass,Pass
3,csv without header and unusual delimiter,.xml,Fail,Fail
4,csv without header and unusual delimiter,.html,Pass,Pass
5,csv without header and unusual delimiter,.h5,Pass,Pass
6,csv without header and unusual delimiter,.dta,Pass,Pass
7,csv without header and unusual delimiter,.feather,Pass,Pass
8,csv without header and unusual delimiter,.parquet,Pass,Pass
9,csv without header and unusual delimiter,.pkl,Pass,Pass


Fingerprint Data for file5


In [4]:
import warnings
import pandas as pd
import os

warnings.filterwarnings('ignore')

# List of numbers
nums = [1, 2, 3, 4, 5]

# Test names based on file number
test_names = {
    1: "Standard csv",
    2: "European format csv",
    3: "Mixed Formats csv",
    4: "csv with missing Data",
    5: "csv without header and unusual delimiter"
}

# Directory where the validation files are located
file_directory = "validation_files/"

# Function to apply conditional formatting
def highlight_status(val):
    color = 'green' if val == 'Pass' else 'red'
    return f'color: {color}'

# Function to extract the file extension
def get_file_extension(file_path):
    return os.path.splitext(file_path)[1]

# List to collect the final validation status for each file type
validation_summary = []

# Loop through each number and validate files
for num in nums:
    # Generate file paths for the current num in the specified directory
    file_paths = [
        os.path.join(file_directory, f"file{num}.csv"),
        os.path.join(file_directory, f"file{num}.csvoutput.xlsx"),
        os.path.join(file_directory, f"file{num}.csvoutput.json"),
        os.path.join(file_directory, f"file{num}.csvoutput.xml"),
        os.path.join(file_directory, f"file{num}.csvoutput.html"),
        os.path.join(file_directory, f"file{num}.csvoutput.h5"),
        os.path.join(file_directory, f"file{num}.csvoutput.dta"),
        os.path.join(file_directory, f"file{num}.csvoutput.feather"),
        os.path.join(file_directory, f"file{num}.csvoutput.parquet"),
        os.path.join(file_directory, f"file{num}.csvoutput.pkl")
    ]

    # Dictionary to store the validation status for each file extension
    file_validation = {}

    # Variables to store the first entry of the .csv fingerprints
    first_csv_dep_fingerprint = None
    first_csv_indep_fingerprint = None

    # Loop through each file and store fingerprints in a list
    for file_path in file_paths:
        fingerprint_dep, fingerprint_indep = generate_fingerprints(file_path, data_fingerprint)

        # If it's the .csv file, store its first entry fingerprints
        if file_path.endswith(".csv"):
            if first_csv_dep_fingerprint is None:
                first_csv_dep_fingerprint = fingerprint_dep
                first_csv_indep_fingerprint = fingerprint_indep

        # Compare the current fingerprints with the first .csv file fingerprints
        dep_status = "Pass" if fingerprint_dep == first_csv_dep_fingerprint else "Fail"
        indep_status = "Pass" if fingerprint_indep == first_csv_indep_fingerprint else "Fail"

        # Extract file extension for the validation summary
        file_extension = get_file_extension(file_path)

        # Set initial validation status to "Validated", and change to "Failed Validation" if any test fails
        if dep_status == "Fail" or indep_status == "Fail":
            file_validation[file_extension] = "Failed Validation"
        elif file_extension not in file_validation:  # Only set to "Validated" if not already failed
            file_validation[file_extension] = "Validated"

    # Append the validation result for this test to the summary
    validation_summary.append({
        'Test Name': test_names[num],
        **file_validation  # Add all file extensions and their validation statuses
    })

# Convert the list of validation statuses to a summary DataFrame
df_validation_summary = pd.DataFrame(validation_summary)

# Display the summary table in Jupyter
display(df_validation_summary)


Could not detect delimiter. Defaulting to comma.
Could not detect delimiter. Defaulting to comma.
Error loading file based on extension: error parsing attribute name, line 2, column 18 (<string>, line 2)
Error loading file based on extension: error parsing attribute name, line 2, column 18 (<string>, line 2)


Unnamed: 0,Test Name,.csv,.xlsx,.json,.xml,.html,.h5,.dta,.feather,.parquet,.pkl
0,Standard csv,Validated,Validated,Validated,Validated,Validated,Validated,Validated,Validated,Validated,Validated
1,European format csv,Validated,Validated,Validated,Validated,Validated,Validated,Validated,Validated,Validated,Validated
2,Mixed Formats csv,Validated,Validated,Validated,Validated,Failed Validation,Validated,Validated,Validated,Validated,Validated
3,csv with missing Data,Validated,Validated,Validated,Validated,Validated,Validated,Validated,Validated,Validated,Validated
4,csv without header and unusual delimiter,Validated,Validated,Validated,Failed Validation,Validated,Validated,Validated,Validated,Validated,Validated


In [5]:
import warnings
import pandas as pd
import os

warnings.filterwarnings('ignore')

# List of numbers (representing the tests)
nums = [1, 2, 3, 4, 5]

# Test names based on file number
test_names = {
    1: "Standard csv",
    2: "European format csv",
    3: "Mixed Formats csv",
    4: "csv with missing Data",
    5: "csv without header and unusual delimiter"
}

# Directory where the validation files are located
file_directory = "validation_files/"

# Function to extract the file extension
def get_file_extension(file_path):
    return os.path.splitext(file_path)[1]

# List to collect validation statuses for each test and file type
validation_data = {ext: [] for ext in ['.csv', '.xlsx', '.json', '.xml', '.html', '.h5', '.dta', '.feather', '.parquet', '.pkl']}

# Loop through each test (number) and validate files
for num in nums:
    # Generate file paths for the current num in the specified directory
    file_paths = [
        os.path.join(file_directory, f"file{num}.csv"),
        os.path.join(file_directory, f"file{num}.csvoutput.xlsx"),
        os.path.join(file_directory, f"file{num}.csvoutput.json"),
        os.path.join(file_directory, f"file{num}.csvoutput.xml"),
        os.path.join(file_directory, f"file{num}.csvoutput.html"),
        os.path.join(file_directory, f"file{num}.csvoutput.h5"),
        os.path.join(file_directory, f"file{num}.csvoutput.dta"),
        os.path.join(file_directory, f"file{num}.csvoutput.feather"),
        os.path.join(file_directory, f"file{num}.csvoutput.parquet"),
        os.path.join(file_directory, f"file{num}.csvoutput.pkl")
    ]

    # Variables to store the first entry of the .csv fingerprints
    first_csv_dep_fingerprint = None
    first_csv_indep_fingerprint = None

    # Loop through each file and store validation results
    for file_path in file_paths:
        fingerprint_dep, fingerprint_indep = generate_fingerprints(file_path, data_fingerprint)

        # If it's the .csv file, store its first entry fingerprints
        if file_path.endswith(".csv"):
            if first_csv_dep_fingerprint is None:
                first_csv_dep_fingerprint = fingerprint_dep
                first_csv_indep_fingerprint = fingerprint_indep

        # Compare the current fingerprints with the first .csv file fingerprints
        dep_status = "Pass" if fingerprint_dep == first_csv_dep_fingerprint else "Fail"
        indep_status = "Pass" if fingerprint_indep == first_csv_indep_fingerprint else "Fail"

        # Extract file extension for validation summary
        file_extension = get_file_extension(file_path)

        # Append 'Pass' or 'Fail' to the validation data for the current test
        if dep_status == "Fail" or indep_status == "Fail":
            validation_data[file_extension].append("Fail")
        else:
            validation_data[file_extension].append("Pass")

# Now create the final summary table where a format is "Validated" only if all tests are "Pass"
summary = []

# Check each file format across all tests
for file_format, results in validation_data.items():
    status = "Validated" if all(result == "Pass" for result in results) else "Failed Validation"
    summary.append({'File Format': file_format, 'Validation Status': status})

# Convert the summary to a DataFrame
df_summary = pd.DataFrame(summary)

# Display the final simplified validation summary table
display(df_summary)


Could not detect delimiter. Defaulting to comma.
Could not detect delimiter. Defaulting to comma.
Error loading file based on extension: error parsing attribute name, line 2, column 18 (<string>, line 2)
Error loading file based on extension: error parsing attribute name, line 2, column 18 (<string>, line 2)


Unnamed: 0,File Format,Validation Status
0,.csv,Validated
1,.xlsx,Validated
2,.json,Validated
3,.xml,Failed Validation
4,.html,Failed Validation
5,.h5,Validated
6,.dta,Validated
7,.feather,Validated
8,.parquet,Validated
9,.pkl,Validated
