# **Step 2: Understand your tools**
Import external libraries:

In [None]:
%pip install pandas
import pandas as pd # imports the 'pandas' library for use in this script and assigns it the nickname 'pd'

## Using `pandas`
Load data from file:

In [None]:
df = pd.read_csv('sales.csv') # loading the data from sales.csv into the DataFrame object
df.index = df.index + 1

Some DataFrame functions built into `pandas`:

In [None]:
df.head() # returns first 5 rows of table

In [None]:
l = len(df) # returns number of rows in table
print(l)

In [None]:
df['SalesID'] # get SalesID column

In [None]:
df.loc[1] # get 1st row

In [None]:
df.loc[1]['SalesID'] # get 1st row of SalesID column

In [None]:
column_names = list(df) # returns list of column names in table
print(column_names)

# **Steps 3 & 4: Data quality check functions**
Each function checks the values in 1 row against 1 validation rule.<br>
Here we define the check for **Completeness**:

In [None]:
def check_cannot_be_blank (df, column_name): # returns true if column_name contains no blanks
  column_values = df[column_name]
  for column_value in column_values:
    if pd.isna(column_value): # pd.isna(column_value) returns False if column_value is blank
      return False
  return True

Run the check for **Completeness**:

In [None]:
print(check_cannot_be_blank(df, 'SalesID')) # run the cannot be blank check on the SalesID column

Use a `for` loop to run the **Completeness** check on all columns:

In [None]:
for column_name in column_names:
  print(check_cannot_be_blank(df, column_name)) # run the cannot be blank check for every column in the table

## Check SalesDate column

Define the **consistency** validation rule check functions for the SalesDate column:

In [None]:
from collections import defaultdict
import math

def is_intable (s): # returns true if the input value is an integer
  try:
    int(s)
  except:
    return False
  return True

def is_alphabetical (s): # returns true if the input value is alphabetical (consists only of alphabets)
  if not isinstance(s, str):
    return False
  for char in s: # for each character in the input string
    if ord(char) not in range(97,123) and ord(char) not in range(65,91): # if the character is not in a-z or A-Z
      return False # the value is not alphabetical
  return True

def check_data_format (df, column_name, character_type, pieces_max_lengths): # returns true if all values in column_name comply with data format specified
  match character_type:
    case 'N': # if character type is 'N'
      if len(pieces_max_lengths) == 2: # data format is N(p,q)
        for value in df[column_name]: # for each value in the column
          val_str = str(value) # convert the value into a string
          val_str_pieces = val_str.split('.') # split the string into before and after the decimal point
          if len(val_str_pieces) != 2:
            return False # the column fails the check if number of '.' is not exactly 1
          # the value passes the check if both pieces are integers, and each piece is of length less than or equal to the maximum length specified
          complies = is_intable(val_str_pieces[0]) and is_intable(val_str_pieces[1]) and len(val_str_pieces[0]) <= pieces_max_lengths[0] and len(val_str_pieces[1]) <= pieces_max_lengths[1]
          if not complies:
            return False
        return True
      elif len(pieces_max_lengths) == 1: # data format is N(p)
        for value in df[column_name]:
          complies = is_intable(value) and len(str(value)) <= pieces_max_lengths[0] # the value passes the check if it is an integer and is of length less than or equal to the maximum length specified
          if not complies:
            return False
        return True
    case 'A': # data format is A(p)
      for value in df[column_name]:
        complies = is_alphabetical(value) and len(value) <= pieces_max_lengths[0] # the value passes the check if it is alphabetical and is of length less than or equal to the maximum length specified
        if not complies:
          return False
      return True
    case 'X': # data format is X(p)
      for value in df[column_name]:
        val_str = str(value) # convert the value into a string
        complies = len(val_str) <= pieces_max_lengths[0] # the value passes the check if it is of length less than or equal to the maximum length specified
        if not complies:
          return False
      return True

def check_date_format_yyyymmdd (df, column_name): # returns true if all values in column_name follow the date format yyyy-mm-dd
  for value in df[column_name]:
    value_split = value.split('-') # split each value into 3 parts to assess individually
    if len(value_split) != 3:
      return False # column fails the check if the number of parts is not exactly 3
    try:
      if len(value_split[2]) != 2 or int(value_split[2]) < 0 or int(value_split[2]) > 31: # reject if date is not 2 digits, or date < 0 or date > 31
        return False
      if len(value_split[2]) != 2 or int(value_split[1]) < 0 or int(value_split[1]) > 12: # reject if month is not 2 digits, or month < 0 or month > 12
        return False
      if len(value_split[0]) != 4 or not is_intable(value_split[0]): # reject if year is not int or year is not 4 digits
        return False
    except:
      return False # reject if date, month or year is not an integer
  return True

Run the **Consistency** checks on the SalesDate column:

In [None]:
# check SalesDate column
print('Check SalesDate column')
print('====')
print('Data format: ' + str(check_data_format(df, 'SalesDate', 'X', [10])))
print('Date format: ' + str(check_date_format_yyyymmdd(df, 'SalesDate')))

Modify the functions to return the rows that fail to comply with the **Consistency** rules:

In [None]:
def data_format_identify_non_complying_rows (df, column_name, character_type, pieces_max_lengths): # returns a list of row(s) that do not comply to data format rule
  non_compliers = [] # keep a list of rows that do not comply to this rule
  match character_type:
    case 'N':
      if len(pieces_max_lengths) == 2: # data format is N(p,q)
        for index in range(1, l + 1):
          val_str = str(df.loc[index][column_name]) # locate the row in  the specified column that does not comply with this rule
          val_str_pieces = val_str.split('.')
          complies = len(val_str_pieces) == 2 and is_intable(val_str_pieces[0]) and is_intable(val_str_pieces[1]) and len(val_str_pieces[0]) <= pieces_max_lengths[0] and len(val_str_pieces[1]) <= pieces_max_lengths[1]
          if not complies:
            non_compliers.append(df.loc[index]) # if the value does not comply then add the row to the list of non-compliers
        return non_compliers
      elif len(pieces_max_lengths) == 1: # data format is N(p)
        for index in range(1, l + 1):
          value = df.loc[index][column_name]
          complies = is_intable(value) and len(str(value)) <= pieces_max_lengths[0]
          if not complies:
            non_compliers.append(df.loc[index]) # if the value does not comply then add the row to the list of non-compliers
        return non_compliers
    case 'A': # data format is A(p)
      for index in range(1, l + 1):
          value = df.loc[index][column_name]
          complies = is_alphabetical(value) and len(value) <= pieces_max_lengths[0]
          if not complies:
            non_compliers.append(df.loc[index]) # if the value does not comply then add the row to the list of non-compliers
      return non_compliers
    case 'X': # data format is X(p)
      for index in range(1, l + 1):
          value = df.loc[index][column_name]
          complies = len(value) <= pieces_max_lengths[0]
          if not complies:
            non_compliers.append(df.loc[index]) # if the value does not comply then add the row to the list of non-compliers
      return non_compliers

def date_format_identify_non_complying_rows (df, column_name): # returns a list of row(s) that do not comply to date format rule
  non_compliers = [] # keep a list of rows that do not comply to this rule
  for row_index in range(1, l + 1): # going through the df row by row this time
    value_complies = True
    value = df.loc[row_index][column_name] # locate the row in  the specified column that does not comply with this rule
    value_split = value.split('-') # split each value into 3 parts to assess individually
    if len(value_split) != 3:
      value_complies = False # column fails the check if the number of parts is not exactly 3
    try:
      if int(value_split[2]) < 0 or int(value_split[2]) > 31: # reject if date is not int or date < 0 or date > 31
        value_complies = False
      if int(value_split[1]) < 0 or int(value_split[1]) > 12: # reject if month is not int or month < 0 or month > 12
        value_complies = False
      if len(value_split[0]) != 4 or not is_intable(value_split[0]): # reject if year is not int or year is not 4 digits
        value_complies = False
    except:
      value_complies = False # also reject row if date, month or year is not an integer
    if value_complies == False:
        non_compliers.append(df.loc[row_index]) # add the row to the list of non-compliers
  return non_compliers

*Define a helper function to print relevant rows neatly:*

In [None]:
def pprint_row_list (lis): # prints a list of rows in a nice format
  for i in range(len(lis)): # for each item in the list
    print('Item number ' + str(i + 1)) # print the item number (position in list)
    print(lis[i]) # print the item
    print('\n') # print a line break

Run the modified **Consistency** check functions:

In [None]:
print('SalesDate non-complying rows for data format :')
pprint_row_list(data_format_identify_non_complying_rows(df, 'SalesDate', 'X', [10]))
print('\n')

print('SalesDate non-complying rows for date format :')
pprint_row_list(date_format_identify_non_complying_rows(df, 'SalesDate'))

Modify the functions again to provide a percentage of all rows that do not comply with the specified **Consistency** rules:

In [120]:
# returns the percentage of row(s) that do not comply to data format rule
def data_format_percentage_non_compliance (df, column_name, character_type, pieces_max_lengths):
  return len(data_format_identify_non_complying_rows(df, column_name, character_type, pieces_max_lengths)) / l * 100

# returns the percentage of row(s) that do not comply to date format rule
def date_format_percentage_non_compliance (df, column_name):
  return len(date_format_identify_non_complying_rows(df, column_name)) / l * 100

Run the modified checks to find the percentage non-compliance of the SalesDate column for each of the **Consistency** checks:

In [None]:
print('Consistency checks:')
print('======')
print('SalesDate percentage non-compliance for data format rule: ' + str(data_format_percentage_non_compliance(df, 'SalesDate')))
print('SalesDate percentage non-compliance for date format rule: ' + str(date_format_percentage_non_compliance(df, 'SalesDate')))

Calculate **Consistency** score:

In [None]:
consistency_score = 100 - (data_format_percentage_non_compliance(df, 'SalesDate') + date_format_percentage_non_compliance(df, 'SalesDate')) / 2
print('Consistency score: ' + str(consistency_score))

## Define rest of validation check functions
Define the rest of the **Consistency** validation check functions to also return a list of non-complying rows:

In [None]:
def unique_values_identify_non_complying_rows (df, column_name): # returns a list of row(s) that do not comply to unique values rule
  non_compliers = [] # keep a list of rows that do not comply to this rule
  value_occurence_list = defaultdict(list) # for each value that occurs in the column, keep a list of rows that have this value
  for index in range(1,l + 1): # remember l is len(df)
    value = df.loc[index][column_name] # locate the row in  the specified column that does not comply with this rule
    value_occurence_list[value].append(index) # add the row to the list of rows with the same value in that column
  for value in value_occurence_list.keys(): # for each value in the column,
    if len(value_occurence_list[value]) > 1: # if there is more than 1 row in the list of rows with that value
      non_compliers = non_compliers + [df.loc[index] for index in value_occurence_list[value]] # add all the rows in the list to the list of non-compliers
  return non_compliers

def data_type_identify_non_complying_rows (df, column_name, type_specified): # returns a list of rows that do not comply with data type rule (Integer, Float etc.)
  non_compliers = [] # keep a list of rows that do not comply to this rule
  for index in range(1, l + 1):
    value = df.loc[index][column_name] # locate the row in  the specified column that does not comply with this rule
    if not isinstance(value, type_specified):
      non_compliers.append(df.loc[index])

Define the functions to return per-column percentage non-compliance for the **Consistency** checks:

In [None]:
# returns the percentage of row(s) that do not comply to unique values rule
def unique_values_percentage_non_compliance (df, column_name):
  return len(unique_values_identify_non_complying_rows(df, column_name)) / l * 100

# returns the percentage of row(s) that do not comply to data type rule
def data_type_percentage_non_compliance (df, column_name, type_specified):
  return len(data_type_identify_non_complying_rows(df, column_name, type_specified)) / l * 100

Define the rest of the **Accuracy** validation check functions to also return a list of non-complying rows:

In [None]:
def numerical_range_identify_non_complying_rows (df, column_name, lower_bound = - math.inf, upper_bound = math.inf): # returns a list of row(s) that do not comply to numerical range rule
  non_compliers = [] # keep a list of rows that do not comply to this rule
  for index in range(1, l + 1):
    value = df.loc[index][column_name] # locate the row in  the specified column that does not comply with this rule
    try:
      if int(value) >= upper_bound or int(value) <= lower_bound: # if value in row is not within specified range
        non_compliers.append(df.loc[index]) # add the row to the list of non-compliers
    except:
      non_compliers.append(df.loc[index]) # also add the row to the list of non-compliers if value is not numerical
  return non_compliers

Define the functions to return per-column percentage non-compliance for the **Accuracy** checks:

In [None]:
# returns the percentage of row(s) that do not comply to numerical range rule
def numerical_range_percentage_non_compliance (df, column_name, lower_bound = - math.inf, upper_bound = math.inf):
  return len(numerical_range_identify_non_complying_rows(df, column_name, lower_bound, upper_bound)) / l * 100

## Check SalesID column
Identify rows in SalesID column that do not comply with **Consistency** and **Accuracy** rules:

In [None]:
print('Consistency checks:')
print('======')
print('SalesID non-complying rows for data format:')
pprint_row_list(data_format_identify_non_complying_rows(df, 'SalesID', 'N', [4]))
print('\n')

print('SalesID non-complying rows for unique values rule:')
pprint_row_list(unique_values_identify_non_complying_rows(df, 'SalesID'))
print('\n')

print('SalesID non-complying rows for Integer type rule:')
pprint_row_list(data_type_identify_non_complying_rows(df, 'SalesID', int))
print('\n')

print('Accuracy checks:')
print('======')
print('SalesID non-complying rows for > 0 rule:')
pprint_row_list(numerical_range_identify_non_complying_rows(df, 'SalesID', lower_bound = 0))

Find percentage non-compliance for each rule:

In [None]:
print('Consistency checks:')
print('======')
print('SalesID percentage non-compliance for data format :' + str(data_format_percentage_non_compliance(df, 'SalesID', 'N', [4])))

print('SalesID percentage non-compliance for unique values rule: ' + str(unique_values_percentage_non_compliance(df, 'SalesID')))

print('SalesID percentage non-compliance for Integer type rule: ' + str(data_type_percentage_non_compliance(df, 'SalesID', int)))
print('\n')

print('Accuracy checks:')
print('======')
print('SalesID percentage non-compliance for > 0 rule: ')
print(numerical_range_percentage_non_compliance(df, 'SalesID', lower_bound = 0))

Calculate **Consistency** and **Accuracy** scores:

In [None]:
consistency_score = 100 - (data_format_percentage_non_compliance(df, 'SalesID', 'N', [4]) + unique_values_percentage_non_compliance(df, 'SalesID') + data_type_percentage_non_compliance(df, 'SalesID', int)) / 3
print('Consistency score: ' + str(consistency_score))

accuracy_score = 100 - numerical_range_percentage_non_compliance(df, 'SalesID', lower_bound = 0)
print('Accuracy score: ' + str(accuracy_score))

## Check CustomerID column
Identify rows in CustomerID column that do not comply with **Consistency** and **Accuracy** rules:

In [None]:
print('Consistency checks:')
print('======')
print('CustomerID non-complying rows for data format:')
pprint_row_list(data_format_identify_non_complying_rows(df, 'CustomerID', 'N', [3]))
print('\n')

print('CustomerID non-complying rows for Integer type rule:')
pprint_row_list(data_type_identify_non_complying_rows(df, 'CustomerID', int))
print('\n')

print('Accuracy checks:')
print('======')
print('CustomerID non-complying rows for > 0 rule:')
pprint_row_list(numerical_range_identify_non_complying_rows(df, 'CustomerID', lower_bound = 0))

Find percentage non-compliance for each rule:

In [None]:
print('Consistency checks:')
print('======')
print('CustomerID percentage non-compliance for data format: ' + str(data_format_percentage_non_compliance(df, 'CustomerID', 'N', [3])))

print('CustomerID percentage non-compliance for Integer type rule: ' + str(data_type_percentage_non_compliance(df, 'CustomerID', int)))
print('\n')

print('Accuracy checks:')
print('======')
print('CustomerID percentage non-compliance for > 0 rule: ')
print(numerical_range_percentage_non_compliance(df, 'CustomerID', lower_bound = 0))

Calculate **Consistency** and **Accuracy** scores:

In [None]:
consistency_score = 100 - (data_format_percentage_non_compliance(df, 'CustomerID', 'N', [3]) + data_type_percentage_non_compliance(df, 'CustomerID', int)) / 2
print('Consistency score: ' + str(consistency_score))

accuracy_score = 100 - numerical_range_percentage_non_compliance(df, 'CustomerID', lower_bound = 0)
print('Accuracy score: ' + str(accuracy_score))

## Check SalesPersonID column
Identify rows in SalesPersonID column that do not comply with **Consistency** and **Accuracy** rules:

In [None]:
print('Consistency checks:')
print('======')
print('SalesPersonID non-complying rows for data format:')
pprint_row_list(data_format_identify_non_complying_rows(df, 'SalesPersonID', 'N', [1]))
print('\n')

print('SalesPersonID non-complying rows for Integer type rule:')
pprint_row_list(data_type_identify_non_complying_rows(df, 'SalesPersonID', int))
print('\n')

print('Accuracy checks:')
print('======')
print('SalesPersonID non-complying rows for > 0 rule:')
pprint_row_list(numerical_range_identify_non_complying_rows(df, 'SalesPersonID', lower_bound = 0))

Find percentage non-compliance for each rule:

In [None]:
print('Consistency checks:')
print('======')
print('SalesPersonID percentage non-compliance for data format: ' + str(data_format_percentage_non_compliance(df, 'SalesPersonID', 'N', [3])))

print('SalesPersonID percentage non-compliance for Integer type rule: ' + str(data_type_percentage_non_compliance(df, 'SalesPersonID', int)))
print('\n')

print('Accuracy checks:')
print('======')
print('SalesPersonID percentage non-compliance for > 0 rule: ')
print(numerical_range_percentage_non_compliance(df, 'SalesPersonID', lower_bound = 0))

Calculate **Consistency** and **Accuracy** scores:

In [None]:
consistency_score = 100 - (data_format_percentage_non_compliance(df, 'SalesPersonID', 'N', [3]) + data_type_percentage_non_compliance(df, 'SalesPersonID', int)) / 2
print('Consistency score: ' + str(consistency_score))

accuracy_score = 100 - numerical_range_percentage_non_compliance(df, 'SalesPersonID', lower_bound = 0)
print('Accuracy score: ' + str(accuracy_score))

# Your turn!
Fill in the code blocks to identify non-complying rows, calculate percentage non-compliance, and calculate **Consistency** and **Accuracy** scores for the ProductID and Quantity columns

## Check ProductID column
Identify rows in ProductID column that do not comply with **Consistency** and **Accuracy** rules:

<div class="alert alert-block alert-info">
<b>↓↓↓ TO DO: Run the validation check functions to find non-complying rows ↓↓↓</b>
</div>

In [None]:
# fill in the code!

Find percentage non-compliance for each rule:

<div class="alert alert-block alert-info">
<b>↓↓↓ TO DO: Run the validation check functions to find percentage non-compliance ↓↓↓</b>
</div>

In [None]:
# fill in the code!

Calculate **Consistency** and **Accuracy** scores:

<div class="alert alert-block alert-info">
<b>↓↓↓ TO DO: Calculate scores for ProductID column ↓↓↓</b>
</div>

In [None]:
# fill in the code!

## Check Quantity column
Identify rows in Quantity column that do not comply with **Consistency** and **Accuracy** rules:

<div class="alert alert-block alert-info">
<b>↓↓↓ TO DO: Run the validation check functions to find non-complying rows ↓↓↓</b>
</div>

In [None]:
# fill in the code!

Find percentage non-compliance for each rule:

<div class="alert alert-block alert-info">
<b>↓↓↓ TO DO: Run the validation check functions to find percentage non-compliance ↓↓↓</b>
</div>

In [None]:
# fill in the code!

Calculate **Consistency** and **Accuracy** scores:

<div class="alert alert-block alert-info">
<b>↓↓↓ TO DO: Calculate scores for ProductID column ↓↓↓</b>
</div>

In [None]:
# fill in the code!

## Check Sales column
Identify rows in Sales column that do not comply with **Consistency** and **Accuracy** rules:

In [None]:
print('Consistency checks:')
print('======')
print('Sales non-complying rows for data format:')
pprint_row_list(data_format_identify_non_complying_rows(df, 'Sales', 'N', [10,2]))
print('\n')

print('Sales non-complying rows for Float type rule:')
pprint_row_list(data_type_identify_non_complying_rows(df, 'Sales', float))
print('\n')

print('Accuracy checks:')
print('======')
print('Sales non-complying rows for > 0 rule:')
pprint_row_list(numerical_range_identify_non_complying_rows(df, 'Sales', lower_bound = 0))

Find percentage non-compliance for each rule:

In [None]:
print('Consistency checks:')
print('======')
print('Sales percentage non-compliance for data format: ' + str(data_format_percentage_non_compliance(df, 'Sales', 'N', [10,2])))

print('Sales percentage non-compliance for Float type rule: ' + str(data_type_percentage_non_compliance(df, 'Sales', float)))
print('\n')

print('Accuracy checks:')
print('======')
print('Sales percentage non-compliance for >= 0 rule: ')
print(numerical_range_percentage_non_compliance(df, 'Sales', lower_bound = -0.001))

Calculate **Consistency** and **Accuracy** scores:

In [None]:
consistency_score = 100 - (data_format_percentage_non_compliance(df, 'Sales', 'N', [10,2]) + data_type_percentage_non_compliance(df, 'Sales', float)) / 2
print('Consistency score: ' + str(consistency_score))

accuracy_score = 100 - numerical_range_percentage_non_compliance(df, 'Sales', lower_bound = -0.001)
print('Accuracy score: ' + str(accuracy_score))

## Check StoreID column
Identify rows in StoreID column that do not comply with data format rule:

In [None]:
print('Consistency checks:')
print('======')
print('StoreID non-complying rows for data format:')
pprint_row_list(data_format_identify_non_complying_rows(df, 'StoreID', 'N', [1]))

Find percentage non-compliance for data format rule:

In [None]:
print('Consistency checks:')
print('======')
print('StoreID percentage non-compliance for data format: ' + str(data_format_percentage_non_compliance(df, 'StoreID', 'N', [1])))

### Using ChatGPT to generate validation check function - **Accuracy**
New function to check that all values in the column 'StoreID' fall contain only the permitted values (1 - 9)

In [None]:
def check_store_id_quality(df):
    # Define the permitted values
    permitted_values = {1, 2, 3, 4, 5, 6, 7, 8, 9}

    # Filter rows where 'StoreID' is not in the permitted values
    invalid_rows = df[~df['StoreID'].isin(permitted_values)]

    # Return the invalid rows as a list of dictionaries
    return invalid_rows.to_dict(orient='records')

In [None]:
print('Consistency checks:')
print('======')
print('StoreID non-complying rows for permitted values:')
print(check_store_id_quality(df))

Find percentage non-compliance for permitted values rule:

In [None]:
store_id_permitted_values_percentage_non_compliance = len(check_store_id_quality(df)) / l * 100

In [None]:
print('Accuracy checks:')
print('======')
print('StoreID percentage non-compliance for permitted values: ' + str(store_id_permitted_values_percentage_non_compliance))

Calculate **Consistency** and **Accuracy** scores:

In [None]:
consistency_score = 100 - data_format_percentage_non_compliance(df, 'StoreID', 'N', [1])
print('Consistency score: ' + str(consistency_score))

accuracy_score = 100 - store_id_permitted_values_percentage_non_compliance
print('Accuracy score: ' + str(accuracy_score))

## Check OrderStatus column
Identify rows in OrderStatus column that do not comply with data format rule:

In [None]:
print('Consistency checks:')
print('======')
print('OrderStatus non-complying rows for data format:')
pprint_row_list(data_format_identify_non_complying_rows(df, 'OrderStatus', 'A', [10]))

Find percentage non-compliance for data format rule:

In [None]:
print('Consistency checks:')
print('======')
print('OrderStatus percentage non-compliance for data format: ' + str(data_format_percentage_non_compliance(df, 'OrderStatus', 'A', [10])))

### Using ChatGPT to generate validation check function - **Accuracy**
New function to check that all values in the column 'OrderStatus' fall contain only the permitted values (Complete, Pending, Incomplete)

<div class="alert alert-block alert-info">
<b>↓↓↓ TO DO: Get ChatGPT to write the Accuracy check function ↓↓↓</b>
</div>

In [None]:
# fill in the code using ChatGPT!

<div class="alert alert-block alert-info">
<b>↓↓↓ TO DO: Run the check and print the results ↓↓↓</b>
</div>

In [None]:
print('Consistency checks:')
print('======')
print('OrderStatus non-complying rows for permitted values:')
order_status_permitted_values_non_compliers = _____ # call the function defined by ChatGPT
print(order_status_permitted_values_non_compliers)

Find percentage non-compliance for permitted values rule:

In [None]:
order_status_permitted_values_percentage_non_compliance = len(order_status_permitted_values_non_compliers) / l * 100

In [None]:
print('Accuracy checks:')
print('======')
print('OrderStatus percentage non-compliance for permitted values: ' + str(order_status_permitted_values_percentage_non_compliance))

Calculate **Consistency** and **Accuracy** scores:

In [None]:
consistency_score = 100 - data_format_percentage_non_compliance(df, 'OrderStatus', 'A', [10])
print('Consistency score: ' + str(consistency_score))

accuracy_score = 100 - order_status_permitted_values_percentage_non_compliance
print('Accuracy score: ' + str(accuracy_score))