# **DataCon 2024: Data Quality Checks**
#### Learn how to run simple Completeness, Consistency and Accuracy Checks based on pre-defined validation rules on a sample Sales Dataset

## Import Pandas Library and Load Data from File

In [None]:
#Import external libraries
%pip install pandas

# imports the 'pandas' library for use in this script and assigns it the nickname 'pd'
import pandas as pd

In [1]:
import pandas as pd

In [2]:
# loading the data from sales.csv into the DataFrame object
df = pd.read_csv('sales.csv')
df.index = df.index + 1

#### Some DataFrame functions built into `pandas`:

In [3]:
df.head() # returns first 5 rows of table

Unnamed: 0,SalesDate,SalesID,CustomerID,SalesPersonID,ProductID,Quantity,Sales,StoreID,OrderStatus
1,2020-01-01,112,37,9,70,7,226.0,9,Incomplete
2,2020-01-01,113,59,4,36,37,326.1,7,Pending
3,2020-01-01,114,59,4,40,19,422.9,7,Incomplete
4,2020-01-01,115,3,9,11,60,1683.8,nd68G6h0PZbk,Pending
5,2020-01-01,116,3,9,40,12,230.9,7,Incomplete


In [4]:
l = len(df) # returns number of rows in table
print(l)

8498


In [5]:
df['SalesID'] # get SalesID column

1        112
2        113
3        114
4        115
5        116
        ... 
8494    2843
8495    2677
8496    2844
8497    2678
8498    2845
Name: SalesID, Length: 8498, dtype: object

In [6]:
df.loc[1] # get 1st row

SalesDate        2020-01-01
SalesID                 112
CustomerID               37
SalesPersonID             9
ProductID                70
Quantity                  7
Sales                 226.0
StoreID                   9
OrderStatus      Incomplete
Name: 1, dtype: object

In [7]:
df.loc[1]['SalesID'] # get 1st row of SalesID column

'112'

In [8]:
column_names = list(df) # returns list of column names in table
print(column_names)

['SalesDate', 'SalesID', 'CustomerID', 'SalesPersonID', 'ProductID', 'Quantity', 'Sales', 'StoreID', 'OrderStatus']


## Define and Run Data Quality Check Functions
Each function checks the values in 1 row against 1 validation rule.<br>
## Check for **Completeness** of dataset

In [9]:
#Defining the Completeness check function
def check_cannot_be_blank (df, column_name): # returns true if column_name contains no blanks
  column_values = df[column_name]
  for column_value in column_values:
    if pd.isna(column_value): # pd.isna(column_value) returns False if column_value is blank
      return False
  return True

In [10]:
#Run the cannot be blank (i.e. completeness) check on a specific column (i.e. SalesID)
print(check_cannot_be_blank(df, 'SalesID')) 

True


In [115]:
#Use a `for` loop to run the **Completeness** check on all columns in the table
for column_name in column_names:
  print(column_name + ': ' + str(check_cannot_be_blank(df, column_name)))

SalesDate: True
SalesID: True
CustomerID: True
SalesPersonID: True
ProductID: True
Quantity: True
Sales: True
StoreID: True
OrderStatus: True


## Check for **Consistency** of dataset 
### Start with SalesDate Column in dataset

In [12]:
#Define the Consistency check function

from collections import defaultdict
import math

def is_intable (s): # returns true if the input value is an integer
  try:
    int(s)
  except:
    return False
  return True

def is_floatable (s): # returns true if the input value is a float
  try:
    float(s)
  except:
    return False
  return True

def is_alphabetical (s): # returns true if the input value is alphabetical (consists only of alphabets)
  if not isinstance(s, str):
    return False
  for char in s: # for each character in the input string
    if ord(char) not in range(97,123) and ord(char) not in range(65,91): # if the character is not in a-z or A-Z
      return False # the value is not alphabetical
  return True

def check_data_format (df, column_name, character_type, pieces_max_lengths): # returns true if all values in column_name comply with data format specified
  match character_type:
    case 'N': # if character type is 'N'
      if len(pieces_max_lengths) == 2: # data format is N(p,q)
        for value in df[column_name]: # for each value in the column
          val_str = str(value) # convert the value into a string
          val_str_pieces = val_str.split('.') # split the string into before and after the decimal point
          if len(val_str_pieces) != 2:
            return False # the column fails the check if number of '.' is not exactly 1
          # the value passes the check if both pieces are integers, and each piece is of length less than or equal to the maximum length specified
          complies = is_intable(val_str_pieces[0]) and is_intable(val_str_pieces[1]) and len(val_str_pieces[0]) <= pieces_max_lengths[0] and len(val_str_pieces[1]) <= pieces_max_lengths[1]
          if not complies:
            return False
        return True
      elif len(pieces_max_lengths) == 1: # data format is N(p)
        for value in df[column_name]:
          complies = is_intable(value) and len(str(value)) <= pieces_max_lengths[0] # the value passes the check if it is an integer and is of length less than or equal to the maximum length specified
          if not complies:
            return False
        return True
    case 'A': # data format is A(p)
      for value in df[column_name]:
        complies = is_alphabetical(value) and len(value) <= pieces_max_lengths[0] # the value passes the check if it is alphabetical and is of length less than or equal to the maximum length specified
        if not complies:
          return False
      return True
    case 'X': # data format is X(p)
      for value in df[column_name]:
        val_str = str(value) # convert the value into a string
        complies = len(val_str) <= pieces_max_lengths[0] # the value passes the check if it is of length less than or equal to the maximum length specified
        if not complies:
          return False
      return True

def check_date_format_yyyymmdd (df, column_name): # returns true if all values in column_name follow the date format yyyy-mm-dd
  for value in df[column_name]:
    value_split = value.split('-') # split each value into 3 parts to assess individually
    if len(value_split) != 3:
      return False # column fails the check if the number of parts is not exactly 3
    try:
      if len(value_split[2]) != 2 or int(value_split[2]) < 0 or int(value_split[2]) > 31: # reject if date is not 2 digits, or date < 0 or date > 31
        return False
      if len(value_split[2]) != 2 or int(value_split[1]) < 0 or int(value_split[1]) > 12: # reject if month is not 2 digits, or month < 0 or month > 12
        return False
      if len(value_split[0]) != 4 or not is_intable(value_split[0]): # reject if year is not int or year is not 4 digits
        return False
    except:
      return False # reject if date, month or year is not an integer
  return True

In [13]:
#Run the Consistency check on SalesDate column
print('Check SalesDate column')
print('====')
print('Data format: ' + str(check_data_format(df, 'SalesDate', 'X', [10])))
print('Date format: ' + str(check_date_format_yyyymmdd(df, 'SalesDate')))

Check SalesDate column
====
Data format: False
Date format: False


#### Modify the functions to investigate the rows that fail to comply with the **Consistency** rules:

In [14]:
def data_format_identify_non_complying_rows (df, column_name, character_type, pieces_max_lengths): # returns a list of row(s) that do not comply to data format rule
  non_compliers = [] # keep a list of rows that do not comply to this rule
  match character_type:
    case 'N':
      if len(pieces_max_lengths) == 2: # data format is N(p,q)
        for index in range(1, l + 1):
          val_str = str(df.loc[index][column_name]) # locate the row in  the specified column that does not comply with this rule
          val_str_pieces = val_str.split('.')
          complies = len(val_str_pieces) == 2 and is_intable(val_str_pieces[0]) and is_intable(val_str_pieces[1]) and len(val_str_pieces[0]) <= pieces_max_lengths[0] and len(val_str_pieces[1]) <= pieces_max_lengths[1]
          if not complies:
            non_compliers.append(df.loc[index]) # if the value does not comply then add the row to the list of non-compliers
        return non_compliers
      elif len(pieces_max_lengths) == 1: # data format is N(p)
        for index in range(1, l + 1):
          value = df.loc[index][column_name]
          complies = is_intable(value) and len(str(value)) <= pieces_max_lengths[0]
          if not complies:
            non_compliers.append(df.loc[index]) # if the value does not comply then add the row to the list of non-compliers
        return non_compliers
    case 'A': # data format is A(p)
      for index in range(1, l + 1):
          value = df.loc[index][column_name]
          complies = is_alphabetical(value) and len(value) <= pieces_max_lengths[0]
          if not complies:
            non_compliers.append(df.loc[index]) # if the value does not comply then add the row to the list of non-compliers
      return non_compliers
    case 'X': # data format is X(p)
      for index in range(1, l + 1):
          value = df.loc[index][column_name]
          complies = len(value) <= pieces_max_lengths[0]
          if not complies:
            non_compliers.append(df.loc[index]) # if the value does not comply then add the row to the list of non-compliers
      return non_compliers

def date_format_identify_non_complying_rows (df, column_name): # returns a list of row(s) that do not comply to date format rule
  non_compliers = [] # keep a list of rows that do not comply to this rule
  for row_index in range(1, l + 1): # going through the df row by row this time
    value_complies = True
    value = df.loc[row_index][column_name] # locate the row in  the specified column that does not comply with this rule
    value_split = value.split('-') # split each value into 3 parts to assess individually
    if len(value_split) != 3:
      value_complies = False # column fails the check if the number of parts is not exactly 3
    try:
      if int(value_split[2]) < 0 or int(value_split[2]) > 31: # reject if date is not int or date < 0 or date > 31
        value_complies = False
      if int(value_split[1]) < 0 or int(value_split[1]) > 12: # reject if month is not int or month < 0 or month > 12
        value_complies = False
      if len(value_split[0]) != 4 or not is_intable(value_split[0]): # reject if year is not int or year is not 4 digits
        value_complies = False
    except:
      value_complies = False # also reject row if date, month or year is not an integer
    if value_complies == False:
        non_compliers.append(df.loc[row_index]) # add the row to the list of non-compliers
  return non_compliers

#### Define a helper function to print relevant rows neatly

In [15]:
def pprint_row_list (lis): # prints a list of rows in a nice format
  for i in range(len(lis)): # for each item in the list
    print('Item number ' + str(i + 1)) # print the item number (position in list)
    print(lis[i]) # print the item
    print('\n') # print a line break

#### Run the modified **Consistency** check function to display all rows with errors

In [16]:
print('SalesDate non-complying rows for data format :')
pprint_row_list(data_format_identify_non_complying_rows(df, 'SalesDate', 'X', [10]))
print('\n')

print('SalesDate non-complying rows for date format :')
pprint_row_list(date_format_identify_non_complying_rows(df, 'SalesDate'))

SalesDate non-complying rows for data format :
Item number 1
SalesDate        4KJlZDR2y8ary
SalesID                   1810
CustomerID                  37
SalesPersonID                3
ProductID                   64
Quantity                     7
Sales                    311.4
StoreID                      2
OrderStatus           Complete
Name: 1775, dtype: object


Item number 2
SalesDate        g9q7jkijclM4tCu
SalesID                     2227
CustomerID                    19
SalesPersonID                  3
ProductID                      2
Quantity                      26
Sales                      461.0
StoreID                        3
OrderStatus             Complete
Name: 2230, dtype: object


Item number 3
SalesDate        2xN93XPWEgreAXJCkfM
SalesID                         3685
CustomerID                        63
SalesPersonID                      7
ProductID                         44
Quantity                          21
Sales                         1805.1
StoreID             

#### Modify the functions again to provide a percentage of all rows that do not comply with the specified **Consistency** rules:

In [17]:
# returns the percentage of row(s) that do not comply to data format rule
def data_format_percentage_non_compliance (df, column_name, character_type, pieces_max_lengths):
  return len(data_format_identify_non_complying_rows(df, column_name, character_type, pieces_max_lengths)) / l * 100

# returns the percentage of row(s) that do not comply to date format rule
def date_format_percentage_non_compliance (df, column_name):
  return len(date_format_identify_non_complying_rows(df, column_name)) / l * 100

#### Run the modified checks to find the percentage non-compliance of the SalesDate column for each of the **Consistency** checks:

In [18]:
print('Consistency checks:')
print('======')
print('SalesDate percentage non-compliance for data format rule: ' + str(data_format_percentage_non_compliance(df, 'SalesDate', 'X', [10])))
print('SalesDate percentage non-compliance for date format rule: ' + str(date_format_percentage_non_compliance(df, 'SalesDate')))

Consistency checks:
SalesDate percentage non-compliance for data format rule: 0.03530242409978819
SalesDate percentage non-compliance for date format rule: 0.2353494939985879


#### Calculate **Consistency** and **Accuracy** score:

In [19]:
sales_date_consistency_score = 100 - (data_format_percentage_non_compliance(df, 'SalesDate', 'X', [10]) + date_format_percentage_non_compliance(df, 'SalesDate')) / 2
print('Consistency score: ' + str(sales_date_consistency_score))

sales_date_accuracy_score = 100 # SalesDate column has no accuracy rules / checks
print('Accuracy score: ' + str(sales_date_accuracy_score))

Consistency score: 99.86467404095082
Accuracy score: 100


### Find non-complying rows
#### Define the rest of the **Consistency** validation check functions to also return a list of non-complying rows:

In [20]:
def unique_values_identify_non_complying_rows (df, column_name): # returns a list of row(s) that do not comply to unique values rule
  non_compliers = [] # keep a list of rows that do not comply to this rule
  value_occurence_list = defaultdict(list) # for each value that occurs in the column, keep a list of rows that have this value
  for index in range(1,l + 1): # remember l is len(df)
    value = df.loc[index][column_name] # locate the row in  the specified column that does not comply with this rule
    value_occurence_list[value].append(index) # add the row to the list of rows with the same value in that column
  for value in value_occurence_list.keys(): # for each value in the column,
    if len(value_occurence_list[value]) > 1: # if there is more than 1 row in the list of rows with that value
      non_compliers = non_compliers + [df.loc[index] for index in value_occurence_list[value]] # add all the rows in the list to the list of non-compliers
  return non_compliers

def data_type_identify_non_complying_rows (df, column_name, type_specified): # returns a list of rows that do not comply with data type rule (Integer, Float etc.)
  non_compliers = [] # keep a list of rows that do not comply to this rule
  for index in range(1, l + 1):
    value = df.loc[index][column_name] # locate the row in  the specified column that does not comply with this rule
    if type_specified == int:
      if not is_intable(value):
        non_compliers.append(df.loc[index])
    elif type_specified == float:
      if not is_floatable(value):
        non_compliers.append(df.loc[index])
  return non_compliers

#### Define the rest of the **Accuracy** validation check functions to also return a list of non-complying rows:

In [21]:
def numerical_range_identify_non_complying_rows (df, column_name, lower_bound = - math.inf, upper_bound = math.inf): # returns a list of row(s) that do not comply to numerical range rule
  non_compliers = [] # keep a list of rows that do not comply to this rule
  for index in range(1, l + 1):
    value = df.loc[index][column_name] # locate the row in  the specified column that does not comply with this rule
    try:
      if float(value) >= upper_bound or float(value) <= lower_bound: # if value in row is not within specified range
        non_compliers.append(df.loc[index]) # add the row to the list of non-compliers
    except:
      non_compliers.append(df.loc[index]) # also add the row to the list of non-compliers if value is not numerical
  return non_compliers

#### Calculate percentage non-compliance per-column for the **Consistency** checks

In [22]:
# returns the percentage of row(s) that do not comply to unique values rule
def unique_values_percentage_non_compliance (df, column_name):
  return len(unique_values_identify_non_complying_rows(df, column_name)) / l * 100

# returns the percentage of row(s) that do not comply to data type rule
def data_type_percentage_non_compliance (df, column_name, type_specified):
  return len(data_type_identify_non_complying_rows(df, column_name, type_specified)) / l * 100

#### Calculate  percentage non-compliance per-column for the **Accuracy** checks

In [23]:
# returns the percentage of row(s) that do not comply to numerical range rule
def numerical_range_percentage_non_compliance (df, column_name, lower_bound = - math.inf, upper_bound = math.inf):
  return len(numerical_range_identify_non_complying_rows(df, column_name, lower_bound, upper_bound)) / l * 100

### Move on to validation checks for SalesID column
#### Identify rows in SalesID column that do not comply with **Consistency** and **Accuracy** rules

In [24]:
print('Consistency checks:')
print('======')
print('SalesID non-complying rows for data format:')
pprint_row_list(data_format_identify_non_complying_rows(df, 'SalesID', 'N', [4]))
print('\n')

print('SalesID non-complying rows for unique values rule:')
pprint_row_list(unique_values_identify_non_complying_rows(df, 'SalesID'))
print('\n')

print('SalesID non-complying rows for Integer type rule:')
pprint_row_list(data_type_identify_non_complying_rows(df, 'SalesID', int))
print('\n')

print('Accuracy checks:')
print('======')
print('SalesID non-complying rows for > 0 rule:')
pprint_row_list(numerical_range_identify_non_complying_rows(df, 'SalesID', lower_bound = 0))

Consistency checks:
SalesID non-complying rows for data format:
Item number 1
SalesDate        2020-02-02
SalesID                   y
CustomerID               79
SalesPersonID             7
ProductID                52
Quantity                 23
Sales                1702.3
StoreID                   9
OrderStatus        Complete
Name: 597, dtype: object


Item number 2
SalesDate        2020-02-10
SalesID                  FH
CustomerID                5
SalesPersonID             2
ProductID                73
Quantity                 27
Sales                  29.1
StoreID                   4
OrderStatus        Complete
Name: 676, dtype: object


Item number 3
SalesDate        2020-03-03
SalesID                   Q
CustomerID               44
SalesPersonID             8
ProductID                30
Quantity                 15
Sales                 488.3
StoreID                   3
OrderStatus        Complete
Name: 970, dtype: object


Item number 4
SalesDate        2020-08-28
SalesID        

#### Find percentage non-compliance for each rule:

In [25]:
print('Consistency checks:')
print('======')
print('SalesID percentage non-compliance for data format: ' + str(data_format_percentage_non_compliance(df, 'SalesID', 'N', [4])))

print('SalesID percentage non-compliance for unique values rule: ' + str(unique_values_percentage_non_compliance(df, 'SalesID')))

print('SalesID percentage non-compliance for Integer type rule: ' + str(data_type_percentage_non_compliance(df, 'SalesID', int)))
print('\n')

print('Accuracy checks:')
print('======')
print('SalesID percentage non-compliance for > 0 rule: ' + str(numerical_range_percentage_non_compliance(df, 'SalesID', lower_bound = 0)))

Consistency checks:
SalesID percentage non-compliance for data format: 0.18827959519887033
SalesID percentage non-compliance for unique values rule: 0.11767474699929395
SalesID percentage non-compliance for Integer type rule: 0.18827959519887033


Accuracy checks:
SalesID percentage non-compliance for > 0 rule: 0.18827959519887033


#### Calculate **Consistency** and **Accuracy** scores:

In [26]:
sales_id_consistency_score = 100 - (data_format_percentage_non_compliance(df, 'SalesID', 'N', [4]) + unique_values_percentage_non_compliance(df, 'SalesID') + data_type_percentage_non_compliance(df, 'SalesID', int)) / 3
print('Consistency score: ' + str(sales_id_consistency_score))

sales_id_accuracy_score = 100 - numerical_range_percentage_non_compliance(df, 'SalesID', lower_bound = 0)
print('Accuracy score: ' + str(sales_id_accuracy_score))

Consistency score: 99.83525535420098
Accuracy score: 99.81172040480114


### Move on to validation checks for CustomerID column
#### Identify rows in CustomerID column that do not comply with **Consistency** and **Accuracy** rules

In [27]:
print('Consistency checks:')
print('======')
print('CustomerID non-complying rows for data format:')
pprint_row_list(data_format_identify_non_complying_rows(df, 'CustomerID', 'N', [3]))
print('\n')

print('CustomerID non-complying rows for Integer type rule:')
pprint_row_list(data_type_identify_non_complying_rows(df, 'CustomerID', int))
print('\n')

print('Accuracy checks:')
print('======')
print('CustomerID non-complying rows for > 0 rule:')
pprint_row_list(numerical_range_identify_non_complying_rows(df, 'CustomerID', lower_bound = 0))

Consistency checks:
CustomerID non-complying rows for data format:
Item number 1
SalesDate        2020-01-31
SalesID                 109
CustomerID         ielXvjMk
SalesPersonID             3
ProductID                53
Quantity                 46
Sales                3490.3
StoreID                   1
OrderStatus        Complete
Name: 508, dtype: object


Item number 2
SalesDate        2020-02-25
SalesID                 131
CustomerID        QsdorcuT5
SalesPersonID             3
ProductID                69
Quantity                 22
Sales                  44.7
StoreID                   8
OrderStatus        Complete
Name: 743, dtype: object


Item number 3
SalesDate        2020-03-01
SalesID                 819
CustomerID                u
SalesPersonID             2
ProductID                26
Quantity                 30
Sales                 955.8
StoreID                   1
OrderStatus        Complete
Name: 764, dtype: object


Item number 4
SalesDate        2020-05-10
SalesID     

#### Find percentage non-compliance for each rule:

In [28]:
print('Consistency checks:')
print('======')
print('CustomerID percentage non-compliance for data format: ' + str(data_format_percentage_non_compliance(df, 'CustomerID', 'N', [3])))

print('CustomerID percentage non-compliance for Integer type rule: ' + str(data_type_percentage_non_compliance(df, 'CustomerID', int)))
print('\n')

print('Accuracy checks:')
print('======')
print('CustomerID percentage non-compliance for > 0 rule: ')
print(numerical_range_percentage_non_compliance(df, 'CustomerID', lower_bound = 0))

Consistency checks:
CustomerID percentage non-compliance for data format: 0.20004706989879972
CustomerID percentage non-compliance for Integer type rule: 0.20004706989879972


Accuracy checks:
CustomerID percentage non-compliance for > 0 rule: 
0.20004706989879972


#### Calculate **Consistency** and **Accuracy** scores:

In [29]:
customer_id_consistency_score = 100 - (data_format_percentage_non_compliance(df, 'CustomerID', 'N', [3]) + data_type_percentage_non_compliance(df, 'CustomerID', int)) / 2
print('Consistency score: ' + str(customer_id_consistency_score))

customer_id_accuracy_score = 100 - numerical_range_percentage_non_compliance(df, 'CustomerID', lower_bound = 0)
print('Accuracy score: ' + str(customer_id_accuracy_score))

Consistency score: 99.7999529301012
Accuracy score: 99.7999529301012


### Move on to validation checks for SalesPersonID column
#### Identify rows in SalesPersonID column that do not comply with **Consistency** and **Accuracy** rules

In [30]:
print('Consistency checks:')
print('======')
print('SalesPersonID non-complying rows for data format:')
pprint_row_list(data_format_identify_non_complying_rows(df, 'SalesPersonID', 'N', [1]))
print('\n')

print('SalesPersonID non-complying rows for Integer type rule:')
pprint_row_list(data_type_identify_non_complying_rows(df, 'SalesPersonID', int))
print('\n')

print('Accuracy checks:')
print('======')
print('SalesPersonID non-complying rows for > 0 rule:')
pprint_row_list(numerical_range_identify_non_complying_rows(df, 'SalesPersonID', lower_bound = 0))

Consistency checks:
SalesPersonID non-complying rows for data format:
Item number 1
SalesDate        2020-02-02
SalesID                 715
CustomerID               75
SalesPersonID          NqWS
ProductID                30
Quantity                  7
Sales                 209.2
StoreID                   7
OrderStatus        Complete
Name: 592, dtype: object


Item number 2
SalesDate                       2020-03-02
SalesID                                968
CustomerID                              10
SalesPersonID    xlnqAIbfylfqG82GiTIww3WQQ
ProductID                               62
Quantity                                57
Sales                               1395.4
StoreID                                  9
OrderStatus                       Complete
Name: 913, dtype: object


Item number 3
SalesDate         2020-03-02
SalesID                 1005
CustomerID                63
SalesPersonID    89tT2ASfvMz
ProductID                 13
Quantity                  80
Sales                

#### Find percentage non-compliance for each rule:

In [31]:
print('Consistency checks:')
print('======')
print('SalesPersonID percentage non-compliance for data format: ' + str(data_format_percentage_non_compliance(df, 'SalesPersonID', 'N', [1])))

print('SalesPersonID percentage non-compliance for Integer type rule: ' + str(data_type_percentage_non_compliance(df, 'SalesPersonID', int)))
print('\n')

print('Accuracy checks:')
print('======')
print('SalesPersonID percentage non-compliance for > 0 rule: ' + str(numerical_range_percentage_non_compliance(df, 'SalesPersonID', lower_bound = 0)))

Consistency checks:
SalesPersonID percentage non-compliance for data format: 0.18827959519887033
SalesPersonID percentage non-compliance for Integer type rule: 0.18827959519887033


Accuracy checks:
SalesPersonID percentage non-compliance for > 0 rule: 0.18827959519887033


#### Calculate **Consistency** and **Accuracy** scores:

In [32]:
salesperson_id_consistency_score = 100 - (data_format_percentage_non_compliance(df, 'SalesPersonID', 'N', [1]) + data_type_percentage_non_compliance(df, 'SalesPersonID', int)) / 2
print('Consistency score: ' + str(salesperson_id_consistency_score))

salesperson_id_accuracy_score = 100 - numerical_range_percentage_non_compliance(df, 'SalesPersonID', lower_bound = 0)
print('Accuracy score: ' + str(salesperson_id_accuracy_score))

Consistency score: 99.81172040480114
Accuracy score: 99.81172040480114


# Your turn!
Fill in the code blocks to identify non-complying rows, calculate percentage non-compliance, and calculate **Consistency** and **Accuracy** scores for the ProductID and Quantity columns

## Check ProductID column
Identify rows in ProductID column that do not comply with **Consistency** and **Accuracy** rules:

<div class="alert alert-block alert-info">
<b>↓↓↓ TO DO: Run the validation check functions to find non-complying rows ↓↓↓</b>
</div>

In [33]:
print('Consistency checks:')
print('======')
print('ProductID non-complying rows for data format:')
pprint_row_list(data_format_identify_non_complying_rows(df, 'ProductID', 'N', [2]))
print('\n')

print('ProductID non-complying rows for Integer type rule:')
pprint_row_list(data_type_identify_non_complying_rows(df, 'ProductID', int))
print('\n')

print('Accuracy checks:')
print('======')
print('ProductID non-complying rows for > 0 rule:')
pprint_row_list(numerical_range_identify_non_complying_rows(df, 'ProductID', lower_bound = 0))

Consistency checks:
ProductID non-complying rows for data format:
Item number 1
SalesDate        2020-05-02
SalesID                1362
CustomerID               28
SalesPersonID             4
ProductID               -77
Quantity                  8
Sales                 136.2
StoreID                   3
OrderStatus        Complete
Name: 1326, dtype: object


Item number 2
SalesDate                     2020-06-02
SalesID                             1512
CustomerID                            59
SalesPersonID                          4
ProductID        24j2K5WcBfQH0JpmHDlxsYg
Quantity                              35
Sales                              851.6
StoreID                                7
OrderStatus                     Complete
Name: 1460, dtype: object


Item number 3
SalesDate        2020-09-02
SalesID                2143
CustomerID               19
SalesPersonID             3
ProductID              UHZy
Quantity                 34
Sales                1755.6
StoreID            

Find percentage non-compliance for each rule:

<div class="alert alert-block alert-info">
<b>↓↓↓ TO DO: Run the validation check functions to find percentage non-compliance ↓↓↓</b>
</div>

In [34]:
print('Consistency checks:')
print('======')
print('ProductID percentage non-compliance for data format: ' + str(data_format_percentage_non_compliance(df, 'ProductID', 'N', [2])))

print('ProductID percentage non-compliance for Integer type rule: ' + str(data_type_percentage_non_compliance(df, 'ProductID', int)))
print('\n')

print('Accuracy checks:')
print('======')
print('ProductID percentage non-compliance for > 0 rule: ' + str(numerical_range_percentage_non_compliance(df, 'ProductID', lower_bound = 0)))

Consistency checks:
ProductID percentage non-compliance for data format: 0.16474464579901155
ProductID percentage non-compliance for Integer type rule: 0.08237232289950577


Accuracy checks:
ProductID percentage non-compliance for > 0 rule: 0.16474464579901155


Calculate **Consistency** and **Accuracy** scores:

<div class="alert alert-block alert-info">
<b>↓↓↓ TO DO: Calculate scores for ProductID column ↓↓↓</b>
</div>

In [35]:
product_id_consistency_score = 100 - (data_format_percentage_non_compliance(df, 'ProductID', 'N', [2]) + data_type_percentage_non_compliance(df, 'ProductID', int)) / 2
print('Consistency score: ' + str(product_id_consistency_score))

product_id_accuracy_score = 100 - numerical_range_percentage_non_compliance(df, 'ProductID', lower_bound = 0)
print('Accuracy score: ' + str(product_id_accuracy_score))

Consistency score: 99.87644151565074
Accuracy score: 99.83525535420098


Identify rows in Quantity column that do not comply with **Consistency** and **Accuracy** rules:

<div class="alert alert-block alert-info">
<b>↓↓↓ TO DO: Run the validation check functions to find non-complying rows ↓↓↓</b>
</div>

In [37]:
print('Consistency checks:')
print('======')
print('Quantity non-complying rows for data format:')
pprint_row_list(data_format_identify_non_complying_rows(df, 'Quantity', 'N', [4]))
print('\n')

print('Quantity non-complying rows for Integer type rule:')
pprint_row_list(data_type_identify_non_complying_rows(df, 'Quantity', int))
print('\n')

print('Accuracy checks:')
print('======')
print(' non-complying rows for > 0 rule:')
pprint_row_list(numerical_range_identify_non_complying_rows(df, 'Quantity', lower_bound = 0))

Consistency checks:
Quantity non-complying rows for data format:
Item number 1
SalesDate        2020-02-08
SalesID                 782
CustomerID               39
SalesPersonID             3
ProductID                 6
Quantity                  e
Sales                  27.0
StoreID                   1
OrderStatus        Complete
Name: 659, dtype: object


Item number 2
SalesDate        2020-09-12
SalesID                2211
CustomerID                5
SalesPersonID             3
ProductID                55
Quantity                  n
Sales                 849.4
StoreID                   1
OrderStatus        Complete
Name: 2160, dtype: object


Item number 3
SalesDate        2021-02-11
SalesID                4150
CustomerID               19
SalesPersonID             9
ProductID                21
Quantity               SoKA
Sales                 444.1
StoreID                   1
OrderStatus        Complete
Name: 3669, dtype: object


Item number 4
SalesDate            2021-04-01
SalesID 

Find percentage non-compliance for each rule:

<div class="alert alert-block alert-info">
<b>↓↓↓ TO DO: Run the validation check functions to find percentage non-compliance ↓↓↓</b>
</div>

In [38]:
print('Consistency checks:')
print('======')
print('Quantity percentage non-compliance for data format: ' + str(data_format_percentage_non_compliance(df, 'Quantity', 'N', [4])))

print('Quantity percentage non-compliance for Integer type rule: ' + str(data_type_percentage_non_compliance(df, 'Quantity', int)))
print('\n')

print('Accuracy checks:')
print('======')
print('Quantity percentage non-compliance for > 0 rule: ' + str(numerical_range_percentage_non_compliance(df, 'Quantity', lower_bound = 0)))

Consistency checks:
Quantity percentage non-compliance for data format: 0.14120969639915276
Quantity percentage non-compliance for Integer type rule: 0.14120969639915276


Accuracy checks:
Quantity percentage non-compliance for > 0 rule: 0.15297717109908215


Calculate **Consistency** and **Accuracy** scores:

<div class="alert alert-block alert-info">
<b>↓↓↓ TO DO: Calculate scores for Quantity column ↓↓↓</b>
</div>

In [39]:
quantity_consistency_score = 100 - (data_format_percentage_non_compliance(df, 'Quantity', 'N', [4]) + data_type_percentage_non_compliance(df, 'Quantity', int)) / 2
print('Consistency score: ' + str(quantity_consistency_score))

quantity_accuracy_score = 100 - numerical_range_percentage_non_compliance(df, 'Quantity', lower_bound = 0)
print('Accuracy score: ' + str(quantity_accuracy_score))

Consistency score: 99.85879030360084
Accuracy score: 99.84702282890092


### Move on to validation checks for Sales column
#### Identify rows in Sales column that do not comply with **Consistency** and **Accuracy** rules

In [40]:
print('Consistency checks:')
print('======')
print('Sales non-complying rows for data format:')
pprint_row_list(data_format_identify_non_complying_rows(df, 'Sales', 'N', [10,2]))
print('\n')

print('Sales non-complying rows for Float type rule:')
pprint_row_list(data_type_identify_non_complying_rows(df, 'Sales', float))
print('\n')

print('Accuracy checks:')
print('======')
print('Sales non-complying rows for > 0 rule:')
pprint_row_list(numerical_range_identify_non_complying_rows(df, 'Sales', lower_bound = 0))

Consistency checks:
Sales non-complying rows for data format:
Item number 1
SalesDate         2020-01-03
SalesID                  520
CustomerID                73
SalesPersonID              9
ProductID                 13
Quantity                  18
Sales            WNl2tBcj5n1
StoreID                    2
OrderStatus         Complete
Name: 341, dtype: object


Item number 2
SalesDate        2020-09-10
SalesID                2198
CustomerID               63
SalesPersonID             8
ProductID                21
Quantity                 13
Sales                     R
StoreID                   9
OrderStatus        Complete
Name: 2147, dtype: object


Item number 3
SalesDate        2020-12-01
SalesID                2349
CustomerID               39
SalesPersonID             3
ProductID                15
Quantity                  5
Sales                   eFU
StoreID                   1
OrderStatus        Complete
Name: 2352, dtype: object


Item number 4
SalesDate        2021-01-13
SalesI

#### Find percentage non-compliance for each rule:

In [41]:
print('Consistency checks:')
print('======')
print('Sales percentage non-compliance for data format: ' + str(data_format_percentage_non_compliance(df, 'Sales', 'N', [10,2])))

print('Sales percentage non-compliance for Float type rule: ' + str(data_type_percentage_non_compliance(df, 'Sales', float)))
print('\n')

print('Accuracy checks:')
print('======')
print('Sales percentage non-compliance for >= 0 rule: ')
print(numerical_range_percentage_non_compliance(df, 'Sales', lower_bound = -0.001))

Consistency checks:
Sales percentage non-compliance for data format: 0.15297717109908215
Sales percentage non-compliance for Float type rule: 0.14120969639915276


Accuracy checks:
Sales percentage non-compliance for >= 0 rule: 
0.14120969639915276


#### Calculate **Consistency** and **Accuracy** scores:

In [42]:
sales_consistency_score = 100 - (data_format_percentage_non_compliance(df, 'Sales', 'N', [10,2]) + data_type_percentage_non_compliance(df, 'Sales', float)) / 2
print('Consistency score: ' + str(sales_consistency_score))

sales_accuracy_score = 100 - numerical_range_percentage_non_compliance(df, 'Sales', lower_bound = -0.001)
print('Accuracy score: ' + str(sales_accuracy_score))

Consistency score: 99.85290656625088
Accuracy score: 99.85879030360084


### Move on to validation checks for StoreID column
#### Identify rows in StoreID column that do not comply with **Consistency** and **Accuracy** rules

In [43]:
print('Consistency checks:')
print('======')
print('StoreID non-complying rows for data format:')
pprint_row_list(data_format_identify_non_complying_rows(df, 'StoreID', 'N', [1]))

Consistency checks:
StoreID non-complying rows for data format:
Item number 1
SalesDate          2020-01-01
SalesID                   115
CustomerID                  3
SalesPersonID               9
ProductID                  11
Quantity                   60
Sales                  1683.8
StoreID          nd68G6h0PZbk
OrderStatus           Pending
Name: 4, dtype: object


Item number 2
SalesDate        2020-02-01
SalesID                 584
CustomerID               28
SalesPersonID             5
ProductID                24
Quantity                 11
Sales                  58.2
StoreID                  -2
OrderStatus        Complete
Name: 517, dtype: object


Item number 3
SalesDate           2020-04-15
SalesID                   1114
CustomerID                  16
SalesPersonID                2
ProductID                   22
Quantity                    12
Sales                     47.5
StoreID          OybsWeAABRihQ
OrderStatus           Complete
Name: 1210, dtype: object


Item number 4

#### Find percentage non-compliance for data format rule:

In [44]:
print('Consistency checks:')
print('======')
print('StoreID percentage non-compliance for data format: ' + str(data_format_percentage_non_compliance(df, 'StoreID', 'N', [1])))

Consistency checks:
StoreID percentage non-compliance for data format: 0.15297717109908215


# Using ChatGPT to generate validation check function - **Accuracy**
### Check that all values in StoreID Column contain only the permitted values (1 - 9)

In [83]:
def check_store_id_quality(df):
    # Define the permitted values
    permitted_values = {'1', '2', '3', '4', '5', '6', '7', '8', '9'}

    # Filter rows where 'StoreID' is not in the permitted values
    invalid_rows = df[~df['StoreID'].isin(permitted_values)]

    # Return the invalid rows as a list of dictionaries
    return invalid_rows.to_dict(orient='records')

#### Run the check defined by ChatGPT to identify rows in StoreID column that do not comply with permitted values rule

In [84]:
print('Consistency checks:')
print('======')
print('StoreID non-complying rows for permitted values:')
print(check_store_id_quality(df))

Consistency checks:
StoreID non-complying rows for permitted values:
[{'SalesDate': '2020-01-01', 'SalesID': '115', 'CustomerID': '3', 'SalesPersonID': '9', 'ProductID': '11', 'Quantity': '60', 'Sales': '1683.8', 'StoreID': 'nd68G6h0PZbk', 'OrderStatus': 'Pending'}, {'SalesDate': '2020-02-01', 'SalesID': '584', 'CustomerID': '28', 'SalesPersonID': '5', 'ProductID': '24', 'Quantity': '11', 'Sales': '58.2', 'StoreID': '-2', 'OrderStatus': 'Complete'}, {'SalesDate': '2020-04-15', 'SalesID': '1114', 'CustomerID': '16', 'SalesPersonID': '2', 'ProductID': '22', 'Quantity': '12', 'Sales': '47.5', 'StoreID': 'OybsWeAABRihQ', 'OrderStatus': 'Complete'}, {'SalesDate': '2020-05-02', 'SalesID': '1371', 'CustomerID': '73', 'SalesPersonID': '9', 'ProductID': '76', 'Quantity': '7', 'Sales': '2671.2', 'StoreID': 'dBA', 'OrderStatus': 'Complete'}, {'SalesDate': '2021-01-02', 'SalesID': '3355', 'CustomerID': '80', 'SalesPersonID': '1', 'ProductID': '29', 'Quantity': '11', 'Sales': '1259.2', 'StoreID': '

#### Find percentage non-compliance for permitted values rule:

In [85]:
store_id_permitted_values_percentage_non_compliance = len(check_store_id_quality(df)) / l * 100

In [86]:
print('Accuracy checks:')
print('======')
print('StoreID percentage non-compliance for permitted values: ' + str(store_id_permitted_values_percentage_non_compliance))

Accuracy checks:
StoreID percentage non-compliance for permitted values: 0.15297717109908215


#### Calculate **Consistency** and **Accuracy** scores:

In [87]:
store_id_consistency_score = 100 - data_format_percentage_non_compliance(df, 'StoreID', 'N', [1])
print('Consistency score: ' + str(store_id_consistency_score))

store_id_accuracy_score = 100 - store_id_permitted_values_percentage_non_compliance
print('Accuracy score: ' + str(store_id_accuracy_score))

Consistency score: 99.84702282890092
Accuracy score: 99.84702282890092


### Move on to OrderStatus column
#### Identify rows in OrderStatus column that do not comply with data format rule:

In [88]:
print('Consistency checks:')
print('======')
print('OrderStatus non-complying rows for data format:')
pprint_row_list(data_format_identify_non_complying_rows(df, 'OrderStatus', 'A', [10]))

Consistency checks:
OrderStatus non-complying rows for data format:
Item number 1
SalesDate          2020-02-07
SalesID                   775
CustomerID                  5
SalesPersonID               9
ProductID                  35
Quantity                    7
Sales                    38.5
StoreID                     4
OrderStatus      rTc8Le3iHuV6
Name: 652, dtype: object


Item number 2
SalesDate        2020-04-01
SalesID                1075
CustomerID               32
SalesPersonID             1
ProductID                62
Quantity                  5
Sales                 161.0
StoreID                   1
OrderStatus             2Jk
Name: 1077, dtype: object


Item number 3
SalesDate        2020-04-07
SalesID                1230
CustomerID               19
SalesPersonID             9
ProductID                49
Quantity                 36
Sales                 899.8
StoreID                   2
OrderStatus          l4cNSE
Name: 1178, dtype: object


Item number 4
SalesDate          

#### Find percentage non-compliance for data format rule:

In [89]:
print('Consistency checks:')
print('======')
print('OrderStatus percentage non-compliance for data format: ' + str(data_format_percentage_non_compliance(df, 'OrderStatus', 'A', [10])))

Consistency checks:
OrderStatus percentage non-compliance for data format: 0.12944222169922334


# Using ChatGPT to generate validation check function - **Accuracy**
### check that all values in the  'OrderStatus' column contain only the permitted values (Complete, Pending, Incomplete)

<div class="alert alert-block alert-info">
<b>↓↓↓ TO DO: Get ChatGPT to write the Accuracy check function ↓↓↓</b>
</div>

In [102]:
def check_order_status_quality(df):
    # Define the permitted values
    permitted_values = {'Complete', 'Pending', 'Incomplete'}

    # Filter rows where 'StoreID' is not in the permitted values
    invalid_rows = df[~df['OrderStatus'].isin(permitted_values)]

    # Return the invalid rows as a list of dictionaries
    return invalid_rows.to_dict(orient='records')

#### Run the check defined by ChatGPT to identify rows in OrderStatus column that do not comply with permitted values rule

<div class="alert alert-block alert-info">
<b>↓↓↓ TO DO: Run the check and print the results ↓↓↓</b>
</div>

In [103]:
print('Consistency checks:')
print('======')
print('OrderStatus non-complying rows for permitted values:')
order_status_permitted_values_non_compliers = check_order_status_quality(df) # call the function defined by ChatGPT
print(order_status_permitted_values_non_compliers)

Consistency checks:
OrderStatus non-complying rows for permitted values:
[{'SalesDate': '2020-02-07', 'SalesID': '775', 'CustomerID': '5', 'SalesPersonID': '9', 'ProductID': '35', 'Quantity': '7', 'Sales': '38.5', 'StoreID': '4', 'OrderStatus': 'rTc8Le3iHuV6'}, {'SalesDate': '2020-04-01', 'SalesID': '1075', 'CustomerID': '32', 'SalesPersonID': '1', 'ProductID': '62', 'Quantity': '5', 'Sales': '161.0', 'StoreID': '1', 'OrderStatus': '2Jk'}, {'SalesDate': '2020-04-07', 'SalesID': '1230', 'CustomerID': '19', 'SalesPersonID': '9', 'ProductID': '49', 'Quantity': '36', 'Sales': '899.8', 'StoreID': '2', 'OrderStatus': 'l4cNSE'}, {'SalesDate': '2020-07-13', 'SalesID': '1662', 'CustomerID': '13', 'SalesPersonID': '3', 'ProductID': '21', 'Quantity': '10', 'Sales': '101.4', 'StoreID': '9', 'OrderStatus': 'QhIX'}, {'SalesDate': '2020-08-11', 'SalesID': '2016', 'CustomerID': '3', 'SalesPersonID': '9', 'ProductID': '57', 'Quantity': '5', 'Sales': '101.9', 'StoreID': '5', 'OrderStatus': 'nCYNj2qGHfSV

#### Find percentage non-compliance for permitted values rule:

In [104]:
order_status_permitted_values_percentage_non_compliance = len(order_status_permitted_values_non_compliers) / l * 100

In [105]:
print('Accuracy checks:')
print('======')
print('OrderStatus percentage non-compliance for permitted values: ' + str(order_status_permitted_values_percentage_non_compliance))

Accuracy checks:
OrderStatus percentage non-compliance for permitted values: 0.2353494939985879


#### Calculate **Consistency** and **Accuracy** scores:

In [106]:
order_status_consistency_score = 100 - data_format_percentage_non_compliance(df, 'OrderStatus', 'A', [10])
print('Consistency score: ' + str(order_status_consistency_score))

order_status_accuracy_score = 100 - order_status_permitted_values_percentage_non_compliance
print('Accuracy score: ' + str(order_status_accuracy_score))

Consistency score: 99.87055777830078
Accuracy score: 99.76465050600142


# Reporting of Overall Dataset quality
## Calculate and present overall **Completeness**, **Consistency**, **Accuracy** scores of dataset

**Completeness** score of entire dataset = average of **Completeness** scores for each column

In [107]:
# remember earlier we found that every column passed the Completeness check - 100% completeness!
dataset_completeness_score = 100

**Consistency** score of entire dataset = average of **Consistency** scores for each column

In [108]:
dataset_consistency_score = (sales_date_consistency_score
+ sales_id_consistency_score
+ customer_id_consistency_score
+ salesperson_id_consistency_score
+ product_id_consistency_score
+ quantity_consistency_score
+ sales_consistency_score
+ store_id_consistency_score
+ order_status_consistency_score) / 9

**Accuracy** score of entire dataset = average of **Accuracy** scores for each column

In [109]:
dataset_accuracy_score = (sales_date_accuracy_score
+ sales_id_accuracy_score
+ customer_id_accuracy_score
+ salesperson_id_accuracy_score
+ product_id_accuracy_score
+ quantity_accuracy_score
+ sales_accuracy_score
+ store_id_accuracy_score
+ order_status_accuracy_score) / 9

Present the results in a table:

In [110]:
completeness_target = 100
consistency_target = 100
accuracy_target = 90

In [111]:
def get_pass_fail (performance, target):
	if performance >= target:
		return 'Pass'
	return 'Fail'

In [112]:
results_header_row = '| ' + 'METRIC'.ljust(15) + ' | ' + 'PERFORMANCE'.ljust(15) + ' | ' + 'TARGET'.ljust(15) + ' | ' + 'PASS / FAIL'.ljust(15) + ' |'

results_completeness_row = '| ' + 'Completeness'.ljust(15) + ' | ' + str(dataset_completeness_score)[:6].ljust(15) + ' | ' + str(completeness_target).ljust(15) + ' | ' + get_pass_fail(dataset_completeness_score, completeness_target).ljust(15) + ' |'

results_consistency_row = '| ' + 'Consistency'.ljust(15) + ' | ' + str(dataset_consistency_score)[:6].ljust(15) + ' | ' + str(consistency_target).ljust(15) + ' | ' + get_pass_fail(dataset_consistency_score, consistency_target).ljust(15) + ' |'

results_accuracy_row = '| ' + 'Accuracy'.ljust(15) + ' | ' + str(dataset_accuracy_score)[:6].ljust(15) + ' | ' + str(accuracy_target).ljust(15) + ' | ' + get_pass_fail(dataset_accuracy_score, accuracy_target).ljust(15) + ' |'

results_divider = '+ ' + ''.ljust(15, '-') + ' + ' + ''.ljust(15, '-') + ' + ' + ''.ljust(15, '-') + ' + ' + ''.ljust(15, '-') + ' +'
results_header_divider = '+ ' + ''.ljust(15, '=') + ' + ' + ''.ljust(15, '=') + ' + ' + ''.ljust(15, '=') + ' + ' + ''.ljust(15, '=') + ' +'

In [113]:
print(results_divider)
print(results_header_row)
print(results_header_divider)
print(results_completeness_row)
print(results_divider)
print(results_consistency_row)
print(results_divider)
print(results_accuracy_row)
print(results_divider)

+ --------------- + --------------- + --------------- + --------------- +
| METRIC          | PERFORMANCE     | TARGET          | PASS / FAIL     |
| Completeness    | 100             | 100             | Pass            |
+ --------------- + --------------- + --------------- + --------------- +
| Consistency     | 99.846          | 100             | Fail            |
+ --------------- + --------------- + --------------- + --------------- +
| Accuracy        | 99.841          | 90              | Pass            |
+ --------------- + --------------- + --------------- + --------------- +
