# **DataCon 2024: Data Quality Checks**
#### Learn how to run simple Completeness, Consistency and Accuracy Checks based on pre-defined validation rules on a sample Sales Dataset

## Import Pandas Library and Load Data from File

In [None]:
#Import external libraries
%pip install pandas

# imports the 'pandas' library for use in this script and assigns it the nickname 'pd'
import pandas as pd

In [1]:
import pandas as pd

In [2]:
# loading the data from sales.csv into the DataFrame object
df = pd.read_csv('sales.csv')
df.index = df.index + 1

#### Some DataFrame functions built into `pandas`:

In [3]:
df.head() # returns first 5 rows of table

Unnamed: 0,SalesDate,SalesID,CustomerID,SalesPersonID,ProductID,Quantity,Sales,StoreID,OrderStatus
1,2020-01-01,112,37,9,70,7,226.0,9,Incomplete
2,2020-01-01,113,59,4,36,37,326.1,7,Pending
3,2020-01-01,114,59,4,40,19,422.9,7,Incomplete
4,2020-01-01,115,3,9,11,60,1683.8,nd68G6h0PZbk,Pending
5,2020-01-01,116,3,9,40,12,230.9,7,Incomplete


In [4]:
l = len(df) # returns number of rows in table
print(l)

8498


In [5]:
df['SalesID'] # get SalesID column

1        112
2        113
3        114
4        115
5        116
        ... 
8494    2843
8495    2677
8496    2844
8497    2678
8498    2845
Name: SalesID, Length: 8498, dtype: object

In [6]:
df.loc[1] # get 1st row

SalesDate        2020-01-01
SalesID                 112
CustomerID               37
SalesPersonID             9
ProductID                70
Quantity                  7
Sales                 226.0
StoreID                   9
OrderStatus      Incomplete
Name: 1, dtype: object

In [7]:
df.loc[1]['SalesID'] # get 1st row of SalesID column

'112'

In [8]:
column_names = list(df) # returns list of column names in table
print(column_names)

['SalesDate', 'SalesID', 'CustomerID', 'SalesPersonID', 'ProductID', 'Quantity', 'Sales', 'StoreID', 'OrderStatus']


## Define and Run Data Quality Check Functions
Each function checks the values in 1 row against 1 validation rule.<br>
## Check for **Completeness** of dataset

In [9]:
#Defining the Completeness check function
def check_cannot_be_blank (df, column_name): # returns true if column_name contains no blanks
  column_values = df[column_name]
  for column_value in column_values:
    if pd.isna(column_value): # pd.isna(column_value) returns False if column_value is blank
      return False
  return True

In [10]:
#Run the cannot be blank (i.e. completeness) check on a specific column (i.e. SalesID)
print(check_cannot_be_blank(df, 'SalesID')) 

True


In [11]:
#Use a `for` loop to run the **Completeness** check on all columns in the table
for column_name in column_names:
  print(column_name + ': ' + str(check_cannot_be_blank(df, column_name)))

SalesDate: True
SalesID: True
CustomerID: True
SalesPersonID: True
ProductID: True
Quantity: True
Sales: True
StoreID: True
OrderStatus: True


## Check for **Consistency** of dataset 
### Start with SalesDate Column in dataset

In [12]:
#Define the Consistency check function

from collections import defaultdict
import math

def is_intable (s): # returns true if the input value is an integer
  try:
    int(s)
  except:
    return False
  return True

def is_floatable (s): # returns true if the input value is a float
  try:
    float(s)
  except:
    return False
  return True

def is_alphabetical (s): # returns true if the input value is alphabetical (consists only of alphabets)
  if not isinstance(s, str):
    return False
  for char in s: # for each character in the input string
    if ord(char) not in range(97,123) and ord(char) not in range(65,91): # if the character is not in a-z or A-Z
      return False # the value is not alphabetical
  return True

def check_data_format (df, column_name, character_type, pieces_max_lengths): # returns true if all values in column_name comply with data format specified
  match character_type:
    case 'N': # if character type is 'N'
      if len(pieces_max_lengths) == 2: # data format is N(p,q)
        for value in df[column_name]: # for each value in the column
          val_str = str(value) # convert the value into a string
          val_str_pieces = val_str.split('.') # split the string into before and after the decimal point
          if len(val_str_pieces) != 2:
            return False # the column fails the check if number of '.' is not exactly 1
          # the value passes the check if both pieces are integers, and each piece is of length less than or equal to the maximum length specified
          complies = is_intable(val_str_pieces[0]) and is_intable(val_str_pieces[1]) and len(val_str_pieces[0]) <= pieces_max_lengths[0] and len(val_str_pieces[1]) <= pieces_max_lengths[1]
          if not complies:
            return False
        return True
      elif len(pieces_max_lengths) == 1: # data format is N(p)
        for value in df[column_name]:
          complies = is_intable(value) and len(str(value)) <= pieces_max_lengths[0] # the value passes the check if it is an integer and is of length less than or equal to the maximum length specified
          if not complies:
            return False
        return True
    case 'A': # data format is A(p)
      for value in df[column_name]:
        complies = is_alphabetical(value) and len(value) <= pieces_max_lengths[0] # the value passes the check if it is alphabetical and is of length less than or equal to the maximum length specified
        if not complies:
          return False
      return True
    case 'X': # data format is X(p)
      for value in df[column_name]:
        val_str = str(value) # convert the value into a string
        complies = len(val_str) <= pieces_max_lengths[0] # the value passes the check if it is of length less than or equal to the maximum length specified
        if not complies:
          return False
      return True

def check_date_format_yyyymmdd (df, column_name): # returns true if all values in column_name follow the date format yyyy-mm-dd
  for value in df[column_name]:
    value_split = value.split('-') # split each value into 3 parts to assess individually
    if len(value_split) != 3:
      return False # column fails the check if the number of parts is not exactly 3
    try:
      if len(value_split[2]) != 2 or int(value_split[2]) < 0 or int(value_split[2]) > 31: # reject if date is not 2 digits, or date < 0 or date > 31
        return False
      if len(value_split[2]) != 2 or int(value_split[1]) < 0 or int(value_split[1]) > 12: # reject if month is not 2 digits, or month < 0 or month > 12
        return False
      if len(value_split[0]) != 4 or not is_intable(value_split[0]): # reject if year is not int or year is not 4 digits
        return False
    except:
      return False # reject if date, month or year is not an integer
  return True

In [13]:
#Run the Consistency check on SalesDate column
print('Check SalesDate column')
print('====')
print('Data format: ' + str(check_data_format(df, 'SalesDate', 'X', [10])))
print('Date format: ' + str(check_date_format_yyyymmdd(df, 'SalesDate')))

Check SalesDate column
====
Data format: False
Date format: False


#### Modify the functions to investigate the rows that fail to comply with the **Consistency** rules:

In [14]:
def data_format_identify_non_complying_rows (df, column_name, character_type, pieces_max_lengths): # returns a list of row(s) that do not comply to data format rule
  non_compliers = [] # keep a list of rows that do not comply to this rule
  match character_type:
    case 'N':
      if len(pieces_max_lengths) == 2: # data format is N(p,q)
        for index in range(1, l + 1):
          val_str = str(df.loc[index][column_name]) # locate the row in  the specified column that does not comply with this rule
          val_str_pieces = val_str.split('.')
          complies = len(val_str_pieces) == 2 and is_intable(val_str_pieces[0]) and is_intable(val_str_pieces[1]) and len(val_str_pieces[0]) <= pieces_max_lengths[0] and len(val_str_pieces[1]) <= pieces_max_lengths[1]
          if not complies:
            non_compliers.append(df.loc[index]) # if the value does not comply then add the row to the list of non-compliers
        return pd.DataFrame(non_compliers)
      elif len(pieces_max_lengths) == 1: # data format is N(p)
        for index in range(1, l + 1):
          value = df.loc[index][column_name]
          complies = is_intable(value) and len(str(value)) <= pieces_max_lengths[0]
          if not complies:
            non_compliers.append(df.loc[index]) # if the value does not comply then add the row to the list of non-compliers
        return pd.DataFrame(non_compliers)
    case 'A': # data format is A(p)
      for index in range(1, l + 1):
          value = df.loc[index][column_name]
          complies = is_alphabetical(value) and len(value) <= pieces_max_lengths[0]
          if not complies:
            non_compliers.append(df.loc[index]) # if the value does not comply then add the row to the list of non-compliers
      return pd.DataFrame(non_compliers)
    case 'X': # data format is X(p)
      for index in range(1, l + 1):
          value = df.loc[index][column_name]
          complies = len(value) <= pieces_max_lengths[0]
          if not complies:
            non_compliers.append(df.loc[index]) # if the value does not comply then add the row to the list of non-compliers
      return pd.DataFrame(non_compliers)

def date_format_identify_non_complying_rows (df, column_name): # returns a list of row(s) that do not comply to date format rule
  non_compliers = [] # keep a list of rows that do not comply to this rule
  for row_index in range(1, l + 1): # going through the df row by row this time
    value_complies = True
    value = df.loc[row_index][column_name] # locate the row in  the specified column that does not comply with this rule
    value_split = value.split('-') # split each value into 3 parts to assess individually
    if len(value_split) != 3:
      value_complies = False # column fails the check if the number of parts is not exactly 3
    try:
      if int(value_split[2]) < 0 or int(value_split[2]) > 31: # reject if date is not int or date < 0 or date > 31
        value_complies = False
      if int(value_split[1]) < 0 or int(value_split[1]) > 12: # reject if month is not int or month < 0 or month > 12
        value_complies = False
      if len(value_split[0]) != 4 or not is_intable(value_split[0]): # reject if year is not int or year is not 4 digits
        value_complies = False
    except:
      value_complies = False # also reject row if date, month or year is not an integer
    if value_complies == False:
        non_compliers.append(df.loc[row_index]) # add the row to the list of non-compliers
  return pd.DataFrame(non_compliers)

#### Run the modified **Consistency** check function to display all rows with errors

In [15]:
print('SalesDate non-complying rows for data format :')
display(data_format_identify_non_complying_rows(df, 'SalesDate', 'X', [10]))
print('\n')

print('SalesDate non-complying rows for date format :')
display(date_format_identify_non_complying_rows(df, 'SalesDate'))

SalesDate non-complying rows for data format :


Unnamed: 0,SalesDate,SalesID,CustomerID,SalesPersonID,ProductID,Quantity,Sales,StoreID,OrderStatus
1775,4KJlZDR2y8ary,1810,37,3,64,7,311.4,2,Complete
2230,g9q7jkijclM4tCu,2227,19,3,2,26,461.0,3,Complete
3059,2xN93XPWEgreAXJCkfM,3685,63,7,44,21,1805.1,7,Complete




SalesDate non-complying rows for date format :


Unnamed: 0,SalesDate,SalesID,CustomerID,SalesPersonID,ProductID,Quantity,Sales,StoreID,OrderStatus
741,mGetkX,164,5,2,44,18,1339.8,8,Complete
1243,U,1125,5,3,54,18,690.5,6,Complete
1775,4KJlZDR2y8ary,1810,37,3,64,7,311.4,2,Complete
1994,e,2029,37,2,36,39,268.1,4,Complete
2230,g9q7jkijclM4tCu,2227,19,3,2,26,461.0,3,Complete
2384,wiw,2381,80,1,61,29,927.8,5,Complete
2805,V5,3459,59,5,31,9,69.1,9,Complete
2894,Q,3544,32,1,46,43,672.7,5,Complete
3028,UFa3b,3658,85,5,71,20,623.4,4,Complete
3059,2xN93XPWEgreAXJCkfM,3685,63,7,44,21,1805.1,7,Complete


#### Modify the functions again to provide a percentage of all rows that do not comply with the specified **Consistency** rules:

In [16]:
# returns the percentage of row(s) that do not comply to data format rule
def data_format_percentage_non_compliance (df, column_name, character_type, pieces_max_lengths):
  return len(data_format_identify_non_complying_rows(df, column_name, character_type, pieces_max_lengths)) / l * 100

# returns the percentage of row(s) that do not comply to date format rule
def date_format_percentage_non_compliance (df, column_name):
  return len(date_format_identify_non_complying_rows(df, column_name)) / l * 100

#### Run the modified checks to find the percentage non-compliance of the SalesDate column for each of the **Consistency** checks:

In [17]:
print('Consistency checks:')
print('======')
print('SalesDate percentage non-compliance for data format rule: ' + str(data_format_percentage_non_compliance(df, 'SalesDate', 'X', [10])))
print('SalesDate percentage non-compliance for date format rule: ' + str(date_format_percentage_non_compliance(df, 'SalesDate')))

Consistency checks:
SalesDate percentage non-compliance for data format rule: 0.03530242409978819
SalesDate percentage non-compliance for date format rule: 0.2353494939985879


#### Calculate **Completeness**, **Consistency** and **Accuracy** score:

In [18]:
sales_date_completeness_score = 100 # determined in cannot_be_blank check earlier

sales_date_consistency_score = 100 - (data_format_percentage_non_compliance(df, 'SalesDate', 'X', [10]) + date_format_percentage_non_compliance(df, 'SalesDate')) / 2

sales_date_accuracy_score = 100 # SalesDate column has no accuracy rules / checks


print('Completeness score: ' + str(sales_date_completeness_score))
print('Consistency score: ' + str(sales_date_consistency_score))
print('Accuracy score: ' + str(sales_date_accuracy_score))

Completeness score: 100
Consistency score: 99.86467404095082
Accuracy score: 100


### Find non-complying rows
#### Define the rest of the **Consistency** validation check functions to also return a list of non-complying rows:

In [19]:
def unique_values_identify_non_complying_rows (df, column_name): # returns a list of row(s) that do not comply to unique values rule
  non_compliers = [] # keep a list of rows that do not comply to this rule
  value_occurence_list = defaultdict(list) # for each value that occurs in the column, keep a list of rows that have this value
  for index in range(1,l + 1): # remember l is len(df)
    value = df.loc[index][column_name] # locate the row in  the specified column that does not comply with this rule
    value_occurence_list[value].append(index) # add the row to the list of rows with the same value in that column
  for value in value_occurence_list.keys(): # for each value in the column,
    if len(value_occurence_list[value]) > 1: # if there is more than 1 row in the list of rows with that value
      non_compliers = non_compliers + [df.loc[index] for index in value_occurence_list[value]] # add all the rows in the list to the list of non-compliers
  return pd.DataFrame(non_compliers)

def data_type_identify_non_complying_rows (df, column_name, type_specified): # returns a list of rows that do not comply with data type rule (Integer, Float etc.)
  non_compliers = [] # keep a list of rows that do not comply to this rule
  for index in range(1, l + 1):
    value = df.loc[index][column_name] # locate the row in  the specified column that does not comply with this rule
    if type_specified == int:
      if not is_intable(value):
        non_compliers.append(df.loc[index])
    elif type_specified == float:
      if not is_floatable(value):
        non_compliers.append(df.loc[index])
  return pd.DataFrame(non_compliers)

#### Define the rest of the **Accuracy** validation check functions to also return a list of non-complying rows:

In [20]:
def numerical_range_identify_non_complying_rows (df, column_name, lower_bound = - math.inf, upper_bound = math.inf): # returns a list of row(s) that do not comply to numerical range rule
  non_compliers = [] # keep a list of rows that do not comply to this rule
  for index in range(1, l + 1):
    value = df.loc[index][column_name] # locate the row in  the specified column that does not comply with this rule
    try:
      if float(value) >= upper_bound or float(value) <= lower_bound: # if value in row is not within specified range
        non_compliers.append(df.loc[index]) # add the row to the list of non-compliers
    except:
      non_compliers.append(df.loc[index]) # also add the row to the list of non-compliers if value is not numerical
  return pd.DataFrame(non_compliers)

#### Calculate percentage non-compliance per-column for the **Consistency** checks

In [21]:
# returns the percentage of row(s) that do not comply to unique values rule
def unique_values_percentage_non_compliance (df, column_name):
  return len(unique_values_identify_non_complying_rows(df, column_name)) / l * 100

# returns the percentage of row(s) that do not comply to data type rule
def data_type_percentage_non_compliance (df, column_name, type_specified):
  return len(data_type_identify_non_complying_rows(df, column_name, type_specified)) / l * 100

#### Calculate  percentage non-compliance per-column for the **Accuracy** checks

In [22]:
# returns the percentage of row(s) that do not comply to numerical range rule
def numerical_range_percentage_non_compliance (df, column_name, lower_bound = - math.inf, upper_bound = math.inf):
  return len(numerical_range_identify_non_complying_rows(df, column_name, lower_bound, upper_bound)) / l * 100

### Move on to validation checks for SalesID column
#### Identify rows in SalesID column that do not comply with **Consistency** and **Accuracy** rules

In [23]:
print('Consistency checks:')
print('======')
print('SalesID non-complying rows for data format:')
display(data_format_identify_non_complying_rows(df, 'SalesID', 'N', [4]))
print('\n')

print('SalesID non-complying rows for unique values rule:')
display(unique_values_identify_non_complying_rows(df, 'SalesID'))
print('\n')

print('SalesID non-complying rows for Integer type rule:')
display(data_type_identify_non_complying_rows(df, 'SalesID', int))
print('\n')

print('Accuracy checks:')
print('======')
print('SalesID non-complying rows for > 0 rule:')
display(numerical_range_identify_non_complying_rows(df, 'SalesID', lower_bound = 0))

Consistency checks:
SalesID non-complying rows for data format:


Unnamed: 0,SalesDate,SalesID,CustomerID,SalesPersonID,ProductID,Quantity,Sales,StoreID,OrderStatus
597,2020-02-02,y,79,7,52,23,1702.3,9,Complete
676,2020-02-10,FH,5,2,73,27,29.1,4,Complete
970,2020-03-03,Q,44,8,30,15,488.3,3,Complete
2030,2020-08-28,4Uipa,37,9,30,44,1130.4,5,Complete
2128,2020-09-09,7Uk9tk44,19,9,35,33,230.4,1,Complete
2719,2021-01-02,v,32,1,1,36,665.6,5,Complete
3351,2021-02-01,yFq2zNd0oKIwNKx,48,1,76,10,4000.6,1,Complete
3637,2021-02-09,qD,63,7,74,17,634.9,8,Complete
3873,2021-03-02,D6LEy6eD,73,9,56,29,1127.5,3,Complete
4689,2021-04-07,BO5wH,17,8,26,12,342.6,5,Complete




SalesID non-complying rows for unique values rule:


Unnamed: 0,SalesDate,SalesID,CustomerID,SalesPersonID,ProductID,Quantity,Sales,StoreID,OrderStatus
193,2020-01-02,372,37,2,36,48,317.1,2,Complete
414,2020-01-02,372,37,2,36,48,317.1,2,Complete
404,2020-01-14,6,19,3,74,50,1360.5,2,Complete
5658,2021-06-02,6,91,8,75,37,257.9,2,Complete
427,2020-01-17,28,37,9,60,10,51.1,9,Complete
537,2020-01-17,28,37,9,60,10,51.1,9,Complete
1450,2020-06-01,1502,21,3,19,17,176.1,8,Complete
1695,2020-06-01,1502,21,3,19,17,176.1,8,Complete
3165,2021-01-17,2904,44,7,52,8,683.5,2,Complete
6644,2021-01-17,2904,44,7,52,8,683.5,2,Complete




SalesID non-complying rows for Integer type rule:


Unnamed: 0,SalesDate,SalesID,CustomerID,SalesPersonID,ProductID,Quantity,Sales,StoreID,OrderStatus
597,2020-02-02,y,79,7,52,23,1702.3,9,Complete
676,2020-02-10,FH,5,2,73,27,29.1,4,Complete
970,2020-03-03,Q,44,8,30,15,488.3,3,Complete
2030,2020-08-28,4Uipa,37,9,30,44,1130.4,5,Complete
2128,2020-09-09,7Uk9tk44,19,9,35,33,230.4,1,Complete
2719,2021-01-02,v,32,1,1,36,665.6,5,Complete
3351,2021-02-01,yFq2zNd0oKIwNKx,48,1,76,10,4000.6,1,Complete
3637,2021-02-09,qD,63,7,74,17,634.9,8,Complete
3873,2021-03-02,D6LEy6eD,73,9,56,29,1127.5,3,Complete
4689,2021-04-07,BO5wH,17,8,26,12,342.6,5,Complete




Accuracy checks:
SalesID non-complying rows for > 0 rule:


Unnamed: 0,SalesDate,SalesID,CustomerID,SalesPersonID,ProductID,Quantity,Sales,StoreID,OrderStatus
597,2020-02-02,y,79,7,52,23,1702.3,9,Complete
676,2020-02-10,FH,5,2,73,27,29.1,4,Complete
970,2020-03-03,Q,44,8,30,15,488.3,3,Complete
2030,2020-08-28,4Uipa,37,9,30,44,1130.4,5,Complete
2128,2020-09-09,7Uk9tk44,19,9,35,33,230.4,1,Complete
2719,2021-01-02,v,32,1,1,36,665.6,5,Complete
3351,2021-02-01,yFq2zNd0oKIwNKx,48,1,76,10,4000.6,1,Complete
3637,2021-02-09,qD,63,7,74,17,634.9,8,Complete
3873,2021-03-02,D6LEy6eD,73,9,56,29,1127.5,3,Complete
4689,2021-04-07,BO5wH,17,8,26,12,342.6,5,Complete


#### Find percentage non-compliance for each rule:

In [24]:
print('Consistency checks:')
print('======')
print('SalesID percentage non-compliance for data format: ' + str(data_format_percentage_non_compliance(df, 'SalesID', 'N', [4])))

print('SalesID percentage non-compliance for unique values rule: ' + str(unique_values_percentage_non_compliance(df, 'SalesID')))

print('SalesID percentage non-compliance for Integer type rule: ' + str(data_type_percentage_non_compliance(df, 'SalesID', int)))
print('\n')

print('Accuracy checks:')
print('======')
print('SalesID percentage non-compliance for > 0 rule: ' + str(numerical_range_percentage_non_compliance(df, 'SalesID', lower_bound = 0)))

Consistency checks:
SalesID percentage non-compliance for data format: 0.18827959519887033
SalesID percentage non-compliance for unique values rule: 0.11767474699929395
SalesID percentage non-compliance for Integer type rule: 0.18827959519887033


Accuracy checks:
SalesID percentage non-compliance for > 0 rule: 0.18827959519887033


#### Calculate **Completeness**, **Consistency** and **Accuracy** score:

In [25]:
sales_id_completeness_score = 100 # determined in cannot_be_blank check earlier

sales_id_consistency_score = 100 - (data_format_percentage_non_compliance(df, 'SalesID', 'N', [4]) + unique_values_percentage_non_compliance(df, 'SalesID') + data_type_percentage_non_compliance(df, 'SalesID', int)) / 3

sales_id_accuracy_score = 100 - numerical_range_percentage_non_compliance(df, 'SalesID', lower_bound = 0)


print('Completeness score: ' + str(sales_id_completeness_score))
print('Consistency score: ' + str(sales_id_consistency_score))
print('Accuracy score: ' + str(sales_id_accuracy_score))

Completeness score: 100
Consistency score: 99.83525535420098
Accuracy score: 99.81172040480114


### Move on to validation checks for CustomerID column
#### Identify rows in CustomerID column that do not comply with **Consistency** and **Accuracy** rules

In [26]:
print('Consistency checks:')
print('======')
print('CustomerID non-complying rows for data format:')
display(data_format_identify_non_complying_rows(df, 'CustomerID', 'N', [3]))
print('\n')

print('CustomerID non-complying rows for Integer type rule:')
display(data_type_identify_non_complying_rows(df, 'CustomerID', int))
print('\n')

print('Accuracy checks:')
print('======')
print('CustomerID non-complying rows for > 0 rule:')
display(numerical_range_identify_non_complying_rows(df, 'CustomerID', lower_bound = 0))

Consistency checks:
CustomerID non-complying rows for data format:


Unnamed: 0,SalesDate,SalesID,CustomerID,SalesPersonID,ProductID,Quantity,Sales,StoreID,OrderStatus
508,2020-01-31,109,ielXvjMk,3,53,46,3490.3,1,Complete
743,2020-02-25,131,QsdorcuT5,3,69,22,44.7,8,Complete
764,2020-03-01,819,u,2,26,30,955.8,1,Complete
1403,2020-05-10,1439,J,2,49,25,491.4,3,Complete
1441,2020-05-27,1287,ZMI1wTiuYavT1Lhm,7,31,20,171.6,6,Complete
1948,2020-08-03,1983,1N,1,68,21,339.9,4,Complete
2254,2020-10-04,2251,m,8,25,50,837.5,6,Complete
2305,2020-11-03,2302,QrUust,3,5,26,143.2,4,Complete
2621,2021-01-02,3275,H0PE13K5,1,59,33,348.5,7,Complete
3711,2021-02-25,2470,Up,3,58,43,2711.8,7,Complete




CustomerID non-complying rows for Integer type rule:


Unnamed: 0,SalesDate,SalesID,CustomerID,SalesPersonID,ProductID,Quantity,Sales,StoreID,OrderStatus
508,2020-01-31,109,ielXvjMk,3,53,46,3490.3,1,Complete
743,2020-02-25,131,QsdorcuT5,3,69,22,44.7,8,Complete
764,2020-03-01,819,u,2,26,30,955.8,1,Complete
1403,2020-05-10,1439,J,2,49,25,491.4,3,Complete
1441,2020-05-27,1287,ZMI1wTiuYavT1Lhm,7,31,20,171.6,6,Complete
1948,2020-08-03,1983,1N,1,68,21,339.9,4,Complete
2254,2020-10-04,2251,m,8,25,50,837.5,6,Complete
2305,2020-11-03,2302,QrUust,3,5,26,143.2,4,Complete
2621,2021-01-02,3275,H0PE13K5,1,59,33,348.5,7,Complete
3711,2021-02-25,2470,Up,3,58,43,2711.8,7,Complete




Accuracy checks:
CustomerID non-complying rows for > 0 rule:


Unnamed: 0,SalesDate,SalesID,CustomerID,SalesPersonID,ProductID,Quantity,Sales,StoreID,OrderStatus
508,2020-01-31,109,ielXvjMk,3,53,46,3490.3,1,Complete
743,2020-02-25,131,QsdorcuT5,3,69,22,44.7,8,Complete
764,2020-03-01,819,u,2,26,30,955.8,1,Complete
1403,2020-05-10,1439,J,2,49,25,491.4,3,Complete
1441,2020-05-27,1287,ZMI1wTiuYavT1Lhm,7,31,20,171.6,6,Complete
1948,2020-08-03,1983,1N,1,68,21,339.9,4,Complete
2254,2020-10-04,2251,m,8,25,50,837.5,6,Complete
2305,2020-11-03,2302,QrUust,3,5,26,143.2,4,Complete
2621,2021-01-02,3275,H0PE13K5,1,59,33,348.5,7,Complete
3711,2021-02-25,2470,Up,3,58,43,2711.8,7,Complete


#### Find percentage non-compliance for each rule:

In [27]:
print('Consistency checks:')
print('======')
print('CustomerID percentage non-compliance for data format: ' + str(data_format_percentage_non_compliance(df, 'CustomerID', 'N', [3])))

print('CustomerID percentage non-compliance for Integer type rule: ' + str(data_type_percentage_non_compliance(df, 'CustomerID', int)))
print('\n')

print('Accuracy checks:')
print('======')
print('CustomerID percentage non-compliance for > 0 rule: ' + str(numerical_range_percentage_non_compliance(df, 'CustomerID', lower_bound = 0)))

Consistency checks:
CustomerID percentage non-compliance for data format: 0.20004706989879972
CustomerID percentage non-compliance for Integer type rule: 0.20004706989879972


Accuracy checks:
CustomerID percentage non-compliance for > 0 rule: 0.20004706989879972


#### Calculate **Completeness**, **Consistency** and **Accuracy** score:

In [28]:
customer_id_completeness_score = 100 # determined in cannot_be_blank check earlier

customer_id_consistency_score = 100 - (data_format_percentage_non_compliance(df, 'CustomerID', 'N', [3]) + data_type_percentage_non_compliance(df, 'CustomerID', int)) / 2

customer_id_accuracy_score = 100 - numerical_range_percentage_non_compliance(df, 'CustomerID', lower_bound = 0)


print('Completeness score: ' + str(customer_id_completeness_score))
print('Consistency score: ' + str(customer_id_consistency_score))
print('Accuracy score: ' + str(customer_id_accuracy_score))

Completeness score: 100
Consistency score: 99.7999529301012
Accuracy score: 99.7999529301012


### Move on to validation checks for SalesPersonID column
#### Identify rows in SalesPersonID column that do not comply with **Consistency** and **Accuracy** rules

In [29]:
print('Consistency checks:')
print('======')
print('SalesPersonID non-complying rows for data format:')
display(data_format_identify_non_complying_rows(df, 'SalesPersonID', 'N', [1]))
print('\n')

print('SalesPersonID non-complying rows for Integer type rule:')
display(data_type_identify_non_complying_rows(df, 'SalesPersonID', int))
print('\n')

print('Accuracy checks:')
print('======')
print('SalesPersonID non-complying rows for > 0 rule:')
display(numerical_range_identify_non_complying_rows(df, 'SalesPersonID', lower_bound = 0))

Consistency checks:
SalesPersonID non-complying rows for data format:


Unnamed: 0,SalesDate,SalesID,CustomerID,SalesPersonID,ProductID,Quantity,Sales,StoreID,OrderStatus
592,2020-02-02,715,75,NqWS,30,7,209.2,7,Complete
913,2020-03-02,968,10,xlnqAIbfylfqG82GiTIww3WQQ,62,57,1395.4,9,Complete
950,2020-03-02,1005,63,89tT2ASfvMz,13,80,2287.3,4,Complete
1064,2020-03-28,600,41,g,36,6,46.0,1,Complete
1838,2020-07-15,1683,63,SLsAspk,38,27,2705.1,1,Complete
1907,2020-08-02,1942,63,r,35,109,757.5,4,Complete
2255,2020-10-04,2252,44,K,51,20,2445.4,6,Complete
3015,2021-01-07,3649,63,8o,51,53,5224.8,9,Complete
3170,2021-01-18,2425,74,2xxryze,18,6,152.4,4,Complete
3270,2021-02-01,3751,24,sl4K,62,6,168.0,2,Complete




SalesPersonID non-complying rows for Integer type rule:


Unnamed: 0,SalesDate,SalesID,CustomerID,SalesPersonID,ProductID,Quantity,Sales,StoreID,OrderStatus
592,2020-02-02,715,75,NqWS,30,7,209.2,7,Complete
913,2020-03-02,968,10,xlnqAIbfylfqG82GiTIww3WQQ,62,57,1395.4,9,Complete
950,2020-03-02,1005,63,89tT2ASfvMz,13,80,2287.3,4,Complete
1064,2020-03-28,600,41,g,36,6,46.0,1,Complete
1838,2020-07-15,1683,63,SLsAspk,38,27,2705.1,1,Complete
1907,2020-08-02,1942,63,r,35,109,757.5,4,Complete
2255,2020-10-04,2252,44,K,51,20,2445.4,6,Complete
3015,2021-01-07,3649,63,8o,51,53,5224.8,9,Complete
3170,2021-01-18,2425,74,2xxryze,18,6,152.4,4,Complete
3270,2021-02-01,3751,24,sl4K,62,6,168.0,2,Complete




Accuracy checks:
SalesPersonID non-complying rows for > 0 rule:


Unnamed: 0,SalesDate,SalesID,CustomerID,SalesPersonID,ProductID,Quantity,Sales,StoreID,OrderStatus
592,2020-02-02,715,75,NqWS,30,7,209.2,7,Complete
913,2020-03-02,968,10,xlnqAIbfylfqG82GiTIww3WQQ,62,57,1395.4,9,Complete
950,2020-03-02,1005,63,89tT2ASfvMz,13,80,2287.3,4,Complete
1064,2020-03-28,600,41,g,36,6,46.0,1,Complete
1838,2020-07-15,1683,63,SLsAspk,38,27,2705.1,1,Complete
1907,2020-08-02,1942,63,r,35,109,757.5,4,Complete
2255,2020-10-04,2252,44,K,51,20,2445.4,6,Complete
3015,2021-01-07,3649,63,8o,51,53,5224.8,9,Complete
3170,2021-01-18,2425,74,2xxryze,18,6,152.4,4,Complete
3270,2021-02-01,3751,24,sl4K,62,6,168.0,2,Complete


#### Find percentage non-compliance for each rule:

In [30]:
print('Consistency checks:')
print('======')
print('SalesPersonID percentage non-compliance for data format: ' + str(data_format_percentage_non_compliance(df, 'SalesPersonID', 'N', [1])))

print('SalesPersonID percentage non-compliance for Integer type rule: ' + str(data_type_percentage_non_compliance(df, 'SalesPersonID', int)))
print('\n')

print('Accuracy checks:')
print('======')
print('SalesPersonID percentage non-compliance for > 0 rule: ' + str(numerical_range_percentage_non_compliance(df, 'SalesPersonID', lower_bound = 0)))

Consistency checks:
SalesPersonID percentage non-compliance for data format: 0.18827959519887033
SalesPersonID percentage non-compliance for Integer type rule: 0.18827959519887033


Accuracy checks:
SalesPersonID percentage non-compliance for > 0 rule: 0.18827959519887033


#### Calculate **Completeness**, **Consistency** and **Accuracy** score:

In [31]:
salesperson_id_completeness_score = 100 # determined in cannot_be_blank check earlier

salesperson_id_consistency_score = 100 - (data_format_percentage_non_compliance(df, 'SalesPersonID', 'N', [1]) + data_type_percentage_non_compliance(df, 'SalesPersonID', int)) / 2

salesperson_id_accuracy_score = 100 - numerical_range_percentage_non_compliance(df, 'SalesPersonID', lower_bound = 0)


print('Completeness score: ' + str(salesperson_id_completeness_score))
print('Consistency score: ' + str(salesperson_id_consistency_score))
print('Accuracy score: ' + str(salesperson_id_accuracy_score))

Completeness score: 100
Consistency score: 99.81172040480114
Accuracy score: 99.81172040480114


# Your turn!
Fill in the code blocks to identify non-complying rows, calculate percentage non-compliance, and calculate **Consistency** and **Accuracy** scores for the ProductID and Quantity columns

## Check ProductID column
Identify rows in ProductID column that do not comply with **Consistency** and **Accuracy** rules:

<div class="alert alert-block alert-info">
<b>↓↓↓ TO DO: Run the validation check functions to find non-complying rows ↓↓↓</b>
</div>

In [32]:
# fill in the code!
print('Consistency checks:')
print('======')
print('ProductID non-complying rows for data format:')
display(data_format_identify_non_complying_rows(df, 'ProductID', 'N', [2]))
print('\n')

print('ProductID non-complying rows for Integer type rule:')
display(data_type_identify_non_complying_rows(df, 'ProductID', int))
print('\n')

print('Accuracy checks:')
print('======')
print('ProductID non-complying rows for > 0 rule:')
display(numerical_range_identify_non_complying_rows(df, 'ProductID', lower_bound = 0))

Consistency checks:
ProductID non-complying rows for data format:


Unnamed: 0,SalesDate,SalesID,CustomerID,SalesPersonID,ProductID,Quantity,Sales,StoreID,OrderStatus
1326,2020-05-02,1362,28,4,-77,8,136.2,3,Complete
1460,2020-06-02,1512,59,4,24j2K5WcBfQH0JpmHDlxsYg,35,851.6,7,Complete
2092,2020-09-02,2143,19,3,UHZy,34,1755.6,7,Complete
2734,2021-01-02,3388,127,9,3xgXb3plr,9,44.3,7,Complete
2876,2021-01-03,3526,34,9,-19,38,322.8,4,Complete
4522,2021-04-02,4909,35,9,EK,19,1003.9,1,Complete
4814,2021-04-16,2515,51,1,-18,40,1107.1,8,Complete
5315,2021-05-09,5610,47,3,y,66,1475.1,5,Complete
5498,2021-06-01,5755,32,4,-50,21,348.8,6,Complete
5521,2021-06-01,5778,37,9,Od,12,52.6,9,Complete




ProductID non-complying rows for Integer type rule:


Unnamed: 0,SalesDate,SalesID,CustomerID,SalesPersonID,ProductID,Quantity,Sales,StoreID,OrderStatus
1460,2020-06-02,1512,59,4,24j2K5WcBfQH0JpmHDlxsYg,35,851.6,7,Complete
2092,2020-09-02,2143,19,3,UHZy,34,1755.6,7,Complete
2734,2021-01-02,3388,127,9,3xgXb3plr,9,44.3,7,Complete
4522,2021-04-02,4909,35,9,EK,19,1003.9,1,Complete
5315,2021-05-09,5610,47,3,y,66,1475.1,5,Complete
5521,2021-06-01,5778,37,9,Od,12,52.6,9,Complete
6452,2021-07-10,6673,32,1,WhcfHZegdLdCLO49ITICn,69,342.4,5,Complete




Accuracy checks:
ProductID non-complying rows for > 0 rule:


Unnamed: 0,SalesDate,SalesID,CustomerID,SalesPersonID,ProductID,Quantity,Sales,StoreID,OrderStatus
1326,2020-05-02,1362,28,4,-77,8,136.2,3,Complete
1460,2020-06-02,1512,59,4,24j2K5WcBfQH0JpmHDlxsYg,35,851.6,7,Complete
2092,2020-09-02,2143,19,3,UHZy,34,1755.6,7,Complete
2734,2021-01-02,3388,127,9,3xgXb3plr,9,44.3,7,Complete
2876,2021-01-03,3526,34,9,-19,38,322.8,4,Complete
4522,2021-04-02,4909,35,9,EK,19,1003.9,1,Complete
4814,2021-04-16,2515,51,1,-18,40,1107.1,8,Complete
5315,2021-05-09,5610,47,3,y,66,1475.1,5,Complete
5498,2021-06-01,5755,32,4,-50,21,348.8,6,Complete
5521,2021-06-01,5778,37,9,Od,12,52.6,9,Complete


Find percentage non-compliance for each rule:

<div class="alert alert-block alert-info">
<b>↓↓↓ TO DO: Run the validation check functions to find percentage non-compliance ↓↓↓</b>
</div>

In [56]:
# fill in the code!
print('Consistency checks:')
print('======')
print('ProductID percentage non-compliance for data format: ' + str(data_format_percentage_non_compliance(df, 'ProductID', 'N', [2])))

print('ProductID percentage non-compliance for Integer type rule: ' + str(data_type_percentage_non_compliance(df, 'ProductID', int)))
print('\n')

print('Accuracy checks:')
print('======')
print('ProductID percentage non-compliance for > 0 rule: ' + str(numerical_range_percentage_non_compliance(df, 'ProductID', lower_bound = 0)))

Consistency checks:
ProductID percentage non-compliance for data format: 0.16474464579901155
ProductID percentage non-compliance for Integer type rule: 0.08237232289950577


Accuracy checks:
ProductID percentage non-compliance for > 0 rule: 0.16474464579901155


Calculate **Completeness**, **Consistency** and **Accuracy** score:

<div class="alert alert-block alert-info">
<b>↓↓↓ TO DO: Calculate scores for ProductID column ↓↓↓</b>
</div>

In [57]:
# fill in the code!
product_id_completeness_score = 100 # determined in cannot_be_blank check earlier

product_id_consistency_score = 100 - (data_format_percentage_non_compliance(df, 'ProductID', 'N', [2]) + data_type_percentage_non_compliance(df, 'ProductID', int)) / 2

product_id_accuracy_score = 100 - numerical_range_percentage_non_compliance(df, 'ProductID', lower_bound = 0)


print('Completeness score: ' + str(product_id_completeness_score))
print('Consistency score: ' + str(product_id_consistency_score))
print('Accuracy score: ' + str(product_id_accuracy_score))

Completeness score: 100
Consistency score: 99.87644151565074
Accuracy score: 99.83525535420098


Identify rows in Quantity column that do not comply with **Consistency** and **Accuracy** rules:

<div class="alert alert-block alert-info">
<b>↓↓↓ TO DO: Run the validation check functions to find non-complying rows ↓↓↓</b>
</div>

In [58]:
# fill in the code!
print('Consistency checks:')
print('======')
print('Quantity non-complying rows for data format:')
display(data_format_identify_non_complying_rows(df, 'Quantity', 'N', [4]))
print('\n')

print('Quantity non-complying rows for Integer type rule:')
display(data_type_identify_non_complying_rows(df, 'Quantity', int))
print('\n')

print('Accuracy checks:')
print('======')
print(' non-complying rows for > 0 rule:')
display(numerical_range_identify_non_complying_rows(df, 'Quantity', lower_bound = 0))

Consistency checks:
Quantity non-complying rows for data format:


Unnamed: 0,SalesDate,SalesID,CustomerID,SalesPersonID,ProductID,Quantity,Sales,StoreID,OrderStatus
659,2020-02-08,782,39,3,6,e,27.0,1,Complete
2160,2020-09-12,2211,5,3,55,n,849.4,1,Complete
3669,2021-02-11,4150,19,9,21,SoKA,444.1,1,Complete
4268,2021-04-01,4657,87,7,14,S8dTEimdwH6723,248.7,3,Complete
4300,2021-04-01,4689,19,9,11,KBIEBM,282.8,3,Complete
4933,2021-05-01,5228,24,7,62,rQgB0KT6os5awe6,156.6,6,Complete
5858,2021-06-23,2726,47,2,45,UU,168.8,8,Complete
6157,2021-07-02,6378,75,1,68,O,96.1,7,Complete
6527,2021-07-29,2750,46,2,19,kN,193.7,3,Complete
7317,2021-09-02,7461,63,8,6,Z3R536u,180.4,3,Complete




Quantity non-complying rows for Integer type rule:


Unnamed: 0,SalesDate,SalesID,CustomerID,SalesPersonID,ProductID,Quantity,Sales,StoreID,OrderStatus
659,2020-02-08,782,39,3,6,e,27.0,1,Complete
2160,2020-09-12,2211,5,3,55,n,849.4,1,Complete
3669,2021-02-11,4150,19,9,21,SoKA,444.1,1,Complete
4268,2021-04-01,4657,87,7,14,S8dTEimdwH6723,248.7,3,Complete
4300,2021-04-01,4689,19,9,11,KBIEBM,282.8,3,Complete
4933,2021-05-01,5228,24,7,62,rQgB0KT6os5awe6,156.6,6,Complete
5858,2021-06-23,2726,47,2,45,UU,168.8,8,Complete
6157,2021-07-02,6378,75,1,68,O,96.1,7,Complete
6527,2021-07-29,2750,46,2,19,kN,193.7,3,Complete
7317,2021-09-02,7461,63,8,6,Z3R536u,180.4,3,Complete




Accuracy checks:
 non-complying rows for > 0 rule:


Unnamed: 0,SalesDate,SalesID,CustomerID,SalesPersonID,ProductID,Quantity,Sales,StoreID,OrderStatus
659,2020-02-08,782,39,3,6,e,27.0,1,Complete
2160,2020-09-12,2211,5,3,55,n,849.4,1,Complete
2722,2021-01-02,3376,32,1,68,0,211.8,5,Complete
3669,2021-02-11,4150,19,9,21,SoKA,444.1,1,Complete
4268,2021-04-01,4657,87,7,14,S8dTEimdwH6723,248.7,3,Complete
4300,2021-04-01,4689,19,9,11,KBIEBM,282.8,3,Complete
4933,2021-05-01,5228,24,7,62,rQgB0KT6os5awe6,156.6,6,Complete
5858,2021-06-23,2726,47,2,45,UU,168.8,8,Complete
6157,2021-07-02,6378,75,1,68,O,96.1,7,Complete
6527,2021-07-29,2750,46,2,19,kN,193.7,3,Complete


Find percentage non-compliance for each rule:

<div class="alert alert-block alert-info">
<b>↓↓↓ TO DO: Run the validation check functions to find percentage non-compliance ↓↓↓</b>
</div>

In [59]:
# fill in the code!
print('Consistency checks:')
print('======')
print('Quantity percentage non-compliance for data format: ' + str(data_format_percentage_non_compliance(df, 'Quantity', 'N', [4])))

print('Quantity percentage non-compliance for Integer type rule: ' + str(data_type_percentage_non_compliance(df, 'Quantity', int)))
print('\n')

print('Accuracy checks:')
print('======')
print('Quantity percentage non-compliance for > 0 rule: ' + str(numerical_range_percentage_non_compliance(df, 'Quantity', lower_bound = 0)))

Consistency checks:
Quantity percentage non-compliance for data format: 0.14120969639915276
Quantity percentage non-compliance for Integer type rule: 0.14120969639915276


Accuracy checks:
Quantity percentage non-compliance for > 0 rule: 0.15297717109908215


Calculate **Completeness**, **Consistency** and **Accuracy** score:

<div class="alert alert-block alert-info">
<b>↓↓↓ TO DO: Calculate scores for Quantity column ↓↓↓</b>
</div>

In [60]:
# fill in the code!
quantity_completeness_score = 100 # determined in cannot_be_blank check earlier

quantity_consistency_score = 100 - (data_format_percentage_non_compliance(df, 'Quantity', 'N', [4]) + data_type_percentage_non_compliance(df, 'Quantity', int)) / 2

quantity_accuracy_score = 100 - numerical_range_percentage_non_compliance(df, 'Quantity', lower_bound = 0)


print('Completeness score: ' + str(quantity_completeness_score))
print('Consistency score: ' + str(quantity_consistency_score))
print('Accuracy score: ' + str(quantity_accuracy_score))

Completeness score: 100
Consistency score: 99.85879030360084
Accuracy score: 99.84702282890092


### Move on to validation checks for Sales column
#### Identify rows in Sales column that do not comply with **Consistency** and **Accuracy** rules

In [61]:
print('Consistency checks:')
print('======')
print('Sales non-complying rows for data format:')
display(data_format_identify_non_complying_rows(df, 'Sales', 'N', [10,2]))
print('\n')

print('Sales non-complying rows for Float type rule:')
display(data_type_identify_non_complying_rows(df, 'Sales', float))
print('\n')

print('Accuracy checks:')
print('======')
print('Sales non-complying rows for > 0 rule:')
display(numerical_range_identify_non_complying_rows(df, 'Sales', lower_bound = 0))

Consistency checks:
Sales non-complying rows for data format:


Unnamed: 0,SalesDate,SalesID,CustomerID,SalesPersonID,ProductID,Quantity,Sales,StoreID,OrderStatus
341,2020-01-03,520,73,9,13,18,WNl2tBcj5n1,2,Complete
2147,2020-09-10,2198,63,8,21,13,R,9,Complete
2352,2020-12-01,2349,39,3,15,5,eFU,1,Complete
3138,2021-01-13,2930,79,5,34,19,nCWAo9ag,2,Complete
4963,2021-05-01,5258,19,3,29,6,NB3Cu77,1,Complete
4993,2021-05-02,5288,56,7,62,20,F,4,Complete
5427,2021-06-01,5684,32,1,41,13,70.139,1,Complete
5732,2021-06-06,5989,37,3,40,12,oOn,7,Complete
5865,2021-06-27,2563,9,4,46,26,oSZtOdKa,8,Complete
6240,2021-07-02,6461,44,8,16,35,uwBoC,6,Complete




Sales non-complying rows for Float type rule:


Unnamed: 0,SalesDate,SalesID,CustomerID,SalesPersonID,ProductID,Quantity,Sales,StoreID,OrderStatus
341,2020-01-03,520,73,9,13,18,WNl2tBcj5n1,2,Complete
2147,2020-09-10,2198,63,8,21,13,R,9,Complete
2352,2020-12-01,2349,39,3,15,5,eFU,1,Complete
3138,2021-01-13,2930,79,5,34,19,nCWAo9ag,2,Complete
4963,2021-05-01,5258,19,3,29,6,NB3Cu77,1,Complete
4993,2021-05-02,5288,56,7,62,20,F,4,Complete
5732,2021-06-06,5989,37,3,40,12,oOn,7,Complete
5865,2021-06-27,2563,9,4,46,26,oSZtOdKa,8,Complete
6240,2021-07-02,6461,44,8,16,35,uwBoC,6,Complete
7189,2021-09-01,7333,73,3,11,43,q3ohlKqQ7Ed,3,Complete




Accuracy checks:
Sales non-complying rows for > 0 rule:


Unnamed: 0,SalesDate,SalesID,CustomerID,SalesPersonID,ProductID,Quantity,Sales,StoreID,OrderStatus
341,2020-01-03,520,73,9,13,18,WNl2tBcj5n1,2,Complete
2147,2020-09-10,2198,63,8,21,13,R,9,Complete
2352,2020-12-01,2349,39,3,15,5,eFU,1,Complete
3138,2021-01-13,2930,79,5,34,19,nCWAo9ag,2,Complete
4963,2021-05-01,5258,19,3,29,6,NB3Cu77,1,Complete
4993,2021-05-02,5288,56,7,62,20,F,4,Complete
5732,2021-06-06,5989,37,3,40,12,oOn,7,Complete
5865,2021-06-27,2563,9,4,46,26,oSZtOdKa,8,Complete
6240,2021-07-02,6461,44,8,16,35,uwBoC,6,Complete
7189,2021-09-01,7333,73,3,11,43,q3ohlKqQ7Ed,3,Complete


#### Find percentage non-compliance for each rule:

In [62]:
print('Consistency checks:')
print('======')
print('Sales percentage non-compliance for data format: ' + str(data_format_percentage_non_compliance(df, 'Sales', 'N', [10,2])))

print('Sales percentage non-compliance for Float type rule: ' + str(data_type_percentage_non_compliance(df, 'Sales', float)))
print('\n')

print('Accuracy checks:')
print('======')
print('Sales percentage non-compliance for >= 0 rule: ' + str(numerical_range_percentage_non_compliance(df, 'Sales', lower_bound = -0.001)))

Consistency checks:
Sales percentage non-compliance for data format: 0.15297717109908215
Sales percentage non-compliance for Float type rule: 0.14120969639915276


Accuracy checks:
Sales percentage non-compliance for >= 0 rule: 0.14120969639915276


#### Calculate **Completeness**, **Consistency** and **Accuracy** score:

In [63]:
sales_completeness_score = 100 # determined in cannot_be_blank check earlier

sales_consistency_score = 100 - (data_format_percentage_non_compliance(df, 'Sales', 'N', [10,2]) + data_type_percentage_non_compliance(df, 'Sales', float)) / 2

sales_accuracy_score = 100 - numerical_range_percentage_non_compliance(df, 'Sales', lower_bound = -0.001)


print('Completeness score: ' + str(sales_completeness_score))
print('Consistency score: ' + str(sales_consistency_score))
print('Accuracy score: ' + str(sales_accuracy_score))

Completeness score: 100
Consistency score: 99.85290656625088
Accuracy score: 99.85879030360084


### Move on to validation checks for StoreID column
#### Identify rows in StoreID column that do not comply with **Consistency** and **Accuracy** rules

In [64]:
print('Consistency checks:')
print('======')
print('StoreID non-complying rows for data format:')
display(data_format_identify_non_complying_rows(df, 'StoreID', 'N', [1]))

Consistency checks:
StoreID non-complying rows for data format:


Unnamed: 0,SalesDate,SalesID,CustomerID,SalesPersonID,ProductID,Quantity,Sales,StoreID,OrderStatus
4,2020-01-01,115,3,9,11,60,1683.8,nd68G6h0PZbk,Pending
517,2020-02-01,584,28,5,24,11,58.2,-2,Complete
1210,2020-04-15,1114,16,2,22,12,47.5,OybsWeAABRihQ,Complete
1335,2020-05-02,1371,73,9,76,7,2671.2,dBA,Complete
2701,2021-01-02,3355,80,1,29,11,1259.2,44,Complete
3732,2021-03-01,4197,40,4,33,7,177.3,kX,Complete
4428,2021-04-02,4815,63,8,76,90,36462.6,Oc,Complete
5721,2021-06-05,5978,46,3,72,22,219.1,w3VbbUt5,Complete
6072,2021-07-02,6293,39,3,36,46,350.5,14,Complete
6533,2021-08-01,6716,41,5,25,10,138.0,31,Complete


#### Find percentage non-compliance for data format rule:

In [65]:
print('Consistency checks:')
print('======')
print('StoreID percentage non-compliance for data format: ' + str(data_format_percentage_non_compliance(df, 'StoreID', 'N', [1])))

Consistency checks:
StoreID percentage non-compliance for data format: 0.15297717109908215


# Using ChatGPT to generate validation check function - **Accuracy**
### Check that all values in StoreID Column contain only the permitted values (1 - 9)

In [66]:
def check_store_id_quality(df):
    # Define the permitted values
    permitted_values = {'1', '2', '3', '4', '5', '6', '7', '8', '9'}

    # Filter rows where 'StoreID' is not in the permitted values
    invalid_rows = df[~df['StoreID'].isin(permitted_values)]

    # Return the invalid rows as a list of dictionaries
    return invalid_rows

#### Run the check defined by ChatGPT to identify rows in StoreID column that do not comply with permitted values rule

In [67]:
print('Consistency checks:')
print('======')
print('StoreID non-complying rows for permitted values:')
display(check_store_id_quality(df))

Consistency checks:
StoreID non-complying rows for permitted values:


Unnamed: 0,SalesDate,SalesID,CustomerID,SalesPersonID,ProductID,Quantity,Sales,StoreID,OrderStatus
4,2020-01-01,115,3,9,11,60,1683.8,nd68G6h0PZbk,Pending
517,2020-02-01,584,28,5,24,11,58.2,-2,Complete
1210,2020-04-15,1114,16,2,22,12,47.5,OybsWeAABRihQ,Complete
1335,2020-05-02,1371,73,9,76,7,2671.2,dBA,Complete
2701,2021-01-02,3355,80,1,29,11,1259.2,44,Complete
3732,2021-03-01,4197,40,4,33,7,177.3,kX,Complete
4428,2021-04-02,4815,63,8,76,90,36462.6,Oc,Complete
5721,2021-06-05,5978,46,3,72,22,219.1,w3VbbUt5,Complete
6072,2021-07-02,6293,39,3,36,46,350.5,14,Complete
6533,2021-08-01,6716,41,5,25,10,138.0,31,Complete


#### Find percentage non-compliance for permitted values rule:

In [68]:
store_id_permitted_values_percentage_non_compliance = len(check_store_id_quality(df)) / l * 100

In [69]:
print('Accuracy checks:')
print('======')
print('StoreID percentage non-compliance for permitted values: ' + str(store_id_permitted_values_percentage_non_compliance))

Accuracy checks:
StoreID percentage non-compliance for permitted values: 0.15297717109908215


#### Calculate **Completeness**, **Consistency** and **Accuracy** score:

In [70]:
store_id_completeness_score = 100 # determined in cannot_be_blank check earlier

store_id_consistency_score = 100 - data_format_percentage_non_compliance(df, 'StoreID', 'N', [1])

store_id_accuracy_score = 100 - store_id_permitted_values_percentage_non_compliance


print('Completeness score: ' + str(store_id_completeness_score))
print('Consistency score: ' + str(store_id_consistency_score))
print('Accuracy score: ' + str(store_id_accuracy_score))

Completeness score: 100
Consistency score: 99.84702282890092
Accuracy score: 99.84702282890092


### Move on to OrderStatus column
#### Identify rows in OrderStatus column that do not comply with data format rule:

In [71]:
print('Consistency checks:')
print('======')
print('OrderStatus non-complying rows for data format:')
display(data_format_identify_non_complying_rows(df, 'OrderStatus', 'A', [10]))

Consistency checks:
OrderStatus non-complying rows for data format:


Unnamed: 0,SalesDate,SalesID,CustomerID,SalesPersonID,ProductID,Quantity,Sales,StoreID,OrderStatus
652,2020-02-07,775,5,9,35,7,38.5,4,rTc8Le3iHuV6
1077,2020-04-01,1075,32,1,62,5,161.0,1,2Jk
1178,2020-04-07,1230,19,9,49,36,899.8,2,l4cNSE
1981,2020-08-11,2016,3,9,57,5,101.9,5,nCYNj2qGHfSVTaNOTaeigVRyaoajW5
2916,2021-01-03,3566,69,4,35,3,22.2,1,TR3OMwFdOBEF
4217,2021-03-19,2499,44,7,75,19,155.8,7,RVD63ltyufaIkUaTHo
5428,2021-06-01,5685,32,4,43,42,589.9,1,5ji
6826,2021-08-02,7008,61,3,41,5,54.0,5,kRIT3I2xQqtQ
7124,2021-09-01,7268,79,8,19,12,111.7,7,Mtb4Lo
7388,2021-09-02,7532,76,4,77,27,373.7,9,Python!1!1


#### Find percentage non-compliance for data format rule:

In [72]:
print('Consistency checks:')
print('======')
print('OrderStatus percentage non-compliance for data format: ' + str(data_format_percentage_non_compliance(df, 'OrderStatus', 'A', [10])))

Consistency checks:
OrderStatus percentage non-compliance for data format: 0.12944222169922334


# Using ChatGPT to generate validation check function - **Accuracy**
### check that all values in the  'OrderStatus' column contain only the permitted values (Complete, Pending, Incomplete)

<div class="alert alert-block alert-info">
<b>↓↓↓ TO DO: Get ChatGPT to write the Accuracy check function ↓↓↓</b>
</div>

In [73]:
# fill in the code using ChatGPT!
def check_order_status_quality(df):
    # Define the permitted values
    permitted_values = {'Complete', 'Pending', 'Incomplete'}

    # Filter rows where 'StoreID' is not in the permitted values
    invalid_rows = df[~df['OrderStatus'].isin(permitted_values)]

    # Return the invalid rows as a list of dictionaries
    return invalid_rows

#### Run the check defined by ChatGPT to identify rows in OrderStatus column that do not comply with permitted values rule

<div class="alert alert-block alert-info">
<b>↓↓↓ TO DO: Run the check and print the results ↓↓↓</b>
</div>

In [74]:
print('Consistency checks:')
print('======')
print('OrderStatus non-complying rows for permitted values:')
order_status_permitted_values_non_compliers = check_order_status_quality(df) # call the function defined by ChatGPT
display(order_status_permitted_values_non_compliers)

Consistency checks:
OrderStatus non-complying rows for permitted values:


Unnamed: 0,SalesDate,SalesID,CustomerID,SalesPersonID,ProductID,Quantity,Sales,StoreID,OrderStatus
652,2020-02-07,775,5,9,35,7,38.5,4,rTc8Le3iHuV6
1077,2020-04-01,1075,32,1,62,5,161.0,1,2Jk
1178,2020-04-07,1230,19,9,49,36,899.8,2,l4cNSE
1833,2020-07-13,1662,13,3,21,10,101.4,9,QhIX
1981,2020-08-11,2016,3,9,57,5,101.9,5,nCYNj2qGHfSVTaNOTaeigVRyaoajW5
2508,2021-01-02,3104,127,3,7,10,413.8,9,YgU
2916,2021-01-03,3566,69,4,35,3,22.2,1,TR3OMwFdOBEF
2984,2021-01-06,2893,6,8,18,10,255.2,3,lFL
3812,2021-03-02,4277,80,4,43,9,102.3,2,I
4217,2021-03-19,2499,44,7,75,19,155.8,7,RVD63ltyufaIkUaTHo


#### Find percentage non-compliance for permitted values rule:

In [75]:
order_status_permitted_values_percentage_non_compliance = len(order_status_permitted_values_non_compliers) / l * 100

In [76]:
print('Accuracy checks:')
print('======')
print('OrderStatus percentage non-compliance for permitted values: ' + str(order_status_permitted_values_percentage_non_compliance))

Accuracy checks:
OrderStatus percentage non-compliance for permitted values: 0.2353494939985879


#### Calculate **Completeness**, **Consistency** and **Accuracy** score:

In [77]:
order_status_completeness_score = 100 # determined in cannot_be_blank check earlier

order_status_consistency_score = 100 - data_format_percentage_non_compliance(df, 'OrderStatus', 'A', [10])

order_status_accuracy_score = 100 - order_status_permitted_values_percentage_non_compliance


print('Completeness score: ' + str(order_status_completeness_score))
print('Consistency score: ' + str(order_status_consistency_score))
print('Accuracy score: ' + str(order_status_accuracy_score))

Completeness score: 100
Consistency score: 99.87055777830078
Accuracy score: 99.76465050600142


# Reporting of Overall Dataset quality
## Calculate and present overall **Completeness**, **Consistency**, **Accuracy** scores of dataset

**Completeness** score of entire dataset = average of **Completeness** scores for each column

In [78]:
# remember earlier we found that every column passed the Completeness check - 100% completeness!
dataset_completeness_score = (sales_date_completeness_score
+ sales_id_completeness_score
+ customer_id_completeness_score
+ salesperson_id_completeness_score
+ product_id_completeness_score
+ quantity_completeness_score
+ sales_completeness_score
+ store_id_completeness_score
+ order_status_completeness_score) / 9

**Consistency** score of entire dataset = average of **Consistency** scores for each column

In [79]:
dataset_consistency_score = (sales_date_consistency_score
+ sales_id_consistency_score
+ customer_id_consistency_score
+ salesperson_id_consistency_score
+ product_id_consistency_score
+ quantity_consistency_score
+ sales_consistency_score
+ store_id_consistency_score
+ order_status_consistency_score) / 9

**Accuracy** score of entire dataset = average of **Accuracy** scores for each column

In [80]:
dataset_accuracy_score = (sales_date_accuracy_score
+ sales_id_accuracy_score
+ customer_id_accuracy_score
+ salesperson_id_accuracy_score
+ product_id_accuracy_score
+ quantity_accuracy_score
+ sales_accuracy_score
+ store_id_accuracy_score
+ order_status_accuracy_score) / 9

Present the results in a table:

In [81]:
completeness_target = 100
consistency_target = 100
accuracy_target = 90

In [82]:
def get_pass_fail (performance, target):
	if performance >= target:
		return 'Pass'
	return 'Fail'

In [83]:
results_header_row = '| ' + 'METRIC'.ljust(15) + ' | ' + 'PERFORMANCE'.ljust(15) + ' | ' + 'TARGET'.ljust(15) + ' | ' + 'PASS / FAIL'.ljust(15) + ' |'

results_completeness_row = '| ' + 'Completeness'.ljust(15) + ' | ' + str(dataset_completeness_score)[:6].ljust(15) + ' | ' + str(completeness_target).ljust(15) + ' | ' + get_pass_fail(dataset_completeness_score, completeness_target).ljust(15) + ' |'

results_consistency_row = '| ' + 'Consistency'.ljust(15) + ' | ' + str(dataset_consistency_score)[:6].ljust(15) + ' | ' + str(consistency_target).ljust(15) + ' | ' + get_pass_fail(dataset_consistency_score, consistency_target).ljust(15) + ' |'

results_accuracy_row = '| ' + 'Accuracy'.ljust(15) + ' | ' + str(dataset_accuracy_score)[:6].ljust(15) + ' | ' + str(accuracy_target).ljust(15) + ' | ' + get_pass_fail(dataset_accuracy_score, accuracy_target).ljust(15) + ' |'

results_divider = '+ ' + ''.ljust(15, '-') + ' + ' + ''.ljust(15, '-') + ' + ' + ''.ljust(15, '-') + ' + ' + ''.ljust(15, '-') + ' +'
results_header_divider = '+ ' + ''.ljust(15, '=') + ' + ' + ''.ljust(15, '=') + ' + ' + ''.ljust(15, '=') + ' + ' + ''.ljust(15, '=') + ' +'

In [84]:
print(results_divider)
print(results_header_row)
print(results_header_divider)
print(results_completeness_row)
print(results_divider)
print(results_consistency_row)
print(results_divider)
print(results_accuracy_row)
print(results_divider)

+ --------------- + --------------- + --------------- + --------------- +
| METRIC          | PERFORMANCE     | TARGET          | PASS / FAIL     |
| Completeness    | 100.0           | 100             | Pass            |
+ --------------- + --------------- + --------------- + --------------- +
| Consistency     | 99.846          | 100             | Fail            |
+ --------------- + --------------- + --------------- + --------------- +
| Accuracy        | 99.841          | 90              | Pass            |
+ --------------- + --------------- + --------------- + --------------- +
