# **Setup script environment**
Import external libraries:

In [None]:
%pip install pandas
import pandas as pd # imports the 'pandas' library for use in this script and assigns it the nickname 'pd'

# **Using `pandas`**
Load data from file:

In [None]:
df = pd.read_csv('sales.csv') # loading the data from sales.csv into the DataFrame object
df.index = df.index + 1

Some DataFrame functions built into `pandas`:

In [None]:
df.head() # returns first 5 rows of table

In [None]:
l = len(df) # returns number of rows in table
print(l)

In [None]:
df['SalesID'] # get SalesID column

In [None]:
df.loc[1] # get 1st row

In [None]:
df.loc[1]['SalesID'] # get 1st row of SalesID column

In [None]:
column_names = list(df) # returns list of column names in table
print(column_names)

# **Data quality check functions**
Each function checks the values in 1 row against 1 validation rule.<br>
Here we define the check for **Completeness**

In [None]:
def check_cannot_be_blank (df, column_name): # returns true if column_name contains no blanks
  column_values = df[column_name]
  for column_value in column_values:
    if pd.isna(column_value): # pd.isna(column_value) returns False if column_value is blank
      return False
  return True

Run the check for **Completeness**

In [None]:
print(check_cannot_be_blank(df, 'SalesID')) # run the cannot be blank check on the SalesID column

Use a `for` loop to run the **Completeness** check on all columns:

In [None]:
for column_name in column_names:
  print(check_cannot_be_blank(df, column_name)) # run the cannot be blank check for every column in the table

Define all the individual validation rule check functions:

In [None]:
from collections import defaultdict
import math

def check_unique_values (df, column_name): # returns true if all values in column_name are unique (no duplicates)
  # a defaultdict is a dictionary with a default value - defaultdict(int) defaults to 0 for keys not yet added to the dictionary
  value_occurence_count = defaultdict(int) # keep a running count of the number of times we see each value in the specified column
  for index in range(1,l + 1): # remember l is len(df) - look through the table row by row
    value = df.loc[index][column_name] # for each row, get the value in the specified column
    value_occurence_count[value] += 1 # increment the number of times we see that value by 1
    if value_occurence_count[value] > 1: # if the number of times we saw that value is more than 1,
      return False # the column fails the check
  return True # otherwise the column passes the check

def check_type (df, column_name, type_specified): # returns true if all values in column_name are integers
  for value in df[column_name]: # for each value in the specified column
    if not isinstance(value, type_specified): # if the value is not an instance of the specified type,
      return False # the column fails the check
  return True # otherwise the column passes the check

def check_within_numerical_range (df, column_name, lower_bound = - math.inf, upper_bound = math.inf): # returns true if all values in column_name are between lower_bound and upper_bound exclusive
  for value in df[column_name]: # for each value in the specified column
    try:
      if value >= upper_bound or value <= lower_bound: # if the value is greater than or equal to the specified upper bound, or less than or equal to the specified lower bound,
        return False # the column fails the check
    except:
      return False # the column also fails the check if any of the values are not numerical (>= or <= do not work)
  return True # otherwise the column passes the check

def is_intable (s): # returns true if the input value is an integer
  try:
    int(s)
  except:
    return False
  return True

def is_alphabetical (s): # returns true if the input value is alphabetical (consists only of alphabets)
  if not isinstance(s, str):
    return False
  for char in s: # for each character in the input string
    if ord(char) not in range(97,123) and ord(char) not in range(65,91): # if the character is not in a-z or A-Z
      return False # the value is not alphabetical
  return True

def check_data_format (df, column_name, character_type, pieces_max_lengths): # returns true if all values in column_name comply with data format specified
  match character_type:
    case 'N': # if character type is 'N'
      if len(pieces_max_lengths) == 2: # data format is N(p,q)
        for value in df[column_name]: # for each value in the column
          val_str = str(value) # convert the value into a string
          val_str_pieces = val_str.split('.') # split the string into before and after the decimal point
          if len(val_str_pieces) != 2:
            return False # the column fails the check if number of '.' is not exactly 1
          # the value passes the check if both pieces are integers, and each piece is of length less than or equal to the maximum length specified
          complies = is_intable(val_str_pieces[0]) and is_intable(val_str_pieces[1]) and len(val_str_pieces[0]) <= pieces_max_lengths[0] and len(val_str_pieces[1]) <= pieces_max_lengths[1]
          if not complies:
            return False
        return True
      elif len(pieces_max_lengths) == 1: # data format is N(p)
        for value in df[column_name]:
          complies = is_intable(value) and len(str(value)) <= pieces_max_lengths[0] # the value passes the check if it is an integer and is of length less than or equal to the maximum length specified
          if not complies:
            return False
        return True
    case 'A': # data format is A(p)
      for value in df[column_name]:
        complies = is_alphabetical(value) and len(value) <= pieces_max_lengths[0] # the value passes the check if it is alphabetical and is of length less than or equal to the maximum length specified
        if not complies:
          return False
      return True
    case 'X': # data format is X(p)
      for value in df[column_name]:
        val_str = str(value) # convert the value into a string
        complies = len(val_str) <= pieces_max_lengths[0] # the value passes the check if it is of length less than or equal to the maximum length specified
        if not complies:
          return False
      return True

def check_date_format_yyyymmdd (df, column_name): # returns true if all values in column_name follow the date format yyyy-mm-dd
  for value in df[column_name]:
    value_split = value.split('-') # split each value into 3 parts to assess individually
    if len(value_split) != 3:
      return False # column fails the check if the number of parts is not exactly 3
    try:
      if len(value_split[2]) != 2 or int(value_split[2]) < 0 or int(value_split[2]) > 31: # reject if date is not 2 digits, or date < 0 or date > 31
        return False
      if len(value_split[2]) != 2 or int(value_split[1]) < 0 or int(value_split[1]) > 12: # reject if month is not 2 digits, or month < 0 or month > 12
        return False
      if len(value_split[0]) != 4 or not is_intable(value_split[0]): # reject if year is not int or year is not 4 digits
        return False
    except:
      return False # reject if date, month or year is not an integer
  return True


Run the defined functions on the respective columns, and print the results:

In [None]:
# check SalesID column
print('check SalesID column')
print('====')
print('Data format: ' + str(check_data_format(df, 'SalesID', 'N', [4])))
print('Unique values: ' + str(check_unique_values(df, 'SalesID')))
print('Is integer: ' + str(check_type(df, 'SalesID', int)))
print('> 0: ' + str(check_within_numerical_range(df, 'SalesID', lower_bound = 0)))
print('\n')

# check SalesDate column
print('check SalesDate column')
print('====')
print('Data format: ' + str(check_data_format(df, 'SalesDate', 'X', [10])))
print('Date format: ' + str(check_date_format_yyyymmdd(df, 'SalesDate')))
print('\n')

# check CustomerID column
print('check CustomerID column')
print('====')
print('Data format: ' + str(check_data_format(df, 'CustomerID', 'N', [3])))
print('Is integer: ' + str(check_type(df, 'CustomerID', int)))
print('> 0: ' + str(check_within_numerical_range(df, 'CustomerID', lower_bound = 0)))
print('\n')

# check SalesPersonID column
print('check SalesPersonID column')
print('====')
print('Data format: ' + str(check_data_format(df, 'SalesPersonID', 'N', [1])))
print('Is integer: ' + str(check_type(df, 'SalesPersonID', int)))
print('> 0: ' + str(check_within_numerical_range(df, 'SalesPersonID', lower_bound = 0)))
print('\n')

# check ProductID column
print('check ProductID column')
print('====')
print('Data format: ' + str(check_data_format(df, 'ProductID', 'N', [2])))
print('Is integer: ' + str(check_type(df, 'ProductID', int)))
print('> 0: ' + str(check_within_numerical_range(df, 'ProductID', lower_bound = 0)))
print('\n')

# check Quantity column
print('check Quantity column')
print('====')
print('Data format: ' + str(check_data_format(df, 'Quantity', 'N', [4])))
print('Is integer: ' + str(check_type(df, 'Quantity', int)))
print('> 0: ' + str(check_within_numerical_range(df, 'Quantity', lower_bound = 0)))
print('\n')

# check Sales column
print('check Sales column')
print('====')
print('Data format: ' + str(check_data_format(df, 'Sales', 'N', [10, 2])))
print('Is float: ' + str(check_type(df, 'Sales', float)))
print('>= 0: ' + str(check_within_numerical_range(df, 'Sales', lower_bound = -1)))
print('\n')

# check StoreID column
print('check StoreID column')
print('====')
print('Data format: ' + str(check_data_format(df, 'StoreID', 'N', [1])))
print('\n')

# check OrderStatus column
print('check OrderStatus column')
print('====')
print('Data format: ' + str(check_data_format(df, 'OrderStatus', 'A', [10])))
print('\n')

Modify the function to also list the rows that do not comply with the specified validation rule:

In [None]:
def numerical_range_identify_non_complying_rows (df, column_name, lower_bound = - math.inf, upper_bound = math.inf): # returns a list of row(s) that do not comply to numerical range rule
  non_compliers = [] # keep a list of rows that do not comply to this rule
  for index in range(1, l + 1):
    value = df.loc[index][column_name] # locate the row in  the specified column that does not comply with this rule
    try:
      if int(value) >= upper_bound or int(value) <= lower_bound: # if value in row is not within specified range
        non_compliers.append(df.loc[index]) # add the row to the list of non-compliers
    except:
      non_compliers.append(df.loc[index]) # also add the row to the list of non-compliers if value is not numerical
  return non_compliers

*Define a helper function to print relevant rows neatly:*

In [None]:
def pprint_row_list (lis): # prints a list of rows in a nice format
  for i in range(len(lis)): # for each item in the list
    print('Item number ' + str(i + 1)) # print the item number (position in list)
    print(lis[i]) # print the item
    print('\n') # print a line break

In [None]:
pprint_row_list(numerical_range_identify_non_complying_rows(df, 'ProductID', lower_bound = 0))

Modify the rest of the relevant validation check functions to also return a list of non-complying rows:

In [None]:
def date_format_identify_non_complying_rows (df, column_name): # returns a list of row(s) that do not comply to date format rule
  non_compliers = [] # keep a list of rows that do not comply to this rule
  for row_index in range(1, l + 1): # going through the df row by row this time
    value_complies = True
    value = df.loc[row_index][column_name] # locate the row in  the specified column that does not comply with this rule
    value_split = value.split('-') # split each value into 3 parts to assess individually
    if len(value_split) != 3:
      value_complies = False # column fails the check if the number of parts is not exactly 3
    try:
      if int(value_split[2]) < 0 or int(value_split[2]) > 31: # reject if date is not int or date < 0 or date > 31
        value_complies = False
      if int(value_split[1]) < 0 or int(value_split[1]) > 12: # reject if month is not int or month < 0 or month > 12
        value_complies = False
      if len(value_split[0]) != 4 or not is_intable(value_split[0]): # reject if year is not int or year is not 4 digits
        value_complies = False
    except:
      value_complies = False # also reject row if date, month or year is not an integer
    if value_complies == False:
        non_compliers.append(df.loc[row_index]) # add the row to the list of non-compliers
  return non_compliers

def unique_values_identify_non_complying_rows (df, column_name): # returns a list of row(s) that do not comply to unique values rule
  non_compliers = [] # keep a list of rows that do not comply to this rule
  value_occurence_list = defaultdict(list) # for each value that occurs in the column, keep a list of rows that have this value
  for index in range(1,l + 1): # remember l is len(df)
    value = df.loc[index][column_name] # locate the row in  the specified column that does not comply with this rule
    value_occurence_list[value].append(index) # add the row to the list of rows with the same value in that column
  for value in value_occurence_list.keys(): # for each value in the column,
    if len(value_occurence_list[value]) > 1: # if there is more than 1 row in the list of rows with that value
      non_compliers = non_compliers + [df.loc[index] for index in value_occurence_list[value]] # add all the rows in the list to the list of non-compliers
  return non_compliers

def data_format_identify_non_complying_rows (df, column_name, character_type, pieces_max_lengths): # returns a list of row(s) that do not comply to data format rule
  non_compliers = [] # keep a list of rows that do not comply to this rule
  match character_type:
    case 'N':
      if len(pieces_max_lengths) == 2: # data format is N(p,q)
        for index in range(1, l + 1):
          val_str = str(df.loc[index][column_name]) # locate the row in  the specified column that does not comply with this rule
          val_str_pieces = val_str.split('.')
          complies = len(val_str_pieces) == 2 and is_intable(val_str_pieces[0]) and is_intable(val_str_pieces[1]) and len(val_str_pieces[0]) <= pieces_max_lengths[0] and len(val_str_pieces[1]) <= pieces_max_lengths[1]
          if not complies:
            non_compliers.append(df.loc[index]) # if the value does not comply then add the row to the list of non-compliers
        return non_compliers
      elif len(pieces_max_lengths) == 1: # data format is N(p)
        for index in range(1, l + 1):
          value = df.loc[index][column_name]
          complies = is_intable(value) and len(str(value)) <= pieces_max_lengths[0]
          if not complies:
            non_compliers.append(df.loc[index]) # if the value does not comply then add the row to the list of non-compliers
        return non_compliers
    case 'A': # data format is A(p)
      for index in range(1, l + 1):
          value = df.loc[index][column_name]
          complies = is_alphabetical(value) and len(value) <= pieces_max_lengths[0]
          if not complies:
            non_compliers.append(df.loc[index]) # if the value does not comply then add the row to the list of non-compliers
      return non_compliers
    case 'X': # data format is X(p)
      for index in range(1, l + 1):
          value = df.loc[index][column_name]
          complies = len(value) <= pieces_max_lengths[0]
          if not complies:
            non_compliers.append(df.loc[index]) # if the value does not comply then add the row to the list of non-compliers
      return non_compliers

Run the newly-modified functions on the rows to find the culprit rows:

In [None]:
print('Check SalesId column - non-unique values:')
pprint_row_list(unique_values_identify_non_complying_rows(df, 'SalesID'))
print('\n')

print('Check SalesDate column - date format:')
pprint_row_list(date_format_identify_non_complying_rows(df, 'SalesDate'))
print('\n')

print('Check ProductID column - data format:')
pprint_row_list(data_format_identify_non_complying_rows(df, 'ProductID', 'N', [2]))
print('\n')

print('Check Sales column - data format:')
pprint_row_list(data_format_identify_non_complying_rows(df, 'Sales', 'N', [10,2]))
print('\n')

Modify the functions again to provide a percentage of all rows that do not comply with the relevant data validation rule:

In [None]:
# returns the percentage of row(s) that do not comply to numerical range rule
def numerical_range_percentage_non_compliance (df, column_name, lower_bound = - math.inf, upper_bound = math.inf):
  return len(numerical_range_identify_non_complying_rows(df, column_name, lower_bound, upper_bound)) / l * 100

In [None]:
numerical_range_percentage_non_compliance(df, 'ProductID', lower_bound = 0)

New function to check that all values in the column 'StoreID' fall contain only the permitted values (1 - 9) - generated by ChatGPT!

In [None]:
def check_store_id_quality(df):
    # Define the permitted values
    permitted_values = {1, 2, 3, 4, 5, 6, 7, 8, 9}

    # Filter rows where 'StoreID' is not in the permitted values
    invalid_rows = df[~df['StoreID'].isin(permitted_values)]

    # Return the invalid rows as a list of dictionaries
    return invalid_rows.to_dict(orient='records')

In [None]:
check_store_id_quality(df)