# Data Quality Checks/Validations for Users, Brands, Receipts and ReceiptItems Files after Extraction

In [38]:
import pandas as pd
import numpy as np

In [39]:
# Load CSV files into pandas dataframes
users_df = pd.read_csv(r"C:\Users\embak\OneDrive\Desktop\Fetch Assessment\Solution\Databricks output\users.csv")
receipts_df = pd.read_csv(r"C:\Users\embak\OneDrive\Desktop\Fetch Assessment\Solution\Databricks output\receipts.csv")
brands_df = pd.read_csv(r"C:\Users\embak\OneDrive\Desktop\Fetch Assessment\Solution\Cleaned\brands_processed.csv")
receiptitems_df = pd.read_csv(r"C:\Users\embak\OneDrive\Desktop\Fetch Assessment\Solution\Databricks output\receiptitems.csv")

In [40]:
# Check for missing or null values
missing_values = {
    'Users': users_df.isnull().sum(),
    'Receipts': receipts_df.isnull().sum(),
    'Brands': brands_df.isnull().sum(),
    'ReceiptItems': receiptitems_df.isnull().sum()
}

In [41]:
missing_values

{'Users': user_id          0
 state           56
 createdDate      0
 lastLogin       62
 role             0
 active           0
 signUpSource    48
 dtype: int64,
 'Receipts': receipt_id                   0
 user_id                      0
 bonusPointsEarned          575
 bonusPointsEarnedReason    575
 createDate                   0
 dateScanned                  0
 finishedDate               551
 modifyDate                   0
 pointsAwardedDate          582
 pointsEarned               510
 purchaseDate               448
 purchasedItemCount         484
 rewardsReceiptStatus         0
 totalSpent                 435
 dtype: int64,
 'Brands': _id               0
 barcode           1
 brandCode       269
 category        156
 categoryCode    651
 name              0
 topBrand          0
 cpgId             0
 cpgRef            1
 dtype: int64,
 'ReceiptItems': receipt_id                    0
 barcode                    3851
 description                 381
 finalPrice                  174

In [42]:
# Check for duplicate primary keys
duplicate_keys = {
    'Users_duplicates': users_df.duplicated().sum(),
    'Receipts_duplicates': receipts_df.duplicated().sum(),
    'Brands_duplicates': brands_df.duplicated().sum(),
}

In [43]:
duplicate_keys

{'Users_duplicates': 283, 'Receipts_duplicates': 0, 'Brands_duplicates': 0}

In [44]:
# Check for foreign key violations
missing_user_ids = receipts_df[~receipts_df['user_id'].isin(users_df['user_id'])]['user_id'].unique()
missing_cpg_pointsPayerId = receiptitems_df[~receiptitems_df['pointsPayerId'].isin(brands_df['cpgId'])]['pointsPayerId'].unique()

In [45]:
foreign_key_violations = {
    'Receipts_missing_user_ids': len(missing_user_ids),
    'ReceiptItems_missing_brands': len(missing_cpg_pointsPayerId)
}

In [47]:
foreign_key_violations

{'Receipts_missing_user_ids': 117, 'ReceiptItems_missing_brands': 3}

In [48]:
# Check for duplicate rows
duplicate_rows = {
    'Users': users_df.duplicated().sum(),
    'Receipts': receipts_df.duplicated().sum(),
    'Brands': brands_df.duplicated().sum(),
    'ReceiptItems': receiptitems_df.duplicated().sum()
}

In [49]:
duplicate_rows

{'Users': 283, 'Receipts': 0, 'Brands': 0, 'ReceiptItems': 0}

In [50]:
# Combine all issues into a single DataFrame
data_quality_issues = {
    'Missing Values': missing_values,
    'Duplicate Keys': duplicate_keys,
    'Foreign Key Violations': foreign_key_violations,
    'Diplicate Rows': duplicate_rows
}

In [51]:
data_quality_issues

{'Missing Values': {'Users': user_id          0
  state           56
  createdDate      0
  lastLogin       62
  role             0
  active           0
  signUpSource    48
  dtype: int64,
  'Receipts': receipt_id                   0
  user_id                      0
  bonusPointsEarned          575
  bonusPointsEarnedReason    575
  createDate                   0
  dateScanned                  0
  finishedDate               551
  modifyDate                   0
  pointsAwardedDate          582
  pointsEarned               510
  purchaseDate               448
  purchasedItemCount         484
  rewardsReceiptStatus         0
  totalSpent                 435
  dtype: int64,
  'Brands': _id               0
  barcode           1
  brandCode       269
  category        156
  categoryCode    651
  name              0
  topBrand          0
  cpgId             0
  cpgRef            1
  dtype: int64,
  'ReceiptItems': receipt_id                    0
  barcode                    3851
  descriptio

In [52]:
# Convert to DataFrame for better readability
data_quality_issues_df = pd.DataFrame.from_dict(data_quality_issues, orient='index').T
print(data_quality_issues_df)

                                                                Missing Values  \
Users                        user_id          0
state           56
createdD...   
Receipts                     receipt_id                   0
user_id        ...   
Brands                       _id               0
barcode           1
brandC...   
ReceiptItems                 receipt_id                    0
barcode       ...   
Users_duplicates                                                           NaN   
Receipts_duplicates                                                        NaN   
Brands_duplicates                                                          NaN   
Receipts_missing_user_ids                                                  NaN   
ReceiptItems_missing_brands                                                NaN   

                            Diplicate Rows Duplicate Keys  \
Users                                  283            NaN   
Receipts                                 0            NaN