In [1]:
import pandas as pd 
from sqlalchemy import create_engine
from dotenv import load_dotenv
import os

# Load environment variables from .env file
load_dotenv()

# Get credentials from environment variables
db_user = os.getenv("db_user")
db_password = os.getenv("db_password")
db_host = os.getenv("db_host")
db_name = os.getenv("db_name")

# Create the database engine
engine = create_engine(f'mysql+mysqlconnector://{db_user}:{db_password}@{db_host}/{db_name}')

In [5]:
# Check for NULL values in 'customers' table
customers_null_check = pd.read_sql("""
    SELECT COUNT(*) AS null_count
    FROM customers
    WHERE customer_id IS NULL OR name IS NULL OR email IS NULL OR phone IS NULL
""", engine)
print("Customers NULL check:\n", customers_null_check)

Customers NULL check:
    null_count
0           0


In [6]:
# Check for NULL values in 'orders' table
orders_null_check = pd.read_sql("""
    SELECT COUNT(*) AS null_count
    FROM orders
    WHERE order_id IS NULL OR customer_id IS NULL OR order_date IS NULL OR order_status IS NULL
""", engine)
print("Orders NULL check:\n", orders_null_check)

Orders NULL check:
    null_count
0           0


In [7]:
# Check for NULL values in 'order_items' table
order_items_null_check = pd.read_sql("""
    SELECT COUNT(*) AS null_count
    FROM order_items
    WHERE order_item_id IS NULL OR order_id IS NULL OR product_id IS NULL OR quantity IS NULL
""", engine)
print("Order Items NULL check:\n", order_items_null_check)

Order Items NULL check:
    null_count
0           0


In [8]:
# Check for duplicate customers based on 'customer_id'
customers_duplicates_check = pd.read_sql("""
    SELECT customer_id, COUNT(*) AS duplicate_count
    FROM customers
    GROUP BY customer_id
    HAVING duplicate_count > 1
""", engine)
print("Duplicate customers check:\n", customers_duplicates_check)

Duplicate customers check:
 Empty DataFrame
Columns: [customer_id, duplicate_count]
Index: []


In [9]:
# Check for duplicate orders based on 'order_id'
orders_duplicates_check = pd.read_sql("""
    SELECT order_id, COUNT(*) AS duplicate_count
    FROM orders
    GROUP BY order_id
    HAVING duplicate_count > 1
""", engine)
print("Duplicate orders check:\n", orders_duplicates_check)

Duplicate orders check:
 Empty DataFrame
Columns: [order_id, duplicate_count]
Index: []


In [10]:
# Check for duplicate order items based on 'order_item_id'
order_items_duplicates_check = pd.read_sql("""
    SELECT order_item_id, COUNT(*) AS duplicate_count
    FROM order_items
    GROUP BY order_item_id
    HAVING duplicate_count > 1
""", engine)
print("Duplicate order items check:\n", order_items_duplicates_check)

Duplicate order items check:
 Empty DataFrame
Columns: [order_item_id, duplicate_count]
Index: []


In [11]:
# Check if all order items reference valid orders
valid_order_items_check = pd.read_sql("""
    SELECT oi.order_id
    FROM order_items oi
    LEFT JOIN orders o ON oi.order_id = o.order_id
    WHERE o.order_id IS NULL
""", engine)
print("Invalid order references in order items:\n", valid_order_items_check)

Invalid order references in order items:
 Empty DataFrame
Columns: [order_id]
Index: []


In [12]:
# Check if all order items reference valid products
valid_order_items_product_check = pd.read_sql("""
    SELECT oi.product_id
    FROM order_items oi
    LEFT JOIN products p ON oi.product_id = p.product_id
    WHERE p.product_id IS NULL
""", engine)
print("Invalid product references in order items:\n", valid_order_items_product_check)

Invalid product references in order items:
 Empty DataFrame
Columns: [product_id]
Index: []


In [13]:
# Check if all orders reference valid customers
valid_orders_check = pd.read_sql("""
    SELECT o.order_id
    FROM orders o
    LEFT JOIN customers c ON o.customer_id = c.customer_id
    WHERE c.customer_id IS NULL
""", engine)
print("Invalid customer references in orders:\n", valid_orders_check)

Invalid customer references in orders:
 Empty DataFrame
Columns: [order_id]
Index: []


In [14]:
# Check if all orders reference valid shippers
valid_orders_shipper_check = pd.read_sql("""
    SELECT o.order_id
    FROM orders o
    LEFT JOIN shippers s ON o.shipper_name = s.shipper_name
    WHERE s.shipper_name IS NULL
""", engine)
print("Invalid shipper references in orders:\n", valid_orders_shipper_check)

Invalid shipper references in orders:
 Empty DataFrame
Columns: [order_id]
Index: []


In [15]:
# Check for negative values in prices (products table)
invalid_product_prices = pd.read_sql("""
    SELECT product_id, product_name, price
    FROM products
    WHERE price < 0
""", engine)
print("Invalid product prices:\n", invalid_product_prices)

Invalid product prices:
 Empty DataFrame
Columns: [product_id, product_name, price]
Index: []


In [16]:
# Check for negative values in order amounts (orders table)
invalid_order_amounts = pd.read_sql("""
    SELECT order_id, order_amount
    FROM orders
    WHERE order_amount < 0
""", engine)
print("Invalid order amounts:\n", invalid_order_amounts)

Invalid order amounts:
 Empty DataFrame
Columns: [order_id, order_amount]
Index: []


In [17]:
# Check for negative values in order items (quantity)
invalid_order_items_quantity = pd.read_sql("""
    SELECT order_item_id, quantity
    FROM order_items
    WHERE quantity < 0
""", engine)
print("Invalid order items quantities:\n", invalid_order_items_quantity)

Invalid order items quantities:
 Empty DataFrame
Columns: [order_item_id, quantity]
Index: []


In [18]:
# Check for invalid values in order status (orders table)
invalid_order_status = pd.read_sql("""
    SELECT order_id, order_status
    FROM orders
    WHERE order_status NOT IN ('Pending', 'Shipped', 'Delivered', 'Cancelled', 'Returned')
""", engine)
print("Invalid order status values:\n", invalid_order_status)

Invalid order status values:
 Empty DataFrame
Columns: [order_id, order_status]
Index: []


In [19]:
# Check for invalid values in payment method (payments table)
invalid_payment_method = pd.read_sql("""
    SELECT payment_id, payment_method
    FROM payments
    WHERE payment_method NOT IN ('Credit Card', 'PayPal', 'Bank Transfer', 'Mobile Money')
""", engine)
print("Invalid payment method values:\n", invalid_payment_method)

Invalid payment method values:
 Empty DataFrame
Columns: [payment_id, payment_method]
Index: []


In [20]:
# Check for invalid values in payment status (payments table)
invalid_payment_status = pd.read_sql("""
    SELECT payment_id, payment_status
    FROM payments
    WHERE payment_status NOT IN ('Completed', 'Pending', 'Failed')
""", engine)
print("Invalid payment status values:\n", invalid_payment_status)

Invalid payment status values:
 Empty DataFrame
Columns: [payment_id, payment_status]
Index: []
