## Data Profiling Analysis

Comprehensive statistical analysis and data exploration.

In [None]:
# Data profiling imports
import warnings

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

warnings.filterwarnings("ignore")

# Set up plotting
plt.style.use("default")
sns.set_palette("husl")

In [None]:
# Load Excel data
file_path = r"test_assets/collection/business-accounting/Business Accounting.xlsx"
sheet_name = "Yiriden Transactions 2025"

try:
    df = pd.read_excel(file_path, sheet_name=sheet_name)
    print(f"✅ Successfully loaded {len(df)} rows and {len(df.columns)} columns from Yiriden Transactions 2025")
    print(f"📊 Data shape: {df.shape}")
except Exception as e:
    print(f"❌ Error loading data: {e}")
    df = pd.DataFrame()  # Empty fallback

In [None]:
# Comprehensive Data Profiling for Yiriden Transactions 2025

if not df.empty:
    print("=" * 60)
    print("📋 DATA PROFILING REPORT - Yiriden Transactions 2025")
    print("=" * 60)

    # Basic Information
    print("\n🔍 BASIC INFORMATION")
    print(f"Shape: {df.shape}")
    print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

    # Column Information
    print("\n📊 COLUMN ANALYSIS")
    for col in df.columns:
        dtype = df[col].dtype
        null_count = df[col].isnull().sum()
        null_pct = (null_count / len(df)) * 100
        unique_count = df[col].nunique()

        print(f"  {col:<30} | {dtype!s:<12} | Nulls: {null_count:>6} ({null_pct:>5.1f}%) | Unique: {unique_count:>6}")

    # Statistical Summary
    print("\n📈 STATISTICAL SUMMARY")
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    if len(numeric_cols) > 0:
        display(df[numeric_cols].describe())
    else:
        print("No numeric columns found for statistical analysis")

    # Missing Values Analysis
    print("\n❓ MISSING VALUES ANALYSIS")
    missing_summary = pd.DataFrame(
        {
            "Column": df.columns,
            "Missing_Count": df.isnull().sum(),
            "Missing_Percentage": (df.isnull().sum() / len(df)) * 100,
        }
    ).sort_values("Missing_Percentage", ascending=False)

    display(missing_summary[missing_summary["Missing_Count"] > 0])

    # Data Types Distribution
    print("\n🏷️ DATA TYPES DISTRIBUTION")
    dtype_counts = df.dtypes.value_counts()
    display(dtype_counts)

    # Sample Data Preview
    print("\n👀 SAMPLE DATA PREVIEW")
    print("First 5 rows:")
    display(df.head())

    if len(df) > 5:
        print("\nLast 5 rows:")
        display(df.tail())

    # Quick Quality Checks
    print("\n✅ QUICK QUALITY CHECKS")

    # Duplicate rows
    duplicate_count = df.duplicated().sum()
    print(f"Duplicate rows: {duplicate_count} ({duplicate_count / len(df) * 100:.1f}%)")

    # Empty rows
    empty_rows = df.isnull().all(axis=1).sum()
    print(f"Completely empty rows: {empty_rows}")

    # Potential ID columns
    potential_ids = [col for col in df.columns if df[col].nunique() == len(df) and not df[col].isnull().any()]
    if potential_ids:
        print(f"Potential ID columns: {potential_ids}")

else:
    print("❌ No data available for profiling")

---

*End of data_profiling*

---