## Data Profiling Analysis

Comprehensive statistical analysis and data exploration.

In [None]:
# Data profiling imports
import warnings

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

warnings.filterwarnings("ignore")

# Set up plotting
plt.style.use("default")
sns.set_palette("husl")

In [None]:
# Load CSV data
file_path = r"/Users/cheickberthe/PycharmProjects/spreadsheet-analyzer/test_data/inventory_tracking.csv"

try:
    df = pd.read_csv(file_path)
    print(f"✅ Successfully loaded {len(df)} rows and {len(df.columns)} columns")
    print(f"📊 Data shape: {df.shape}")
except Exception as e:
    print(f"❌ Error loading data: {e}")
    df = pd.DataFrame()  # Empty fallback

In [None]:
# Comprehensive Data Profiling for inventory_tracking

if not df.empty:
    print("=" * 60)
    print(f"📋 DATA PROFILING REPORT - {sheet_name}")
    print("=" * 60)

    # Basic Information
    print("\n🔍 BASIC INFORMATION")
    print(f"Shape: {df.shape}")
    print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

    # Column Information
    print("\n📊 COLUMN ANALYSIS")
    for col in df.columns:
        dtype = df[col].dtype
        null_count = df[col].isnull().sum()
        null_pct = (null_count / len(df)) * 100
        unique_count = df[col].nunique()

        print(f"  {col:<30} | {dtype!s:<12} | Nulls: {null_count:>6} ({null_pct:>5.1f}%) | Unique: {unique_count:>6}")

    # Statistical Summary
    print("\n📈 STATISTICAL SUMMARY")
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    if len(numeric_cols) > 0:
        display(df[numeric_cols].describe())
    else:
        print("No numeric columns found for statistical analysis")

    # Missing Values Analysis
    print("\n❓ MISSING VALUES ANALYSIS")
    missing_summary = pd.DataFrame(
        {
            "Column": df.columns,
            "Missing_Count": df.isnull().sum(),
            "Missing_Percentage": (df.isnull().sum() / len(df)) * 100,
        }
    ).sort_values("Missing_Percentage", ascending=False)

    display(missing_summary[missing_summary["Missing_Count"] > 0])

    # Data Types Distribution
    print("\n🏷️ DATA TYPES DISTRIBUTION")
    dtype_counts = df.dtypes.value_counts()
    display(dtype_counts)

    # Sample Data Preview
    print("\n👀 SAMPLE DATA PREVIEW")
    print("First 5 rows:")
    display(df.head())

    if len(df) > 5:
        print("\nLast 5 rows:")
        display(df.tail())

    # Quick Quality Checks
    print("\n✅ QUICK QUALITY CHECKS")

    # Duplicate rows
    duplicate_count = df.duplicated().sum()
    print(f"Duplicate rows: {duplicate_count} ({duplicate_count / len(df) * 100:.1f}%)")

    # Empty rows
    empty_rows = df.isnull().all(axis=1).sum()
    print(f"Completely empty rows: {empty_rows}")

    # Potential ID columns
    potential_ids = [col for col in df.columns if df[col].nunique() == len(df) and not df[col].isnull().any()]
    if potential_ids:
        print(f"Potential ID columns: {potential_ids}")

else:
    print("❌ No data available for profiling")

---

*End of data_profiling*

---

## Outlier Detection

Statistical analysis to identify outliers and anomalies in the data.

In [None]:
# Statistical Outlier Detection

# Assuming 'df' DataFrame is available from previous cells
if "df" in locals() and not df.empty:
    print("=" * 60)
    print("🔍 OUTLIER DETECTION ANALYSIS")
    print("=" * 60)

    # Select numeric columns only
    numeric_cols = df.select_dtypes(include=[np.number]).columns

    if len(numeric_cols) == 0:
        print("❌ No numeric columns found for outlier detection")
    else:
        print("\n📊 Analyzing {len(numeric_cols)} numeric columns for outliers...")

        outlier_summary = []

        for col in numeric_cols:
            if df[col].isnull().all():
                continue

            # Calculate statistics
            Q1 = df[col].quantile(0.25)
            Q3 = df[col].quantile(0.75)
            IQR = Q3 - Q1

            # IQR method outliers
            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR
            iqr_outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]

            # Z-score method outliers (>3 standard deviations)
            z_scores = np.abs((df[col] - df[col].mean()) / df[col].std())
            zscore_outliers = df[z_scores > 3]

            outlier_summary.append(
                {
                    {
                        "Column": col,
                        "IQR_Outliers": len(iqr_outliers),
                        "ZScore_Outliers": len(zscore_outliers),
                        "Min_Value": df[col].min(),
                        "Max_Value": df[col].max(),
                        "Mean": df[col].mean(),
                        "Std": df[col].std(),
                    }
                }
            )

            # Print detailed analysis for columns with outliers
            if len(iqr_outliers) > 0 or len(zscore_outliers) > 0:
                print("\n🚨 OUTLIERS DETECTED in '{col}':")
                print("  IQR Method: {len(iqr_outliers)} outliers ({len(iqr_outliers)/len(df)*100:.1f}%)")
                print("  Z-Score Method: {len(zscore_outliers)} outliers ({len(zscore_outliers)/len(df)*100:.1f}%)")
                print("  Value range: {df[col].min():.2f} to {df[col].max():.2f}")
                print("  Expected range (IQR): {lower_bound:.2f} to {upper_bound:.2f}")

                # Show some outlier values
                if len(iqr_outliers) > 0:
                    outlier_values = iqr_outliers[col].values
                    print("  Sample outlier values: {outlier_values[:5].tolist()}")

        # Summary table
        if outlier_summary:
            print("\n📋 OUTLIER SUMMARY TABLE")
            outlier_df = pd.DataFrame(outlier_summary)
            display(outlier_df)

            # Visualization for columns with outliers
            cols_with_outliers = outlier_df[(outlier_df["IQR_Outliers"] > 0) | (outlier_df["ZScore_Outliers"] > 0)][
                "Column"
            ].tolist()

            if cols_with_outliers:
                print("\n📈 OUTLIER VISUALIZATIONS")

                # Create box plots
                n_cols = min(3, len(cols_with_outliers))
                n_rows = (len(cols_with_outliers) + n_cols - 1) // n_cols

                fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 5 * n_rows))
                if n_rows == 1 and n_cols == 1:
                    axes = [axes]
                elif n_rows == 1:
                    axes = axes
                else:
                    axes = axes.flatten()

                for i, col in enumerate(cols_with_outliers[: len(axes)]):
                    if i < len(axes):
                        df[col].plot(kind="box", ax=axes[i], title="Box Plot: {col}")
                        axes[i].grid(True, alpha=0.3)

                # Hide empty subplots
                for i in range(len(cols_with_outliers), len(axes)):
                    axes[i].set_visible(False)

                plt.tight_layout()
                plt.show()
            else:
                print("\n✅ No significant outliers detected in any numeric columns!")
        else:
            print("\n❌ No numeric data available for outlier analysis")

else:
    print("❌ No DataFrame 'df' available. Please run data loading cells first.")

---

*End of outlier_detection*

---