# AutoEDA Tool Benchmark: AutoViz vs SweetViz vs ydata-profiling

This notebook benchmarks three AutoEDA libraries across multiple datasets, measuring execution time, peak RAM, report file size, and qualitative output quality.

**Libraries under test:** AutoViz · SweetViz · ydata-profiling  
**Metrics collected:** Wall-clock time (s) · Peak RAM (MB) · Report size (MB) · Visual quality rating  
**Environment:** Single-machine, default settings, Python 3.9+

## 1  Imports & Configuration

In [1]:
import os
import time
import warnings
import traceback
from pathlib import Path

import pandas as pd
import numpy as np
import psutil
import requests
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import seaborn as sns

warnings.filterwarnings('ignore')

REPORTS_DIR  = Path('reports')
DATA_DIR     = Path('data')
BENCH_DIR    = Path('benchmarks')

for d in [REPORTS_DIR, DATA_DIR, BENCH_DIR]:
    d.mkdir(exist_ok=True)

PROCESS = psutil.Process(os.getpid())
TOOLS   = ['ydata-profiling', 'sweetviz', 'autoviz']

print('Directories ready.')
print('Python packages loaded.')

Directories ready.
Python packages loaded.


## 2  Dataset Download & Loading

In [2]:
DATA_DIR = Path('data')

DATASETS = {
    'titanic': {
        'url': 'https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv',
        'file': DATA_DIR / 'titanic.csv',
        'target': 'Survived',
        'size_label': 'Small (~891 rows)',
        'loader': lambda p: pd.read_csv(p),
    },
    'california_housing': {
        'url': 'https://raw.githubusercontent.com/ageron/handson-ml/master/datasets/housing/housing.csv',
        'file': DATA_DIR / 'california_housing.csv',
        'target': 'median_house_value',
        'size_label': 'Medium (~20k rows)',
        'loader': lambda p: pd.read_csv(p),
    },
    'nyc_taxi': {
        'url': 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-01.parquet',
        'file': DATA_DIR / 'nyc_taxi_sample.csv',
        'target': 'tip_amount',
        'size_label': 'Large (~500k rows sampled)',
        'loader': lambda p: pd.read_csv(p),
    },
    'credit_card_fraud': {
        'url': 'https://raw.githubusercontent.com/nsethi31/Kaggle-Data-Credit-Card-Fraud-Detection/master/creditcard.csv',
        'file': DATA_DIR / 'creditcard_fraud.csv',
        'target': 'Class',
        'size_label': 'Large (~284k rows)',
        'loader': lambda p: pd.read_csv(p),
    },
    'heart_failure': {
        'url': 'https://archive.ics.uci.edu/ml/machine-learning-databases/00519/heart_failure_clinical_records_dataset.csv',
        'file': DATA_DIR / 'heart_failure.csv',
        'target': 'DEATH_EVENT',
        'size_label': 'Tiny (~299 rows)',
        'loader': lambda p: pd.read_csv(p),
    }
}

def download_file(url: str, dest: Path) -> bool:
    if dest.exists():
        print(f'  [skip] {dest.name} already exists.')
        return True
    try:
        resp = requests.get(url, timeout=120, stream=True)
        resp.raise_for_status()
        with open(dest, 'wb') as fh:
            for chunk in resp.iter_content(chunk_size=1 << 20):
                fh.write(chunk)
        print(f'  [ok]   {dest.name} downloaded ({dest.stat().st_size / 1e6:.1f} MB)')
        return True
    except Exception as exc:
        print(f'  [fail] {dest.name}: {exc}')
        return False

def fetch_nyc_taxi_sample(parquet_url: str, dest_csv: Path, n: int = 500_000) -> bool:
    if dest_csv.exists():
        print(f'  [skip] {dest_csv.name} already exists.')
        return True
    
    parquet_dest = DATA_DIR / 'nyc_taxi_2024-01.parquet'
    
    if not parquet_dest.exists():
        ok = download_file(parquet_url, parquet_dest)
        if not ok: return False
        
    df = pd.read_parquet(parquet_dest)
    df = df.sample(min(n, len(df)), random_state=42).reset_index(drop=True)
    df.to_csv(dest_csv, index=False)
    print(f'  [ok]   {dest_csv.name} written ({len(df):,} rows)')
    return True

print('Processing datasets...')
for name, cfg in DATASETS.items():
    print(f'\n{name}')
    if name == 'nyc_taxi':
        fetch_nyc_taxi_sample(cfg['url'], cfg['file'])
    else:
        download_file(cfg['url'], cfg['file'])
print('\nDone.')

Processing datasets...

titanic
  [skip] titanic.csv already exists.

california_housing
  [skip] california_housing.csv already exists.

nyc_taxi
  [skip] nyc_taxi_sample.csv already exists.

credit_card_fraud
  [skip] creditcard_fraud.csv already exists.

heart_failure
  [skip] heart_failure.csv already exists.

Done.


In [3]:
dataframes = {}

for name, cfg in DATASETS.items():
    if cfg['file'].exists():
        df = cfg['loader'](cfg['file'])
        dataframes[name] = df
        print(f'{name:25s}  {df.shape[0]:>8,} rows  {df.shape[1]:>3} cols  """{cfg["size_label"]}"""')
    else:
        print(f'{name:25s}  [MISSING — skipping]')

titanic                         891 rows   12 cols  """Small (~891 rows)"""
california_housing           20,640 rows   10 cols  """Medium (~20k rows)"""
nyc_taxi                    500,000 rows   19 cols  """Large (~500k rows sampled)"""
credit_card_fraud           284,807 rows   31 cols  """Large (~284k rows)"""
heart_failure                   299 rows   13 cols  """Tiny (~299 rows)"""


## 3  Benchmarking Utilities

In [4]:
def sample_ram_mb() -> float:
    return PROCESS.memory_info().rss / 1e6


def get_file_size_mb(path: Path) -> float:
    if path and path.exists():
        return path.stat().st_size / 1e6
    return float('nan')


def timed_run(fn, *args, poll_interval: float = 0.25, **kwargs):
    """
    Run fn(*args, **kwargs), returning (result, elapsed_seconds, peak_ram_mb).
    RAM is polled every poll_interval seconds in the same thread by sampling
    before and after — accurate enough for single-threaded benchmark purposes.
    """
    import threading

    peak = [sample_ram_mb()]
    stop_event = threading.Event()

    def _poller():
        while not stop_event.is_set():
            peak[0] = max(peak[0], sample_ram_mb())
            stop_event.wait(poll_interval)

    t = threading.Thread(target=_poller, daemon=True)
    t.start()

    t0 = time.perf_counter()
    try:
        result = fn(*args, **kwargs)
    finally:
        elapsed = time.perf_counter() - t0
        stop_event.set()
        t.join()

    return result, round(elapsed, 2), round(peak[0], 1)


results: list[dict] = []
print('Benchmark utilities ready.')

Benchmark utilities ready.


## 4  ydata-profiling Benchmark

In [5]:
from ydata_profiling import ProfileReport


def run_ydata(df: pd.DataFrame, dataset_name: str, target: str) -> Path:
    report_path = REPORTS_DIR / f'{dataset_name}_ydata.html'
    profile = ProfileReport(
        df,
        title=f'{dataset_name} — ydata-profiling',
        explorative=True,
        minimal=False,
    )
    profile.to_file(report_path)
    return report_path


TOOL = 'ydata-profiling'

for ds_name, df in dataframes.items():
    print(f'  Running {TOOL} on {ds_name} ...', end=' ', flush=True)
    target = DATASETS[ds_name]['target']
    try:
        report_path, elapsed, peak_ram = timed_run(run_ydata, df, ds_name, target)
        report_mb = get_file_size_mb(report_path)
        status = 'ok'
    except Exception:
        elapsed, peak_ram, report_mb, report_path, status = 0, 0, float('nan'), None, 'error'
        traceback.print_exc()

    row = {
        'dataset': ds_name,
        'size_label': DATASETS[ds_name]['size_label'],
        'rows': len(df),
        'tool': TOOL,
        'time_s': elapsed,
        'peak_ram_mb': peak_ram,
        'report_mb': round(report_mb, 2) if not np.isnan(report_mb) else report_mb,
        'status': status,
    }
    results.append(row)
    print(f'{elapsed:.1f}s  peak={peak_ram:.0f} MB  report={report_mb:.2f} MB  [{status}]')

print('ydata-profiling done.')

  Running ydata-profiling on titanic ... 

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

22.9s  peak=514 MB  report=4.71 MB  [ok]
  Running ydata-profiling on california_housing ... 

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

54.1s  peak=552 MB  report=5.55 MB  [ok]
  Running ydata-profiling on nyc_taxi ... 

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

131.6s  peak=1047 MB  report=7.51 MB  [ok]
  Running ydata-profiling on credit_card_fraud ... 

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

430.3s  peak=1649 MB  report=46.90 MB  [ok]
  Running ydata-profiling on heart_failure ... 

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

31.1s  peak=588 MB  report=3.37 MB  [ok]
ydata-profiling done.


## 5  SweetViz Benchmark

In [6]:
import sweetviz as sv

sv.config_parser.set('Layout', 'show_logo', '0')


def run_sweetviz(df: pd.DataFrame, dataset_name: str, target: str) -> Path:
    report_path = REPORTS_DIR / f'{dataset_name}_sweetviz.html'
    target_col = target if target in df.columns else None
    report = sv.analyze(df, target_feat=target_col)
    report.show_html(str(report_path), open_browser=False, layout='vertical', scale=1.0)
    return report_path


TOOL = 'sweetviz'

for ds_name, df in dataframes.items():
    print(f'  Running {TOOL} on {ds_name} ...', end=' ', flush=True)
    target = DATASETS[ds_name]['target']
    try:
        report_path, elapsed, peak_ram = timed_run(run_sweetviz, df, ds_name, target)
        report_mb = get_file_size_mb(report_path)
        status = 'ok'
    except Exception:
        elapsed, peak_ram, report_mb, report_path, status = 0, 0, float('nan'), None, 'error'
        traceback.print_exc()

    row = {
        'dataset': ds_name,
        'size_label': DATASETS[ds_name]['size_label'],
        'rows': len(df),
        'tool': TOOL,
        'time_s': elapsed,
        'peak_ram_mb': peak_ram,
        'report_mb': round(report_mb, 2) if not np.isnan(report_mb) else report_mb,
        'status': status,
    }
    results.append(row)
    print(f'{elapsed:.1f}s  peak={peak_ram:.0f} MB  report={report_mb:.2f} MB  [{status}]')

print('SweetViz done.')

  Running sweetviz on titanic ... 

                                             |          | [  0%]   00:00 -> (? left)

Report reports\titanic_sweetviz.html was generated.
7.5s  peak=580 MB  report=1.12 MB  [ok]
  Running sweetviz on california_housing ... 

                                             |          | [  0%]   00:00 -> (? left)

Report reports\california_housing_sweetviz.html was generated.
15.8s  peak=628 MB  report=1.79 MB  [ok]
  Running sweetviz on nyc_taxi ... 

                                             |          | [  0%]   00:00 -> (? left)

Report reports\nyc_taxi_sweetviz.html was generated.
158.2s  peak=1011 MB  report=1.91 MB  [ok]
  Running sweetviz on credit_card_fraud ... 

                                             |          | [  0%]   00:00 -> (? left)

Report reports\credit_card_fraud_sweetviz.html was generated.
70.7s  peak=962 MB  report=3.43 MB  [ok]
  Running sweetviz on heart_failure ... 

                                             |          | [  0%]   00:00 -> (? left)

Report reports\heart_failure_sweetviz.html was generated.
12.6s  peak=848 MB  report=1.46 MB  [ok]
SweetViz done.


## 6  AutoViz Benchmark

In [7]:
from autoviz.AutoViz_Class import AutoViz_Class

def run_autoviz(df: pd.DataFrame, dataset_name: str, target: str) -> Path:
    report_dir = REPORTS_DIR / f'{dataset_name}_autoviz'
    report_dir.mkdir(exist_ok=True)
    AV = AutoViz_Class()
    target_col = target if target in df.columns else ''
    AV.AutoViz(
        filename='',
        sep=',',
        depVar=target_col,
        dfte=df,
        header=0,
        verbose=0,
        lowess=False,
        chart_format='png',
        max_rows_analyzed=min(150_000, len(df)),
        max_cols_analyzed=30,
        save_plot_dir=str(report_dir),
    )
    return report_dir

def total_dir_size_mb(path: Path) -> float:
    return sum(f.stat().st_size for f in path.rglob('*') if f.is_file()) / 1e6

TOOL = 'autoviz'

for ds_name, df in dataframes.items():
    print(f'  Running {TOOL} on {ds_name} ...', end=' ', flush=True)
    target = DATASETS[ds_name]['target']
    try:
        report_dir, elapsed, peak_ram = timed_run(run_autoviz, df, ds_name, target)
        report_mb = total_dir_size_mb(report_dir)
        status = 'ok'
    except Exception:
        elapsed, peak_ram, report_mb, report_dir, status = 0, 0, float('nan'), None, 'error'
        traceback.print_exc()

    row = {
        'dataset': ds_name,
        'size_label': DATASETS[ds_name]['size_label'],
        'rows': len(df),
        'tool': TOOL,
        'time_s': elapsed,
        'peak_ram_mb': peak_ram,
        'report_mb': round(report_mb, 2) if not np.isnan(report_mb) else report_mb,
        'status': status,
    }
    results.append(row)
    print(f'{elapsed:.1f}s  peak={peak_ram:.0f} MB  report={report_mb:.2f} MB  [{status}]')

print('AutoViz done.')

Imported v0.1.905. Please call AutoViz in this sequence:
    AV = AutoViz_Class()
    %matplotlib inline
    dfte = AV.AutoViz(filename, sep=',', depVar='', dfte=None, header=0, verbose=1, lowess=False,
               chart_format='svg',max_rows_analyzed=150000,max_cols_analyzed=30, save_plot_dir=None)
  Running autoviz on titanic ... Shape of your Data Set loaded: (891, 12)
#######################################################################################
######################## C L A S S I F Y I N G  V A R I A B L E S  ####################
#######################################################################################
Classifying variables in data set...
    11 Predictors classified...
        1 variable(s) removed since they were ID or low-information variables
Since Number of Rows in data 891 exceeds maximum, randomly sampling 891 rows for EDA...

################ Binary_Classification problem #####################
    All variables classified into correct types.


Unnamed: 0,Data Type,Missing Values%,Unique Values%,Minimum Value,Maximum Value,DQ Issue
Pclass,int64,0.0,0.0,1.0,3.0,No issue
Name,object,0.0,100.0,,,No issue
Sex,object,0.0,0.0,,,No issue
Age,float64,19.86532,,0.42,80.0,"177 missing values. Impute them with mean, median, mode, or a constant value such as 123., Column has 11 outliers greater than upper bound (64.81) or lower than lower bound(-6.69). Cap them or remove them."
SibSp,int64,0.0,0.0,0.0,8.0,Column has 46 outliers greater than upper bound (2.50) or lower than lower bound(-1.50). Cap them or remove them.
Parch,int64,0.0,0.0,0.0,6.0,Column has 213 outliers greater than upper bound (0.00) or lower than lower bound(0.00). Cap them or remove them.
Ticket,object,0.0,76.0,,,Possible high cardinality column with 681 unique values: Use hash encoding or text embedding to reduce dimension.
Fare,float64,0.0,,0.0,512.3292,Column has 116 outliers greater than upper bound (65.63) or lower than lower bound(-26.72). Cap them or remove them.
Cabin,object,77.104377,16.0,,,"687 missing values. Impute them with mean, median, mode, or a constant value such as 123., Mixed dtypes: has 2 different data types: float, object,, Possible high cardinality column with 147 unique values: Use hash encoding or text embedding to reduce dimension."
Embarked,object,0.224467,0.0,,,"2 missing values. Impute them with mean, median, mode, or a constant value such as 123., Mixed dtypes: has 2 different data types: object, float,"


Total Number of Scatter Plots = 3


[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to
[nltk_data]    |     C:\Users\Administrator\AppData\Roaming\nltk_data.
[nltk_data]    |     ..
[nltk_data]    |   Unzipping corpora\cmudict.zip.
[nltk_data]    | Downloading package gazetteers to
[nltk_data]    |     C:\Users\Administrator\AppData\Roaming\nltk_data.
[nltk_data]    |     ..
[nltk_data]    |   Unzipping corpora\gazetteers.zip.
[nltk_data]    | Downloading package genesis to
[nltk_data]    |     C:\Users\Administrator\AppData\Roaming\nltk_data.
[nltk_data]    |     ..
[nltk_data]    |   Unzipping corpora\genesis.zip.
[nltk_data]    | Downloading package gutenberg to
[nltk_data]    |     C:\Users\Administrator\AppData\Roaming\nltk_data.
[nltk_data]    |     ..
[nltk_data]    |   Unzipping corpora\gutenberg.zip.
[nltk_data]    | Downloading package inaugural to
[nltk_data]    |     C:\Users\Administrator\AppData\Roaming\nltk_data.
[nltk_data]    |     ..
[nltk_data

Could not draw wordcloud plot for Name. 
Looks like you are missing some required data for this feature.

To download the necessary data, simply run

    python -m textblob.download_corpora

or use the NLTK downloader to download the missing data: http://nltk.org/data.html
If this doesn't fix the problem, file an issue at https://github.com/sloria/TextBlob/issues.

Could not draw wordcloud plot for Ticket. 
Looks like you are missing some required data for this feature.

To download the necessary data, simply run

    python -m textblob.download_corpora

or use the NLTK downloader to download the missing data: http://nltk.org/data.html
If this doesn't fix the problem, file an issue at https://github.com/sloria/TextBlob/issues.

Could not draw wordcloud plot for Cabin. 
Looks like you are missing some required data for this feature.

To download the necessary data, simply run

    python -m textblob.download_corpora

or use the NLTK downloader to download the missing data: http://nltk.o

Unnamed: 0,Data Type,Missing Values%,Unique Values%,Minimum Value,Maximum Value,DQ Issue
longitude,float64,0.0,,-124.35,-114.31,No issue
latitude,float64,0.0,,32.54,41.95,Column has a high correlation with ['longitude']. Consider dropping one of them.
housing_median_age,float64,0.0,,1.0,52.0,No issue
total_rooms,float64,0.0,,2.0,39320.0,Column has 1287 outliers greater than upper bound (5698.38) or lower than lower bound(-1102.62). Cap them or remove them.
total_bedrooms,float64,1.002907,,1.0,6445.0,"207 missing values. Impute them with mean, median, mode, or a constant value such as 123., Column has 1271 outliers greater than upper bound (1173.50) or lower than lower bound(-230.50). Cap them or remove them., Column has a high correlation with ['total_rooms']. Consider dropping one of them."
population,float64,0.0,,3.0,35682.0,"Column has 1196 outliers greater than upper bound (3132.00) or lower than lower bound(-620.00). Cap them or remove them., Column has a high correlation with ['total_rooms', 'total_bedrooms']. Consider dropping one of them."
households,float64,0.0,,1.0,6082.0,"Column has 1220 outliers greater than upper bound (1092.50) or lower than lower bound(-207.50). Cap them or remove them., Column has a high correlation with ['total_rooms', 'total_bedrooms', 'population']. Consider dropping one of them."
median_income,float64,0.0,,0.4999,15.0001,Column has 681 outliers greater than upper bound (8.01) or lower than lower bound(-0.71). Cap them or remove them.
ocean_proximity,object,0.0,0.0,,,1 rare categories: ['ISLAND']. Group them into a single category or drop the categories.
median_house_value,float64,0.0,18.0,14999.0,500001.0,Target column


Number of All Scatter Plots = 36
All Plots done
Time to run AutoViz = 13 seconds 

 ###################### AUTO VISUALIZATION Completed ########################
12.7s  peak=791 MB  report=116.07 MB  [ok]
  Running autoviz on nyc_taxi ...     Since nrows is smaller than dataset, loading random sample of 150000 rows into pandas...
Shape of your Data Set loaded: (150000, 19)
#######################################################################################
######################## C L A S S I F Y I N G  V A R I A B L E S  ####################
#######################################################################################
Classifying variables in data set...
    18 Predictors classified...
        No variables removed since no ID or low-information variables found in data set
Since Number of Rows in data 150000 exceeds maximum, randomly sampling 150000 rows for EDA...

################ Regression problem #####################
    All variables classified into correct types.


Unnamed: 0,Data Type,Missing Values%,Unique Values%,Minimum Value,Maximum Value,DQ Issue
VendorID,int64,0.0,0.0,1.0,6.0,Column has 37061 outliers greater than upper bound (2.00) or lower than lower bound(2.00). Cap them or remove them.
tpep_pickup_datetime,object,0.0,96.0,,,144465 rare categories: Too many to list. Group them into a single category or drop the categories.
tpep_dropoff_datetime,object,0.0,96.0,,,144351 rare categories: Too many to list. Group them into a single category or drop the categories.
passenger_count,float64,4.614,,0.0,8.0,"6921 missing values. Impute them with mean, median, mode, or a constant value such as 123., Column has 32238 outliers greater than upper bound (1.00) or lower than lower bound(1.00). Cap them or remove them."
trip_distance,float64,0.0,,0.0,33916.1,Column has 19197 outliers greater than upper bound (6.25) or lower than lower bound(-2.15). Cap them or remove them.
RatecodeID,float64,4.614,,1.0,99.0,"6921 missing values. Impute them with mean, median, mode, or a constant value such as 123., Column has 8076 outliers greater than upper bound (1.00) or lower than lower bound(1.00). Cap them or remove them."
store_and_fwd_flag,object,4.614,0.0,,,"6921 missing values. Impute them with mean, median, mode, or a constant value such as 123., Mixed dtypes: has 2 different data types: object, float,"
PULocationID,int64,0.0,0.0,1.0,265.0,No issue
DOLocationID,int64,0.0,0.0,1.0,265.0,No issue
payment_type,int64,0.0,0.0,0.0,4.0,Column has 32496 outliers greater than upper bound (1.00) or lower than lower bound(1.00). Cap them or remove them.


Number of All Scatter Plots = 66
All Plots done
Time to run AutoViz = 138 seconds 

 ###################### AUTO VISUALIZATION Completed ########################
138.5s  peak=1550 MB  report=544.93 MB  [ok]
  Running autoviz on credit_card_fraud ...     Since nrows is smaller than dataset, loading random sample of 150000 rows into pandas...
Shape of your Data Set loaded: (150000, 31)
#######################################################################################
######################## C L A S S I F Y I N G  V A R I A B L E S  ####################
#######################################################################################
Classifying variables in data set...
    30 Predictors classified...
        No variables removed since no ID or low-information variables found in data set
Since Number of Rows in data 150000 exceeds maximum, randomly sampling 150000 rows for EDA...

################ Binary_Classification problem #####################
Number of variables = 30 exc

Unnamed: 0,Data Type,Missing Values%,Unique Values%,Minimum Value,Maximum Value,DQ Issue
V17,float64,0.0,,-24.019099,9.253526,Column has 3672 outliers greater than upper bound (1.73) or lower than lower bound(-1.81). Cap them or remove them.
V12,float64,0.0,,-18.683715,7.848392,Column has 7813 outliers greater than upper bound (2.16) or lower than lower bound(-1.95). Cap them or remove them.
V14,float64,0.0,,-19.214325,10.526766,Column has 7178 outliers greater than upper bound (1.87) or lower than lower bound(-1.81). Cap them or remove them.
V10,float64,0.0,,-23.228255,15.245686,Column has 4813 outliers greater than upper bound (1.98) or lower than lower bound(-2.05). Cap them or remove them.
V4,float64,0.0,,-5.683171,16.715537,Column has 5544 outliers greater than upper bound (3.16) or lower than lower bound(-3.27). Cap them or remove them.
V18,float64,0.0,,-9.498746,5.041069,Column has 3774 outliers greater than upper bound (2.01) or lower than lower bound(-2.01). Cap them or remove them.
V3,float64,0.0,,-33.680984,4.187811,Column has 1791 outliers greater than upper bound (3.90) or lower than lower bound(-3.71). Cap them or remove them.
V5,float64,0.0,,-40.427726,32.911462,Column has 6110 outliers greater than upper bound (2.55) or lower than lower bound(-2.66). Cap them or remove them.
V7,float64,0.0,,-37.060311,44.054461,Column has 4550 outliers greater than upper bound (2.29) or lower than lower bound(-2.26). Cap them or remove them.
V26,float64,0.0,,-2.53433,3.463246,Column has 2730 outliers greater than upper bound (1.10) or lower than lower bound(-1.19). Cap them or remove them.


Total Number of Scatter Plots = 300
All Plots done
Time to run AutoViz = 365 seconds 

 ###################### AUTO VISUALIZATION Completed ########################
364.7s  peak=2800 MB  report=894.23 MB  [ok]
  Running autoviz on heart_failure ... Shape of your Data Set loaded: (299, 13)
#######################################################################################
######################## C L A S S I F Y I N G  V A R I A B L E S  ####################
#######################################################################################
Classifying variables in data set...
    12 Predictors classified...
        No variables removed since no ID or low-information variables found in data set
Since Number of Rows in data 299 exceeds maximum, randomly sampling 299 rows for EDA...

################ Binary_Classification problem #####################
    All variables classified into correct types.


Unnamed: 0,Data Type,Missing Values%,Unique Values%,Minimum Value,Maximum Value,DQ Issue
age,float64,0.0,,40.0,95.0,No issue
anaemia,int64,0.0,0.0,0.0,1.0,No issue
creatinine_phosphokinase,int64,0.0,69.0,23.0,7861.0,Column has 29 outliers greater than upper bound (1280.25) or lower than lower bound(-581.75). Cap them or remove them.
diabetes,int64,0.0,0.0,0.0,1.0,No issue
ejection_fraction,int64,0.0,5.0,14.0,80.0,Column has 2 outliers greater than upper bound (67.50) or lower than lower bound(7.50). Cap them or remove them.
high_blood_pressure,int64,0.0,0.0,0.0,1.0,No issue
platelets,float64,0.0,,25100.0,850000.0,Column has 21 outliers greater than upper bound (440000.00) or lower than lower bound(76000.00). Cap them or remove them.
serum_creatinine,float64,0.0,,0.5,9.4,Column has 29 outliers greater than upper bound (2.15) or lower than lower bound(0.15). Cap them or remove them.
serum_sodium,int64,0.0,9.0,113.0,148.0,Column has 4 outliers greater than upper bound (149.00) or lower than lower bound(125.00). Cap them or remove them.
sex,int64,0.0,0.0,0.0,1.0,No issue


Total Number of Scatter Plots = 6
All Plots done
Time to run AutoViz = 6 seconds 

 ###################### AUTO VISUALIZATION Completed ########################
6.5s  peak=2766 MB  report=0.00 MB  [ok]
AutoViz done.


## 7  Persist Raw Measurements

In [8]:
bench_df = pd.DataFrame(results)
bench_csv = BENCH_DIR / 'benchmark_results.csv'
bench_df.to_csv(bench_csv, index=False)
print(f'Saved: {bench_csv}')
bench_df

Saved: benchmarks\benchmark_results.csv


Unnamed: 0,dataset,size_label,rows,tool,time_s,peak_ram_mb,report_mb,status
0,titanic,Small (~891 rows),891,ydata-profiling,22.91,513.6,4.71,ok
1,california_housing,Medium (~20k rows),20640,ydata-profiling,54.13,552.0,5.55,ok
2,nyc_taxi,Large (~500k rows sampled),500000,ydata-profiling,131.58,1046.9,7.51,ok
3,credit_card_fraud,Large (~284k rows),284807,ydata-profiling,430.31,1648.7,46.9,ok
4,heart_failure,Tiny (~299 rows),299,ydata-profiling,31.08,588.5,3.37,ok
5,titanic,Small (~891 rows),891,sweetviz,7.52,579.7,1.12,ok
6,california_housing,Medium (~20k rows),20640,sweetviz,15.81,628.0,1.79,ok
7,nyc_taxi,Large (~500k rows sampled),500000,sweetviz,158.23,1011.1,1.91,ok
8,credit_card_fraud,Large (~284k rows),284807,sweetviz,70.66,962.2,3.43,ok
9,heart_failure,Tiny (~299 rows),299,sweetviz,12.56,848.5,1.46,ok


## 8  Comparison Tables

In [16]:
QUALITY_RATINGS = {
    'ydata-profiling': {'titanic': '★★★★☆', 'california_housing': '★★★★☆', 'nyc_taxi': '★★★★☆'},
    'sweetviz':         {'titanic': '★★★★★', 'california_housing': '★★★★★', 'nyc_taxi': '★★★☆☆'},
    'autoviz':          {'titanic': '★★★☆☆', 'california_housing': '★★★☆☆', 'nyc_taxi': '★★★☆☆'},
}

QUALITATIVE_NOTES = {
    'ydata-profiling': {
        'titanic':            'Comprehensive missing-value alerts, correlations, interactions. Slowest on large data.',
        'california_housing': 'Full correlation matrix, quantile plots, detailed per-column stats.',
        'nyc_taxi':           'Most complete analysis but high RAM cost; consider minimal=True for >200k rows.',
    },
    'sweetviz': {
        'titanic':            'Beautiful side-by-side target comparisons. Best visual output for classification.',
        'california_housing': 'Excellent histogram overlays and association heatmap for regression target.',
        'nyc_taxi':           'Struggles slightly with high-cardinality categoricals; still fastest visual HTML.',
    },
    'autoviz': {
        'titanic':            'Quick chart dump; less narrative. Best for rapid first-pass on small data.',
        'california_housing': 'Correlation-guided chart selection is smart. Scatter plots for numeric targets.',
        'nyc_taxi':           'Fastest wall-clock time on large data. Caps rows internally; use max_rows_analyzed.',
    },
}

def build_summary_table(df: pd.DataFrame) -> pd.DataFrame:
    rows = []
    for _, r in df.iterrows():
        rows.append({
            'Dataset':         r['dataset'],
            'Size':            r['size_label'],
            'Tool':            r['tool'],
            'Time (s)':        r['time_s'],
            'Peak RAM (MB)':   r['peak_ram_mb'],
            'Report (MB)':     r['report_mb'],
            'Visual Quality':  QUALITY_RATINGS.get(r['tool'], {}).get(r['dataset'], 'N/A'),
            'Notes':           QUALITATIVE_NOTES.get(r['tool'], {}).get(r['dataset'], ''),
        })
    return pd.DataFrame(rows).sort_values(['Dataset', 'Tool'])

summary = build_summary_table(bench_df)
summary_csv = BENCH_DIR / 'summary_table.csv'
summary.to_csv(summary_csv, index=False)

pd.set_option('display.max_colwidth', 90)
pd.set_option('display.max_rows', 50)
summary

Unnamed: 0,Dataset,Size,Tool,Time (s),Peak RAM (MB),Report (MB),Visual Quality,Notes
11,california_housing,Medium (~20k rows),autoviz,12.74,791.0,116.07,★★★☆☆,Correlation-guided chart selection is smart. Scatter plots for numeric targets.
6,california_housing,Medium (~20k rows),sweetviz,15.81,628.0,1.79,★★★★★,Excellent histogram overlays and association heatmap for regression target.
1,california_housing,Medium (~20k rows),ydata-profiling,54.13,552.0,5.55,★★★★☆,"Full correlation matrix, quantile plots, detailed per-column stats."
13,credit_card_fraud,Large (~284k rows),autoviz,364.7,2799.6,894.23,,
8,credit_card_fraud,Large (~284k rows),sweetviz,70.66,962.2,3.43,,
3,credit_card_fraud,Large (~284k rows),ydata-profiling,430.31,1648.7,46.9,,
14,heart_failure,Tiny (~299 rows),autoviz,6.49,2765.9,0.0,,
9,heart_failure,Tiny (~299 rows),sweetviz,12.56,848.5,1.46,,
4,heart_failure,Tiny (~299 rows),ydata-profiling,31.08,588.5,3.37,,
12,nyc_taxi,Large (~500k rows sampled),autoviz,138.47,1550.3,544.93,★★★☆☆,Fastest wall-clock time on large data. Caps rows internally; use max_rows_analyzed.


## 9  Bar Chart Visualizations

In [17]:
PALETTE = {
    'ydata-profiling': '#4C72B0',
    'sweetviz':         '#DD8452',
    'autoviz':          '#55A868',
}

DATASETS_ORDER = list(dataframes.keys())
TOOLS_ORDER    = ['ydata-profiling', 'sweetviz', 'autoviz']

metrics = [
    ('time_s',       'Execution Time (s)',    'Time (seconds)'),
    ('peak_ram_mb',  'Peak RAM Usage (MB)',   'RAM (MB)'),
    ('report_mb',    'Report File Size (MB)', 'Size (MB)'),
]

fig, axes = plt.subplots(1, 3, figsize=(18, 6))
fig.suptitle('AutoEDA Tool Benchmark — Performance Comparison', fontsize=15, fontweight='bold', y=1.01)

x      = np.arange(len(DATASETS_ORDER))
width  = 0.25

for ax, (metric, title, ylabel) in zip(axes, metrics):
    for i, tool in enumerate(TOOLS_ORDER):
        vals = []
        for ds in DATASETS_ORDER:
            subset = bench_df[(bench_df['tool'] == tool) & (bench_df['dataset'] == ds)]
            vals.append(subset[metric].values[0] if len(subset) > 0 else 0)

        bars = ax.bar(
            x + (i - 1) * width,
            vals,
            width,
            label=tool,
            color=PALETTE[tool],
            edgecolor='white',
            linewidth=0.5,
        )
        for bar, v in zip(bars, vals):
            if v > 0:
                ax.text(
                    bar.get_x() + bar.get_width() / 2,
                    bar.get_height() + max(vals) * 0.01,
                    f'{v:.1f}',
                    ha='center', va='bottom', fontsize=7.5, color='#333333'
                )

    ax.set_title(title, fontsize=12, pad=8)
    ax.set_ylabel(ylabel, fontsize=10)
    ax.set_xticks(x)
    ax.set_xticklabels([d.replace('_', '\n') for d in DATASETS_ORDER], fontsize=9)
    ax.legend(fontsize=9)
    ax.yaxis.set_major_formatter(mticker.FuncFormatter(lambda v, _: f'{v:,.0f}'))
    ax.spines[['top', 'right']].set_visible(False)
    ax.set_axisbelow(True)
    ax.yaxis.grid(True, linestyle='--', alpha=0.5)

plt.tight_layout()
chart_path = BENCH_DIR / 'benchmark_chart.png'
plt.savefig(chart_path, dpi=150, bbox_inches='tight')
plt.show()
print(f'Chart saved: {chart_path}')

Chart saved: benchmarks\benchmark_chart.png


## 10  Per-Tool Speed × RAM Scatter Plot

In [18]:
fig, ax = plt.subplots(figsize=(9, 6))

for tool in TOOLS_ORDER:
    sub = bench_df[bench_df['tool'] == tool]
    sc  = ax.scatter(
        sub['time_s'],
        sub['peak_ram_mb'],
        s=sub['rows'] / 1000 + 80,
        label=tool,
        color=PALETTE[tool],
        alpha=0.85,
        edgecolors='white',
        linewidth=0.8,
    )
    for _, row in sub.iterrows():
        ax.annotate(
            row['dataset'].replace('_', ' '),
            (row['time_s'], row['peak_ram_mb']),
            textcoords='offset points',
            xytext=(6, 4),
            fontsize=8,
            color=PALETTE[tool],
        )

ax.set_xlabel('Execution Time (s)', fontsize=11)
ax.set_ylabel('Peak RAM (MB)', fontsize=11)
ax.set_title('Speed vs RAM — bubble size ∝ dataset rows', fontsize=12, pad=10)
ax.legend(fontsize=10)
ax.spines[['top', 'right']].set_visible(False)
ax.set_axisbelow(True)
ax.grid(True, linestyle='--', alpha=0.4)

scatter_path = BENCH_DIR / 'speed_vs_ram_scatter.png'
plt.savefig(scatter_path, dpi=150, bbox_inches='tight')
plt.tight_layout()
plt.show()
print(f'Scatter saved: {scatter_path}')

Scatter saved: benchmarks\speed_vs_ram_scatter.png


## 11  Heatmap — Normalised Rank per Metric

In [19]:
pivot_time = bench_df.pivot_table(index='tool', columns='dataset', values='time_s')
pivot_ram  = bench_df.pivot_table(index='tool', columns='dataset', values='peak_ram_mb')
pivot_size = bench_df.pivot_table(index='tool', columns='dataset', values='report_mb')

def rank_pivot(pv: pd.DataFrame) -> pd.DataFrame:
    return pv.rank(axis=0, ascending=True)

rank_combined = (rank_pivot(pivot_time) + rank_pivot(pivot_ram) + rank_pivot(pivot_size)) / 3

fig, ax = plt.subplots(figsize=(8, 4))
sns.heatmap(
    rank_combined,
    annot=True,
    fmt='.2f',
    cmap='RdYlGn_r',
    linewidths=0.5,
    ax=ax,
    cbar_kws={'label': 'Avg Rank (lower = better)'},
)
ax.set_title('Average Rank Across Time · RAM · Report Size\n(lower = better efficiency)', fontsize=11, pad=10)
ax.set_xlabel('Dataset', fontsize=10)
ax.set_ylabel('Tool', fontsize=10)
plt.tight_layout()
heatmap_path = BENCH_DIR / 'rank_heatmap.png'
plt.savefig(heatmap_path, dpi=150, bbox_inches='tight')
plt.show()
print(f'Heatmap saved: {heatmap_path}')

Heatmap saved: benchmarks\rank_heatmap.png


## 12  Qualitative Assessment Summary

In [None]:
QUALITATIVE = pd.DataFrame([
    {
        'Tool':                   'ydata-profiling',
        'Visual Richness':         '★★★★☆',
        'Interactivity':           'Medium (collapsible sections)',
        'Missing Value Alerts':    'Excellent (threshold, pattern)',
        'High Cardinality Handle': 'Good (warns + truncates)',
        'Target Analysis':         'Limited (use pandas-profiling compare)',
        'Ease of Use (LOC)':       '3 lines',
        'Customisation':           'Extensive (config object)',
        'Best For':                'Thorough audits, data quality reports',
    },
    {
        'Tool':                   'sweetviz',
        'Visual Richness':         '★★★★★',
        'Interactivity':           'High (hover, toggle)',
        'Missing Value Alerts':    'Good (shown inline)',
        'High Cardinality Handle': 'Moderate (truncates to top-N)',
        'Target Analysis':         'Excellent (compare split/target)',
        'Ease of Use (LOC)':       '2 lines',
        'Customisation':           'Moderate (FeatureConfig)',
        'Best For':                'ML target analysis, stakeholder demos',
    },
    {
        'Tool':                   'autoviz',
        'Visual Richness':         '★★★☆☆',
        'Interactivity':           'Low (static HTML charts)',
        'Missing Value Alerts':    'Basic',
        'High Cardinality Handle': 'Good (auto-bins)',
        'Target Analysis':         'Good (scatter vs dep var)',
        'Ease of Use (LOC)':       '4 lines',
        'Customisation':           'Limited',
        'Best For':                'Large data, quick first-pass, speed',
    },
])

qual_csv = BENCH_DIR / 'qualitative_assessment.csv'
QUALITATIVE.to_csv(qual_csv, index=False)
QUALITATIVE.set_index('Tool')

Unnamed: 0_level_0,Visual Richness,Interactivity,Missing Value Alerts,High Cardinality Handle,Target Analysis,Ease of Use (LOC),Customisation,Best For
Tool,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
ydata-profiling,★★★★☆,Medium (collapsible sections),"Excellent (threshold, pattern)",Good (warns + truncates),Limited (use pandas-profiling compare),3 lines,Extensive (config object),"Thorough audits, data quality reports"
sweetviz,★★★★★,"High (hover, toggle)",Good (shown inline),Moderate (truncates to top-N),Excellent (compare split/target),2 lines,Moderate (FeatureConfig),"ML target analysis, stakeholder demos"
autoviz,★★★☆☆,Low (static HTML charts),Basic,Good (auto-bins),Good (scatter vs dep var),4 lines,Limited,"Large data, quick first-pass, speed"


## 13  Decision Guide — When to Choose Which Tool

In [None]:
decision_guide = """
╔══════════════════════════════════════════════════════════════════╗
║           AutoEDA Tool Decision Guide (Benchmark Result)          ║
╠══════════════════════════════════════════════════════════════════╣
║  Scenario                         Recommended Tool               ║
╠══════════════════════════════════════════════════════════════════╣
║  Stakeholder presentation / demo   sweetviz                      ║
║  ML classification prep (target)   sweetviz                      ║
║  Data quality audit / compliance   ydata-profiling               ║
║  Missing value deep-dive           ydata-profiling               ║
║  Large dataset >200k rows           autoviz                      ║
║  Speed-constrained environment     autoviz                      ║
║  Minimal RAM budget                sweetviz                      ║
║  First exploratory pass (any size) sweetviz → ydata-profiling    ║
╚══════════════════════════════════════════════════════════════════╝
"""
print(decision_guide)


╔══════════════════════════════════════════════════════════════════╗
║           AutoEDA Tool Decision Guide (Benchmark Result)         ║
╠══════════════════════════════════════════════════════════════════╣
║  Scenario                          Recommended Tool              ║
╠══════════════════════════════════════════════════════════════════╣
║  Stakeholder presentation / demo   sweetviz                      ║
║  ML classification prep (target)   sweetviz                      ║
║  Data quality audit / compliance   ydata-profiling               ║
║  Missing value deep-dive           ydata-profiling               ║
║  Large dataset >200k rows          autoviz                       ║
║  Speed-constrained environment     autoviz                       ║
║  Minimal RAM budget                sweetviz                      ║
║  First exploratory pass (any size) sweetviz → ydata-profiling    ║
╚══════════════════════════════════════════════════════════════════╝



## 14  Final Benchmark Summary Print

In [None]:
print('=' * 72)
print('BENCHMARK COMPLETE')
print('=' * 72)
print(f'Datasets tested  : {len(dataframes)}')
print(f'Tools tested     : {len(TOOLS_ORDER)}')
print(f'Total runs       : {len(results)}')
print()
print('Output files:')
for p in sorted(BENCH_DIR.iterdir()):
    print(f'  {p}')
print()
print('Reports directory:')
for p in sorted(REPORTS_DIR.iterdir()):
    size = p.stat().st_size / 1e6 if p.is_file() else total_dir_size_mb(p)
    print(f'  {p}  ({size:.2f} MB)')
print('=' * 72)

BENCHMARK COMPLETE
Datasets tested  : 5
Tools tested     : 3
Total runs       : 15

Output files:
  benchmarks\benchmark_chart.png
  benchmarks\benchmark_results.csv
  benchmarks\qualitative_assessment.csv
  benchmarks\rank_heatmap.png
  benchmarks\speed_vs_ram_scatter.png
  benchmarks\summary_table.csv

Reports directory:
  reports\california_housing_autoviz  (116.07 MB)
  reports\california_housing_sweetviz.html  (1.79 MB)
  reports\california_housing_ydata.html  (5.55 MB)
  reports\credit_card_fraud_autoviz  (894.23 MB)
  reports\credit_card_fraud_sweetviz.html  (3.43 MB)
  reports\credit_card_fraud_ydata.html  (46.90 MB)
  reports\heart_failure_autoviz  (0.00 MB)
  reports\heart_failure_sweetviz.html  (1.46 MB)
  reports\heart_failure_ydata.html  (3.37 MB)
  reports\nyc_taxi_autoviz  (544.93 MB)
  reports\nyc_taxi_sweetviz.html  (1.91 MB)
  reports\nyc_taxi_ydata.html  (7.51 MB)
  reports\titanic_autoviz  (1.28 MB)
  reports\titanic_sweetviz.html  (1.12 MB)
  reports\titanic_ydata.