# Chapter 5: Analysis & Visualization

**Data-Juicer User Guide**

- Git Commit: `v1.0.5`
- Commit Date: 2026-01-16
- Repository: https://github.com/datajuicer/data-juicer

# Table of Contents

1. [Setup](#setup)
2. [Create Sample Dataset](#create-sample-dataset)
3. [Run Data Analysis](#run-data-analysis)
4. [Interpret Analysis Results](#interpret-analysis-results)
5. [Compare Before and After Processing](#compare-before-and-after-processing)
6. [Further Reading](#further-reading)

## Setup

In [None]:
import json
import os
import pandas as pd
import matplotlib.pyplot as plt
from PIL import Image

## Create Sample Dataset

In [None]:
os.makedirs('./data', exist_ok=True)

# Create diverse dataset for analysis
samples = [
    {"text": "Short text."},
    {"text": "This is a medium-length text sample for analysis."},
    {"text": "This is a longer text sample that contains more words and provides better content for statistical analysis of text length distribution."},
    {"text": "Another medium sample."},
    {"text": "Very short."},
    {"text": "Machine learning and artificial intelligence are transforming how we process and analyze large-scale datasets."},
    {"text": "Data quality is crucial."},
    {"text": "This text has a good balance of length and content quality for demonstration purposes."},
    {"text": "x"},
    {"text": "The quick brown fox jumps over the lazy dog."}
]

with open('./data/analysis_demo.jsonl', 'w') as f:
    for sample in samples:
        f.write(json.dumps(sample) + '\n')

print(f"Created dataset with {len(samples)} samples")

## Run Data Analysis

In [None]:
# Create analysis configuration
analysis_config = """project_name: 'analysis_demo'
dataset_path: './data/analysis_demo.jsonl'
export_path: './outputs/analysis_demo/analyzed.jsonl'
np: 2

export_original_dataset: true # Keep original dataset for further processing

# Operators to compute statistics
process:
  - language_id_score_filter:
      lang: 'en'
      min_score: 0.5
  - text_length_filter:
      min_len: 5
      max_len: 500
  - alphanumeric_filter:
      min_ratio: 0.3
"""

os.makedirs('./configs', exist_ok=True)
with open('./configs/analysis.yaml', 'w') as f:
    f.write(analysis_config)

print("Analysis configuration created")

In [None]:
# Run analysis
!dj-analyze --config ./configs/analysis.yaml

## Interpret Analysis Results

In [None]:
# Load overall statistics
stats_file = './outputs/analysis_demo/analysis/overall.csv'
if os.path.exists(stats_file):
    overall_stats = pd.read_csv(stats_file)
    print("Overall Statistics:")
    print(overall_stats)
else:
    print("Statistics file not found. Analysis may still be running.")

In [None]:
# Display generated histograms
analysis_dir = './outputs/analysis_demo/analysis'
if os.path.exists(analysis_dir):
    png_files = [f for f in os.listdir(analysis_dir) if f.endswith('.png')]
    
    if png_files:
        print(f"Found {len(png_files)} visualization(s)\n")
        
        for png_file in png_files[:3]:  # Show first 3
            img_path = os.path.join(analysis_dir, png_file)
            img = Image.open(img_path)
            
            plt.figure(figsize=(10, 6))
            plt.imshow(img)
            plt.axis('off')
            plt.title(png_file)
            plt.tight_layout()
            plt.show()
    else:
        print("No visualization files found")
else:
    print("Analysis directory not found")

## Compare Before and After Processing

In [None]:
!dj-process --config ./configs/analysis.yaml \
    --dataset_path ./outputs/analysis_demo/analyzed.jsonl \
    --export_path ./outputs/process_demo/processed.jsonl \
    --keep_stats_in_res_ds true

In [None]:
!dj-analyze --config ./configs/analysis.yaml \
    --dataset_path ./outputs/process_demo/processed.jsonl \
    --export_path ./outputs/processed_analyzed/analyzed.jsonl

In [None]:
# Compare Before and After Processing
import matplotlib.gridspec as gridspec

before_dir = './outputs/analysis_demo/analysis'
after_dir = './outputs/processed_analyzed/analysis'

# Get PNG files from both directories
before_files = sorted([f for f in os.listdir(before_dir) if f.endswith('.png')]) if os.path.exists(before_dir) else []
after_files = sorted([f for f in os.listdir(after_dir) if f.endswith('.png')]) if os.path.exists(after_dir) else []

if before_files and after_files:
    print(f"Comparing {len(before_files)} visualizations before and after processing\n")
    
    # Compare matching files
    for i, (before_file, after_file) in enumerate(zip(before_files[:2], after_files[:2])):
        before_path = os.path.join(before_dir, before_file)
        after_path = os.path.join(after_dir, after_file)
        
        before_img = Image.open(before_path)
        after_img = Image.open(after_path)
        
        # Create side-by-side comparison
        fig = plt.figure(figsize=(16, 6))
        gs = gridspec.GridSpec(1, 2, figure=fig, wspace=0.3)
        
        # Before processing
        ax1 = fig.add_subplot(gs[0, 0])
        ax1.imshow(before_img)
        ax1.set_title(f'Before Processing\n{before_file}', fontsize=12, fontweight='bold')
        ax1.axis('off')
        
        # After processing
        ax2 = fig.add_subplot(gs[0, 1])
        ax2.imshow(after_img)
        ax2.set_title(f'After Processing\n{after_file}', fontsize=12, fontweight='bold')
        ax2.axis('off')
        
        plt.tight_layout()
        plt.show()
else:
    print("Missing analysis directories or no visualization files found")
    if not before_files:
        print(f"  - Before: {before_dir} not found or empty")
    if not after_files:
        print(f"  - After: {after_dir} not found or empty")

## Further Reading

- [Analysis Tools](https://github.com/datajuicer/data-juicer/blob/main/tools/analyze_data.py)
- [Operators Documentation](https://datajuicer.github.io/data-juicer/en/main/docs/Operators.html)