# Dataset Analysis and Exploration

This notebook provides comprehensive analysis of the text classification datasets used in the preprocessing research.

In [None]:
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import yaml
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Add src to path
sys.path.append('../../src')

from src.data.loaders import DatasetLoader
from src.utils.visualization import VisualizationUtils
from src.utils.reproducibility import set_random_seeds

# Set random seeds for reproducibility
set_random_seeds(42)

# Configure plotting
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 8)

## Load Configuration and Initialize Components

In [None]:
# Load dataset configuration
with open('../../config/datasets.yaml', 'r') as f:
    datasets_config = yaml.safe_load(f)

# Initialize components
loader = DatasetLoader(cache_dir='../../datasets/processed')
visualizer = VisualizationUtils(output_dir='../../experiments/results/plots')

print("Configuration loaded successfully")
print(f"Available dataset categories: {list(datasets_config['datasets'].keys())}")

## Dataset Overview

In [None]:
# Create dataset overview
all_datasets = {}
for category, datasets in datasets_config['datasets'].items():
    for dataset_name, config in datasets.items():
        all_datasets[dataset_name] = config
        all_datasets[dataset_name]['category'] = category

# Create overview DataFrame
overview_data = []
for name, config in all_datasets.items():
    overview_data.append({
        'Dataset': name,
        'Category': config['category'],
        'Source': config['source'],
        'Task Type': config['task_type'],
        'Max Samples': config['max_samples'],
        'Target Column': config['target_column'],
        'Text Columns': ', '.join(config['text_columns']),
        'Quality Issues': ', '.join(config['quality_issues'])
    })

overview_df = pd.DataFrame(overview_data)
print("Dataset Overview:")
display(overview_df)

## Dataset Distribution Analysis

In [None]:
# Visualize dataset distributions
fig, axes = plt.subplots(2, 2, figsize=(16, 12))
fig.suptitle('Dataset Distribution Analysis', fontsize=16, fontweight='bold')

# 1. Category distribution
category_counts = overview_df['Category'].value_counts()
axes[0, 0].pie(category_counts.values, labels=category_counts.index, autopct='%1.1f%%')
axes[0, 0].set_title('Datasets by Quality Issue Category')

# 2. Source distribution
source_counts = overview_df['Source'].value_counts()
axes[0, 1].bar(source_counts.index, source_counts.values, color='lightcoral', alpha=0.7)
axes[0, 1].set_title('Datasets by Source')
axes[0, 1].set_ylabel('Number of Datasets')

# 3. Task type distribution
task_counts = overview_df['Task Type'].value_counts()
axes[1, 0].bar(task_counts.index, task_counts.values, color='lightblue', alpha=0.7)
axes[1, 0].set_title('Datasets by Task Type')
axes[1, 0].set_ylabel('Number of Datasets')

# 4. Sample size distribution
axes[1, 1].hist(overview_df['Max Samples'], bins=10, color='lightgreen', alpha=0.7)
axes[1, 1].set_title('Distribution of Dataset Sizes')
axes[1, 1].set_xlabel('Max Samples')
axes[1, 1].set_ylabel('Number of Datasets')

plt.tight_layout()
plt.show()

## Individual Dataset Analysis

Let's analyze each dataset category in detail.

In [None]:
def analyze_dataset(dataset_name, config, category):
    """
    Analyze a single dataset.
    """
    print(f"\n{'='*50}")
    print(f"ANALYZING: {dataset_name.upper()} ({category})")
    print(f"{'='*50}")
    
    try:
        # Load dataset
        train_data, test_data = loader.load_dataset(dataset_name, config)
        
        print(f"Dataset loaded successfully:")
        print(f"  - Training samples: {len(train_data):,}")
        print(f"  - Test samples: {len(test_data):,}")
        print(f"  - Total samples: {len(train_data) + len(test_data):,}")
        
        # Combine for analysis
        full_data = pd.concat([train_data, test_data], ignore_index=True)
        
        # Basic statistics
        target_col = config['target_column']
        text_cols = config['text_columns']
        
        print(f"\nTarget column: {target_col}")
        print(f"Text columns: {text_cols}")
        
        # Class distribution
        if target_col in full_data.columns:
            class_dist = full_data[target_col].value_counts().sort_index()
            print(f"\nClass distribution:")
            for class_val, count in class_dist.items():
                percentage = (count / len(full_data)) * 100
                print(f"  - Class {class_val}: {count:,} ({percentage:.1f}%)")
            
            # Check for imbalance
            min_class_ratio = class_dist.min() / class_dist.max()
            print(f"\nImbalance ratio (min/max): {min_class_ratio:.3f}")
            if min_class_ratio < 0.1:
                print("  ⚠️  Severe class imbalance detected")
            elif min_class_ratio < 0.3:
                print("  ⚠️  Moderate class imbalance detected")
            else:
                print("  ✅ Classes are relatively balanced")
        
        # Text analysis
        for text_col in text_cols:
            if text_col in full_data.columns:
                print(f"\nText analysis for '{text_col}':")
                
                # Text length statistics
                text_lengths = full_data[text_col].astype(str).str.len()
                print(f"  - Mean length: {text_lengths.mean():.1f} characters")
                print(f"  - Median length: {text_lengths.median():.1f} characters")
                print(f"  - Min length: {text_lengths.min()} characters")
                print(f"  - Max length: {text_lengths.max()} characters")
                print(f"  - Std deviation: {text_lengths.std():.1f} characters")
                
                # Word count statistics
                word_counts = full_data[text_col].astype(str).str.split().str.len()
                print(f"  - Mean word count: {word_counts.mean():.1f} words")
                print(f"  - Median word count: {word_counts.median():.1f} words")
                
                # Check for duplicates
                duplicate_count = full_data[text_col].duplicated().sum()
                duplicate_percentage = (duplicate_count / len(full_data)) * 100
                print(f"  - Exact duplicates: {duplicate_count:,} ({duplicate_percentage:.1f}%)")
                
                if duplicate_percentage > 10:
                    print("    ⚠️  High duplicate content detected")
                elif duplicate_percentage > 5:
                    print("    ⚠️  Moderate duplicate content detected")
                else:
                    print("    ✅ Low duplicate content")
        
        # Missing values
        missing_info = full_data.isnull().sum()
        if missing_info.sum() > 0:
            print(f"\nMissing values:")
            for col, missing_count in missing_info.items():
                if missing_count > 0:
                    percentage = (missing_count / len(full_data)) * 100
                    print(f"  - {col}: {missing_count:,} ({percentage:.1f}%)")
        else:
            print("\n✅ No missing values detected")
        
        return full_data
        
    except Exception as e:
        print(f"❌ Error loading dataset: {str(e)}")
        return None

### Redundancy Category Datasets

In [None]:
# Analyze redundancy datasets
redundancy_datasets = datasets_config['datasets']['redundancy']
redundancy_data = {}

for dataset_name, config in redundancy_datasets.items():
    data = analyze_dataset(dataset_name, config, 'redundancy')
    if data is not None:
        redundancy_data[dataset_name] = data

### Imbalance Category Datasets

In [None]:
# Analyze imbalance datasets
imbalance_datasets = datasets_config['datasets']['imbalance']
imbalance_data = {}

for dataset_name, config in imbalance_datasets.items():
    data = analyze_dataset(dataset_name, config, 'imbalance')
    if data is not None:
        imbalance_data[dataset_name] = data

### Noise Category Datasets

In [None]:
# Analyze noise datasets
noise_datasets = datasets_config['datasets']['noise']
noise_data = {}

for dataset_name, config in noise_datasets.items():
    data = analyze_dataset(dataset_name, config, 'noise')
    if data is not None:
        noise_data[dataset_name] = data

### Outliers Category Datasets

In [None]:
# Analyze outlier datasets
outlier_datasets = datasets_config['datasets']['outliers']
outlier_data = {}

for dataset_name, config in outlier_datasets.items():
    data = analyze_dataset(dataset_name, config, 'outliers')
    if data is not None:
        outlier_data[dataset_name] = data

## Summary and Recommendations

Based on the analysis above, we can make the following observations and recommendations:

In [None]:
print("\n" + "="*60)
print("DATASET ANALYSIS SUMMARY")
print("="*60)

# Count successfully loaded datasets
total_loaded = len(redundancy_data) + len(imbalance_data) + len(noise_data) + len(outlier_data)
total_configured = sum(len(datasets) for datasets in datasets_config['datasets'].values())

print(f"\nDatasets successfully loaded: {total_loaded}/{total_configured}")
print(f"\nCategory breakdown:")
print(f"  - Redundancy: {len(redundancy_data)} datasets")
print(f"  - Imbalance: {len(imbalance_data)} datasets")
print(f"  - Noise: {len(noise_data)} datasets")
print(f"  - Outliers: {len(outlier_data)} datasets")

print(f"\n📋 NEXT STEPS:")
print(f"1. Run baseline experiments on successfully loaded datasets")
print(f"2. Implement data quality detection for automatic issue identification")
print(f"3. Apply targeted preprocessing strategies")
print(f"4. Compare preprocessing vs baseline performance")
print(f"5. Generate comprehensive benchmark report")

if total_loaded < total_configured:
    print(f"\n⚠️  Some datasets failed to load. Consider:")
    print(f"   - Checking internet connectivity for downloads")
    print(f"   - Setting up Kaggle API credentials")
    print(f"   - Using synthetic data for initial testing")