# Data Preprocessing and Analysis

This notebook analyzes feature distributions, correlations, and preprocessing effects.

In [1]:
import sys
import os
sys.path.append('../src')

import pandas as pd
from preprocessing.feature_analyzer import FeatureAnalyzer

# Load your dataset
data = pd.read_csv('test_set.tsv', sep='\t')

# Define feature and label columns
feature_cols = ['mH2', 'mHD', 'mAD', 'mHDp', 'alpha', 'L2', 'L8', 'vs', 'm22sq']
label_cols = ['valid_BFB', 'valid_Uni', 'valid_STU', 'valid_Higgs']

# Initialize analyzer
analyzer = FeatureAnalyzer(data, feature_cols, label_cols)

# Create output directory
output_dir = 'preprocessing_analysis'
os.makedirs(output_dir, exist_ok=True)

ModuleNotFoundError: No module named 'preprocessing'

## Feature Statistics and Distributions

In [None]:
# Generate and display feature statistics
stats = analyzer.generate_feature_stats()
pd.DataFrame(stats).T

In [None]:
# Plot feature distributions
analyzer.plot_feature_distributions(output_dir)

## Feature Correlations

In [None]:
# Plot correlation matrix
analyzer.plot_correlation_matrix(output_dir)

## Preprocessing Effects

In [None]:
# Analyze preprocessing effects
preprocessed_data = analyzer.analyze_preprocessing_effects()

# Display summary statistics for each preprocessing approach
for name, data in preprocessed_data.items():
    print(f"\n{name.upper()} Statistics:")
    print(data.describe())