# Battery Data Exploration

This notebook helps you explore and inspect battery charging data.

**Use this notebook instead of creating separate Python scripts for data exploration.**

## Setup

In [None]:
from pyspark.sql import SparkSession
import yaml

# Initialize Spark
spark = SparkSession.builder \
    .appName("DataExploration") \
    .getOrCreate()

print("✓ Spark session initialized")

## Load Configuration

In [None]:
# Load config to get data paths
with open('../config.yaml', 'r') as f:
    config = yaml.safe_load(f)

input_path = config['data']['input']
output_path = config['data']['output']

print(f"Input:  {input_path}")
print(f"Output: {output_path}")

## 1. Explore Raw Parquet Files

In [None]:
# Read a sample parquet file
# Replace with actual path to a specific parquet file
sample_path = f"{input_path}/car_example/file.parquet"

try:
    df_raw = spark.read.parquet(sample_path)
    print("✓ Parquet file loaded successfully\n")
    
    # Show schema
    print("Schema:")
    df_raw.printSchema()
    
    # Show sample data
    print("\nSample data (first 5 rows):")
    df_raw.show(5)
    
    # Count rows
    print(f"\nTotal rows: {df_raw.count():,}")
    
except Exception as e:
    print(f"Error: {e}")
    print("\nTip: Update 'sample_path' with an actual parquet file path")

## 2. Check Column Names

In [None]:
# List all columns
print("Available columns:")
for i, col in enumerate(df_raw.columns, 1):
    print(f"{i:2d}. {col}")

## 3. Basic Statistics

In [None]:
# Get summary statistics for numerical columns
df_raw.select([
    'bms_total_voltage',
    'bms_total_current',
    'bms_soc',
    'bms_temp_max_value',
    'odo'
]).describe().show()

## 4. Explore Processed Output

In [None]:
# Read processed Delta Lake output
try:
    df_output = spark.read.format("delta").load(output_path)
    print("✓ Processed data loaded successfully\n")
    
    # Show schema
    print("Schema:")
    df_output.printSchema()
    
    # Show sample
    print("\nSample windows:")
    df_output.show(5, truncate=False)
    
    # Statistics
    total_windows = df_output.count()
    total_cars = df_output.select("car_name").distinct().count()
    
    print(f"\nTotal vehicles: {total_cars}")
    print(f"Total windows: {total_windows:,}")
    
except Exception as e:
    print(f"Error: {e}")
    print("\nTip: Run data_process.py first to generate output data")

## 5. Windows per Vehicle

In [None]:
# Count windows per vehicle
try:
    df_output.groupBy("car_name") \
        .count() \
        .orderBy("count", ascending=False) \
        .show(20, truncate=False)
except:
    print("Run previous cell first to load output data")

## 6. Inspect Window Data Structure

In [None]:
# Get one window sample
try:
    sample_window = df_output.first()
    
    print("Sample window metadata:")
    print(f"  Car: {sample_window['car_name']}")
    print(f"  Charge session: {sample_window['charge_number']}")
    print(f"  Window ID: {sample_window['window_id']}")
    print(f"  Label: {sample_window['label']}")
    print(f"  SOC range: {sample_window['soc_range']}")
    print(f"  Voltage range: {sample_window['volt_range']}")
    
    print(f"\nWindow data (time series):")
    print(f"  Length: {len(sample_window['window_data'])} timesteps")
    print(f"  Features per timestep: {len(sample_window['window_data'][0])}")
    print(f"\n  First 3 timesteps:")
    for i, timestep in enumerate(sample_window['window_data'][:3]):
        print(f"    {i}: {timestep}")
        
except:
    print("Run cell 4 first to load output data")

## 7. Filter and Query Data

In [None]:
# Example: Find windows for a specific car
car_name = "car_example"  # Replace with actual car name

try:
    car_windows = df_output.filter(f"car_name = '{car_name}'")
    print(f"Windows for {car_name}: {car_windows.count()}")
    car_windows.show(10)
except:
    print("Update car_name and run cell 4 first")

## 8. Custom Analysis

Add your own exploration code below:

In [None]:
# Your custom code here
