# Data Checks

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

import pandas as pd
import os

# Your file paths
files = [
    "../data/text_generation/gpt2/gpt2_cpu4-mem32_results.parquet",
    "../data/text_generation/gpt2/gpt2_cpu8-mem16_results.parquet", 
    "../data/text_generation/gpt2/gpt2_cpu8-mem32_results.parquet",
    "../data/text_generation/gpt2/gpt2_gpu40_results.parquet",
    "../data/text_generation/gpt2/gpt2_gpu80_results.parquet"
]

print("🔍 SAMPLING 10 ROWS FROM EACH DATAFRAME:")
print("=" * 60)

for i, file_path in enumerate(files, 1):
    # Extract config name from filename
    config_name = os.path.basename(file_path).replace('_results.parquet', '').replace('gpt2_', '')
    
    try:
        # Load the dataframe
        df = pd.read_parquet(file_path)
        
        print(f"\n📊 CONFIG {i}: {config_name.upper()}")
        print(f"Total rows: {len(df)}")
        print(f"Columns: {list(df.columns)}")
        
        # Sample 10 rows (or all if less than 10)
        sample_size = min(10, len(df))
        sample_df = df.sample(n=sample_size, random_state=42)
        
        # Show key columns for analysis
        key_columns = ['runtime_sec', 'tokens_per_second', 'total_estimated_power_watts', 
                      'gpu_power_watts_after', 'cpu_cores', 'memory_total_gb', 'batch_size']
        
        # Only show columns that exist
        available_columns = [col for col in key_columns if col in df.columns]
        
        print(f"\nSample data ({sample_size} rows):")
        print(sample_df[available_columns].round(2))
        
        # Check for missing power data
        if 'total_estimated_power_watts' in df.columns:
            missing_power = df['total_estimated_power_watts'].isna().sum()
            print(f"Missing power data: {missing_power}/{len(df)} rows")
        
        print("-" * 50)
        
    except Exception as e:
        print(f"❌ Error loading {config_name}: {e}")
        print("-" * 50)

print("\n✅ Sample inspection complete!")
