In [1]:
# validate_dataset_columns.py
import pandas as pd
import pickle
from pathlib import Path

def validate_features(data_path: str, features_path: str) -> dict:
    """Validate dataset columns against expected features."""
    results = {
        'missing_columns': [],
        'extra_columns': [],
        'type_mismatches': [],
        'validation_passed': False
    }
    
    try:
        # Load actual data
        df = pd.read_csv(data_path, nrows=1)  # Load just headers
        data_cols = set(df.columns)
        
        # Load expected features
        with open(features_path, 'rb') as f:
            expected_features = set(pickle.load(f))
            
        # Check for missing columns
        results['missing_columns'] = list(expected_features - data_cols)
        
        # Check for extra columns
        results['extra_columns'] = list(data_cols - expected_features)
        
        # Check data types for existing columns
        for col in expected_features & data_cols:
            if not pd.api.types.is_numeric_dtype(df[col]):
                results['type_mismatches'].append({
                    'column': col,
                    'expected': 'numeric',
                    'actual': str(df[col].dtype)
                })
                
        results['validation_passed'] = len(results['missing_columns']) == 0
        
    except Exception as e:
        results['error'] = str(e)
        
    return results

def main():
    # Configure paths (update these as needed)
    base_dir = Path("data/preprocessor/features_info")
    data_file = Path("data/processed/final_ml_dataset.csv")
    features_file = base_dir / "final_ml_df_selected_features_columns.pkl"
    
    print("🔍 Starting dataset column validation...\n")
    print(f"📁 Data file: {data_file.resolve()}")
    print(f"📦 Features file: {features_file.resolve()}\n")
    
    # Run validation
    validation = validate_features(data_file, features_file)
    
    # Print results
    if 'error' in validation:
        print(f"❌ Error occurred: {validation['error']}")
        return
    
    print("=== Validation Results ===")
    print(f"✅ Expected features count: {len(validation['missing_columns']) + len(validation['extra_columns'])}")
    print(f"📊 Dataset columns count: {len(validation['missing_columns']) + len(validation['extra_columns'])}\n")
    
    if validation['missing_columns']:
        print("🚨 Missing required columns:")
        for col in validation['missing_columns']:
            print(f"  - {col}")
            
    if validation['extra_columns']:
        print("\n⚠️ Unexpected extra columns:")
        for col in validation['extra_columns']:
            print(f"  - {col}")
            
    if validation['type_mismatches']:
        print("\n🔧 Data type mismatches:")
        for mismatch in validation['type_mismatches']:
            print(f"  - {mismatch['column']}: Expected {mismatch['expected']}, found {mismatch['actual']}")
            
    if validation['validation_passed']:
        print("\n🎉 Validation passed! All required columns are present.")
    else:
        print("\n❌ Validation failed. Missing required columns.")

if __name__ == "__main__":
    main()


🔍 Starting dataset column validation...

📁 Data file: C:\docker_projects\spl_freethrow_biomechanics_analysis_ml_prediction\data\processed\final_ml_dataset.csv
📦 Features file: C:\docker_projects\spl_freethrow_biomechanics_analysis_ml_prediction\data\preprocessor\features_info\final_ml_df_selected_features_columns.pkl

=== Validation Results ===
✅ Expected features count: 143
📊 Dataset columns count: 143

🚨 Missing required columns:
  - elbow_release_angle
  - elbow_max_angle
  - knee_max_angle
  - knee_release_angle
  - wrist_max_angle
  - wrist_release_angle

⚠️ Unexpected extra columns:
  - R_ELBOW_max_angle
  - R_KNEE_release_angle
  - R_5THFINGER_energy_mean
  - L_ANKLE_energy_max
  - R_5THFINGER_max_power
  - player_participant_id
  - entry_angle
  - L_WRIST_max_angle
  - L_5THFINGER_avg_power
  - player_estimated_wingspan_cm
  - L_WRIST_max_power
  - release_frame_time
  - L_KNEE_max_power
  - angle_difference
  - peak_height_relative
  - R_ELBOW_energy_mean
  - L_ANKLE_max_power