In [None]:
import sys
sys.path.append('../')
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import GroupShuffleSplit
from common.data_preparation import load_and_preprocess_data
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import pickle

def organize_features(df):
    """Organize features into static and dynamic groups based on simulation requirements."""
    static_features = [
        # Driver characteristics
        'driver_overall_skill',
        'driver_circuit_skill',
        'driver_consistency',
        'driver_aggression',
        'driver_reliability',
        'driver_risk_taking',
        'driver_adaptability',
        
        # Constructor information
        'constructorId',
        'constructor_performance',
        'constructor_nationality',
        'constructor_position',
        
        # Circuit characteristics
        'circuit_type_encoded',
        'alt',
        
        # Historical performance
        'fp1_median_time',
        'fp2_median_time',
        'fp3_median_time',
        'quali_time',
        
        # Race start configuration
        'grid'
    ]
    
    dynamic_features = [
        # Timing and position
        'lap',
        'position',
        'positionOrder',
        'track_position',
        'cumulative_milliseconds',
        'seconds_from_start',
        'GapToLeader_ms',
        'IntervalToPositionAhead_ms',
        
        # Car state
        'tire_compound',
        'tire_age',
        'fuel_load',
        'is_pit_lap',
        
        # Environmental conditions
        'TrackTemp',
        'AirTemp',
        'Humidity',
        'TrackStatus'
    ]
    
    target = 'milliseconds'
    
    return static_features, dynamic_features, target

def encode_categorical_features(train_df, test_df, static_features, dynamic_features):
    """Encode categorical features using one-hot encoding."""
    encoders = {'onehot_encoders': {}}
    onehot_encode_features = ['nationality', 'country', 'status', 'constructor_nationality']
    
    for feature in onehot_encode_features:
        if feature not in train_df.columns:
            continue
            
        encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
        train_encoded = encoder.fit_transform(train_df[[feature]])
        feature_names = encoder.get_feature_names_out([feature])
        test_encoded = encoder.transform(test_df[[feature]])
        
        train_encoded_df = pd.DataFrame(train_encoded, columns=feature_names, index=train_df.index)
        test_encoded_df = pd.DataFrame(test_encoded, columns=feature_names, index=test_df.index)
        
        train_df = pd.concat([train_df.drop(feature, axis=1), train_encoded_df], axis=1)
        test_df = pd.concat([test_df.drop(feature, axis=1), test_encoded_df], axis=1)
        
        if feature in static_features:
            static_features.remove(feature)
            static_features.extend(feature_names)
        elif feature in dynamic_features:
            dynamic_features.remove(feature)
            dynamic_features.extend(feature_names)
            
        encoders['onehot_encoders'][feature] = encoder
    
    return train_df, test_df, static_features, dynamic_features, encoders

def split_data(df, test_size=0.2, random_state=42):
    """Split data while keeping races together."""
    splitter = GroupShuffleSplit(test_size=test_size, n_splits=1, random_state=random_state)
    split = splitter.split(df, groups=df['raceId'])
    train_idx, test_idx = next(split)
    return df.iloc[train_idx], df.iloc[test_idx]

def scale_features(train_data, test_data, scaler_type='standard'):
    """Scale features using specified scaler."""
    scaler = StandardScaler() if scaler_type == 'standard' else MinMaxScaler()
    scaled_train = scaler.fit_transform(train_data)
    scaled_test = scaler.transform(test_data)
    return scaled_train, scaled_test, scaler

def prepare_race_data(df, static_features, dynamic_features):
    """Prepare data while maintaining race and temporal structure."""
    features = static_features + dynamic_features
    X = df[features]
    y = df['milliseconds']
    
    # Keep race and driver information for later sequence creation
    metadata = df[['raceId', 'driverId', 'lap']]
    
    return X, y, metadata

def save_processed_data(data, filename):
    """Save processed data to file."""
    with open(filename, 'wb') as f:
        pickle.dump(data, f)

if __name__ == "__main__":
    # Load and preprocess data
    df = load_and_preprocess_data()
    
    # Get feature groups
    static_features, dynamic_features, target = organize_features(df)
    
    # Split data (keeping races together)
    train_df, test_df = split_data(df)
    
    # Reset indices to ensure alignment
    train_df = train_df.reset_index(drop=True)
    test_df = test_df.reset_index(drop=True)
    
    # Encode categorical features
    train_df, test_df, static_features, dynamic_features, encoders = encode_categorical_features(
        train_df, test_df, static_features, dynamic_features
    )
    
    # Prepare data while maintaining temporal structure
    train_X, train_y, train_metadata = prepare_race_data(train_df, static_features, dynamic_features)
    test_X, test_y, test_metadata = prepare_race_data(test_df, static_features, dynamic_features)

    # Scale features
    scaled_train_X, scaled_test_X, feature_scaler = scale_features(train_X, test_X)
    
    # Create final processed dataset dictionary
    processed_data = {
        'train': {
            'features': scaled_train_X,
            'targets': train_y,
            'metadata': train_metadata
        },
        'test': {
            'features': scaled_test_X,
            'targets': test_y,
            'metadata': test_metadata
        },
        'feature_info': {
            'static_features': static_features,
            'dynamic_features': dynamic_features
        },
        'scalers': feature_scaler,
        'encoders': encoders
    }
    
    # Save processed data
    save_processed_data(processed_data, 'processed_race_data.pkl')


In [None]:
import pickle
import pandas as pd
import numpy as np
from ydata_profiling import ProfileReport
import logging

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S'
)

def load_processed_data(filepath):
    """Load the processed race data from pickle file."""
    logging.info(f"Attempting to load data from {filepath}")
    try:
        with open(filepath, 'rb') as f:
            data = pickle.load(f)
        logging.info("Data loaded successfully")
        return data
    except Exception as e:
        logging.error(f"Error loading data: {str(e)}")
        raise

def prepare_data_for_profiling(features, targets, metadata, feature_info):
    """Combine features, targets, and metadata into a single DataFrame for profiling."""
    logging.info("Starting data preparation for profiling")
    
    try:
        # Log shapes before processing
        logging.info(f"Features shape: {features.shape}")
        logging.info(f"Targets shape: {targets.shape}")
        logging.info(f"Metadata shape: {metadata.shape}")
        
        static_features = feature_info['static_features']
        dynamic_features = feature_info['dynamic_features']
        all_features = static_features + dynamic_features
        
        logging.info(f"Number of features: Static={len(static_features)}, Dynamic={len(dynamic_features)}")
        
        # Create feature DataFrame with reset index
        feature_df = pd.DataFrame(features, columns=all_features).reset_index(drop=True)
        logging.info(f"Feature DataFrame created with shape: {feature_df.shape}")
        
        # Reset metadata index and remove duplicate lap column
        metadata_df = metadata.reset_index(drop=True).drop('lap', axis=1)
        
        # Create targets series with matching index
        targets_series = pd.Series(targets, name='target_milliseconds').reset_index(drop=True)
        
        # Combine data with aligned indices
        df = pd.concat([
            feature_df,
            metadata_df,
            targets_series
        ], axis=1)
        
        logging.info(f"Final combined DataFrame shape: {df.shape}")
        logging.info(f"Final columns: {df.columns.tolist()}")
        
        return df
        
    except Exception as e:
        logging.error(f"Error in data preparation: {str(e)}")
        raise

def generate_profile_reports(data):
    """Generate profile reports for both training and test datasets."""
    logging.info("Starting profile report generation")
    
    try:
        # Prepare training data
        logging.info("Preparing training dataset")
        train_df = prepare_data_for_profiling(
            data['train']['features'],
            data['train']['targets'],
            data['train']['metadata'],
            data['feature_info']
        )
        
        # Prepare test data
        logging.info("Preparing test dataset")
        test_df = prepare_data_for_profiling(
            data['test']['features'],
            data['test']['targets'],
            data['test']['metadata'],
            data['feature_info']
        )
        
        # Generate reports
        logging.info("Generating training dataset profile")
        train_profile = ProfileReport(train_df, title="Training Dataset Profile")
        
        logging.info("Generating test dataset profile")
        test_profile = ProfileReport(test_df, title="Test Dataset Profile")
        
        # Save reports
        logging.info("Saving profile reports to files")
        train_profile.to_file("train_profile.html")
        test_profile.to_file("test_profile.html")
        
    except Exception as e:
        logging.error(f"Error in profile generation: {str(e)}")
        raise

def main():
    logging.info("Starting main execution")
    
    try:
        data = load_processed_data('processed_race_data.pkl')
        
        generate_profile_reports(data)
        logging.info("Profile reports generated successfully")
        
        # Print basic dataset information
        print("\nDataset Information:")
        print(f"Training Features Shape: {data['train']['features'].shape}")
        print(f"Test Features Shape: {data['test']['features'].shape}")
        print(f"\nNumber of static features: {len(data['feature_info']['static_features'])}")
        print(f"Number of dynamic features: {len(data['feature_info']['dynamic_features'])}")
        print(f"\nMetadata columns: {', '.join(data['train']['metadata'].columns)}")
        
    except Exception as e:
        logging.error(f"Main execution failed: {str(e)}")
        raise

if __name__ == "__main__":
    main()