In [1]:
import sys
sys.path.append('../')
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import GroupShuffleSplit
from common.data_preparation import load_and_preprocess_data
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import pickle

def organize_features(df):
    """Organize features into static and dynamic groups based on simulation requirements."""
    static_features = [
        # Driver characteristics
        'driver_overall_skill',
        'driver_circuit_skill',
        'driver_consistency',
        'driver_aggression',
        'driver_reliability',
        'driver_risk_taking',
        'driver_adaptability',
        
        # Constructor information
        'constructorId',
        'constructor_performance',
        'constructor_nationality',
        'constructor_position',
        
        # Circuit characteristics
        'circuit_type_encoded',
        'alt',
        
        # Historical performance
        'fp1_median_time',
        'fp2_median_time',
        'fp3_median_time',
        'quali_time',
        
        # Race start configuration
        'grid'
    ]
    
    dynamic_features = [
        # Timing and position
        'lap',
        'position',
        'cumulative_milliseconds',
        'seconds_from_start',
        'GapToLeader_ms',
        'IntervalToPositionAhead_ms',
        
        # Car state
        'tire_compound',
        'tire_age',
        'fuel_load',
        'is_pit_lap',
        
        # Environmental conditions
        'TrackTemp',
        'AirTemp',
        'Humidity',
        'TrackStatus'
    ]
    
    target = 'milliseconds'
    
    return static_features, dynamic_features, target

def encode_categorical_features(train_df, test_df, static_features, dynamic_features):
    """Encode categorical features using one-hot encoding."""
    encoders = {'onehot_encoders': {}}
    onehot_encode_features = ['nationality', 'country', 'status', 'constructor_nationality', 'TrackStatus', 'date', 'code' , 'round', 'circuitId', 'circuit_type_encoded', 'circuit_type' 'constructorId']
    
    for feature in onehot_encode_features:
        if feature not in train_df.columns:
            continue
            
        encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
        train_encoded = encoder.fit_transform(train_df[[feature]])
        feature_names = encoder.get_feature_names_out([feature])
        test_encoded = encoder.transform(test_df[[feature]])
        
        train_encoded_df = pd.DataFrame(train_encoded, columns=feature_names, index=train_df.index)
        test_encoded_df = pd.DataFrame(test_encoded, columns=feature_names, index=test_df.index)
        
        train_df = pd.concat([train_df.drop(feature, axis=1), train_encoded_df], axis=1)
        test_df = pd.concat([test_df.drop(feature, axis=1), test_encoded_df], axis=1)
        
        if feature in static_features:
            static_features.remove(feature)
            static_features.extend(feature_names)
        elif feature in dynamic_features:
            dynamic_features.remove(feature)
            dynamic_features.extend(feature_names)
            
        encoders['onehot_encoders'][feature] = encoder
    
    return train_df, test_df, static_features, dynamic_features, encoders

def split_data(df, test_size=0.2, random_state=42):
    """Split data while keeping races together."""
    splitter = GroupShuffleSplit(test_size=test_size, n_splits=1, random_state=random_state)
    split = splitter.split(df, groups=df['raceId'])
    train_idx, test_idx = next(split)
    return df.iloc[train_idx], df.iloc[test_idx]

def scale_features(train_data, test_data, scaler_type='standard'):
    """Scale features using specified scaler."""
    scaler = StandardScaler() if scaler_type == 'standard' else MinMaxScaler()
    scaled_train = scaler.fit_transform(train_data)
    scaled_test = scaler.transform(test_data)
    return scaled_train, scaled_test, scaler

def prepare_race_data(df, static_features, dynamic_features):
    """Prepare data while maintaining race and temporal structure."""
    features = static_features + dynamic_features
    X = df[features]
    y = df['milliseconds']
    
    # Keep race and driver information for later sequence creation
    metadata = df[['raceId', 'driverId', 'lap']]
    
    return X, y, metadata

def save_processed_data(data, filename):
    """Save processed data to file."""
    with open(filename, 'wb') as f:
        pickle.dump(data, f)

if __name__ == "__main__":
    # Load and preprocess data
    df = load_and_preprocess_data()
    
    # Get feature groups
    static_features, dynamic_features, target = organize_features(df)
    
    # Split data (keeping races together)
    train_df, test_df = split_data(df)
    
    # Reset indices to ensure alignment
    train_df = train_df.reset_index(drop=True)
    test_df = test_df.reset_index(drop=True)
    
    # Encode categorical features
    train_df, test_df, static_features, dynamic_features, encoders = encode_categorical_features(
        train_df, test_df, static_features, dynamic_features
    )
    
    # Prepare data while maintaining temporal structure
    train_X, train_y, train_metadata = prepare_race_data(train_df, static_features, dynamic_features)
    test_X, test_y, test_metadata = prepare_race_data(test_df, static_features, dynamic_features)

    # Scale features
    scaled_train_X, scaled_test_X, feature_scaler = scale_features(train_X, test_X)
    
    # Scale targets
    target_scaler = StandardScaler()
    scaled_train_y = target_scaler.fit_transform(train_y.values.reshape(-1, 1)).ravel()
    scaled_test_y = target_scaler.transform(test_y.values.reshape(-1, 1)).ravel()
    
    # Create final processed dataset dictionary
    processed_data = {
        'train': {
            'features': scaled_train_X,
            'targets': scaled_train_y,  # Now using scaled targets
            'metadata': train_metadata
        },
        'test': {
            'features': scaled_test_X,
            'targets': scaled_test_y,   # Now using scaled targets
            'metadata': test_metadata
        },
        'feature_info': {
            'static_features': static_features,
            'dynamic_features': dynamic_features
        },
        'scalers': {
            'feature_scaler': feature_scaler,
            'target_scaler': target_scaler  # Save target scaler for inverse transform
        },
        'encoders': encoders
    }
    
    # Save processed data
    save_processed_data(processed_data, 'processed_race_data.pkl')
 

/Users/I551659/Documents/GitHub/IE650-RAMP/ie500-data-mining-group7/race_simulation_clean/notebooks
Index(['resultId', 'raceId', 'driverId', 'constructorId', 'number', 'grid',
       'position', 'positionText', 'positionOrder', 'points', 'laps', 'time',
       'racetime_milliseconds', 'fastestLap', 'rank', 'fastestLapTime',
       'fastestLapSpeed', 'statusId'],
      dtype='object')





Initial data sizes:
Lap times: (586171, 6)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


The behavior will change in pa


After initial merge: (586171, 46) - All lap data merged
After year filtering (>=2018): (159538, 46)
After adding constructor info: (159538, 52)
After adding circuit info: (159538, 55)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFr

After adding weather info: (159538, 60)

Fetching weather for 92 races: [989, 990, 991, 992, 993, 994, 995, 996, 997, 998, 999, 1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015, 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023, 1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031, 1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039, 1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047, 1051, 1052, 1053, 1054, 1055, 1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063, 1064, 1065, 1066, 1067, 1069, 1070, 1071, 1072, 1073, 1096, 1120, 1121, 1122, 1123, 1124, 1125, 1126, 1127, 1128, 1129]
No weather data available for race 992
No weather data available for race 1005
No weather data available for race 1013
No weather data available for race 1023
No weather data available for race 1026
No weather data available for race 1038
No weather data available for race 1057


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try usi

After adding tire info: (159538, 66)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try usi

After adding practice info: (159538, 71)
After cleaning time intervals: (159538, 71)
Calculating enhanced driver metrics...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try usi

After enhancing driver attributes: (159538, 78)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.




After adding dynamic features: (159538, 80)

Calculating race intervals...
After calculating race intervals: (159538, 80)

Before outlier removal:

Before outlier removal:
Unique races: 146
Unique drivers per race: count    146.000000
mean      19.500000
std        0.888625
min       16.000000
25%       19.000000
50%       20.000000
75%       20.000000
max       20.000000
Name: driverId, dtype: float64

After removing outliers: (156890, 80)
After dropping unnecessary columns: (156890, 49)

Removed Columns:
[]
After removing duplicate columns: (156890, 49)

Before handling missing values:
Unique races: 145
Unique drivers per race: count    145.000000
mean      19.475862
std        0.905866
min       16.000000
25%       19.000000
50%       20.000000
75%       20.000000
max       20.000000
Name: driverId, dtype: float64

Initial shape: (156890, 49)
Initial unique races: 145
Initial unique drivers: 39

Columns with missing values:

Column: driver_aggression
- Missing values: 1656 (1.06%)
-

In [2]:
import pickle
import pandas as pd
import numpy as np
from ydata_profiling import ProfileReport
import logging

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S'
)

def load_processed_data(filepath):
    """Load the processed race data from pickle file."""
    logging.info(f"Attempting to load data from {filepath}")
    try:
        with open(filepath, 'rb') as f:
            data = pickle.load(f)
        logging.info("Data loaded successfully")
        return data
    except Exception as e:
        logging.error(f"Error loading data: {str(e)}")
        raise

def prepare_data_for_profiling(features, targets, metadata, feature_info):
    """Combine features, targets, and metadata into a single DataFrame for profiling."""
    logging.info("Starting data preparation for profiling")
    
    try:
        # Log shapes before processing
        logging.info(f"Features shape: {features.shape}")
        logging.info(f"Targets shape: {targets.shape}")
        logging.info(f"Metadata shape: {metadata.shape}")
        
        static_features = feature_info['static_features']
        dynamic_features = feature_info['dynamic_features']
        all_features = static_features + dynamic_features
        
        logging.info(f"Number of features: Static={len(static_features)}, Dynamic={len(dynamic_features)}")
        
        # Create feature DataFrame with reset index
        feature_df = pd.DataFrame(features, columns=all_features).reset_index(drop=True)
        logging.info(f"Feature DataFrame created with shape: {feature_df.shape}")
        
        # Reset metadata index and remove duplicate lap column
        metadata_df = metadata.reset_index(drop=True).drop('lap', axis=1)
        
        # Create targets series with matching index
        targets_series = pd.Series(targets, name='target_milliseconds').reset_index(drop=True)
        
        # Combine data with aligned indices
        df = pd.concat([
            feature_df,
            metadata_df,
            targets_series
        ], axis=1)
        
        logging.info(f"Final combined DataFrame shape: {df.shape}")
        logging.info(f"Final columns: {df.columns.tolist()}")
        
        return df
        
    except Exception as e:
        logging.error(f"Error in data preparation: {str(e)}")
        raise

def generate_profile_reports(data):
    """Generate profile reports for both training and test datasets."""
    logging.info("Starting profile report generation")
    
    try:
        # Prepare training data
        logging.info("Preparing training dataset")
        train_df = prepare_data_for_profiling(
            data['train']['features'],
            data['train']['targets'],
            data['train']['metadata'],
            data['feature_info']
        )
        
        # Prepare test data
        logging.info("Preparing test dataset")
        test_df = prepare_data_for_profiling(
            data['test']['features'],
            data['test']['targets'],
            data['test']['metadata'],
            data['feature_info']
        )
        
        # Generate reports
        logging.info("Generating training dataset profile")
        train_profile = ProfileReport(train_df, title="Training Dataset Profile")
        
        logging.info("Generating test dataset profile")
        test_profile = ProfileReport(test_df, title="Test Dataset Profile")
        
        # Save reports
        logging.info("Saving profile reports to files")
        train_profile.to_file("train_profile.html")
        test_profile.to_file("test_profile.html")
        
    except Exception as e:
        logging.error(f"Error in profile generation: {str(e)}")
        raise

def main():
    logging.info("Starting main execution")
    
    try:
        data = load_processed_data('processed_race_data.pkl')
        
        generate_profile_reports(data)
        logging.info("Profile reports generated successfully")
        
        # Print basic dataset information
        print("\nDataset Information:")
        print(f"Training Features Shape: {data['train']['features'].shape}")
        print(f"Test Features Shape: {data['test']['features'].shape}")
        print(f"\nNumber of static features: {len(data['feature_info']['static_features'])}")
        print(f"Number of dynamic features: {len(data['feature_info']['dynamic_features'])}")
        print(f"\nMetadata columns: {', '.join(data['train']['metadata'].columns)}")
        
    except Exception as e:
        logging.error(f"Main execution failed: {str(e)}")
        raise

if __name__ == "__main__":
    main()

INFO:visions.backends:Pandas backend loaded 2.2.3
INFO:visions.backends:Numpy backend loaded 2.0.2
INFO:visions.backends:Pyspark backend NOT loaded
INFO:visions.backends:Python backend loaded
INFO:root:Starting main execution
INFO:root:Attempting to load data from processed_race_data.pkl
INFO:root:Data loaded successfully
INFO:root:Starting profile report generation
INFO:root:Preparing training dataset
INFO:root:Starting data preparation for profiling
INFO:root:Features shape: (123257, 80)
INFO:root:Targets shape: (123257,)
INFO:root:Metadata shape: (123257, 3)
INFO:root:Number of features: Static=27, Dynamic=53
INFO:root:Feature DataFrame created with shape: (123257, 80)
INFO:root:Final combined DataFrame shape: (123257, 83)
INFO:root:Final columns: ['driver_overall_skill', 'driver_circuit_skill', 'driver_consistency', 'driver_aggression', 'driver_reliability', 'driver_risk_taking', 'driver_adaptability', 'constructorId', 'constructor_performance', 'constructor_position', 'alt', 'fp1_

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

INFO:root:Profile reports generated successfully



Dataset Information:
Training Features Shape: (123257, 80)
Test Features Shape: (31069, 80)

Number of static features: 27
Number of dynamic features: 53

Metadata columns: raceId, driverId, lap


In [10]:
import pickle
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns

def evaluate_fp3_baseline(processed_data_path='processed_race_data.pkl'):
    """
    Evaluate a baseline model that uses FP3 median times as predictions,
    ensuring we use the correct FP3 time for each driver-race combination.
    """
    # Load processed data
    with open(processed_data_path, 'rb') as f:
        processed_data = pickle.load(f)
    
    # Get test data metadata and features
    test_metadata = processed_data['test']['metadata']
    test_features = processed_data['test']['features']
    test_actual_times = processed_data['test']['targets']
    
    # Get the feature index for fp3_median_time
    static_features = processed_data['feature_info']['static_features']
    fp3_index = static_features.index('fp3_median_time')
    
    # Create a DataFrame combining metadata, FP3 times, and actual times
    test_df = pd.DataFrame({
        'raceId': test_metadata['raceId'],
        'driverId': test_metadata['driverId'],
        'lap': test_metadata['lap'],
        'fp3_scaled': test_features[:, fp3_index],
        'actual_scaled': test_actual_times
    })
    
    # Get the scalers
    feature_scaler = processed_data['scalers']['feature_scaler']
    target_scaler = processed_data['scalers']['target_scaler']
    
    # Inverse transform the scaled values
    zeros = np.zeros_like(test_features)
    zeros[:, fp3_index] = test_df['fp3_scaled']
    test_df['fp3_time'] = feature_scaler.inverse_transform(zeros)[:, fp3_index]
    test_df['actual_time'] = target_scaler.inverse_transform(test_df['actual_scaled'].values.reshape(-1, 1)).ravel()
    
    # Group by race and driver to verify we're using consistent FP3 times
    grouped_predictions = test_df.groupby(['raceId', 'driverId']).agg({
        'fp3_time': 'first',  # Each driver should have same FP3 time for the race
        'actual_time': list
    }).reset_index()
    
    # Expand the predictions to match actual times
    all_predictions = []
    all_actuals = []
    for _, row in grouped_predictions.iterrows():
        predictions = [row['fp3_time']] * len(row['actual_time'])
        all_predictions.extend(predictions)
        all_actuals.extend(row['actual_time'])
    
    all_predictions = np.array(all_predictions)
    all_actuals = np.array(all_actuals)
    
    # Calculate metrics
    rmse = np.sqrt(mean_squared_error(all_actuals, all_predictions))
    r2 = r2_score(all_actuals, all_predictions)
    mae = np.mean(np.abs(all_actuals - all_predictions))
    
    # Create visualization
    plt.figure(figsize=(10, 8))
    sns.scatterplot(x=all_actuals, y=all_predictions, alpha=0.5)
    
    # Add perfect prediction line
    min_val = min(all_actuals.min(), all_predictions.min())
    max_val = max(all_actuals.max(), all_predictions.max())
    plt.plot([min_val, max_val], [min_val, max_val], 'r--', label='Perfect Prediction')
    
    plt.xlabel('Actual Lap Time (ms)')
    plt.ylabel('FP3 Median Time (ms)')
    plt.title(f'FP3 Baseline Model Performance\nR² = {r2:.4f}, RMSE = {rmse:.2f} ms')
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    
    # Print verification of consistency
    print("\nVerification of FP3 times consistency:")
    fp3_consistency = test_df.groupby(['raceId', 'driverId'])['fp3_time'].nunique()
    print(f"Number of unique FP3 times per driver-race combination: {fp3_consistency.value_counts().to_dict()}")
    
    print("\nBaseline Model Performance Metrics:")
    print(f"R² Score: {r2:.4f}")
    print(f"RMSE: {rmse:.2f} ms")
    print(f"MAE: {mae:.2f} ms")
    
    # Additional statistics
    mean_error = np.mean(all_predictions - all_actuals)
    error_std = np.std(all_predictions - all_actuals)
    print(f"Mean Error: {mean_error:.2f} ms")
    print(f"Error Std Dev: {error_std:.2f} ms")
    
    # Return metrics dictionary
    return {
        'r2': r2,
        'rmse': rmse,
        'mae': mae,
        'mean_error': mean_error,
        'error_std': error_std,
        'actual_times': all_actuals,
        'predicted_times': all_predictions,
        'predictions_df': test_df
    }

if __name__ == "__main__":
    metrics = evaluate_fp3_baseline()
    plt.show()


Verification of FP3 times consistency:
Number of unique FP3 times per driver-race combination: {1: 558}

Baseline Model Performance Metrics:
R² Score: -0.2633
RMSE: 15928.62 ms
MAE: 13221.61 ms
Mean Error: 8525.08 ms
Error Std Dev: 13455.25 ms




In [None]:
import pickle
import pandas as pd
import numpy as np

def convert_preprocessed_to_csv(pickle_path='processed_race_data.pkl', output_path='race_data.csv'):
    """
    Load preprocessed pickle data, rescale it, and save as CSV.
    
    Args:
        pickle_path (str): Path to the preprocessed pickle file
        output_path (str): Path where the CSV should be saved
    """
    # Load the preprocessed data
    with open(pickle_path, 'rb') as f:
        processed_data = pickle.load(f)
    
    # Get scalers and feature information
    feature_scaler = processed_data['scalers']['feature_scaler']
    target_scaler = processed_data['scalers']['target_scaler']
    static_features = processed_data['feature_info']['static_features']
    dynamic_features = processed_data['feature_info']['dynamic_features']
    
    # Function to create DataFrame for a dataset split
    def create_dataframe(data_split):
        # Inverse transform features and targets
        features_unscaled = feature_scaler.inverse_transform(data_split['features'])
        targets_unscaled = target_scaler.inverse_transform(data_split['targets'].reshape(-1, 1))
        
        # Create DataFrame with features
        df = pd.DataFrame(
            features_unscaled,
            columns=static_features + dynamic_features
        )
        
        # Add target
        df['milliseconds'] = targets_unscaled
        
        # Add metadata
        for col in data_split['metadata'].columns:
            df[col] = data_split['metadata'][col]
            
        return df
    
    # Create DataFrames for train and test sets
    train_df = create_dataframe(processed_data['train'])
    test_df = create_dataframe(processed_data['test'])
    
    # Add a column to identify train/test split
    train_df['dataset'] = 'train'
    test_df['dataset'] = 'test'
    
    # Combine train and test sets
    full_df = pd.concat([train_df, test_df], axis=0, ignore_index=True)
    
    # Save to CSV
    full_df.to_csv(output_path, index=False)
    
    print(f"Data saved to {output_path}")
    print("\nDataset shape:", full_df.shape)
    print("\nColumns:", full_df.columns.tolist())
    
    # Display some basic statistics
    print("\nSample of numerical statistics:")
    print(full_df.describe().round(2).head())
    
    return full_df

if __name__ == "__main__":
    df = convert_preprocessed_to_csv()