In [5]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

def preprocess_data(data_path):
    # Load data
    data = pd.read_csv(data_path)
    
    # Handle missing values
    data = data.dropna()
    
    # Calculate Jerk (rate of change of acceleration)
    data['Jerk'] = data['Acceleration(m/s^2)'].diff() / data['Time_Step'].diff()
    data['Jerk'] = data['Jerk'].fillna(0)
    
    # Feature Scaling
    scaler = StandardScaler()
    scaled_features = scaler.fit_transform(data[['Speed(km/h)', 'Acceleration(m/s^2)', 'Heading_Change(degrees)', 'Jerk']])
    data[['Speed_Scaled', 'Acceleration_Scaled', 'Heading_Change_Scaled', 'Jerk_Scaled']] = scaled_features
    
    # Calculate Driving Score
    data['Driving_Score'] = (
        0.4 * data['Speed_Scaled'] + 
        0.3 * data['Acceleration_Scaled'] + 
        0.2 * data['Heading_Change_Scaled'] + 
        0.1 * data['Jerk_Scaled']
    )
    
    # Classify based on driving score
    data['Driver_Category'] = pd.cut(
        data['Driving_Score'],
        bins=[-np.inf, -0.5, 0.5, 1.5, np.inf],
        labels=['Safe', 'Moderate', 'Aggressive', 'Very Aggressive']
    )
    
    # Save preprocessed data
    data.to_csv('../data/preprocessed_data.csv', index=False)
    
    return data

if __name__ == "__main__":
    data_path = '../data/driving_data.csv'
    preprocess_data(data_path)
