In [None]:
# ml_model/data_preprocessing.py

import pandas as pd
import numpy as np

def preprocess_data(df):
    """
    Preprocesses the driving data to calculate additional features and categorize drivers.
    """
    # Calculate the jerk (rate of change of acceleration)
    df['Jerk'] = df['Acceleration(m/s^2)'].diff() / df['Time_Step'].diff()

    # Calculate the total usage per trip (time span of the trip)
    df['Usage_Per_Day'] = df.groupby('TripId')['Time_Step'].transform('max')

    # Handle missing values (e.g., fill NaNs with 0)
    df.fillna(0, inplace=True)

    # Define driving score calculation
    def calculate_driving_score(speed, acceleration, heading_change):
        norm_speed = speed / speed.max()
        norm_acceleration = np.abs(acceleration) / np.abs(acceleration).max()
        norm_heading_change = np.abs(heading_change) / np.abs(heading_change).max()
        score = 0.4 * norm_speed + 0.3 * norm_acceleration + 0.3 * norm_heading_change
        return score

    # Define driver categorization
    def categorize_driver(score):
        if score < 0.3:
            return "Safe"
        elif 0.3 <= score < 0.6:
            return "Moderate"
        else:
            return "Aggressive"

    # Apply the functions to create the Driver_Category column
    df['Driving_Score'] = df.apply(lambda row: calculate_driving_score(
        row['Speed(km/h)'], row['Acceleration(m/s^2)'], row['Heading_Change(degrees)']), axis=1)

    df['Driver_Category'] = df['Driving_Score'].apply(categorize_driver)

    # Drop the temporary Driving_Score column
    df.drop(columns=['Driving_Score'], inplace=True)

    return df

if __name__ == "__main__":
    # Load data
    data = pd.read_csv('ml_model/data/driving_data.csv')

    # Preprocess data
    preprocessed_data = preprocess_data(data)

    # Save preprocessed data
    preprocessed_data.to_csv('ml_model/data/preprocessed_driving_data_with_category.csv', index=False)


: 