In [5]:
import os
import pandas as pd
import numpy as np
from math import radians, sin, cos, sqrt, atan2, degrees
from sklearn.preprocessing import MinMaxScaler
from scipy import stats

# Load the driving data
df = pd.read_csv('../data/driving_data.csv')
print("Rows after loading:", len(df))


# Ensure that Latitude, Longitude, and TimeStamp are in the correct format
df['Latitude'] = pd.to_numeric(df['Latitude'], errors='coerce')
df['Longitude'] = pd.to_numeric(df['Longitude'], errors='coerce')
df['TimeStamp'] = pd.to_datetime(df['TimeStamp'].str.replace('EDT', ''), errors='coerce')  # Remove timezone for simplicity

# Drop rows with missing values in important columns
df = df.dropna(subset=['Latitude', 'Longitude', 'TimeStamp'])
print("Rows after dropping NaNs:", len(df))


# Sort the data by TripID and TimeStamp to ensure trips are processed in the correct order
df = df.sort_values(by=['TripID', 'TimeStamp']).reset_index(drop=True)
print("Rows after processing trips:", len(df))

# Function for calculating distance using the Haversine formula
def haversine(lat1, lon1, lat2, lon2):
    R = 6371  # Radius of Earth in kilometers
    d_lat = radians(lat2 - lat1)
    d_lon = radians(lon2 - lon1)
    lat1 = radians(lat1)
    lat2 = radians(lat2)
    
    a = sin(d_lat / 2)**2 + cos(lat1) * cos(lat2) * sin(d_lon / 2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))
    
    return R * c * 1000  # Distance in meters

# Processing each trip separately
def process_trip(trip_df):
    trip_df['Time_Diff'] = trip_df['TimeStamp'].diff().dt.total_seconds().fillna(0)

    # Shift latitude and longitude for distance calculation
    trip_df['Lat_Shifted'] = trip_df['Latitude'].shift(1)
    trip_df['Lon_Shifted'] = trip_df['Longitude'].shift(1)

    # Calculate distance between consecutive points using the Haversine formula
    trip_df['Distance(m)'] = trip_df.apply(lambda row: haversine(row['Lat_Shifted'], row['Lon_Shifted'], row['Latitude'], row['Longitude']), axis=1)

    # Calculate speed (m/s) using distance and time difference
    trip_df['Speed(m/s)'] = trip_df['Distance(m)'] / trip_df['Time_Diff'].replace(0, np.nan).fillna(1)

    # Calculate acceleration (m/s²)
    trip_df['Acceleration(m/s^2)'] = trip_df['Speed(m/s)'].diff() / trip_df['Time_Diff'].replace(0, np.nan).fillna(1)

    # Calculate jerk (m/s³)
    trip_df['Jerk(m/s^3)'] = trip_df['Acceleration(m/s^2)'].diff() / trip_df['Time_Diff'].replace(0, np.nan).fillna(1)

    # Calculate braking intensity (absolute value of negative acceleration)
    trip_df['Braking_Intensity'] = trip_df['Acceleration(m/s^2)'].apply(lambda x: abs(x) if x < 0 else 0)

    # Fill missing values after calculations
    trip_df.fillna(0, inplace=True)

    return trip_df

# Apply the processing for each TripID separately while keeping the trip order
df = df.groupby('TripID', group_keys=False).apply(process_trip)

# Ensure the DataFrame is sorted by TripID and TimeStamp after processing
df = df.sort_values(by=['TripID', 'TimeStamp']).reset_index(drop=True)

# Remove outliers using Z-score method
z_scores = np.abs(stats.zscore(df[['Speed(m/s)', 'Acceleration(m/s^2)', 'Jerk(m/s^3)', 'Braking_Intensity']].fillna(0)))
df = df[(z_scores < 3).all(axis=1)]  # Keep data within 3 standard deviations
print("Rows after outlier removal:", len(df))

# Load sensitive locations (school, hospital, etc.)
sensitive_locations = pd.read_csv('../data/sensitive_location.csv')

# Function to calculate SASV (Sensitive Area Speed Violation)
def haversine_vectorized(lat1, lon1, lat2_series, lon2_series):
    R = 6371  # Earth's radius in kilometers
    d_lat = np.radians(lat2_series - lat1)
    d_lon = np.radians(lon2_series - lon1)
    lat1 = np.radians(lat1)
    lat2_series = np.radians(lat2_series)
    a = np.sin(d_lat / 2)**2 + np.cos(lat1) * np.cos(lat2_series) * np.sin(d_lon / 2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
    return R * c * 1000  # Distance in meters

def calculate_sasv(lat, lon, speed, sensitive_locations, sasv_threshold=8.33, radius_threshold=300):
    sensitive_distances = haversine_vectorized(lat, lon, sensitive_locations['Latitude'], sensitive_locations['Longitude'])
    if np.any(sensitive_distances < radius_threshold):  # Within 300 meters of sensitive areas
        if speed > sasv_threshold:  # Speed > 30 km/h in sensitive area
            return 1
    return 0

# Apply SASV calculation
df['SASV'] = df.apply(lambda row: calculate_sasv(row['Latitude'], row['Longitude'], row['Speed(m/s)'], sensitive_locations), axis=1)

# Calculate rule violation score for exceeding general speed limit
def calculate_speed_violation(row, speed_limit=13.89):  # Default 50 km/h
    if row['Speed(m/s)'] > speed_limit:
        return 1  # Speed violation
    return 0

df['Speed_Violation'] = df.apply(calculate_speed_violation, axis=1)

# ---------- Calculate Heading and Heading Change ---------- #
def calculate_heading(lat1, lon1, lat2, lon2):
    d_lon = lon2 - lon1
    x = sin(radians(d_lon)) * cos(radians(lat2))
    y = cos(radians(lat1)) * sin(radians(lat2)) - sin(radians(lat1)) * cos(radians(lat2)) * cos(radians(d_lon))
    initial_bearing = atan2(x, y)
    initial_bearing = degrees(initial_bearing)
    return (initial_bearing + 360) % 360  # Normalize to 0-360

df['Heading'] = df.groupby('TripID', group_keys=False).apply(
    lambda group: group.apply(
        lambda row: calculate_heading(row['Lat_Shifted'], row['Lon_Shifted'], row['Latitude'], row['Longitude']), axis=1
    )
).reset_index(level=0, drop=True)
df['Heading_Change(degrees)'] = df['Heading'].diff().fillna(0)

# ---------- Driving Score Calculation ---------- #
df['Driving_Score'] = 100

# Save unnormalized values before normalization
df['Speed_Unnormalized(m/s)'] = df['Speed(m/s)']
df['Acceleration_Unnormalized(m/s^2)'] = df['Acceleration(m/s^2)']
df['Jerk_Unnormalized(m/s^3)'] = df['Jerk(m/s^3)']
df['Braking_Intensity_Unnormalized'] = df['Braking_Intensity']
df['Heading_Change_Unnormalized'] = df['Heading_Change(degrees)']

# Normalize the key features using MinMaxScaler
scaler = MinMaxScaler()
df[['Speed(m/s)', 'Acceleration(m/s^2)', 'Jerk(m/s^3)', 'Braking_Intensity', 'Heading_Change(degrees)']] = scaler.fit_transform(
    df[['Speed(m/s)', 'Acceleration(m/s^2)', 'Jerk(m/s^3)', 'Braking_Intensity', 'Heading_Change(degrees)']])

# Apply penalties based on normalized features
df['Driving_Score'] -= df['Speed(m/s)'] * 25  # Speed penalty
df['Driving_Score'] -= df['Acceleration(m/s^2)'] * 20  # Acceleration penalty
df['Driving_Score'] -= df['Jerk(m/s^3)'] * 10  # Jerk penalty
df['Driving_Score'] -= df['Braking_Intensity'] * 5  # Braking intensity penalty
df['Driving_Score'] -= df['Heading_Change(degrees)'] * 5  # Heading change penalty

# Penalty for violations
df['Driving_Score'] -= df['SASV'] * 10  # Penalty for violating sensitive areas
df['Driving_Score'] -= df['Speed_Violation'] * 15  # Penalty for general speed limit violations

# Categorize driving behavior
def categorize_driving(score):
    if score >= 80:
        return 'Safe'
    elif 60 <= score < 80:
        return 'Moderate'
    else:
        return 'Risky'

df['Driving_Category'] = df['Driving_Score'].apply(categorize_driving)

# Save the processed data to CSV, keeping only relevant columns
processed_columns = ['TripID', 'TimeStamp', 'Speed_Unnormalized(m/s)', 'Speed(m/s)',
                     'Acceleration_Unnormalized(m/s^2)', 'Acceleration(m/s^2)',
                     'Jerk_Unnormalized(m/s^3)', 'Jerk(m/s^3)', 
                     'Braking_Intensity_Unnormalized', 'Braking_Intensity', 
                     'SASV', 'Speed_Violation', 'Heading', 
                     'Heading_Change_Unnormalized', 'Heading_Change(degrees)', 
                     'Driving_Score', 'Driving_Category']

df[processed_columns].to_csv('../data/processed_data.csv', index=False)


Rows after loading: 47846
Rows after dropping NaNs: 42601
Rows after processing trips: 42601


  df = df.groupby('TripID', group_keys=False).apply(process_trip)


Rows after outlier removal: 41494


  df['Heading'] = df.groupby('TripID', group_keys=False).apply(
