In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

def ensure_directory_exists(directory):
    """Create directory if it does not exist."""
    if not os.path.exists(directory):
        os.makedirs(directory)

def calculate_speed(df):
    """Calculate speed using Haversine formula between consecutive points."""
    df['Speed(m/s)'] = np.sqrt(np.diff(df['Latitude'], prepend=df['Latitude'][0])**2 + 
                               np.diff(df['Longitude'], prepend=df['Longitude'][0])**2) * (111139 / df['Time_Step'].diff().fillna(1))
    return df

def calculate_acceleration(df):
    """Calculate acceleration as the rate of change of speed."""
    df['Acceleration(m/s^2)'] = df['Speed(m/s)'].diff().fillna(0)
    return df

def calculate_heading_change(df):
    """Calculate heading change based on bearing between two points."""
    df['Heading_Change(degrees)'] = np.abs(np.diff(np.arctan2(df['Longitude'].diff(), df['Latitude'].diff()), prepend=0)) * 180 / np.pi
    return df

def calculate_jerk(df):
    """Calculate jerk as the rate of change of acceleration."""
    df['Jerk(m/s^3)'] = df['Acceleration(m/s^2)'].diff().fillna(0)
    return df

def preprocess_data(file_path):
    """Preprocess the data and save processed data and plots."""
    df = pd.read_csv(file_path)
    
    # Ensure directories for saving files exist
    ensure_directory_exists('../data')
    
    # Calculate speed, acceleration, heading change, and jerk
    df = calculate_speed(df)
    df = calculate_acceleration(df)
    df = calculate_heading_change(df)
    df = calculate_jerk(df)
    
    # Categorize data
    def categorize(row):
        if row['Speed(m/s)'] < 10 and row['Jerk(m/s^3)'] < 0.5 and row['Heading_Change(degrees)'] < 5:
            return 'Safe'
        elif row['Speed(m/s)'] < 15 and row['Jerk(m/s^3)'] < 1 and row['Heading_Change(degrees)'] < 10:
            return 'Moderate'
        else:
            return 'Aggressive'
    
    df['Category'] = df.apply(categorize, axis=1)
    
    # Aggregate daily usage based on trips
    df['Date'] = pd.to_datetime(df['Time_Step'], unit='s').dt.date
    daily_usage = df.groupby(['TripId', 'Date'])['Speed(m/s)'].count().reset_index(name='Usage_Per_Day')
    
    # Merge daily usage back to the main dataframe
    df = df.merge(daily_usage, on=['TripId', 'Date'], how='left')
    
    # Dropping unnecessary columns
    df = df.drop(columns=['Latitude', 'Longitude', 'Time_Step', 'Date'])
    
    # Save the preprocessed data
    df.to_csv('../data/processed_data.csv', index=False)
    
    # Plot and save graphs for understanding
    def plot_feature_distribution(df, feature_name, file_name, bins=30):
        plt.figure(figsize=(12, 6))
        plt.hist(df[feature_name].dropna(), bins=bins, edgecolor='k', alpha=0.7)
        plt.title(f'Distribution of {feature_name}')
        plt.xlabel(feature_name)
        plt.ylabel('Frequency')
        plt.grid(True)
        plt.savefig(os.path.join('../data', file_name))
        plt.close()
        print(f"Plot saved to {os.path.join('../data', file_name)}")

    def plot_boxplot(df, feature_name, file_name):
        plt.figure(figsize=(12, 6))
        sns.boxplot(data=df, x=feature_name)
        plt.title(f'Boxplot of {feature_name}')
        plt.xlabel(feature_name)
        plt.grid(True)
        plt.savefig(os.path.join('../data', file_name))
        plt.close()
        print(f"Plot saved to {os.path.join('../data', file_name)}")

    def plot_pairplot(df, file_name):
        plt.figure(figsize=(12, 12))
        sns.pairplot(df[['Speed(m/s)', 'Acceleration(m/s^2)', 'Jerk(m/s^3)', 'Heading_Change(degrees)', 'Category']], hue='Category')
        plt.savefig(os.path.join('../data', file_name))
        plt.close()
        print(f"Plot saved to {os.path.join('../data', file_name)}")

    def plot_correlation_heatmap(df, file_name):
        plt.figure(figsize=(10, 8))
        correlation_matrix = df[['Speed(m/s)', 'Acceleration(m/s^2)', 'Jerk(m/s^3)', 'Heading_Change(degrees)']].corr()
        sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=.5)
        plt.title('Correlation Heatmap')
        plt.savefig(os.path.join('../data', file_name))
        plt.close()
        print(f"Plot saved to {os.path.join('../data', file_name)}")

    # Generate plots
    plot_feature_distribution(df, 'Speed(m/s)', 'speed_distribution.png')
    plot_feature_distribution(df, 'Acceleration(m/s^2)', 'acceleration_distribution.png')
    plot_feature_distribution(df, 'Jerk(m/s^3)', 'jerk_distribution.png')
    plot_feature_distribution(df, 'Heading_Change(degrees)', 'heading_change_distribution.png')
    
    plot_boxplot(df, 'Speed(m/s)', 'speed_boxplot.png')
    plot_boxplot(df, 'Acceleration(m/s^2)', 'acceleration_boxplot.png')
    plot_boxplot(df, 'Jerk(m/s^3)', 'jerk_boxplot.png')
    plot_boxplot(df, 'Heading_Change(degrees)', 'heading_change_boxplot.png')
    
    plot_pairplot(df, 'pairplot.png')
    plot_correlation_heatmap(df, 'correlation_heatmap.png')

    print("Preprocessing completed. Processed data and plots saved.")
    
    return df

if __name__ == "__main__":
    preprocess_data('../data/driving_data.csv')


Plot saved to ../data\speed_distribution.png
Plot saved to ../data\acceleration_distribution.png
Plot saved to ../data\jerk_distribution.png
Plot saved to ../data\heading_change_distribution.png
Plot saved to ../data\speed_boxplot.png
Plot saved to ../data\acceleration_boxplot.png
Plot saved to ../data\jerk_boxplot.png
Plot saved to ../data\heading_change_boxplot.png
Plot saved to ../data\pairplot.png
Plot saved to ../data\correlation_heatmap.png
Preprocessing completed. Processed data and plots saved.


<Figure size 1200x1200 with 0 Axes>