In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split, learning_curve
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import joblib
import matplotlib.pyplot as plt
import numpy as np

def train_model(data_path, model_save_path):
    # Load preprocessed data
    data = pd.read_csv(data_path)
    
    # Define features and target variable
    X = data[['Speed_Scaled', 'Acceleration_Scaled', 'Heading_Change_Scaled', 'Jerk_Scaled']]
    y = data['Driver_Category']
    
    # Train-Test Split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Train Random Forest Classifier
    clf = RandomForestClassifier(n_estimators=100, random_state=42)
    clf.fit(X_train, y_train)
    
    # Predictions and Evaluation
    y_pred = clf.predict(X_test)
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))
    
    # Save the model
    joblib.dump(clf, model_save_path)
    
    return clf

def plot_learning_curve(clf, X_train, y_train):
    train_sizes, train_scores, test_scores = learning_curve(
        clf, X_train, y_train, cv=5, n_jobs=-1,
        train_sizes=np.linspace(0.1, 1.0, 10)
    )
    
    # Plot learning curve
    plt.figure()
    plt.title("Learning Curve")
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    plt.ylim(0, 1.1)
    plt.grid()
    
    # Training score
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    plt.plot(train_sizes, train_scores_mean, 'o-', color='r', label="Training score")
    plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.2, color='r')
    
    # Cross-validation score
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.plot(train_sizes, test_scores_mean, 'o-', color='g', label="Cross-validation score")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.2, color='g')
    
    plt.legend(loc="best")
    plt.show()

if __name__ == "__main__":
    data_path = '../data/preprocessed_data.csv'
    model_save_path = '../models/driving_behavior_model.pkl'
    clf = train_model(data_path, model_save_path)


Accuracy: 0.9941727894603496
Classification Report:
                  precision    recall  f1-score   support

     Aggressive       0.99      0.99      0.99      1567
       Moderate       0.99      1.00      1.00      4388
           Safe       0.99      1.00      0.99      1909
Very Aggressive       1.00      0.87      0.93        30

       accuracy                           0.99      7894
      macro avg       1.00      0.96      0.98      7894
   weighted avg       0.99      0.99      0.99      7894

