<a href="https://colab.research.google.com/github/chewzzz1014/csc4700-embedded-collision-detection/blob/master/src/train_collision_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import joblib

In [2]:
csv_path_no_collision = "is_collision = 0 data.csv"
csv_path_collision = "is_collision = 1 data.csv"

df_no_collision = pd.read_csv(csv_path_no_collision)
df_collision = pd.read_csv(csv_path_collision)
df = pd.concat([df_no_collision, df_collision], ignore_index=True)

In [3]:
df.head()

Unnamed: 0,created_at,entry_id,x,y,z,is_collision,batch_id
0,2025-01-09T02:55:56+00:00,1,-2.0,0.71,-9.02,0,483505
1,2025-01-09T02:56:13+00:00,2,-2.04,0.75,-9.06,0,483505
2,2025-01-09T02:56:30+00:00,3,-1.96,0.75,-9.02,0,483505
3,2025-01-09T02:56:48+00:00,4,-2.0,0.78,-9.02,0,483505
4,2025-01-09T02:57:05+00:00,5,-2.0,0.71,-9.06,0,483505


In [4]:
# Feature engineering: Extract statistical features for each batch_id
def extract_features(group):
    features = {
        "x_mean": group["x"].mean(),
        "x_std": group["x"].std(),
        "x_min": group["x"].min(),
        "x_max": group["x"].max(),
        "y_mean": group["y"].mean(),
        "y_std": group["y"].std(),
        "y_min": group["y"].min(),
        "y_max": group["y"].max(),
        "z_mean": group["z"].mean(),
        "z_std": group["z"].std(),
        "z_min": group["z"].min(),
        "z_max": group["z"].max(),
    }
    return pd.Series(features)

# Apply feature extraction
features = df.groupby("batch_id").apply(extract_features).reset_index()
labels = df.groupby("batch_id")["is_collision"].first().reset_index(name="is_collision")

  features = df.groupby("batch_id").apply(extract_features).reset_index()


In [5]:
data = pd.merge(features, labels, on="batch_id")

# Split data into train and test sets
X = data.drop(columns=["batch_id", "is_collision"])
y = data["is_collision"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 80 entries, 55 to 51
Data columns (total 12 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   x_mean  80 non-null     float64
 1   x_std   77 non-null     float64
 2   x_min   80 non-null     float64
 3   x_max   80 non-null     float64
 4   y_mean  80 non-null     float64
 5   y_std   77 non-null     float64
 6   y_min   80 non-null     float64
 7   y_max   80 non-null     float64
 8   z_mean  80 non-null     float64
 9   z_std   77 non-null     float64
 10  z_min   80 non-null     float64
 11  z_max   80 non-null     float64
dtypes: float64(12)
memory usage: 8.1 KB


In [7]:
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)

# Export trained model
joblib.dump(clf, "collision_clf_model.pkl")

['collision_clf_model.pkl']

In [8]:
y_pred = clf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.85
Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.62      0.77         8
           1       0.80      1.00      0.89        12

    accuracy                           0.85        20
   macro avg       0.90      0.81      0.83        20
weighted avg       0.88      0.85      0.84        20



In [9]:
new_batch = pd.DataFrame({
    "x": [-2.04, -2.12, -1.96, -2.08, -2.24],
    "y": [0.75, 0.86, 0.67, 0.39, 0.51],
    "z": [-9.06, -9.34, -8.98, -8.90, -9.02]
})
new_features = extract_features(new_batch)
print("Prediction for new batch:", clf.predict([new_features]))

Prediction for new batch: [0]


