<a href="https://colab.research.google.com/github/gowrisankar393/vaylen-transitlk/blob/Multi-Sensor-Fusion-Crash-Detection/TransitLK_MSFCD_SDP_XGB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Setup & Data Loading

In [None]:
# üì¶ Install and import libraries
!pip install xgboost scikit-learn pandas numpy matplotlib seaborn imbalanced-learn -q

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from imblearn.over_sampling import SMOTE  # For balancing crashes
import xgboost as xgb
import joblib
import warnings
warnings.filterwarnings('ignore')

print("‚úÖ All libraries loaded successfully!")

# üìÇ Upload your dataset
from google.colab import files
uploaded = files.upload()  # This will prompt you to select nthsc_telemetry_records.csv

# Load data
df = pd.read_csv('nthsc_telemetry_records.csv')
print(f"üìä Dataset loaded: {df.shape[0]} rows, {df.shape[1]} columns")

Exploratory Data Analysis (EDA)

In [None]:
print("üîç Exploratory Data Analysis")
print("="*50)

# Basic info
print("\n1. Dataset Info:")
df.info()

print("\n2. Missing Values:")
print(df.isnull().sum())

print("\n3. Class Distribution (Crash vs Non-Crash):")
crash_counts = df['crash_label'].value_counts()
print(crash_counts)
print(f"\nCrash rate: {crash_counts[1]/len(df):.2%}")

# Visualize class distribution
plt.figure(figsize=(8,5))
sns.barplot(x=crash_counts.index, y=crash_counts.values, palette=['#96d46c','#ef4444'])
plt.title('Class Distribution: Crash vs Non-Crash', fontsize=14, fontweight='bold')
plt.xlabel('Crash Label (0=Normal, 1=Crash)')
plt.ylabel('Count')
for i, v in enumerate(crash_counts.values):
    plt.text(i, v + 50, str(v), ha='center', fontweight='bold')
plt.show()

# Sensor statistics
print("\n4. Sensor Statistics:")
sensor_cols = [col for col in df.columns if col not in ['timestamp', 'crash_label']]
print(df[sensor_cols].describe())

Data Cleaning & Preprocessing

In [None]:
print("üßπ Data Cleaning & Preprocessing")
print("="*50)

# Copy to avoid modifying original
df_clean = df.copy()

# Handle missing values (if any)
print("\n1. Checking for missing values...")
missing_before = df_clean.isnull().sum().sum()
df_clean = df_clean.fillna(df_clean.median())  # Fill with median
missing_after = df_clean.isnull().sum().sum()
print(f"Missing values: {missing_before} ‚Üí {missing_after}")

# Convert timestamp to proper format (optional, for time-series features)
print("\n2. Parsing timestamps...")
df_clean['timestamp_seconds'] = df_clean['timestamp'].apply(
    lambda x: float(x.split(':')[0]) * 60 + float(x.split(':')[1])
)
print("‚úÖ Timestamps converted to seconds")

# Separate features and target
X = df_clean.drop(columns=['timestamp', 'crash_label'])
y = df_clean['crash_label']

print(f"\n3. Feature matrix shape: {X.shape}")
print(f"   Target vector shape: {y.shape}")

# Mobile sensor subset (what your phone can actually measure)
MOBILE_FEATURES = [
    'accel_x', 'accel_y', 'accel_z',
    'gyro_x', 'gyro_y', 'gyro_z',
    'gps_lat', 'gps_lon', 'gps_speed'
]

X_mobile = X[MOBILE_FEATURES]
print(f"\n4. Mobile features selected: {len(MOBILE_FEATURES)} features")
print("   Features:", MOBILE_FEATURES)

Handle Class Imbalance

In [None]:
print("‚öñÔ∏è Balancing Crash vs Non-Crash Data")
print("="*50)

print(f"\nBefore balancing:")
print(y.value_counts())

# Use SMOTE to generate synthetic crash examples
smote = SMOTE(random_state=42, k_neighbors=3)
X_balanced, y_balanced = smote.fit_resample(X_mobile, y)

print(f"\nAfter SMOTE balancing:")
print(f"Non-crash: {sum(y_balanced == 0)}")
print(f"Crash: {sum(y_balanced == 1)}")
print(f"New shape: {X_balanced.shape}")

# Verify the balance
plt.figure(figsize=(8,5))
balanced_counts = pd.Series(y_balanced).value_counts()
sns.barplot(x=balanced_counts.index, y=balanced_counts.values, palette=['#96d46c','#ef4444'])
plt.title('Balanced Class Distribution', fontsize=14, fontweight='bold')
plt.xlabel('Crash Label')
plt.ylabel('Count')
for i, v in enumerate(balanced_counts.values):
    plt.text(i, v - 500, str(v), ha='center', fontweight='bold', color='white')
plt.show()

Feature Engineering & Selection

In [None]:
print("üîß Feature Engineering")
print("="*50)

# Calculate derived features that help crash detection
X_engineered = X_balanced.copy()

# 1. Acceleration magnitude (total G-force)
X_engineered['accel_magnitude'] = np.sqrt(
    X_engineered['accel_x']**2 + X_engineered['accel_y']**2 + X_engineered['accel_z']**2
)

# 2. Gyroscope magnitude (total rotation)
X_engineered['gyro_magnitude'] = np.sqrt(
    X_engineered['gyro_x']**2 + X_engineered['gyro_y']**2 + X_engineered['gyro_z']**2
)

# 3. Speed change (jerk) - but we only have instant speed, so we'll use absolute speed
X_engineered['gps_speed_abs'] = np.abs(X_engineered['gps_speed'])

print("‚úÖ Engineered features:")
print("   - accel_magnitude (total acceleration force)")
print("   - gyro_magnitude (total rotation force)")
print("   - gps_speed_abs (absolute speed)")

# Select final features (including engineered ones)
FINAL_FEATURES = MOBILE_FEATURES + ['accel_magnitude', 'gyro_magnitude', 'gps_speed_abs']
X_final = X_engineered[FINAL_FEATURES]

print(f"\nüìä Final feature matrix: {X_final.shape[1]} features")
print("   Features:", FINAL_FEATURES)

Train XGBoost Model

In [None]:
print("üöÄ Training XGBoost Crash Detection Model")
print("="*50)

# Split data (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X_final, y_balanced, test_size=0.2, random_state=42, stratify=y_balanced
)

print(f"Train set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")

# Initialize XGBoost with parameters tuned for mobile sensors
model = xgb.XGBClassifier(
    n_estimators=150,           # More trees for better accuracy
    max_depth=6,                # Allow deeper trees
    learning_rate=0.05,         # Slower learning for better generalization
    subsample=0.8,              # Use 80% of data per tree
    colsample_bytree=0.8,       # Use 80% of features per tree
    scale_pos_weight=1,         # Already balanced
    random_state=42,
    eval_metric='logloss'
)

print("\nüìà Training in progress...")
# Train with early stopping to prevent overfitting
model.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    early_stopping_rounds=20,
    verbose=False
)

print(f"\n‚úÖ Training complete! Best iteration: {model.best_iteration}")

Model Evaluation & Confusion Matrix

In [None]:
print("üìä Model Evaluation")
print("="*50)

# Make predictions
y_pred = model.predict(X_test)
y_pred_prob = model.predict_proba(X_test)[:, 1]

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"\nüéØ Test Accuracy: {accuracy:.2%}")

# Detailed classification report
print("\nüìã Classification Report:")
print(classification_report(y_test, y_pred, target_names=['Normal', 'Crash']))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print("\nüî¢ Confusion Matrix:")
print(cm)

# Visualize confusion matrix
plt.figure(figsize=(8,6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Normal', 'Crash'],
            yticklabels=['Normal', 'Crash'])
plt.title('Confusion Matrix', fontsize=14, fontweight='bold')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()

# ROC Curve (optional but useful)
from sklearn.metrics import roc_curve, auc
fpr, tpr, _ = roc_curve(y_test, y_pred_prob)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(8,6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve', fontsize=14, fontweight='bold')
plt.legend(loc="lower right")
plt.show()

Feature Importance Analysis

In [None]:
print("üîç Feature Importance Analysis")
print("="*50)

# Get feature importance from XGBoost
importance_df = pd.DataFrame({
    'feature': FINAL_FEATURES,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

print("\nTop 5 Most Important Features:")
print(importance_df.head())

# Visualize
plt.figure(figsize=(10,6))
sns.barplot(data=importance_df.head(10), x='importance', y='feature', palette='viridis')
plt.title('Top 10 Feature Importances for Crash Detection', fontsize=14, fontweight='bold')
plt.xlabel('Importance Score')
plt.ylabel('Feature')
plt.show()

# Show correlation with crash label
print("\nüìà Correlation with Crash Label:")
correlations = X_final.apply(lambda x: x.corr(y_balanced))
correlations_sorted = correlations.abs().sort_values(ascending=False)
print(correlations_sorted.head(10))

Export Model for Android


In [None]:
print("üíæ Exporting Production Model")
print("="*50)

# Final model filename (matches your GitHub branch)
MODEL_FILENAME = "TransitLK-MSFCD-SCD-XGB-1.pkl"

# Save the trained model
joblib.dump(model, MODEL_FILENAME)

# Save feature order (CRITICAL for Android)
feature_order_dict = {
    'model_name': 'Multi-Sensor Fusion Crash Detection v1',
    'features': FINAL_FEATURES,
    'threshold': 0.75,
    'accuracy': float(accuracy),
    'git_branch': 'Multi-Sensor-Fusion-Crash-Detection'
}

import json
with open('model_metadata.json', 'w') as f:
    json.dump(feature_order_dict, f, indent=2)

print(f"\n‚úÖ Model saved: {MODEL_FILENAME}")
print(f"‚úÖ Metadata saved: model_metadata.json")
print(f"\nüìã FINAL FEATURE ORDER for Android:")
for i, feat in enumerate(FINAL_FEATURES, 1):
    print(f"  {i}. {feat}")

print("\nüìå NEXT STEPS:")
print("1. Download both files from Colab file panel")
print("2. Place them in your GitHub repo: vaylen-transitlk/Multi-Sensor-Fusion-Crash-Detection")
print("3. Update server.py to load this model")
print("4. Update Android app with your computer's IP address")