# US Accident Severity Prediction
Predicting the severity of US traffic accidents using weather and geolocation data with Random Forest classifier

## Setup

In [None]:
# !pip install pandas numpy matplotlib seaborn scikit-learn

## Load Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

## Load Dataset

In [None]:
df = pd.read_csv("US_Accidents_Dec20.csv")

print(df.head())
print(df.info())

## Select & Clean Relevant Features

In [None]:
df = df[['Severity', 'Start_Lat', 'Start_Lng', 'Temperature(F)', 'Humidity(%)',
         'Visibility(mi)', 'Wind_Speed(mph)', 'Weather_Condition']]

df.dropna(inplace=True)
print("Dataset shape after cleaning:", df.shape)

## One-Hot Encode Categorical Feature

In [None]:
df = pd.get_dummies(df, columns=['Weather_Condition'], drop_first=True)

## Split Features & Target

In [None]:
X = df.drop('Severity', axis=1)
y = df['Severity']

## Train/Test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

## Train Random Forest Model

In [None]:
model = RandomForestClassifier(class_weight='balanced')  # Handles class imbalance
model.fit(X_train, y_train)

## Evaluate the Model

In [None]:
y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

## Confusion Matrix

In [None]:
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.tight_layout()
plt.show()

## Feature Importance

In [None]:
importances = model.feature_importances_
features = X.columns
indices = np.argsort(importances)

plt.figure(figsize=(10, 6))
plt.title('Feature Importance')
plt.barh(range(len(indices)), importances[indices], color='skyblue', align='center')
plt.yticks(range(len(indices)), [features[i] for i in indices])
plt.xlabel('Relative Importance')
plt.tight_layout()
plt.show()