In [None]:
import pickle
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.utils import class_weight
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
import xgboost as xgb


In [None]:
df = pd.read_pickle('clean_crash_data.pkl')

In [None]:
df['SEVERITY'].value_counts()

In [None]:
df = df[df['SEVERITY'] != 4]
df['SEVERITY'].value_counts()

In [None]:
X = df[['SEX', 'AGE', 'HELMET_BELT_WORN', 'DAY_OF_WEEK', 'LIGHT_CONDITION', 'ROAD_GEOMETRY', 'SPEED_ZONE', 'SURFACE_COND', 'TOTAL_NO_OCCUPANTS', 'VEHICLE_YEARS_OLD']]
y = df['SEVERITY']
# Perform one-hot encoding for categorical variables
categorical_cols = ['SEX', 'HELMET_BELT_WORN', 'DAY_OF_WEEK', 'LIGHT_CONDITION', 'ROAD_GEOMETRY', 'SURFACE_COND']
encoder = OneHotEncoder(drop='first', sparse=False)
X_encoded = encoder.fit_transform(X[categorical_cols])
feature_names = encoder.get_feature_names_out(input_features=categorical_cols)
X_encoded_df = pd.DataFrame(X_encoded, columns=feature_names)
# Drop the original categorical columns and concatenate the encoded columns
X = pd.concat([X.drop(categorical_cols, axis=1).reset_index(drop=True), X_encoded_df.reset_index(drop=True)], axis=1)
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
# Create and train a logistic regression model



In [None]:
y.value_counts()

**Logistic Regression**

In [None]:
model = LogisticRegression()
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

In [None]:
feature_importance = abs(model.coef_[0])  # Absolute values for importance

# Map feature names to importance scores
feature_names = X_train.columns  # Replace with your actual feature names
feature_importance_dict = dict(zip(feature_names, feature_importance))

# Sort features by importance
sorted_features = sorted(feature_importance_dict.items(), key=lambda x: x[1], reverse=True)
sorted_features

In [None]:

unique_actual, counts_actual = np.unique(y_test, return_counts=True)
unique_predicted, counts_predicted = np.unique(y_pred, return_counts=True)

# Create dictionaries to store the counts
actual_counts = dict(zip(unique_actual, counts_actual))
predicted_counts = dict(zip(unique_predicted, counts_predicted))

# Display the distribution of actual and predicted severities
print("Actual Severity Distribution:")
print(actual_counts)

print("\nPredicted Severity Distribution:")
print(predicted_counts)

Baseline Logistic Regression Seems to underpredict class 1 quite heavily, try improve our model through HyperParameter tuning and Class Weights

In [None]:
class_weights = dict(zip([1, 2, 3], class_weight.compute_class_weight('balanced', classes=[1, 2, 3], y=y_train)))

# Create and train a logistic regression model with class weights
weighted_model = LogisticRegression(class_weight=class_weights)
weighted_model.fit(X_train, y_train)
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],  # Regularization parameter
    'penalty': ['l1', 'l2']  # Regularization type
}

# Create a GridSearchCV object
grid_search = GridSearchCV(LogisticRegression(class_weight=class_weights), param_grid, cv=5, scoring='f1_macro')

# Fit the GridSearchCV to your data
grid_search.fit(X_train, y_train)

# Get the best hyperparameters
best_params = grid_search.best_params_

# Train the model with the best hyperparameters
best_model = LogisticRegression(class_weight=class_weights, **best_params)
best_model.fit(X_train, y_train)

In [None]:
y_pred_best = best_model.predict(X_test)

# Evaluate the best model
accuracy_best = accuracy_score(y_test, y_pred_best)
print(f'Best Model Accuracy: {accuracy_best}')
print(classification_report(y_test, y_pred_best))
print(confusion_matrix(y_test, y_pred_best))

In [None]:
feature_importance = abs(best_model.coef_[0])  # Absolute values for importance

# Map feature names to importance scores
feature_names = X_train.columns  # Replace with your actual feature names
feature_importance_dict = dict(zip(feature_names, feature_importance))

# Sort features by importance
sorted_features = sorted(feature_importance_dict.items(), key=lambda x: x[1], reverse=True)
sorted_features

**Random Forest**

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Define class weights (adjust as needed)


# Create the classifier with class weights
clf = RandomForestClassifier(class_weight='balanced')

clf.fit(X_train, y_train)

# Get feature importance scores
feature_importance = clf.feature_importances_

# Map feature names to importance scores
feature_names = X_train.columns  # Replace with your actual feature names
feature_importance_dict = dict(zip(feature_names, feature_importance))

# Sort features by importance
sorted_features = sorted(feature_importance_dict.items(), key=lambda x: x[1], reverse=True)


In [None]:
y_pred = clf.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

# Generate classification report
classification_rep = classification_report(y_test, y_pred)

# Generate confusion matrix
confusion_mat = confusion_matrix(y_test, y_pred)

# Print the results
print("Feature Importance:")
for feature, importance in sorted_features:
    print(f"{feature}: {importance:.4f}")

print("\nModel Performance on Test Data:")
print(f"Accuracy: {accuracy:.4f}")
print("Classification Report:\n", classification_rep)
print("Confusion Matrix:\n", confusion_mat)

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Define the hyperparameter grid for the Random Forest
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2]
}

# Create a Random Forest classifier
clf = RandomForestClassifier(class_weight='balanced', random_state=42)

# Create a GridSearchCV object
grid_search = GridSearchCV(clf, param_grid, cv=5, scoring='f1_macro', n_jobs=-1)

# Fit the GridSearchCV to your data
grid_search.fit(X_train, y_train)

# Get the best hyperparameters
best_params = grid_search.best_params_

# Train the model with the best hyperparameters
best_model = RandomForestClassifier(class_weight='balanced', random_state=42, **best_params)
best_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred_best = best_model.predict(X_test)

# Evaluate the best model
accuracy_best = accuracy_score(y_test, y_pred_best)
print(f'Best Model Accuracy: {accuracy_best}')
print(classification_report(y_test, y_pred_best))
print(confusion_matrix(y_test, y_pred_best))
# Get feature importance scores


In [None]:
feature_importance = best_model.feature_importances_

# Map feature names to importance scores
feature_names = X_train.columns  # Replace with your actual feature names
feature_importance_dict = dict(zip(feature_names, feature_importance))

# Sort features by importance
sorted_features_rf_grid = sorted(feature_importance_dict.items(), key=lambda x: x[1], reverse=True)
sorted_features_rf_grid

In [None]:


# Map class labels to start from 0
y_train_mapped = y_train - 1  # Subtract 1 from each class label to map to 0, 1, 2
y_test_mapped = y_test - 1

# Calculate class weights for balanced classes
class_weights = len(y_train_mapped) / (len(np.unique(y_train_mapped)) * np.bincount(y_train_mapped))

# Create a custom weight array for each sample in the training data
sample_weights = np.array([class_weights[label] for label in y_train_mapped])

# Define the XGBoost classifier
clf = xgb.XGBClassifier(random_state=42)

# Define the hyperparameter grid for the XGBoost model
param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.1],
    'max_depth': [3, 4]
}

# Create a GridSearchCV object
grid_search = GridSearchCV(clf, param_grid, cv=5, scoring='f1_macro', n_jobs=-1)

# Fit the GridSearchCV to your data, passing the custom sample weights
grid_search.fit(X_train, y_train_mapped, sample_weight=sample_weights)

# Get the best hyperparameters
best_params = grid_search.best_params_

# Train the model with the best hyperparameters
best_model = xgb.XGBClassifier(random_state=42, **best_params)

# Fit the model using the custom sample weights
best_model.fit(X_train, y_train_mapped, sample_weight=sample_weights)

# Make predictions on the test set
y_pred_best = best_model.predict(X_test)

# Map predicted class labels back to 1, 2, 3
y_pred_best_mapped = y_pred_best + 1

# Evaluate the best model
accuracy_best = accuracy_score(y_test, y_pred_best_mapped)
print(f'Best Model Accuracy: {accuracy_best}')
print(classification_report(y_test, y_pred_best_mapped))
print(confusion_matrix(y_test, y_pred_best_mapped))


In [None]:
feature_importance = best_model.feature_importances_

# Create a list or a DataFrame to associate feature names with importance scores
feature_names = X_train.columns  # Replace with your feature names
feature_importance_list = list(zip(feature_names, feature_importance))

# Sort the features by importance (optional)
feature_importance_list.sort(key=lambda x: x[1], reverse=True)

# Display the list of feature importance
print("Feature Importance:")
for feature, importance in feature_importance_list:
    print(f"{feature}: {importance}")

Support Vector Classifier

In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Define the hyperparameter grid for the SVC with 'rbf' kernel and a specific gamma value
param_grid = {
    'C': [0.1, 1, 5],
    'kernel': ['rbf'],
}

# Create an SVC classifier
svc = SVC(class_weight='balanced', random_state=42)

# Create a GridSearchCV object
grid_search_svc = GridSearchCV(svc, param_grid, cv=5, scoring='f1_macro', n_jobs=-1)

# Fit the GridSearchCV to your data
grid_search_svc.fit(X_train, y_train)

# Get the best hyperparameters
best_params_svc = grid_search_svc.best_params_

# Train the model with the best hyperparameters
best_svc_model = SVC(class_weight='balanced', random_state=42, **best_params_svc)
best_svc_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred_best_svc = best_svc_model.predict(X_test)

# Evaluate The best SVC model
accuracy_best_svc = accuracy_score(y_test, y_pred_best_svc)
print(f'Best SVC Model Accuracy: {accuracy_best_svc}')
print(classification_report(y_test, y_pred_best_svc))
print(confusion_matrix(y_test, y_pred_best_svc))


In [None]:
feature_names

In [None]:
df

In [None]:
df.loc[df['DAY_OF_WEEK']==6]