In [1]:
import pickle
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.utils import class_weight
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
import xgboost as xgb


In [2]:
df = pd.read_pickle('clean_crash_data.pkl')

In [3]:
df['SEVERITY'].value_counts()

SEVERITY
3    413351
2    195380
1      9961
4         4
Name: count, dtype: int64

In [4]:
df = df[df['SEVERITY'] != 4]
df['SEVERITY'].value_counts()

SEVERITY
3    413351
2    195380
1      9961
Name: count, dtype: int64

In [5]:
X = df[['SEX', 'AGE', 'HELMET_BELT_WORN', 'DAY_OF_WEEK', 'LIGHT_CONDITION', 'ROAD_GEOMETRY', 'SPEED_ZONE', 'SURFACE_COND', 'TOTAL_NO_OCCUPANTS', 'VEHICLE_YEARS_OLD']]
y = df['SEVERITY']
# Perform one-hot encoding for categorical variables
categorical_cols = ['SEX', 'HELMET_BELT_WORN', 'DAY_OF_WEEK', 'LIGHT_CONDITION', 'ROAD_GEOMETRY', 'SURFACE_COND']
encoder = OneHotEncoder(drop='first', sparse=False)
X_encoded = encoder.fit_transform(X[categorical_cols])
feature_names = encoder.get_feature_names_out(input_features=categorical_cols)
X_encoded_df = pd.DataFrame(X_encoded, columns=feature_names)
# Drop the original categorical columns and concatenate the encoded columns
X = pd.concat([X.drop(categorical_cols, axis=1).reset_index(drop=True), X_encoded_df.reset_index(drop=True)], axis=1)
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
# Create and train a logistic regression model





In [6]:
y.value_counts()

SEVERITY
3    413351
2    195380
1      9961
Name: count, dtype: int64

**Logistic Regression**

In [7]:
model = LogisticRegression()
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.6688432911208269
              precision    recall  f1-score   support

           1       0.00      0.00      0.00      1992
           2       0.48      0.08      0.14     39076
           3       0.68      0.96      0.80     82671

    accuracy                           0.67    123739
   macro avg       0.39      0.35      0.31    123739
weighted avg       0.60      0.67      0.58    123739

[[    0   395  1597]
 [    0  3097 35979]
 [    0  3006 79665]]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [8]:
feature_importance = abs(model.coef_[0])  # Absolute values for importance

# Map feature names to importance scores
feature_names = X_train.columns  # Replace with your actual feature names
feature_importance_dict = dict(zip(feature_names, feature_importance))

# Sort features by importance
sorted_features = sorted(feature_importance_dict.items(), key=lambda x: x[1], reverse=True)
sorted_features

[('TOTAL_NO_OCCUPANTS', 0.19980381069988604),
 ('ROAD_GEOMETRY_2', 0.18015537657172717),
 ('LIGHT_CONDITION_5', 0.15459870200908976),
 ('HELMET_BELT_WORN_9', 0.11223333168352376),
 ('DAY_OF_WEEK_4', 0.10836398181598901),
 ('ROAD_GEOMETRY_5', 0.09893691887044134),
 ('LIGHT_CONDITION_3', 0.09769794623456836),
 ('SURFACE_COND_2', 0.0803667586414338),
 ('DAY_OF_WEEK_3', 0.07849514765027113),
 ('DAY_OF_WEEK_5', 0.07694751931554067),
 ('HELMET_BELT_WORN_2', 0.05060355793871352),
 ('SURFACE_COND_9', 0.05016217487868456),
 ('LIGHT_CONDITION_2', 0.049207573359401735),
 ('DAY_OF_WEEK_2', 0.048780507169666046),
 ('DAY_OF_WEEK_6', 0.042126081962104336),
 ('ROAD_GEOMETRY_4', 0.024163478869716884),
 ('LIGHT_CONDITION_9', 0.020603168367946557),
 ('VEHICLE_YEARS_OLD', 0.01704147374157583),
 ('AGE', 0.014804630781804398),
 ('DAY_OF_WEEK_1', 0.010395645482561569),
 ('LIGHT_CONDITION_6', 0.008689386289348872),
 ('SEX_M', 0.008022491670732048),
 ('SPEED_ZONE', 0.007534469824564507),
 ('HELMET_BELT_WORN_8'

In [9]:

unique_actual, counts_actual = np.unique(y_test, return_counts=True)
unique_predicted, counts_predicted = np.unique(y_pred, return_counts=True)

# Create dictionaries to store the counts
actual_counts = dict(zip(unique_actual, counts_actual))
predicted_counts = dict(zip(unique_predicted, counts_predicted))

# Display the distribution of actual and predicted severities
print("Actual Severity Distribution:")
print(actual_counts)

print("\nPredicted Severity Distribution:")
print(predicted_counts)

Actual Severity Distribution:
{1: 1992, 2: 39076, 3: 82671}

Predicted Severity Distribution:
{2: 6498, 3: 117241}


Baseline Logistic Regression Seems to underpredict class 1 quite heavily, try improve our model through HyperParameter tuning and Class Weights

In [10]:
class_weights = dict(zip([1, 2, 3], class_weight.compute_class_weight('balanced', classes=[1, 2, 3], y=y_train)))

# Create and train a logistic regression model with class weights
weighted_model = LogisticRegression(class_weight=class_weights)
weighted_model.fit(X_train, y_train)
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],  # Regularization parameter
    'penalty': ['l1', 'l2']  # Regularization type
}

# Create a GridSearchCV object
grid_search = GridSearchCV(LogisticRegression(class_weight=class_weights), param_grid, cv=5, scoring='f1_macro')

# Fit the GridSearchCV to your data
grid_search.fit(X_train, y_train)

# Get the best hyperparameters
best_params = grid_search.best_params_

# Train the model with the best hyperparameters
best_model = LogisticRegression(class_weight=class_weights, **best_params)
best_model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [11]:
y_pred_best = best_model.predict(X_test)

# Evaluate the best model
accuracy_best = accuracy_score(y_test, y_pred_best)
print(f'Best Model Accuracy: {accuracy_best}')
print(classification_report(y_test, y_pred_best))
print(confusion_matrix(y_test, y_pred_best))

Best Model Accuracy: 0.4595236748317022
              precision    recall  f1-score   support

           1       0.04      0.61      0.08      1992
           2       0.35      0.30      0.32     39076
           3       0.74      0.53      0.62     82671

    accuracy                           0.46    123739
   macro avg       0.37      0.48      0.34    123739
weighted avg       0.60      0.46      0.52    123739

[[ 1225   402   365]
 [11979 11679 15418]
 [17124 21590 43957]]


In [12]:
feature_importance = abs(best_model.coef_[0])  # Absolute values for importance

# Map feature names to importance scores
feature_names = X_train.columns  # Replace with your actual feature names
feature_importance_dict = dict(zip(feature_names, feature_importance))

# Sort features by importance
sorted_features = sorted(feature_importance_dict.items(), key=lambda x: x[1], reverse=True)
sorted_features

[('LIGHT_CONDITION_5', 0.8117292650506047),
 ('HELMET_BELT_WORN_2', 0.4630536853820863),
 ('DAY_OF_WEEK_4', 0.46171440564044175),
 ('SEX_M', 0.41242014626485857),
 ('SURFACE_COND_2', 0.37261681645378114),
 ('SURFACE_COND_9', 0.36070848448397147),
 ('DAY_OF_WEEK_3', 0.3157415637655155),
 ('DAY_OF_WEEK_5', 0.3034232638405772),
 ('HELMET_BELT_WORN_6', 0.2664049804989535),
 ('LIGHT_CONDITION_2', 0.22569072997420708),
 ('DAY_OF_WEEK_2', 0.21428324726674763),
 ('ROAD_GEOMETRY_4', 0.18104784336669003),
 ('LIGHT_CONDITION_9', 0.14587979484783808),
 ('ROAD_GEOMETRY_5', 0.13593550283350003),
 ('DAY_OF_WEEK_6', 0.1230298959445845),
 ('ROAD_GEOMETRY_2', 0.11045343681218386),
 ('LIGHT_CONDITION_3', 0.10171099555022661),
 ('HELMET_BELT_WORN_9', 0.1013538422430687),
 ('DAY_OF_WEEK_1', 0.09429158006375235),
 ('TOTAL_NO_OCCUPANTS', 0.07567882380314442),
 ('HELMET_BELT_WORN_7', 0.0646683261832012),
 ('LIGHT_CONDITION_6', 0.057676174726637594),
 ('ROAD_GEOMETRY_3', 0.04453533035421775),
 ('HELMET_BELT_WO

**Random Forest**

In [20]:
from sklearn.ensemble import RandomForestClassifier

# Define class weights (adjust as needed)


# Create the classifier with class weights
clf = RandomForestClassifier(class_weight='balanced')

clf.fit(X_train, y_train)

# Get feature importance scores
feature_importance = clf.feature_importances_

# Map feature names to importance scores
feature_names = X_train.columns  # Replace with your actual feature names
feature_importance_dict = dict(zip(feature_names, feature_importance))

# Sort features by importance
sorted_features = sorted(feature_importance_dict.items(), key=lambda x: x[1], reverse=True)


In [21]:
y_pred = clf.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

# Generate classification report
classification_rep = classification_report(y_test, y_pred)

# Generate confusion matrix
confusion_mat = confusion_matrix(y_test, y_pred)

# Print the results
print("Feature Importance:")
for feature, importance in sorted_features:
    print(f"{feature}: {importance:.4f}")

print("\nModel Performance on Test Data:")
print(f"Accuracy: {accuracy:.4f}")
print("Classification Report:\n", classification_rep)
print("Confusion Matrix:\n", confusion_mat)

Feature Importance:
AGE: 0.3378
VEHICLE_YEARS_OLD: 0.2604
SPEED_ZONE: 0.1138
TOTAL_NO_OCCUPANTS: 0.0487
HELMET_BELT_WORN_9: 0.0201
SURFACE_COND_2: 0.0195
ROAD_GEOMETRY_5: 0.0162
SEX_M: 0.0158
LIGHT_CONDITION_5: 0.0138
LIGHT_CONDITION_3: 0.0136
LIGHT_CONDITION_2: 0.0135
ROAD_GEOMETRY_2: 0.0124
DAY_OF_WEEK_6: 0.0112
DAY_OF_WEEK_5: 0.0109
DAY_OF_WEEK_2: 0.0108
DAY_OF_WEEK_3: 0.0104
DAY_OF_WEEK_4: 0.0095
DAY_OF_WEEK_1: 0.0093
DAY_OF_WEEK_7: 0.0088
HELMET_BELT_WORN_6: 0.0084
HELMET_BELT_WORN_2: 0.0069
SURFACE_COND_9: 0.0067
ROAD_GEOMETRY_4: 0.0047
LIGHT_CONDITION_6: 0.0021
HELMET_BELT_WORN_8: 0.0021
LIGHT_CONDITION_9: 0.0020
HELMET_BELT_WORN_7: 0.0018
LIGHT_CONDITION_4: 0.0018
HELMET_BELT_WORN_5: 0.0015
SURFACE_COND_5: 0.0014
SURFACE_COND_3: 0.0012
ROAD_GEOMETRY_3: 0.0010
SEX_U: 0.0009
ROAD_GEOMETRY_9: 0.0007
SURFACE_COND_4: 0.0003
ROAD_GEOMETRY_6: 0.0002
ROAD_GEOMETRY_7: 0.0000
ROAD_GEOMETRY_8: 0.0000
HELMET_BELT_WORN_4: 0.0000

Model Performance on Test Data:
Accuracy: 0.6771
Classificati

In [15]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Define the hyperparameter grid for the Random Forest
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2]
}

# Create a Random Forest classifier
clf = RandomForestClassifier(class_weight='balanced', random_state=42)

# Create a GridSearchCV object
grid_search = GridSearchCV(clf, param_grid, cv=5, scoring='f1_macro', n_jobs=-1)

# Fit the GridSearchCV to your data
grid_search.fit(X_train, y_train)

# Get the best hyperparameters
best_params = grid_search.best_params_

# Train the model with the best hyperparameters
best_model = RandomForestClassifier(class_weight='balanced', random_state=42, **best_params)
best_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred_best = best_model.predict(X_test)

# Evaluate the best model
accuracy_best = accuracy_score(y_test, y_pred_best)
print(f'Best Model Accuracy: {accuracy_best}')
print(classification_report(y_test, y_pred_best))
print(confusion_matrix(y_test, y_pred_best))
# Get feature importance scores




Best Model Accuracy: 0.6188913762031373
              precision    recall  f1-score   support

           1       0.09      0.53      0.15      1992
           2       0.48      0.39      0.43     39076
           3       0.75      0.73      0.74     82671

    accuracy                           0.62    123739
   macro avg       0.44      0.55      0.44    123739
weighted avg       0.66      0.62      0.63    123739

[[ 1049   501   442]
 [ 4644 15313 19119]
 [ 6382 16070 60219]]


In [16]:
feature_importance = best_model.feature_importances_

# Map feature names to importance scores
feature_names = X_train.columns  # Replace with your actual feature names
feature_importance_dict = dict(zip(feature_names, feature_importance))

# Sort features by importance
sorted_features_rf_grid = sorted(feature_importance_dict.items(), key=lambda x: x[1], reverse=True)
sorted_features_rf_grid

[('SPEED_ZONE', 0.22114624049849008),
 ('AGE', 0.18368122694845093),
 ('VEHICLE_YEARS_OLD', 0.15411503683806474),
 ('TOTAL_NO_OCCUPANTS', 0.05101990201154643),
 ('LIGHT_CONDITION_5', 0.036492297726746),
 ('SEX_M', 0.0361030507579531),
 ('ROAD_GEOMETRY_5', 0.033535210701789095),
 ('LIGHT_CONDITION_3', 0.025601163987977103),
 ('SURFACE_COND_2', 0.024651739289313135),
 ('HELMET_BELT_WORN_9', 0.0234404844570691),
 ('SURFACE_COND_9', 0.017515590652845567),
 ('ROAD_GEOMETRY_2', 0.016811478034374217),
 ('HELMET_BELT_WORN_2', 0.015830304406000955),
 ('DAY_OF_WEEK_6', 0.015575652579585153),
 ('HELMET_BELT_WORN_6', 0.015442360542574819),
 ('LIGHT_CONDITION_2', 0.015187949509744522),
 ('DAY_OF_WEEK_7', 0.014515185833815808),
 ('DAY_OF_WEEK_5', 0.014378913728022294),
 ('DAY_OF_WEEK_2', 0.013863689318802901),
 ('DAY_OF_WEEK_3', 0.013636767107522598),
 ('DAY_OF_WEEK_4', 0.01340176122884812),
 ('DAY_OF_WEEK_1', 0.01320260929926689),
 ('ROAD_GEOMETRY_4', 0.00596881135330032),
 ('LIGHT_CONDITION_9', 0.

In [41]:


# Map class labels to start from 0
y_train_mapped = y_train - 1  # Subtract 1 from each class label to map to 0, 1, 2
y_test_mapped = y_test - 1

# Calculate class weights for balanced classes
class_weights = len(y_train_mapped) / (len(np.unique(y_train_mapped)) * np.bincount(y_train_mapped))

# Create a custom weight array for each sample in the training data
sample_weights = np.array([class_weights[label] for label in y_train_mapped])

# Define the XGBoost classifier
clf = xgb.XGBClassifier(random_state=42)

# Define the hyperparameter grid for the XGBoost model
param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.1],
    'max_depth': [3, 4]
}

# Create a GridSearchCV object
grid_search = GridSearchCV(clf, param_grid, cv=5, scoring='f1_macro', n_jobs=-1)

# Fit the GridSearchCV to your data, passing the custom sample weights
grid_search.fit(X_train, y_train_mapped, sample_weight=sample_weights)

# Get the best hyperparameters
best_params = grid_search.best_params_

# Train the model with the best hyperparameters
best_model = xgb.XGBClassifier(random_state=42, **best_params)

# Fit the model using the custom sample weights
best_model.fit(X_train, y_train_mapped, sample_weight=sample_weights)

# Make predictions on the test set
y_pred_best = best_model.predict(X_test)

# Map predicted class labels back to 1, 2, 3
y_pred_best_mapped = y_pred_best + 1

# Evaluate the best model
accuracy_best = accuracy_score(y_test, y_pred_best_mapped)
print(f'Best Model Accuracy: {accuracy_best}')
print(classification_report(y_test, y_pred_best_mapped))
print(confusion_matrix(y_test, y_pred_best_mapped))


  if is_sparse(dtype):
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(dtype):
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  if is_categorical_dtype(dtype)
  if is_sparse(dtype):
  if is_categorical_dtype(dtype)
  elif is_categorical_dtype(dtype) and enable_categorical:
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_categorical_dtype(dtype)
  if is_categorical_dtype(dtype)
  if is_sparse(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  eli

Best Model Accuracy: 0.502921471807595
              precision    recall  f1-score   support

           1       0.05      0.66      0.09      1992
           2       0.37      0.28      0.32     39076
           3       0.74      0.60      0.67     82671

    accuracy                           0.50    123739
   macro avg       0.39      0.51      0.36    123739
weighted avg       0.61      0.50      0.55    123739

[[ 1307   350   335]
 [10983 11122 16971]
 [14622 18247 49802]]


  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)


In [42]:
feature_importance = best_model.feature_importances_

# Create a list or a DataFrame to associate feature names with importance scores
feature_names = X_train.columns  # Replace with your feature names
feature_importance_list = list(zip(feature_names, feature_importance))

# Sort the features by importance (optional)
feature_importance_list.sort(key=lambda x: x[1], reverse=True)

# Display the list of feature importance
print("Feature Importance:")
for feature, importance in feature_importance_list:
    print(f"{feature}: {importance}")

Feature Importance:
SPEED_ZONE: 0.14273853600025177
SEX_M: 0.12902940809726715
SURFACE_COND_9: 0.112106554210186
LIGHT_CONDITION_5: 0.05999951809644699
LIGHT_CONDITION_3: 0.049157850444316864
HELMET_BELT_WORN_6: 0.03957219794392586
ROAD_GEOMETRY_5: 0.03730175271630287
HELMET_BELT_WORN_2: 0.03188567981123924
DAY_OF_WEEK_7: 0.028122056275606155
SURFACE_COND_2: 0.02611190266907215
LIGHT_CONDITION_9: 0.022196322679519653
DAY_OF_WEEK_4: 0.02030378393828869
ROAD_GEOMETRY_2: 0.01974046789109707
ROAD_GEOMETRY_3: 0.01781030371785164
HELMET_BELT_WORN_7: 0.017032792791724205
AGE: 0.01696138083934784
HELMET_BELT_WORN_8: 0.01631156913936138
TOTAL_NO_OCCUPANTS: 0.015825118869543076
DAY_OF_WEEK_6: 0.015516722574830055
LIGHT_CONDITION_4: 0.013931464403867722
HELMET_BELT_WORN_9: 0.013537454418838024
ROAD_GEOMETRY_4: 0.013510229997336864
SURFACE_COND_5: 0.012903150171041489
LIGHT_CONDITION_6: 0.012310940772294998
DAY_OF_WEEK_1: 0.012018210254609585
VEHICLE_YEARS_OLD: 0.01151778269559145
LIGHT_CONDITION_

Support Vector Classifier

In [17]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Define the hyperparameter grid for the SVC with 'rbf' kernel and a specific gamma value
param_grid = {
    'C': [0.1, 1, 5],
    'kernel': ['rbf'],
}

# Create an SVC classifier
svc = SVC(class_weight='balanced', random_state=42)

# Create a GridSearchCV object
grid_search_svc = GridSearchCV(svc, param_grid, cv=5, scoring='f1_macro', n_jobs=-1)

# Fit the GridSearchCV to your data
grid_search_svc.fit(X_train, y_train)

# Get the best hyperparameters
best_params_svc = grid_search_svc.best_params_

# Train the model with the best hyperparameters
best_svc_model = SVC(class_weight='balanced', random_state=42, **best_params_svc)
best_svc_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred_best_svc = best_svc_model.predict(X_test)

# Evaluate the best SVC model
accuracy_best_svc = accuracy_score(y_test, y_pred_best_svc)
print(f'Best SVC Model Accuracy: {accuracy_best_svc}')
print(classification_report(y_test, y_pred_best_svc))
print(confusion_matrix(y_test, y_pred_best_svc))


KeyboardInterrupt: 

In [45]:
feature_names

Index(['AGE', 'SPEED_ZONE', 'TOTAL_NO_OCCUPANTS', 'VEHICLE_YEARS_OLD', 'SEX_M',
       'SEX_U', 'HELMET_BELT_WORN_2', 'HELMET_BELT_WORN_4',
       'HELMET_BELT_WORN_5', 'HELMET_BELT_WORN_6', 'HELMET_BELT_WORN_7',
       'HELMET_BELT_WORN_8', 'HELMET_BELT_WORN_9', 'DAY_OF_WEEK_1',
       'DAY_OF_WEEK_2', 'DAY_OF_WEEK_3', 'DAY_OF_WEEK_4', 'DAY_OF_WEEK_5',
       'DAY_OF_WEEK_6', 'DAY_OF_WEEK_7', 'LIGHT_CONDITION_2',
       'LIGHT_CONDITION_3', 'LIGHT_CONDITION_4', 'LIGHT_CONDITION_5',
       'LIGHT_CONDITION_6', 'LIGHT_CONDITION_9', 'ROAD_GEOMETRY_2',
       'ROAD_GEOMETRY_3', 'ROAD_GEOMETRY_4', 'ROAD_GEOMETRY_5',
       'ROAD_GEOMETRY_6', 'ROAD_GEOMETRY_7', 'ROAD_GEOMETRY_8',
       'ROAD_GEOMETRY_9', 'SURFACE_COND_2', 'SURFACE_COND_3', 'SURFACE_COND_4',
       'SURFACE_COND_5', 'SURFACE_COND_9'],
      dtype='object')

In [60]:
df

Unnamed: 0,ACCIDENT_NO,SEX,AGE,HELMET_BELT_WORN,ACCIDENTTIME,DAY_OF_WEEK,Day Week Description,LIGHT_CONDITION,Light Condition Desc,ROAD_GEOMETRY,...,SEVERITY,SPEED_ZONE,SURFACE_COND,Surface Cond Desc,VEHICLE_BODY_STYLE,TOTAL_NO_OCCUPANTS,VEHICLE_YEARS_OLD,AGE_BAND,OCCUPANTS_RANGE,VEHICLE_AGE_RANGE
0,T20060000010,M,72.0,1,12:42:00,6,Friday,1,Day,1,...,3,60,1,Dry,SEDAN,2.0,24.0,"[70, 80)",3,20-25
1,T20060000010,M,72.0,1,12:42:00,6,Friday,1,Day,1,...,3,60,1,Dry,COUPE,1.0,17.0,"[70, 80)",2,15-20
2,T20060000010,M,72.0,1,12:42:00,6,Friday,1,Day,1,...,3,60,1,Dry,SEDAN,3.0,19.0,"[70, 80)",4,15-20
3,T20060000010,F,62.0,1,12:42:00,6,Friday,1,Day,1,...,3,60,1,Dry,SEDAN,2.0,24.0,"[60, 70)",3,20-25
4,T20060000010,F,62.0,1,12:42:00,6,Friday,1,Day,1,...,3,60,1,Dry,COUPE,1.0,17.0,"[60, 70)",2,15-20
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
634032,T20200019250,M,45.0,1,18:00:00,0,Sunday,1,Day,1,...,2,60,1,Dry,SOLO,1.0,13.0,"[40, 50)",2,10-15
634033,T20200019250,M,56.0,6,18:00:00,0,Sunday,1,Day,1,...,2,60,1,Dry,SEDAN,1.0,13.0,"[50, 60)",2,10-15
634034,T20200019250,M,56.0,6,18:00:00,0,Sunday,1,Day,1,...,2,60,1,Dry,SOLO,1.0,13.0,"[50, 60)",2,10-15
634035,T20200019253,M,65.0,6,12:00:00,1,Sunday,1,Day,5,...,2,80,1,Dry,SOLO,1.0,13.0,"[60, 70)",2,10-15


In [91]:
df.loc[df['DAY_OF_WEEK']==6]

Unnamed: 0,ACCIDENT_NO,SEX,AGE,HELMET_BELT_WORN,ACCIDENTTIME,DAY_OF_WEEK,Day Week Description,LIGHT_CONDITION,Light Condition Desc,ROAD_GEOMETRY,...,SEVERITY,SPEED_ZONE,SURFACE_COND,Surface Cond Desc,VEHICLE_BODY_STYLE,TOTAL_NO_OCCUPANTS,VEHICLE_YEARS_OLD,AGE_BAND,OCCUPANTS_RANGE,VEHICLE_AGE_RANGE
0,T20060000010,M,72.0,1,12:42:00,6,Friday,1,Day,1,...,3,60,1,Dry,SEDAN,2.0,24.0,"[70, 80)",3,20-25
1,T20060000010,M,72.0,1,12:42:00,6,Friday,1,Day,1,...,3,60,1,Dry,COUPE,1.0,17.0,"[70, 80)",2,15-20
2,T20060000010,M,72.0,1,12:42:00,6,Friday,1,Day,1,...,3,60,1,Dry,SEDAN,3.0,19.0,"[70, 80)",4,15-20
3,T20060000010,F,62.0,1,12:42:00,6,Friday,1,Day,1,...,3,60,1,Dry,SEDAN,2.0,24.0,"[60, 70)",3,20-25
4,T20060000010,F,62.0,1,12:42:00,6,Friday,1,Day,1,...,3,60,1,Dry,COUPE,1.0,17.0,"[60, 70)",2,15-20
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
633989,T20200018649,M,34.0,7,13:55:00,6,Friday,1,Day,5,...,2,50,1,Dry,SOLO,1.0,13.0,"[30, 40)",2,10-15
634025,T20200019195,F,40.0,9,15:16:00,6,Saturday,1,Day,2,...,2,40,1,Dry,SEDAN,1.0,6.0,"[40, 50)",2,5-10
634026,T20200019195,F,40.0,9,15:16:00,6,Saturday,1,Day,2,...,2,40,1,Dry,SOLO,1.0,4.0,"[40, 50)",2,0-5
634027,T20200019195,M,25.0,6,15:16:00,6,Saturday,1,Day,2,...,2,40,1,Dry,SEDAN,1.0,6.0,"[20, 30)",2,5-10


In [None]:
AGE: 0.3378
VEHICLE_YEARS_OLD: 0.2604
SPEED_ZONE: 0.1138
TOTAL_NO_OCCUPANTS: 0.0487
W: 0.0201
SURFACE_COND_2: 0.0195
ROAD_GEOMETRY_5: 0.0162
SEX_M: 0.0158
LIGHT_CONDITION_5: 0.0138
LIGHT_CONDITION_3: 0.0136
LIGHT_CONDITION_2: 0.0135
ROAD_GEOMETRY_2: 0.0124
DAY_OF_WEEK_6: 0.0112
DAY_OF_WEEK_5: 0.0109