In [1]:
import pandas as pd
import numpy as np

In [2]:
# Import the Chapel Hill data
df_work = pd.read_csv("/Users/danoand/Documents/Companies/LambdaSchool/Build_Project_02/data/pedestrian-crashes-chapel-hill-region_imported.txt",
                     sep='\t')

In [3]:
# Profile the "work" dataframe
from pandas_profiling import ProfileReport

profile = ProfileReport(df_work, title='Pandas Profiling Report', html={'style':{'full_width':True}})
profile.to_file(output_file="/Users/danoand/Documents/Companies/LambdaSchool/Build_Project_02/data/ChapelHillAccidents.html")

In [4]:
# Define "ante" columns (data that can be reasonably known prior to the event)
# Notes:
#   - removed the lat/long column 
categories_ante = [
    'City',
    'County',
    'CrashHour',
    'CrashMonth',
    'Development',
    'LightCond',
    'Locality',
    'NumLanes',
    'RdCharacte',
    'RdClass',
    'RdConditio',
    'RdConfig',
    'RdDefects',
    'RdFeature',
    'RdSurface',
    'Region',
    'RuralUrban',
    'SpeedLimit',
    'TraffCntrl',
    'Weather',
    'Workzone',
    'PedInjury']        # Source for the target data

In [6]:
# Define a dataframe with "ante" data
df_ante = df_work[categories_ante]

df_ante.sample(10)

Unnamed: 0,City,County,CrashHour,CrashMonth,Development,LightCond,Locality,NumLanes,RdCharacte,RdClass,RdConditio,RdConfig,RdDefects,RdFeature,RdSurface,Region,RuralUrban,SpeedLimit,TraffCntrl,Weather,Workzone,PedInjury
20440,Mount Airy,Surry,15,January,Commercial,Daylight,Urban (>70% Developed),1 lane,Straight - Level,Public Vehicular Area,Snow,"One-Way, Not Divided",,Missing,Coarse Asphalt,Mountains,Urban,5 - 15 MPH,No Control Present,Cloudy,No,C: Possible Injury
23716,Wilmington,New Hanover,10,October,Institutional,Daylight,Urban (>70% Developed),2 lanes,Straight - Level,Local Street,Wet,"Two-Way, Not Divided",,T-Intersection,Smooth Asphalt,Coastal,Urban,20 - 25 MPH,Stop Sign,Cloudy,No,B: Suspected Minor Injury
30862,None - Rural Crash,Davidson,15,November,"Farms, Woods, Pastures",Daylight,Rural (<30% Developed),2 lanes,Straight - Level,NC Route,Wet,"Two-Way, Not Divided",,No Special Feature,Smooth Asphalt,Piedmont,Rural,40 - 45 MPH,"Double Yellow Line, No Passing Zone",Cloudy,No,B: Suspected Minor Injury
11313,Wilmington,New Hanover,14,January,Commercial,Daylight,Urban (>70% Developed),Unknown,Straight - Level,Public Vehicular Area,Wet,"One-Way, Not Divided",,No Special Feature,Smooth Asphalt,Coastal,Urban,5 - 15 MPH,No Control Present,Rain,No,C: Possible Injury
9137,Charlotte,Mecklenburg,16,March,Residential,Daylight,Urban (>70% Developed),2 lanes,Straight - Grade,Public Vehicular Area,Dry,"Two-Way, Not Divided",,No Special Feature,Smooth Asphalt,Piedmont,Urban,30 - 35 MPH,No Control Present,Clear,No,C: Possible Injury
6672,Rocky Mount,Edgecombe,15,March,Residential,Daylight,Urban (>70% Developed),2 lanes,Straight - Level,Local Street,Dry,"Two-Way, Not Divided",,No Special Feature,Smooth Asphalt,Coastal,Urban,30 - 35 MPH,No Control Present,Clear,No,Unknown Injury
1616,None - Rural Crash,Watauga,7,February,"Farms, Woods, Pastures",Daylight,Mixed (30% To 70% Developed),2 lanes,Straight - Grade,US Route,Wet,"Two-Way, Not Divided",,No Special Feature,Smooth Asphalt,Mountains,Rural,40 - 45 MPH,"Double Yellow Line, No Passing Zone",Clear,No,B: Suspected Minor Injury
15478,Fayetteville,Cumberland,9,May,Residential,Daylight,Urban (>70% Developed),2 lanes,Straight - Level,Public Vehicular Area,Dry,"Two-Way, Not Divided",,No Special Feature,Coarse Asphalt,Coastal,Urban,5 - 15 MPH,No Control Present,Clear,No,B: Suspected Minor Injury
4803,Morganton,Burke,8,May,Commercial,Daylight,Urban (>70% Developed),4 lanes,Straight - Level,US Route,Dry,"Two-Way, Not Divided",Work Zone,No Special Feature,Smooth Asphalt,Mountains,Urban,40 - 45 MPH,No Control Present,Cloudy,No,B: Suspected Minor Injury
32101,None - Rural Crash,Moore,19,January,"Farms, Woods, Pastures",Dark - Roadway Not Lighted,Rural (<30% Developed),2 lanes,Straight - Grade,NC Route,Wet,"Two-Way, Not Divided",,No Special Feature,Coarse Asphalt,Piedmont,Rural,50 - 55 MPH,"Double Yellow Line, No Passing Zone",Rain,No,O: No Injury


In [7]:
# Profile the df_ante dataframe
from pandas_profiling import ProfileReport

profile_ante = ProfileReport(df_ante, title='Pandas Profiling Report', html={'style':{'full_width':True}})
profile_ante.to_file(output_file="/Users/danoand/Documents/Companies/LambdaSchool/Build_Project_02/data/ChapelHillAccidents_Ante.html")

In [8]:
# Split the data into training, validation, and testing datasets
from sklearn.model_selection import train_test_split
df_ante_train, df_ante_test = train_test_split(df_ante, train_size=0.85, test_size=0.15, 
                              stratify=df_ante['PedInjury'], random_state=42)

print(f'Shape of df_ante_train: {df_ante_train.shape}; Shape of df_ante_test: {df_ante_test.shape}')

Shape of df_ante_train: (28650, 22); Shape of df_ante_test: (5057, 22)


In [9]:
# Split the train data into a training subset and a validation subset
df_ante_train, df_ante_val = train_test_split(df_ante_train, test_size=len(df_ante_test), 
                              stratify=df_ante_train['PedInjury'], random_state=42)

print(f'Shape of df_ante_train: {df_ante_train.shape}; Shape of df_ante_val: {df_ante_val.shape}; Shape of df_ante_test: {df_ante_test.shape}')

Shape of df_ante_train: (23593, 22); Shape of df_ante_val: (5057, 22); Shape of df_ante_test: (5057, 22)


In [10]:
# Breakdown the target attribute by class value
df_ante_train['PedInjury'].value_counts(normalize=True)

C: Possible Injury             0.409020
B: Suspected Minor Injury      0.354597
A: Suspected Serious Injury    0.072140
K: Killed                      0.063917
O: No Injury                   0.059975
Unknown Injury                 0.040351
Name: PedInjury, dtype: float64

In [11]:
# Wrangle the modeling data
outcome_serious = ['K: Killed', 'A: Suspected Serious Injury']

# has_fatality is a function that returns a boolean value if the inbound value represents a serious outcome (True) or not (False)
def has_serious_outcome(val):
  if (val in outcome_serious): 
    return True

  return False

# wrangle returns a dataframe with updated/created columns for a given input dataframe 
def wrangle(DF):
  X = DF.copy()

  # Create a boolean target column reflecting an outcome (serious injury or not)
  X['ped_serious_outcome'] = X['PedInjury'].apply(has_serious_outcome)

  # List columns to be dropped
  cols_drop = ['PedInjury'] # PedInjury - use the engineered 'ped_fatality' column

  # Drop undesired columns
  X = X.drop(columns=cols_drop)

  return X

In [12]:
# Wrangle the training, validation, and test datasets
df_wrgl_train = wrangle(df_ante_train)
df_wrgl_val   = wrangle(df_ante_val)
df_wrgl_test  = wrangle(df_ante_test)

print(f'Shape of df_wrgl_train: {df_wrgl_train.shape}; Shape of df_wrgl_val: {df_wrgl_val.shape}; Shape of df_wrgl_test: {df_wrgl_test.shape}')

Shape of df_wrgl_train: (23593, 22); Shape of df_wrgl_val: (5057, 22); Shape of df_wrgl_test: (5057, 22)


In [13]:
df_wrgl_train.sample(5)

Unnamed: 0,City,County,CrashHour,CrashMonth,Development,LightCond,Locality,NumLanes,RdCharacte,RdClass,RdConditio,RdConfig,RdDefects,RdFeature,RdSurface,Region,RuralUrban,SpeedLimit,TraffCntrl,Weather,Workzone,ped_serious_outcome
2538,Smithfield,Johnston,21,June,Commercial,Dark - Roadway Not Lighted,Urban (>70% Developed),3 lanes,Straight - Hillcrest,Local Street,Dry,"Two-Way, Not Divided",,No Special Feature,Coarse Asphalt,Coastal,Urban,40 - 45 MPH,No Control Present,Clear,No,False
2158,Gastonia,Gaston,8,January,Residential,Daylight,Urban (>70% Developed),4 lanes,Straight - Grade,Local Street,Dry,"Two-Way, Not Divided",,Four-Way Intersection,Smooth Asphalt,Piedmont,Urban,30 - 35 MPH,Stop And Go Signal,Clear,No,False
26993,None - Rural Crash,Moore,11,June,Residential,Daylight,Rural (<30% Developed),2 lanes,Curve - Level,State Secondary Route,Dry,"Two-Way, Not Divided",,No Special Feature,Coarse Asphalt,Piedmont,Rural,40 - 45 MPH,"Double Yellow Line, No Passing Zone",Clear,No,False
1232,Concord,Cabarrus,18,February,Commercial,Dark - Lighted Roadway,Urban (>70% Developed),Unknown,Straight - Level,Local Street,Dry,"One-Way, Not Divided",,Missing,Coarse Asphalt,Piedmont,Urban,Unknown,No Control Present,Clear,No,False
7517,Fayetteville,Cumberland,11,November,Commercial,Daylight,Urban (>70% Developed),5 lanes,Straight - Level,Local Street,Dry,"Two-Way, Divided, Unprotected Median",,Missing,Coarse Asphalt,Coastal,Urban,40 - 45 MPH,Stop And Go Signal,Clear,No,False


In [14]:
# Construct the modeling datasets
target = 'ped_serious_outcome'
X_train = df_wrgl_train.drop(columns=target)
y_train = df_wrgl_train[target]

X_val   = df_wrgl_val.drop(columns=target)
y_val   = df_wrgl_val[target]

X_test  = df_wrgl_test.drop(columns=target)
y_test  = df_wrgl_test[target]

In [15]:
# Create a baseline prediction using a dummy classifier
from sklearn.dummy import DummyClassifier
clfr_dummy = DummyClassifier(strategy="stratified")

clfr_dummy.fit(X_train, y_train)
clfr_dummy.predict(X_train)

dummy_train_y_pred = clfr_dummy.predict(X_train)

print(f'Baseline prediction score for the training set is: {round(clfr_dummy.score(X_train, y_train), 4)}')

Baseline prediction score for the training set is: 0.7683


In [16]:
from sklearn.metrics import classification_report

print(classification_report(y_train, dummy_train_y_pred))

precision    recall  f1-score   support

       False       0.86      0.86      0.86     20383
        True       0.14      0.14      0.14      3210

    accuracy                           0.77     23593
   macro avg       0.50      0.50      0.50     23593
weighted avg       0.77      0.77      0.77     23593



In [17]:
import category_encoders as ce 
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline

# Construct a modeling pipeline
pipeline = make_pipeline(
    ce.OrdinalEncoder(),
    RandomForestClassifier(n_jobs=-1, random_state=42)
)

# Fit the pipeline to the training dataset
pipeline.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('ordinalencoder',
                 OrdinalEncoder(cols=['City', 'County', 'CrashMonth',
                                      'Development', 'LightCond', 'Locality',
                                      'NumLanes', 'RdCharacte', 'RdClass',
                                      'RdConditio', 'RdConfig', 'RdDefects',
                                      'RdFeature', 'RdSurface', 'Region',
                                      'RuralUrban', 'SpeedLimit', 'TraffCntrl',
                                      'Weather', 'Workzone'],
                                drop_invariant=False, handle_missing='value',
                                handle_un...
                 RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                        class_weight=None, criterion='gini',
                                        max_depth=None, max_features='auto',
                                        max_leaf_nodes=None, max_samples=None,


In [18]:
# Determine the validation dataset's accuracy
print(f'Validation Accuracy: {round(pipeline.score(X_val, y_val), 5)}')

Validation Accuracy: 0.86296


In [19]:
# Determine the test dataset's accuracy
print(f'Validation Accuracy: {round(pipeline.score(X_test, y_test), 5)}')

Validation Accuracy: 0.86237


In [20]:
# Predict outcome variables for the validation dataset
y_pred_val = pipeline.predict(X_val)

In [21]:
from sklearn.metrics import classification_report

print(classification_report(y_val, y_pred_val))

precision    recall  f1-score   support

       False       0.87      0.98      0.93      4368
        True       0.49      0.11      0.18       689

    accuracy                           0.86      5057
   macro avg       0.68      0.54      0.55      5057
weighted avg       0.82      0.86      0.82      5057



In [22]:
# Predict outcome variables for the testing dataset
y_pred_test = pipeline.predict(X_test)

In [23]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred_test))

precision    recall  f1-score   support

       False       0.87      0.98      0.92      4369
        True       0.47      0.11      0.17       688

    accuracy                           0.86      5057
   macro avg       0.67      0.54      0.55      5057
weighted avg       0.82      0.86      0.82      5057



In [26]:
# Generate the model again for use in generating the permutation importance
trans_encode = make_pipeline(
    ce.OrdinalEncoder()
)

# Encode the train and val independent attributes
X_train_trans_encode = trans_encode.fit_transform(X_train)
X_val_trans_encode   = trans_encode.fit_transform(X_val)

# Create the model 
tmp_mdl = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=42)
tmp_mdl.fit(X_train_trans_encode, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=-1, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [28]:
# Use the eli5 package to generate permutation importance
import eli5
from eli5.sklearn import PermutationImportance

# Create a permutation importance object
prmuter = PermutationImportance(
    tmp_mdl,
    scoring='accuracy',
    n_iter=5,
    random_state=42)

prmuter.fit(X_val_trans_encode, y_val)

PermutationImportance(cv='prefit',
                      estimator=RandomForestClassifier(bootstrap=True,
                                                       ccp_alpha=0.0,
                                                       class_weight=None,
                                                       criterion='gini',
                                                       max_depth=None,
                                                       max_features='auto',
                                                       max_leaf_nodes=None,
                                                       max_samples=None,
                                                       min_impurity_decrease=0.0,
                                                       min_impurity_split=None,
                                                       min_samples_leaf=1,
                                                       min_samples_split=2,
                                                       min_weight_fr

In [30]:
# Grab the feature names
feature_names = X_val.columns.tolist()

# Display the permutation importances 
eli5.show_weights(
    prmuter,
    top=None,
    feature_names=feature_names
)

Weight,Feature
0.0014  ± 0.0012,LightCond
0.0005  ± 0.0022,CrashHour
0.0002  ± 0.0013,NumLanes
0.0001  ± 0.0027,SpeedLimit
0.0000  ± 0.0006,RdDefects
-0.0000  ± 0.0003,Workzone
-0.0000  ± 0.0017,TraffCntrl
-0.0003  ± 0.0006,RdConditio
-0.0003  ± 0.0026,RdClass
-0.0004  ± 0.0026,CrashMonth
