In [3]:
import pandas as pd
import numpy as np

In [4]:
# Import the Chapel Hill data
df_work = pd.read_csv("/Users/danoand/Documents/Companies/LambdaSchool/Build_Project_02/data/pedestrian-crashes-chapel-hill-region_imported.txt",
                     sep='\t')

In [5]:
# Profile the "work" dataframe
# from pandas_profiling import ProfileReport

# profile = ProfileReport(df_work, title='Pandas Profiling Report', html={'style':{'full_width':True}})
# profile.to_file(output_file="/Users/danoand/Documents/Companies/LambdaSchool/Build_Project_02/data/ChapelHillAccidents.html")

In [6]:
# Define "ante" columns (data that can be reasonably known prior to the event)
# Notes:
#   - removed the lat/long column 
categories_ante = [
    'City',
    'County',
    'CrashHour',
    'CrashMonth',
    'Development',
    'LightCond',
    'Locality',
    'NumLanes',
    'RdCharacte',
    'RdClass',
    'RdConditio',
    'RdConfig',
    'RdDefects',
    'RdFeature',
    'RdSurface',
    'Region',
    'RuralUrban',
    'SpeedLimit',
    'TraffCntrl',
    'Weather',
    'Workzone',
    'PedInjury']        # Source for the target data

In [7]:
# Define a dataframe with "ante" data
df_ante = df_work[categories_ante]

df_ante.sample(10)

Unnamed: 0,City,County,CrashHour,CrashMonth,Development,LightCond,Locality,NumLanes,RdCharacte,RdClass,...,RdDefects,RdFeature,RdSurface,Region,RuralUrban,SpeedLimit,TraffCntrl,Weather,Workzone,PedInjury
14724,Charlotte,Mecklenburg,16,March,Commercial,Daylight,Urban (>70% Developed),4 lanes,Straight - Level,Local Street,...,,Four-Way Intersection,Smooth Asphalt,Piedmont,Urban,20 - 25 MPH,Human Control,Clear,No,C: Possible Injury
4045,Durham,Durham,23,November,Residential,Dark - Roadway Not Lighted,Urban (>70% Developed),3 lanes,Curve - Grade,Local Street,...,,No Special Feature,Smooth Asphalt,Piedmont,Urban,30 - 35 MPH,No Control Present,Clear,No,K: Killed
23090,None - Rural Crash,Onslow,13,October,"Farms, Woods, Pastures",Daylight,Rural (<30% Developed),Unknown,Straight - Level,"Private Road, Driveway",...,,"Driveway, Private",Sand,Coastal,Rural,5 - 15 MPH,No Control Present,Clear,No,C: Possible Injury
7658,Charlotte,Mecklenburg,17,March,Residential,Daylight,Mixed (30% To 70% Developed),4 lanes,Straight - Level,Local Street,...,,T-Intersection,Coarse Asphalt,Piedmont,Urban,5 - 15 MPH,No Control Present,Clear,No,C: Possible Injury
13921,Forest City,Rutherford,19,December,Commercial,Dark - Roadway Not Lighted,Urban (>70% Developed),5 lanes,Straight - Hillcrest,Local Street,...,,Four-Way Intersection,Smooth Asphalt,Mountains,Urban,40 - 45 MPH,Stop And Go Signal,Clear,No,C: Possible Injury
12155,Thomasville,Davidson,11,October,Residential,Dawn,Mixed (30% To 70% Developed),Unknown,Straight - Level,Public Vehicular Area,...,,No Special Feature,Concrete,Piedmont,Urban,5 - 15 MPH,No Control Present,Clear,No,C: Possible Injury
4109,None - Rural Crash,Beaufort,12,April,Residential,Daylight,Mixed (30% To 70% Developed),3 lanes,Curve - Level,NC Route,...,,"Driveway, Public",Smooth Asphalt,Coastal,Rural,40 - 45 MPH,"Double Yellow Line, No Passing Zone",Clear,No,A: Suspected Serious Injury
14101,Charlotte,Mecklenburg,14,May,Commercial,Daylight,Urban (>70% Developed),4 lanes,Straight - Level,Local Street,...,,Four-Way Intersection,Smooth Asphalt,Piedmont,Urban,30 - 35 MPH,Stop And Go Signal,Clear,No,B: Suspected Minor Injury
28291,Wilmington,New Hanover,18,April,Commercial,Daylight,Urban (>70% Developed),4 lanes,Straight - Level,Local Street,...,,No Special Feature,Smooth Asphalt,Coastal,Urban,30 - 35 MPH,No Control Present,Clear,No,C: Possible Injury
6394,Fayetteville,Cumberland,2,November,Commercial,Dark - Lighted Roadway,Urban (>70% Developed),Unknown,Straight - Level,Public Vehicular Area,...,,"Driveway, Public",Coarse Asphalt,Coastal,Urban,5 - 15 MPH,No Control Present,Clear,No,C: Possible Injury


In [8]:
# Profile the df_ante dataframe
# from pandas_profiling import ProfileReport

# profile = ProfileReport(df_work, title='Pandas Profiling Report', html={'style':{'full_width':True}})
# profile.to_file(output_file="/Users/danoand/Documents/Companies/LambdaSchool/Build_Project_02/data/ChapelHillAccidents.html")

In [10]:
# Split the data into training, validation, and testing datasets
from sklearn.model_selection import train_test_split
df_ante_train, df_ante_test = train_test_split(df_ante, train_size=0.85, test_size=0.15, 
                              stratify=df_ante['PedInjury'], random_state=42)

print(f'Shape of df_ante_train: {df_ante_train.shape}; Shape of df_ante_test: {df_ante_test.shape}')

Shape of df_ante_train: (28650, 22); Shape of df_ante_test: (5057, 22)


In [11]:
# Split the train data into a training subset and a validation subset
df_ante_train, df_ante_val = train_test_split(df_ante_train, test_size=len(df_ante_test), 
                              stratify=df_ante_train['PedInjury'], random_state=42)

print(f'Shape of df_ante_train: {df_ante_train.shape}; Shape of df_ante_val: {df_ante_val.shape}; Shape of df_ante_test: {df_ante_test.shape}')

Shape of df_ante_train: (23593, 22); Shape of df_ante_val: (5057, 22); Shape of df_ante_test: (5057, 22)


In [12]:
# Breakdown the target attribute by class value
df_ante_train['PedInjury'].value_counts(normalize=True)

C: Possible Injury             0.409020
B: Suspected Minor Injury      0.354597
A: Suspected Serious Injury    0.072140
K: Killed                      0.063917
O: No Injury                   0.059975
Unknown Injury                 0.040351
Name: PedInjury, dtype: float64

In [13]:
# Wrangle the modeling data
outcome_serious = ['K: Killed', 'A: Suspected Serious Injury']

# has_fatality is a function that returns a boolean value if the inbound value represents a serious outcome (True) or not (False)
def has_serious_outcome(val):
  if (val in outcome_serious): 
    return True

  return False

# wrangle returns a dataframe with updated/created columns for a given input dataframe 
def wrangle(DF):
  X = DF.copy()

  # Create a boolean target column reflecting an outcome (serious injury or not)
  X['ped_serious_outcome'] = X['PedInjury'].apply(has_serious_outcome)

  # Convert the CrashHour column to a string (from an int)
  X['CrashHour'] = X['CrashHour'].apply(str)

  # List columns to be dropped
  cols_drop = ['PedInjury'] # PedInjury - use the engineered 'ped_fatality' column

  # Drop undesired columns
  X = X.drop(columns=cols_drop)

  return X

In [15]:
# Wrangle the training, validation, and test datasets
df_wrgl_train = wrangle(df_ante_train)
df_wrgl_val   = wrangle(df_ante_val)
df_wrgl_test  = wrangle(df_ante_test)

print(f'Shape of df_wrgl_train: {df_wrgl_train.shape}; Shape of df_wrgl_val: {df_wrgl_val.shape}; Shape of df_wrgl_test: {df_wrgl_test.shape}\n')

print(f'\nColumn Data Types:\n{df_wrgl_val.dtypes}')

Shape of df_wrgl_train: (23593, 22); Shape of df_wrgl_val: (5057, 22); Shape of df_wrgl_test: (5057, 22)


Column Data Types:
City                   object
County                 object
CrashHour              object
CrashMonth             object
Development            object
LightCond              object
Locality               object
NumLanes               object
RdCharacte             object
RdClass                object
RdConditio             object
RdConfig               object
RdDefects              object
RdFeature              object
RdSurface              object
Region                 object
RuralUrban             object
SpeedLimit             object
TraffCntrl             object
Weather                object
Workzone               object
ped_serious_outcome      bool
dtype: object


In [16]:
df_wrgl_train.sample(5)

Unnamed: 0,City,County,CrashHour,CrashMonth,Development,LightCond,Locality,NumLanes,RdCharacte,RdClass,...,RdDefects,RdFeature,RdSurface,Region,RuralUrban,SpeedLimit,TraffCntrl,Weather,Workzone,ped_serious_outcome
31219,Charlotte,Mecklenburg,18,July,Commercial,Daylight,Urban (>70% Developed),4 lanes,Straight - Level,Local Street,...,,No Special Feature,Smooth Asphalt,Piedmont,Urban,30 - 35 MPH,No Control Present,Clear,No,False
28558,Asheville,Buncombe,22,July,Commercial,Dark - Lighted Roadway,Urban (>70% Developed),4 lanes,Straight - Grade,US Route,...,,Four-Way Intersection,Smooth Asphalt,Mountains,Urban,30 - 35 MPH,Stop And Go Signal,Clear,No,False
2217,None - Rural Crash,Cumberland,0,April,Residential,Dark - Roadway Not Lighted,Rural (<30% Developed),1 lane,Curve - Grade,"Private Road, Driveway",...,,"Driveway, Private",Sand,Coastal,Rural,5 - 15 MPH,No Control Present,Clear,No,True
5131,Durham,Durham,16,January,Institutional,Daylight,Urban (>70% Developed),2 lanes,Straight - Level,Local Street,...,,No Special Feature,Coarse Asphalt,Piedmont,Urban,30 - 35 MPH,No Control Present,Cloudy,Yes,False
3399,Garysburg,Northampton,7,February,Residential,Daylight,Urban (>70% Developed),2 lanes,Straight - Level,Local Street,...,,No Special Feature,Coarse Asphalt,Coastal,Rural,20 - 25 MPH,No Control Present,Clear,No,False


In [18]:
# Construct the modeling datasets
target = 'ped_serious_outcome'
X_train = df_wrgl_train.drop(columns=target)
y_train = df_wrgl_train[target]

X_val   = df_wrgl_val.drop(columns=target)
y_val   = df_wrgl_val[target]

X_test  = df_wrgl_test.drop(columns=target)
y_test  = df_wrgl_test[target]


print(f'\nValidation Column Data Types:\n{X_val.dtypes}')


Validation Column Data Types:
City           object
County         object
CrashHour      object
CrashMonth     object
Development    object
LightCond      object
Locality       object
NumLanes       object
RdCharacte     object
RdClass        object
RdConditio     object
RdConfig       object
RdDefects      object
RdFeature      object
RdSurface      object
Region         object
RuralUrban     object
SpeedLimit     object
TraffCntrl     object
Weather        object
Workzone       object
dtype: object


In [19]:
# Create a baseline prediction using a dummy classifier
from sklearn.dummy import DummyClassifier
clfr_dummy = DummyClassifier(strategy="stratified")

clfr_dummy.fit(X_train, y_train)
clfr_dummy.predict(X_train)

dummy_train_y_pred = clfr_dummy.predict(X_train)

print(f'Baseline prediction score for the training set is: {round(clfr_dummy.score(X_train, y_train), 4)}')

Baseline prediction score for the training set is: 0.764


In [38]:
# Generate the classification report for the Dummy Classifier model
from sklearn.metrics import classification_report

print(f'Dummy (Baseline) Classification Report:\n\n{classification_report(y_train, dummy_train_y_pred)}')

Dummy (Baseline) Classification Report:

              precision    recall  f1-score   support

       False       0.86      0.86      0.86     20383
        True       0.13      0.13      0.13      3210

    accuracy                           0.76     23593
   macro avg       0.50      0.50      0.50     23593
weighted avg       0.76      0.76      0.76     23593



In [39]:
import category_encoders as ce 
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline

print(X_train.dtypes)

# Construct a modeling pipeline
pipeline = make_pipeline(
    ce.OrdinalEncoder(),
    RandomForestClassifier(n_jobs=-1, random_state=42)
)

# Fit the pipeline to the training dataset
pipeline.fit(X_train, y_train)

City           object
County         object
CrashHour      object
CrashMonth     object
Development    object
LightCond      object
Locality       object
NumLanes       object
RdCharacte     object
RdClass        object
RdConditio     object
RdConfig       object
RdDefects      object
RdFeature      object
RdSurface      object
Region         object
RuralUrban     object
SpeedLimit     object
TraffCntrl     object
Weather        object
Workzone       object
dtype: object


Pipeline(memory=None,
         steps=[('ordinalencoder',
                 OrdinalEncoder(cols=['City', 'County', 'CrashHour',
                                      'CrashMonth', 'Development', 'LightCond',
                                      'Locality', 'NumLanes', 'RdCharacte',
                                      'RdClass', 'RdConditio', 'RdConfig',
                                      'RdDefects', 'RdFeature', 'RdSurface',
                                      'Region', 'RuralUrban', 'SpeedLimit',
                                      'TraffCntrl', 'Weather', 'Workzone'],
                                drop_invariant=False, handle_missing='valu...
                 RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                        class_weight=None, criterion='gini',
                                        max_depth=None, max_features='auto',
                                        max_leaf_nodes=None, max_samples=None,
                                

In [40]:
# Calculate the accuracy using the validation data
print(f'The Validation Accuracy is: {round(pipeline.score(X_val, y_val), 4)}')

The Validation Accuracy is: 0.8655


In [45]:
# Generate the classification report for the Random Forest Classifier model
from sklearn.metrics import classification_report

y_val_pred = pipeline.predict(X_val)

print(f'Random Forest Classification Report:\n\n{classification_report(y_val, y_val_pred)}')

Random Forest Classification Report:

              precision    recall  f1-score   support

       False       0.87      0.99      0.93      4368
        True       0.53      0.11      0.18       689

    accuracy                           0.87      5057
   macro avg       0.70      0.55      0.55      5057
weighted avg       0.83      0.87      0.82      5057



In [23]:
# Construct the encoding and modeling objects for use in generating a pdp plot
trans_encode = make_pipeline(
    ce.OrdinalEncoder()
)

# Encode the train and validation attributes
X_train_trans_encode = trans_encode.fit_transform(X_train)
X_val_trans_encode   = trans_encode.fit_transform(X_val)

# Recreate the model
tmp_mdl = RandomForestClassifier(n_jobs=-1, random_state=42)
tmp_mdl.fit(X_train_trans_encode, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=-1, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [24]:
# Calculate the permutation importance
import eli5
from eli5.sklearn import PermutationImportance

# Create a permutation importance object
prmuter = PermutationImportance(
    tmp_mdl,
    scoring='accuracy',
    n_iter=5,
    random_state=42)

# Compute feature importance
prmuter.fit(X_val_trans_encode, y_val)

PermutationImportance(cv='prefit',
                      estimator=RandomForestClassifier(bootstrap=True,
                                                       ccp_alpha=0.0,
                                                       class_weight=None,
                                                       criterion='gini',
                                                       max_depth=None,
                                                       max_features='auto',
                                                       max_leaf_nodes=None,
                                                       max_samples=None,
                                                       min_impurity_decrease=0.0,
                                                       min_impurity_split=None,
                                                       min_samples_leaf=1,
                                                       min_samples_split=2,
                                                       min_weight_fr

In [26]:
# List out the independent variables by importance
feature_names = X_val.columns.tolist()

# Display permutation importances
eli5.show_weights(
    prmuter,
    top=None,
    feature_names=feature_names)

Weight,Feature
0.0036  ± 0.0008,LightCond
0.0026  ± 0.0019,TraffCntrl
0.0020  ± 0.0015,CrashMonth
0.0018  ± 0.0028,RdClass
0.0014  ± 0.0014,SpeedLimit
0.0013  ± 0.0014,RdConfig
0.0011  ± 0.0011,Locality
0.0010  ± 0.0014,NumLanes
0.0008  ± 0.0028,Development
0.0008  ± 0.0026,CrashHour


In [33]:
# Generate a logistic regression model
from sklearn.linear_model import LogisticRegressionCV

# Construct a modeling pipeline
pipeline_logstc = make_pipeline(
    ce.OrdinalEncoder(),
    LogisticRegressionCV(cv=5, n_jobs=-1, random_state=42)
)

# Fit the pipeline to the training dataset
pipeline_logstc.fit(X_train, y_train)

# Calculate the accuracy using the validation data
print(f'The Validation Accuracy is: {round(pipeline_logstc.score(X_val, y_val), 4)}')


The Validation Accuracy is: 0.8638


In [35]:
pipeline_logstc[1]

LogisticRegressionCV(Cs=10, class_weight=None, cv=5, dual=False,
                     fit_intercept=True, intercept_scaling=1.0, l1_ratios=None,
                     max_iter=100, multi_class='auto', n_jobs=-1, penalty='l2',
                     random_state=42, refit=True, scoring=None, solver='lbfgs',
                     tol=0.0001, verbose=0)

In [47]:
# Generate the classification report for the Logistic Regression model
y_val_pred_lgtc = pipeline_logstc.predict(X_val)

print(f'Logistic Regression Classification Report:\n\n{classification_report(y_val, y_val_pred_lgtc)}')

Logistic Regression Classification Report:

              precision    recall  f1-score   support

       False       0.86      1.00      0.93      4368
        True       0.00      0.00      0.00       689

    accuracy                           0.86      5057
   macro avg       0.43      0.50      0.46      5057
weighted avg       0.75      0.86      0.80      5057

