In [141]:
import pandas as pd
import numpy as np

In [142]:
# Import the Chapel Hill data
df_work = pd.read_csv("/Users/danoand/Documents/Companies/LambdaSchool/Build_Project_02/data/pedestrian-crashes-chapel-hill-region_imported.txt",
                     sep='\t')

In [143]:
# Profile the "work" dataframe

In [144]:
# Define "ante" columns (data that can be reasonably known prior to the event)
# Notes:
#   - removed the lat/long column 
categories_ante = [
    'City',
    'County',
    'CrashHour',
    'CrashMonth',
    'Development',
    'LightCond',
    'Locality',
    'NumLanes',
    'RdCharacte',
    'RdClass',
    'RdConditio',
    'RdConfig',
    'RdDefects',
    'RdFeature',
    'RdSurface',
    'Region',
    'RuralUrban',
    'SpeedLimit',
    'TraffCntrl',
    'Weather',
    'Workzone',
    'PedInjury']        # Source for the target data

In [145]:
# Define a dataframe with "ante" data
df_ante = df_work[categories_ante]

df_ante.sample(10)

Unnamed: 0,City,County,CrashHour,CrashMonth,Development,LightCond,Locality,NumLanes,RdCharacte,RdClass,...,RdDefects,RdFeature,RdSurface,Region,RuralUrban,SpeedLimit,TraffCntrl,Weather,Workzone,PedInjury
1975,Raleigh,Wake,17,February,Residential,Daylight,Urban (>70% Developed),4 lanes,Straight - Grade,Local Street,...,,No Special Feature,Smooth Asphalt,Piedmont,Urban,30 - 35 MPH,No Control Present,Cloudy,No,B: Suspected Minor Injury
12329,Charlotte,Mecklenburg,17,March,Commercial,Daylight,Urban (>70% Developed),6 lanes,Straight - Level,Local Street,...,,No Special Feature,Smooth Asphalt,Piedmont,Urban,40 - 45 MPH,No Control Present,Clear,No,B: Suspected Minor Injury
18898,Fayetteville,Cumberland,8,February,Residential,Daylight,Mixed (30% To 70% Developed),Unknown,Curve - Level,Public Vehicular Area,...,,"Driveway, Public",Coarse Asphalt,Coastal,Urban,Unknown,No Control Present,Clear,No,A: Suspected Serious Injury
25131,Raleigh,Wake,21,September,Commercial,Dark - Lighted Roadway,Urban (>70% Developed),4 lanes,Curve - Grade,Local Street,...,,No Special Feature,Smooth Asphalt,Piedmont,Urban,30 - 35 MPH,No Control Present,Clear,No,A: Suspected Serious Injury
6997,Greensboro,Guilford,12,September,Commercial,Daylight,Urban (>70% Developed),2 lanes,Straight - Level,Public Vehicular Area,...,,Other,Smooth Asphalt,Piedmont,Urban,5 - 15 MPH,No Control Present,Clear,No,A: Suspected Serious Injury
2197,Raleigh,Wake,9,March,Commercial,Dawn,Urban (>70% Developed),4 lanes,Straight - Level,Local Street,...,,Four-Way Intersection,Coarse Asphalt,Piedmont,Urban,40 - 45 MPH,Stop And Go Signal,Cloudy,No,C: Possible Injury
8184,Raleigh,Wake,12,April,Residential,Daylight,Urban (>70% Developed),1 lane,Straight - Grade,"Private Road, Driveway",...,,"Driveway, Private",Concrete,Piedmont,Urban,30 - 35 MPH,No Control Present,Clear,No,B: Suspected Minor Injury
22347,Fayetteville,Cumberland,23,October,Commercial,Dark - Lighted Roadway,Urban (>70% Developed),5 lanes,Straight - Level,Local Street,...,,Bridge Approach,Coarse Asphalt,Coastal,Urban,40 - 45 MPH,No Control Present,Clear,No,A: Suspected Serious Injury
11167,None - Rural Crash,Moore,14,October,"Farms, Woods, Pastures",Daylight,Rural (<30% Developed),2 lanes,Straight - Level,State Secondary Route,...,,Four-Way Intersection,Smooth Asphalt,Piedmont,Rural,50 - 55 MPH,Human Control,Clear,No,C: Possible Injury
7005,None - Rural Crash,Martin,18,October,"Farms, Woods, Pastures",Dark - Roadway Not Lighted,Rural (<30% Developed),2 lanes,Straight - Grade,NC Route,...,,No Special Feature,Coarse Asphalt,Coastal,Rural,50 - 55 MPH,"Double Yellow Line, No Passing Zone",Cloudy,No,A: Suspected Serious Injury


In [126]:
# Profile the df_ante dataframe

In [146]:
# Split the data into training, validation, and testing datasets
from sklearn.model_selection import train_test_split
df_ante_train, df_ante_test = train_test_split(df_ante, train_size=0.85, test_size=0.15, 
                              stratify=df_ante['PedInjury'], random_state=42)

print(f'Shape of df_ante_train: {df_ante_train.shape}; Shape of df_ante_test: {df_ante_test.shape}')

Shape of df_ante_train: (28650, 22); Shape of df_ante_test: (5057, 22)


In [147]:
# Split the train data into a training subset and a validation subset
df_ante_train, df_ante_val = train_test_split(df_ante_train, test_size=len(df_ante_test), 
                              stratify=df_ante_train['PedInjury'], random_state=42)

print(f'Shape of df_ante_train: {df_ante_train.shape}; Shape of df_ante_val: {df_ante_val.shape}; Shape of df_ante_test: {df_ante_test.shape}')

Shape of df_ante_train: (23593, 22); Shape of df_ante_val: (5057, 22); Shape of df_ante_test: (5057, 22)


In [148]:
# Breakdown the target attribute by class value
df_ante_train['PedInjury'].value_counts(normalize=True)

C: Possible Injury             0.409020
B: Suspected Minor Injury      0.354597
A: Suspected Serious Injury    0.072140
K: Killed                      0.063917
O: No Injury                   0.059975
Unknown Injury                 0.040351
Name: PedInjury, dtype: float64

In [149]:
# Wrangle the modeling data
outcome_serious = ['K: Killed', 'A: Suspected Serious Injury']

# has_fatality is a function that returns a boolean value if the inbound value represents a serious outcome (True) or not (False)
def has_serious_outcome(val):
  if (val in outcome_serious): 
    return True

  return False

# wrangle returns a dataframe with updated/created columns for a given input dataframe 
def wrangle(DF):
  X = DF.copy()

  # Create a boolean target column reflecting an outcome (serious injury or not)
  X['ped_serious_outcome'] = X['PedInjury'].apply(has_serious_outcome)

  # Convert the CrashHour column to a string (from an int)
  X['CrashHour'] = X['CrashHour'].apply(str)

  # List columns to be dropped
  cols_drop = ['PedInjury'] # PedInjury - use the engineered 'ped_fatality' column

  # Drop undesired columns
  X = X.drop(columns=cols_drop)

  return X

In [150]:
# Wrangle the training, validation, and test datasets
df_wrgl_train = wrangle(df_ante_train)
df_wrgl_val   = wrangle(df_ante_val)
df_wrgl_test  = wrangle(df_ante_test)

print(f'Shape of df_wrgl_train: {df_wrgl_train.shape}; Shape of df_wrgl_val: {df_wrgl_val.shape}; Shape of df_wrgl_test: {df_wrgl_test.shape}')

print(df_wrgl_val.dtypes)

Shape of df_wrgl_train: (23593, 22); Shape of df_wrgl_val: (5057, 22); Shape of df_wrgl_test: (5057, 22)
City                   object
County                 object
CrashHour              object
CrashMonth             object
Development            object
LightCond              object
Locality               object
NumLanes               object
RdCharacte             object
RdClass                object
RdConditio             object
RdConfig               object
RdDefects              object
RdFeature              object
RdSurface              object
Region                 object
RuralUrban             object
SpeedLimit             object
TraffCntrl             object
Weather                object
Workzone               object
ped_serious_outcome      bool
dtype: object


In [151]:
df_wrgl_train.sample(5)

Unnamed: 0,City,County,CrashHour,CrashMonth,Development,LightCond,Locality,NumLanes,RdCharacte,RdClass,...,RdDefects,RdFeature,RdSurface,Region,RuralUrban,SpeedLimit,TraffCntrl,Weather,Workzone,ped_serious_outcome
12765,Hamlet,Richmond,19,February,Residential,Dark - Lighted Roadway,Urban (>70% Developed),2 lanes,Straight - Level,Local Street,...,,No Special Feature,Smooth Asphalt,Piedmont,Urban,30 - 35 MPH,No Control Present,Clear,No,False
27127,Shelby,Cleveland,9,August,Residential,Daylight,Urban (>70% Developed),2 lanes,Straight - Level,Public Vehicular Area,...,,No Special Feature,Smooth Asphalt,Mountains,Urban,30 - 35 MPH,No Control Present,Clear,No,False
21179,None - Rural Crash,Robeson,3,October,"Farms, Woods, Pastures",Dark - Roadway Not Lighted,Rural (<30% Developed),4 lanes,Straight - Level,Interstate,...,,No Special Feature,Smooth Asphalt,Coastal,Rural,60 - 75 MPH,No Control Present,"Fog, Smog, Smoke",No,True
21525,Raleigh,Wake,17,March,Commercial,Daylight,Urban (>70% Developed),2 lanes,Straight - Level,Public Vehicular Area,...,,"Driveway, Public",Smooth Asphalt,Piedmont,Urban,30 - 35 MPH,No Control Present,Clear,No,False
11476,Raleigh,Wake,8,September,Commercial,Daylight,Mixed (30% To 70% Developed),3 lanes,Straight - Grade,Local Street,...,,No Special Feature,Smooth Asphalt,Piedmont,Urban,30 - 35 MPH,No Control Present,Clear,No,True


In [152]:
# Construct the modeling datasets
target = 'ped_serious_outcome'
X_train = df_wrgl_train.drop(columns=target)
y_train = df_wrgl_train[target]

X_val   = df_wrgl_val.drop(columns=target)
y_val   = df_wrgl_val[target]

X_test  = df_wrgl_test.drop(columns=target)
y_test  = df_wrgl_test[target]

print(X_val.dtypes)

City           object
County         object
CrashHour      object
CrashMonth     object
Development    object
LightCond      object
Locality       object
NumLanes       object
RdCharacte     object
RdClass        object
RdConditio     object
RdConfig       object
RdDefects      object
RdFeature      object
RdSurface      object
Region         object
RuralUrban     object
SpeedLimit     object
TraffCntrl     object
Weather        object
Workzone       object
dtype: object


In [153]:
# Create a baseline prediction using a dummy classifier
from sklearn.dummy import DummyClassifier
clfr_dummy = DummyClassifier(strategy="stratified")

clfr_dummy.fit(X_train, y_train)
clfr_dummy.predict(X_train)

dummy_train_y_pred = clfr_dummy.predict(X_train)

print(f'Baseline prediction score for the training set is: {round(clfr_dummy.score(X_train, y_train), 4)}')

Baseline prediction score for the training set is: 0.7638


In [154]:
from sklearn.metrics import classification_report

print(classification_report(y_train, dummy_train_y_pred))

precision    recall  f1-score   support

       False       0.86      0.87      0.87     20383
        True       0.14      0.14      0.14      3210

    accuracy                           0.77     23593
   macro avg       0.50      0.50      0.50     23593
weighted avg       0.77      0.77      0.77     23593



In [155]:
import category_encoders as ce 
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline

print(X_train.dtypes)

# Construct a modeling pipeline
pipeline = make_pipeline(
    ce.TargetEncoder(),
    RandomForestClassifier(n_jobs=-1, random_state=42)
)

# Fit the pipeline to the training dataset
pipeline.fit(X_train, y_train)

City           object
County         object
CrashHour      object
CrashMonth     object
Development    object
LightCond      object
Locality       object
NumLanes       object
RdCharacte     object
RdClass        object
RdConditio     object
RdConfig       object
RdDefects      object
RdFeature      object
RdSurface      object
Region         object
RuralUrban     object
SpeedLimit     object
TraffCntrl     object
Weather        object
Workzone       object
dtype: object


Pipeline(memory=None,
         steps=[('targetencoder',
                 TargetEncoder(cols=['City', 'County', 'CrashHour',
                                     'CrashMonth', 'Development', 'LightCond',
                                     'Locality', 'NumLanes', 'RdCharacte',
                                     'RdClass', 'RdConditio', 'RdConfig',
                                     'RdDefects', 'RdFeature', 'RdSurface',
                                     'Region', 'RuralUrban', 'SpeedLimit',
                                     'TraffCntrl', 'Weather', 'Workzone'],
                               drop_invariant=False, handle_missing='value'...
                 RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                        class_weight=None, criterion='gini',
                                        max_depth=None, max_features='auto',
                                        max_leaf_nodes=None, max_samples=None,
                                       

In [156]:
print(X_train.dtypes)

City           object
County         object
CrashHour      object
CrashMonth     object
Development    object
LightCond      object
Locality       object
NumLanes       object
RdCharacte     object
RdClass        object
RdConditio     object
RdConfig       object
RdDefects      object
RdFeature      object
RdSurface      object
Region         object
RuralUrban     object
SpeedLimit     object
TraffCntrl     object
Weather        object
Workzone       object
dtype: object


In [157]:
from pdpbox.pdp import pdp_isolate, pdp_plot

tmp_isolate = pdp_isolate(
    model          = pipeline,
    dataset        = X_val,
    model_features = X_val.columns,
    n_jobs         = -1,
    feature        = 'LightCond')

pdp_plot(isolated, feature_name='LightCond')

TypeError: can't multiply sequence by non-int of type 'float'

In [17]:
# Determine the validation dataset's accuracy
print(f'Validation Accuracy: {round(pipeline.score(X_val, y_val), 5)}')

Validation Accuracy: 0.86296


In [18]:
# Determine the test dataset's accuracy
print(f'Validation Accuracy: {round(pipeline.score(X_test, y_test), 5)}')

Validation Accuracy: 0.86237


In [19]:
# Predict outcome variables for the validation dataset
y_pred_val = pipeline.predict(X_val)

In [20]:
from sklearn.metrics import classification_report

print(classification_report(y_val, y_pred_val))

precision    recall  f1-score   support

       False       0.87      0.98      0.93      4368
        True       0.49      0.11      0.18       689

    accuracy                           0.86      5057
   macro avg       0.68      0.54      0.55      5057
weighted avg       0.82      0.86      0.82      5057



In [21]:
# Predict outcome variables for the testing dataset
y_pred_test = pipeline.predict(X_test)

In [22]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred_test))

precision    recall  f1-score   support

       False       0.87      0.98      0.92      4369
        True       0.47      0.11      0.17       688

    accuracy                           0.86      5057
   macro avg       0.67      0.54      0.55      5057
weighted avg       0.82      0.86      0.82      5057



In [23]:
# Generate the model again for use in generating the permutation importance
trans_encode = make_pipeline(
    ce.OrdinalEncoder()
)

# Encode the train and val independent attributes
X_train_trans_encode = trans_encode.fit_transform(X_train)
X_val_trans_encode   = trans_encode.fit_transform(X_val)

# Create the model 
tmp_mdl = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=42)
tmp_mdl.fit(X_train_trans_encode, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=-1, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [24]:
# Use the eli5 package to generate permutation importance
import eli5
from eli5.sklearn import PermutationImportance

# Create a permutation importance object
prmuter = PermutationImportance(
    tmp_mdl,
    scoring='accuracy',
    n_iter=5,
    random_state=42)

prmuter.fit(X_val_trans_encode, y_val)

PermutationImportance(cv='prefit',
                      estimator=RandomForestClassifier(bootstrap=True,
                                                       ccp_alpha=0.0,
                                                       class_weight=None,
                                                       criterion='gini',
                                                       max_depth=None,
                                                       max_features='auto',
                                                       max_leaf_nodes=None,
                                                       max_samples=None,
                                                       min_impurity_decrease=0.0,
                                                       min_impurity_split=None,
                                                       min_samples_leaf=1,
                                                       min_samples_split=2,
                                                       min_weight_fr

In [25]:
# Grab the feature names
feature_names = X_val.columns.tolist()

# Display the permutation importances 
eli5.show_weights(
    prmuter,
    top=None,
    feature_names=feature_names
)

Weight,Feature
0.0014  ± 0.0012,LightCond
0.0005  ± 0.0022,CrashHour
0.0002  ± 0.0013,NumLanes
0.0001  ± 0.0027,SpeedLimit
0.0000  ± 0.0006,RdDefects
-0.0000  ± 0.0003,Workzone
-0.0000  ± 0.0017,TraffCntrl
-0.0003  ± 0.0006,RdConditio
-0.0003  ± 0.0026,RdClass
-0.0004  ± 0.0026,CrashMonth


In [32]:
# Manually grab the most important features
features_org  = df_wrgl_train.columns
features_drop = []

# Define the important features as per the permutation importance analysis
features_important = [
    'LightCond',
    'CrashHour',
    'NumLanes',
    'SpeedLimit',
    'RdDefects',
    'Workzone',
    'TraffCntrl']

# Remove the "important" features to construct a drop feature list
for val in features_org:
    # Iterating over an important feature - then skip to keep
    if val in features_important:
        continue

    # Not keeping this feature - add to our drop list
    features_drop.append(val)


features_drop

['City',
 'County',
 'CrashMonth',
 'Development',
 'Locality',
 'RdCharacte',
 'RdClass',
 'RdConditio',
 'RdConfig',
 'RdFeature',
 'RdSurface',
 'Region',
 'RuralUrban',
 'Weather',
 'ped_serious_outcome']

In [34]:
# Construct modeling datasets - focusing on the "important" features (per the permuation importance analysis)
X_train_imp = df_wrgl_train.drop(columns=features_drop)
y_train_imp = df_wrgl_train[target]

X_val_imp   = df_wrgl_val.drop(columns=features_drop)
y_val_imp   = df_wrgl_val[target]

X_test_imp  = df_wrgl_test.drop(columns=features_drop)
y_test_imp  = df_wrgl_test[target]

In [36]:
# Construct a modeling pipeline
pipeline_imp = make_pipeline(
    ce.OrdinalEncoder(),
    RandomForestClassifier(n_jobs=-1, random_state=42)
)

# Fit the pipeline to the training dataset
pipeline_imp.fit(X_train_imp, y_train_imp)

Pipeline(memory=None,
         steps=[('ordinalencoder',
                 OrdinalEncoder(cols=['LightCond', 'NumLanes', 'RdDefects',
                                      'SpeedLimit', 'TraffCntrl', 'Workzone'],
                                drop_invariant=False, handle_missing='value',
                                handle_unknown='value',
                                mapping=[{'col': 'LightCond',
                                          'data_type': dtype('O'),
                                          'mapping': Dark - Lighted Roadway        1
Dusk                          2
Dark - Roadway Not Lighted    3
Daylight                      4
Dawn                          5
Other                         6
Dark - Unknown Li...
                 RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                        class_weight=None, criterion='gini',
                                        max_depth=None, max_features='auto',
                                  

In [39]:
# Determine the validation dataset's accuracy
print(f'Validation Accuracy: {round(pipeline_imp.score(X_val_imp, y_val_imp), 5)}')

Validation Accuracy: 0.85584


In [40]:
# Determine the test dataset's accuracy
print(f'Validation Accuracy: {round(pipeline_imp.score(X_test_imp, y_test_imp), 5)}')

Validation Accuracy: 0.85367


In [41]:
import matplotlib.pyplot as plt
plt.rcParams['figure.dpi'] = 72

In [45]:
from pdpbox.pdp import pdp_isolate, pdp_plot

print(f'type of pipeline[1] is: {type(pipeline[1])}')

# Iterate through the top three most important features
for i in range(3):
    featr = features_important[i]
    print(f'current featr is: {featr}')

    tmp_isolate = pdp_isolate(
        model          = pipeline[1],
        dataset        = X_val,
        model_features = X_val.columns,
        feature        = featr)

    pdp_plot(isolated, feature_name=featr);

type of pipeline[1] is: <class 'sklearn.ensemble._forest.RandomForestClassifier'>
current featr is: LightCond


TypeError: can't multiply sequence by non-int of type 'float'