In [5]:
import pandas as pd
import numpy as np

In [6]:
# Import the Chapel Hill data
df_work = pd.read_csv("/Users/danoand/Documents/Companies/LambdaSchool/Build_Project_02/data/pedestrian-crashes-chapel-hill-region_imported.txt",
                     sep='\t')

In [7]:
# Profile the "work" dataframe
from pandas_profiling import ProfileReport

profile = ProfileReport(df_work, title='Pandas Profiling Report', html={'style':{'full_width':True}})
profile.to_file(output_file="/Users/danoand/Documents/Companies/LambdaSchool/Build_Project_02/data/ChapelHillAccidents.html")

In [8]:
# Define "ante" columns (data that can be reasonably known prior to the event)
# Notes:
#   - removed the lat/long column 
categories_ante = [
    'City',
    'County',
    'CrashHour',
    'CrashMonth',
    'Development',
    'LightCond',
    'Locality',
    'NumLanes',
    'RdCharacte',
    'RdClass',
    'RdConditio',
    'RdConfig',
    'RdDefects',
    'RdFeature',
    'RdSurface',
    'Region',
    'RuralUrban',
    'SpeedLimit',
    'TraffCntrl',
    'Weather',
    'Workzone',
    'PedInjury']        # Source for the target data

In [11]:
# Define a dataframe with "ante" data
df_ante = df_work[categories_ante]

df_ante.sample(10)

Unnamed: 0,City,County,CrashHour,CrashMonth,Development,LightCond,Locality,NumLanes,RdCharacte,RdClass,...,RdDefects,RdFeature,RdSurface,Region,RuralUrban,SpeedLimit,TraffCntrl,Weather,Workzone,PedInjury
20037,Lumberton,Robeson,14,October,Commercial,Daylight,Rural (<30% Developed),Unknown,Straight - Level,Public Vehicular Area,...,,No Special Feature,Coarse Asphalt,Coastal,Urban,5 - 15 MPH,No Control Present,Clear,No,Unknown Injury
33080,None - Rural Crash,New Hanover,19,November,Institutional,Dark - Lighted Roadway,Urban (>70% Developed),Unknown,Straight - Level,Public Vehicular Area,...,,"Driveway, Public",Smooth Asphalt,Coastal,Rural,5 - 15 MPH,No Control Present,Clear,No,B: Suspected Minor Injury
28079,None - Rural Crash,Surry,8,April,"Farms, Woods, Pastures",Daylight,Rural (<30% Developed),4 lanes,Straight - Grade,US Route,...,,Bridge,Concrete,Mountains,Rural,60 - 75 MPH,No Control Present,Clear,No,B: Suspected Minor Injury
32759,Durham,Durham,8,December,Commercial,Daylight,Urban (>70% Developed),2 lanes,Straight - Level,Public Vehicular Area,...,,No Special Feature,Smooth Asphalt,Piedmont,Urban,20 - 25 MPH,No Control Present,Clear,No,B: Suspected Minor Injury
30876,None - Rural Crash,Guilford,20,March,Residential,Dark - Roadway Not Lighted,Rural (<30% Developed),2 lanes,Straight - Level,State Secondary Route,...,,No Special Feature,Smooth Asphalt,Piedmont,Rural,40 - 45 MPH,No Control Present,Clear,No,C: Possible Injury
3995,Charlotte,Mecklenburg,7,October,Commercial,Daylight,Urban (>70% Developed),1 lane,Straight - Level,Local Street,...,,No Special Feature,Smooth Asphalt,Piedmont,Urban,30 - 35 MPH,Stop And Go Signal,Clear,No,C: Possible Injury
25823,Dudley,Wayne,17,April,Residential,Daylight,Mixed (30% To 70% Developed),Unknown,Straight - Level,Public Vehicular Area,...,,"Driveway, Private",Sand,Coastal,Rural,Unknown,No Control Present,Cloudy,No,C: Possible Injury
15541,None - Rural Crash,Guilford,22,August,"Farms, Woods, Pastures",Dark - Roadway Not Lighted,Rural (<30% Developed),2 lanes,Straight - Level,US Route,...,,No Special Feature,Smooth Asphalt,Piedmont,Rural,50 - 55 MPH,No Control Present,Clear,No,B: Suspected Minor Injury
12276,Fayetteville,Cumberland,11,August,Commercial,Daylight,Urban (>70% Developed),8 lanes,Straight - Level,Local Street,...,,No Special Feature,Coarse Asphalt,Coastal,Urban,40 - 45 MPH,No Control Present,Clear,No,B: Suspected Minor Injury
25766,Rocky Mount,Edgecombe,14,January,Residential,Daylight,Mixed (30% To 70% Developed),2 lanes,Straight - Level,Public Vehicular Area,...,,"Driveway, Public",Smooth Asphalt,Coastal,Urban,5 - 15 MPH,No Control Present,Clear,No,B: Suspected Minor Injury


In [12]:
# Profile the df_ante dataframe
from pandas_profiling import ProfileReport

profile_ante = ProfileReport(df_ante, title='Pandas Profiling Report', html={'style':{'full_width':True}})
profile_ante.to_file(output_file="/Users/danoand/Documents/Companies/LambdaSchool/Build_Project_02/data/ChapelHillAccidents_Ante.html")

In [13]:
# Split the data into training, validation, and testing datasets
from sklearn.model_selection import train_test_split
df_ante_train, df_ante_test = train_test_split(df_ante, train_size=0.85, test_size=0.15, 
                              stratify=df_ante['PedInjury'], random_state=42)

print(f'Shape of df_ante_train: {df_ante_train.shape}; Shape of df_ante_test: {df_ante_test.shape}')

Shape of df_ante_train: (28650, 22); Shape of df_ante_test: (5057, 22)


In [14]:
# Split the train data into a training subset and a validation subset
df_ante_train, df_ante_val = train_test_split(df_ante_train, test_size=len(df_ante_test), 
                              stratify=df_ante_train['PedInjury'], random_state=42)

print(f'Shape of df_ante_train: {df_ante_train.shape}; Shape of df_ante_val: {df_ante_val.shape}; Shape of df_ante_test: {df_ante_test.shape}')

Shape of df_ante_train: (23593, 22); Shape of df_ante_val: (5057, 22); Shape of df_ante_test: (5057, 22)


In [15]:
# Breakdown the target attribute by class value
df_ante_train['PedInjury'].value_counts(normalize=True)

C: Possible Injury             0.409020
B: Suspected Minor Injury      0.354597
A: Suspected Serious Injury    0.072140
K: Killed                      0.063917
O: No Injury                   0.059975
Unknown Injury                 0.040351
Name: PedInjury, dtype: float64

In [16]:
# Wrangle the modeling data
outcome_serious = ['K: Killed', 'A: Suspected Serious Injury']

# has_fatality is a function that returns a boolean value if the inbound value represents a serious outcome (True) or not (False)
def has_serious_outcome(val):
  if (val in outcome_serious): 
    return True

  return False

# wrangle returns a dataframe with updated/created columns for a given input dataframe 
def wrangle(DF):
  X = DF.copy()

  # Create a boolean target column reflecting an outcome (serious injury or not)
  X['ped_serious_outcome'] = X['PedInjury'].apply(has_serious_outcome)

  # List columns to be dropped
  cols_drop = ['PedInjury'] # PedInjury - use the engineered 'ped_fatality' column

  # Drop undesired columns
  X = X.drop(columns=cols_drop)

  return X

In [17]:
# Wrangle the training, validation, and test datasets
df_wrgl_train = wrangle(df_ante_train)
df_wrgl_val   = wrangle(df_ante_val)
df_wrgl_test  = wrangle(df_ante_test)

print(f'Shape of df_wrgl_train: {df_wrgl_train.shape}; Shape of df_wrgl_val: {df_wrgl_val.shape}; Shape of df_wrgl_test: {df_wrgl_test.shape}')

Shape of df_wrgl_train: (23593, 22); Shape of df_wrgl_val: (5057, 22); Shape of df_wrgl_test: (5057, 22)


In [18]:
df_wrgl_train.sample(5)

Unnamed: 0,City,County,CrashHour,CrashMonth,Development,LightCond,Locality,NumLanes,RdCharacte,RdClass,...,RdDefects,RdFeature,RdSurface,Region,RuralUrban,SpeedLimit,TraffCntrl,Weather,Workzone,ped_serious_outcome
4025,Spring Lake,Cumberland,18,October,Commercial,Daylight,Urban (>70% Developed),2 lanes,Straight - Level,Public Vehicular Area,...,,No Special Feature,Coarse Asphalt,Coastal,Urban,5 - 15 MPH,No Control Present,Clear,No,False
10212,Charlotte,Mecklenburg,17,June,Commercial,Daylight,Urban (>70% Developed),2 lanes,Straight - Grade,Public Vehicular Area,...,,No Special Feature,Smooth Asphalt,Piedmont,Urban,Unknown,No Control Present,Cloudy,No,False
30789,Boone,Watauga,11,April,Commercial,Daylight,Urban (>70% Developed),2 lanes,Straight - Level,Local Street,...,,T-Intersection,Smooth Asphalt,Mountains,Urban,20 - 25 MPH,Stop And Go Signal,Clear,No,False
962,None - Rural Crash,Wake,13,November,Commercial,Daylight,Urban (>70% Developed),9 or more lanes,Straight - Level,Public Vehicular Area,...,,"Driveway, Public",Smooth Asphalt,Piedmont,Rural,5 - 15 MPH,No Control Present,Clear,No,True
3253,Mount Holly,Gaston,19,May,Commercial,Daylight,Mixed (30% To 70% Developed),Unknown,Straight - Level,Public Vehicular Area,...,,"Driveway, Public",Coarse Asphalt,Piedmont,Urban,5 - 15 MPH,No Control Present,Clear,No,False


In [19]:
# Construct the modeling datasets
target = 'ped_serious_outcome'
X_train = df_wrgl_train.drop(columns=target)
y_train = df_wrgl_train[target]

X_val   = df_wrgl_val.drop(columns=target)
y_val   = df_wrgl_val[target]

X_test  = df_wrgl_test.drop(columns=target)
y_test  = df_wrgl_test[target]

In [30]:
# Create a baseline prediction using a dummy classifier
from sklearn.dummy import DummyClassifier
clfr_dummy = DummyClassifier(strategy="stratified")

clfr_dummy.fit(X_train, y_train)
clfr_dummy.predict(X_train)

dummy_train_y_pred = clfr_dummy.predict(X_train)

print(f'Baseline prediction score for the training set is: {round(clfr_dummy.score(X_train, y_train), 4)}')

Baseline prediction score for the training set is: 0.7657


In [31]:
from sklearn.metrics import classification_report

print(classification_report(y_train, dummy_train_y_pred))

precision    recall  f1-score   support

       False       0.86      0.86      0.86     20383
        True       0.13      0.14      0.14      3210

    accuracy                           0.76     23593
   macro avg       0.50      0.50      0.50     23593
weighted avg       0.76      0.76      0.76     23593



In [21]:
import category_encoders as ce 
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline

# Construct a modeling pipeline
pipeline = make_pipeline(
    ce.OrdinalEncoder(),
    RandomForestClassifier(n_jobs=-1, random_state=42)
)

# Fit the pipeline to the training dataset
pipeline.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('ordinalencoder',
                 OrdinalEncoder(cols=['City', 'County', 'CrashMonth',
                                      'Development', 'LightCond', 'Locality',
                                      'NumLanes', 'RdCharacte', 'RdClass',
                                      'RdConditio', 'RdConfig', 'RdDefects',
                                      'RdFeature', 'RdSurface', 'Region',
                                      'RuralUrban', 'SpeedLimit', 'TraffCntrl',
                                      'Weather', 'Workzone'],
                                drop_invariant=False, handle_missing='value',
                                handle_un...
                 RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                        class_weight=None, criterion='gini',
                                        max_depth=None, max_features='auto',
                                        max_leaf_nodes=None, max_samples=None,


In [18]:
# Determine the validation dataset's accuracy
print(f'Validation Accuracy: {round(pipeline.score(X_val, y_val), 5)}')

Validation Accuracy: 0.86138


In [22]:
# Determine the test dataset's accuracy
print(f'Validation Accuracy: {round(pipeline.score(X_test, y_test), 5)}')

Validation Accuracy: 0.86237


In [25]:
# Predict outcome variables for the validation dataset
y_pred_val = pipeline.predict(X_val)

In [29]:
from sklearn.metrics import classification_report

print(classification_report(y_val, y_pred_val))

precision    recall  f1-score   support

       False       0.87      0.98      0.93      4368
        True       0.49      0.11      0.18       689

    accuracy                           0.86      5057
   macro avg       0.68      0.54      0.55      5057
weighted avg       0.82      0.86      0.82      5057



In [32]:
# Predict outcome variables for the testing dataset
y_pred_test = pipeline.predict(X_test)

In [33]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred_test))

precision    recall  f1-score   support

       False       0.87      0.98      0.92      4369
        True       0.47      0.11      0.17       688

    accuracy                           0.86      5057
   macro avg       0.67      0.54      0.55      5057
weighted avg       0.82      0.86      0.82      5057

