In [1]:
import pandas as pd
import numpy as np

In [2]:
# Import the Chapel Hill data
df_work = pd.read_csv("/Users/danoand/Documents/Companies/LambdaSchool/Build_Project_02/data/pedestrian-crashes-chapel-hill-region_imported.txt",
                     sep='\t')

In [3]:
# Profile the "work" dataframe
from pandas_profiling import ProfileReport

profile = ProfileReport(df_work, title='Pandas Profiling Report', html={'style':{'full_width':True}})
profile.to_file(output_file="/Users/danoand/Documents/Companies/LambdaSchool/Build_Project_02/data/ChapelHillAccidents.html")

In [4]:
# Define "ante" columns (data that can be reasonably known prior to the event)
# Notes:
#   - removed the lat/long column 
categories_ante = [
    'City',
    'County',
    'CrashHour',
    'CrashMonth',
    'Development',
    'LightCond',
    'Locality',
    'NumLanes',
    'RdCharacte',
    'RdClass',
    'RdConditio',
    'RdConfig',
    'RdDefects',
    'RdFeature',
    'RdSurface',
    'Region',
    'RuralUrban',
    'SpeedLimit',
    'TraffCntrl',
    'Weather',
    'Workzone',
    'PedInjury']        # Source for the target data

In [6]:
# Define a dataframe with "ante" data
df_ante = df_work[categories_ante]

df_ante.sample(10)

Unnamed: 0,City,County,CrashHour,CrashMonth,Development,LightCond,Locality,NumLanes,RdCharacte,RdClass,...,RdDefects,RdFeature,RdSurface,Region,RuralUrban,SpeedLimit,TraffCntrl,Weather,Workzone,PedInjury
9592,Asheville,Buncombe,10,December,Residential,Daylight,Urban (>70% Developed),2 lanes,Straight - Grade,Local Street,...,,No Special Feature,Smooth Asphalt,Mountains,Urban,30 - 35 MPH,No Control Present,Cloudy,No,C: Possible Injury
4092,None - Rural Crash,Mcdowell,17,November,Residential,Dusk,Rural (<30% Developed),2 lanes,Curve - Level,State Secondary Route,...,,No Special Feature,Smooth Asphalt,Mountains,Rural,30 - 35 MPH,"Double Yellow Line, No Passing Zone",Clear,No,A: Suspected Serious Injury
20104,None - Rural Crash,Beaufort,6,October,"Farms, Woods, Pastures",Dark - Roadway Not Lighted,Mixed (30% To 70% Developed),2 lanes,Straight - Level,NC Route,...,,No Special Feature,Smooth Asphalt,Coastal,Rural,50 - 55 MPH,No Control Present,Cloudy,No,K: Killed
4018,None - Rural Crash,Craven,17,November,Residential,Daylight,Mixed (30% To 70% Developed),2 lanes,Straight - Level,NC Route,...,,No Special Feature,Coarse Asphalt,Coastal,Rural,50 - 55 MPH,No Control Present,Clear,No,B: Suspected Minor Injury
8881,None - Rural Crash,Harnett,11,October,"Farms, Woods, Pastures",Dark - Roadway Not Lighted,Rural (<30% Developed),2 lanes,Straight - Level,State Secondary Route,...,Obstruction In Roadway,No Special Feature,Coarse Asphalt,Coastal,Rural,50 - 55 MPH,"Double Yellow Line, No Passing Zone",Clear,No,A: Suspected Serious Injury
17040,Wilmington,New Hanover,17,April,Commercial,Daylight,Urban (>70% Developed),1 lane,Straight - Level,Public Vehicular Area,...,,Missing,Smooth Asphalt,Coastal,Urban,5 - 15 MPH,No Control Present,Clear,No,C: Possible Injury
15305,Raleigh,Wake,17,April,Commercial,Daylight,Urban (>70% Developed),3 lanes,Straight - Grade,Local Street,...,,No Special Feature,Smooth Asphalt,Piedmont,Urban,30 - 35 MPH,Stop And Go Signal,Clear,No,C: Possible Injury
9846,Ahoskie,Hertford,23,August,Residential,Dark - Roadway Not Lighted,Mixed (30% To 70% Developed),2 lanes,Straight - Level,NC Route,...,,No Special Feature,Coarse Asphalt,Coastal,Rural,50 - 55 MPH,"Double Yellow Line, No Passing Zone",Cloudy,No,O: No Injury
25216,Lumberton,Robeson,12,December,Residential,Daylight,Urban (>70% Developed),Unknown,Straight - Level,Public Vehicular Area,...,,"Driveway, Public",Coarse Asphalt,Coastal,Urban,5 - 15 MPH,No Control Present,Clear,No,C: Possible Injury
23323,Lumberton,Robeson,18,May,Industrial,Daylight,Urban (>70% Developed),Unknown,Missing,Interstate,...,,Missing,Missing,Coastal,Urban,Unknown,No Control Present,Clear,No,K: Killed


In [7]:
# Profile the df_ante dataframe
from pandas_profiling import ProfileReport

profile_ante = ProfileReport(df_ante, title='Pandas Profiling Report', html={'style':{'full_width':True}})
profile_ante.to_file(output_file="/Users/danoand/Documents/Companies/LambdaSchool/Build_Project_02/data/ChapelHillAccidents_Ante.html")

In [8]:
# Split the data into training, validation, and testing datasets
from sklearn.model_selection import train_test_split
df_ante_train, df_ante_test = train_test_split(df_ante, train_size=0.85, test_size=0.15, 
                              stratify=df_ante['PedInjury'], random_state=42)

print(f'Shape of df_ante_train: {df_ante_train.shape}; Shape of df_ante_test: {df_ante_test.shape}')

Shape of df_ante_train: (28650, 22); Shape of df_ante_test: (5057, 22)


In [9]:
# Split the train data into a training subset and a validation subset
df_ante_train, df_ante_val = train_test_split(df_ante_train, test_size=len(df_ante_test), 
                              stratify=df_ante_train['PedInjury'], random_state=42)

print(f'Shape of df_ante_train: {df_ante_train.shape}; Shape of df_ante_val: {df_ante_val.shape}; Shape of df_ante_test: {df_ante_test.shape}')

Shape of df_ante_train: (23593, 22); Shape of df_ante_val: (5057, 22); Shape of df_ante_test: (5057, 22)


In [10]:
# Breakdown the target attribute by class value
df_ante_train['PedInjury'].value_counts(normalize=True)

C: Possible Injury             0.409020
B: Suspected Minor Injury      0.354597
A: Suspected Serious Injury    0.072140
K: Killed                      0.063917
O: No Injury                   0.059975
Unknown Injury                 0.040351
Name: PedInjury, dtype: float64

In [11]:
# Wrangle the modeling data
outcome_serious = ['K: Killed', 'A: Suspected Serious Injury']

# has_fatality is a function that returns a boolean value if the inbound value represents a serious outcome (True) or not (False)
def has_serious_outcome(val):
  if (val in outcome_serious): 
    return True

  return False

# wrangle returns a dataframe with updated/created columns for a given input dataframe 
def wrangle(DF):
  X = DF.copy()

  # Create a boolean target column reflecting an outcome (serious injury or not)
  X['ped_serious_outcome'] = X['PedInjury'].apply(has_serious_outcome)

  # List columns to be dropped
  cols_drop = ['PedInjury'] # PedInjury - use the engineered 'ped_fatality' column

  # Drop undesired columns
  X = X.drop(columns=cols_drop)

  return X

In [12]:
# Wrangle the training, validation, and test datasets
df_wrgl_train = wrangle(df_ante_train)
df_wrgl_val   = wrangle(df_ante_val)
df_wrgl_test  = wrangle(df_ante_test)

print(f'Shape of df_wrgl_train: {df_wrgl_train.shape}; Shape of df_wrgl_val: {df_wrgl_val.shape}; Shape of df_wrgl_test: {df_wrgl_test.shape}')

Shape of df_wrgl_train: (23593, 22); Shape of df_wrgl_val: (5057, 22); Shape of df_wrgl_test: (5057, 22)


In [13]:
df_wrgl_train.sample(5)

Unnamed: 0,City,County,CrashHour,CrashMonth,Development,LightCond,Locality,NumLanes,RdCharacte,RdClass,...,RdDefects,RdFeature,RdSurface,Region,RuralUrban,SpeedLimit,TraffCntrl,Weather,Workzone,ped_serious_outcome
33525,Charlotte,Mecklenburg,22,July,Residential,Dark - Roadway Not Lighted,Urban (>70% Developed),1 lane,Curve - Level,Local Street,...,,No Special Feature,Concrete,Piedmont,Urban,30 - 35 MPH,No Control Present,Clear,No,False
32892,Kinston,Lenoir,21,August,Residential,Dark - Lighted Roadway,Urban (>70% Developed),2 lanes,Straight - Level,Local Street,...,,No Special Feature,Coarse Asphalt,Coastal,Urban,30 - 35 MPH,No Control Present,Clear,No,True
25520,Greensboro,Guilford,14,February,Commercial,Daylight,Urban (>70% Developed),2 lanes,Straight - Level,Public Vehicular Area,...,,No Special Feature,Smooth Asphalt,Piedmont,Urban,5 - 15 MPH,No Control Present,Clear,No,False
6946,Kannapolis,Cabarrus,7,December,Institutional,Daylight,Urban (>70% Developed),4 lanes,Straight - Level,Local Street,...,,Other,Smooth Asphalt,Piedmont,Urban,30 - 35 MPH,No Control Present,Clear,No,True
24742,Durham,Durham,18,August,Residential,Daylight,Urban (>70% Developed),2 lanes,Straight - Level,Local Street,...,,No Special Feature,Smooth Asphalt,Piedmont,Urban,30 - 35 MPH,No Control Present,Clear,No,False


In [14]:
# Construct the modeling datasets
target = 'ped_serious_outcome'
X_train = df_wrgl_train.drop(columns=target)
y_train = df_wrgl_train[target]

X_val   = df_wrgl_val.drop(columns=target)
y_val   = df_wrgl_val[target]

X_test  = df_wrgl_test.drop(columns=target)
y_test  = df_wrgl_test[target]

In [17]:
# Create a baseline prediction using a dummy classifier
from sklearn.dummy import DummyClassifier
clfr_dummy = DummyClassifier(strategy="stratified")

clfr_dummy.fit(X_train, y_train)
clfr_dummy.predict(X_train)

print(f'Baseline prediction score for the training set is: {round(clfr_dummy.score(X_train, y_train), 4)}')

Baseline prediction score for the training set is: 0.7647


In [18]:
import category_encoders as ce 
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline

# Construct a modeling pipeline
pipeline = make_pipeline(
    ce.OrdinalEncoder(),
    RandomForestClassifier(n_jobs=-1, random_state=42)
)

# Fit the pipeline to the training dataset
pipeline.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('ordinalencoder',
                 OrdinalEncoder(cols=['City', 'County', 'CrashMonth',
                                      'Development', 'LightCond', 'Locality',
                                      'NumLanes', 'RdCharacte', 'RdClass',
                                      'RdConditio', 'RdConfig', 'RdDefects',
                                      'RdFeature', 'RdSurface', 'Region',
                                      'RuralUrban', 'SpeedLimit', 'TraffCntrl',
                                      'Weather', 'Workzone'],
                                drop_invariant=False, handle_missing='value',
                                handle_un...
                ('randomforestclassifier',
                 RandomForestClassifier(bootstrap=True, class_weight=None,
                                        criterion='gini', max_depth=None,
                                        max_features='auto',
                                        max_leaf_no

In [19]:
# Determine the validation dataset's accuracy
print(f'Validation Accuracy: {round(pipeline.score(X_val, y_val), 5)}')