In [97]:
import pandas as pd
import numpy as np

In [98]:
# Read the crash reports dataset
crash_reports = pd.read_csv('crash_reports.csv', skiprows=11, header=0)

# Read the county coordinates dataset (used to fill in missing coordinate values in crash reports dataset)
county_coordinates = pd.read_csv('Texas_Counties_Centroid_Map.csv')

In [99]:
# Drop unnecessary columns for county_coordinates
columns_to_drop = ['CNTY_NBR', 'FIPS', 'Shape_Leng', 'Shape_Area', 'County Centroid Location']
county_coordinates = county_coordinates.drop(columns=columns_to_drop)

# Convert 'County' column to uppercase 
county_coordinates['CNTY_NM'] = county_coordinates['CNTY_NM'].str.upper()

# Replace 'DE WITT' with 'DEWITT'
county_coordinates['CNTY_NM'] = county_coordinates['CNTY_NM'].replace('DE WITT', 'DEWITT')

# Rename county_coordinates columns to match crash_reports
county_coordinates = county_coordinates.rename(columns={'CNTY_NM': 'County', 'X (Lat)': 'County_Latitude', 'Y (Long)': 'County_Longitude'})

In [100]:
# Merge the two datasets on the 'County' column
crash_data = pd.merge(crash_reports, county_coordinates[['County', 'County_Latitude', 'County_Longitude']], on='County', how='left')

# Fill missing latitude and longitude values with county center coordinates
crash_data['Latitude'] = crash_data['Latitude'].fillna(crash_data['County_Latitude'])
crash_data['Longitude'] = crash_data['Longitude'].fillna(crash_data['County_Longitude'])

In [101]:
crash_data['Crash Time'].tail()

6090    1222
6091     710
6092     710
6093    1816
6094    1816
Name: Crash Time, dtype: int64

In [102]:
# Replace 'No Data' with NaN
crash_data.replace('No Data', np.nan, inplace=True)

# Format Date and Time
crash_data['Crash Date'] = pd.to_datetime(crash_data['Crash Date'])
crash_data['Crash Time'] = crash_data['Crash Time'].apply(lambda x: f"{str(x)[:-2].zfill(2)}:{str(x)[-2:]}")


# Format Numbers
crash_data['Latitude'] = pd.to_numeric(crash_data['Latitude'], errors='coerce')
crash_data['Longitude'] = pd.to_numeric(crash_data['Longitude'], errors='coerce')
crash_data['Person Age'] = pd.to_numeric(crash_data['Person Age'], errors='coerce')
crash_data['Latitude'] = pd.to_numeric(crash_data['County_Latitude'], errors='coerce')
crash_data['Longitude'] = pd.to_numeric(crash_data['County_Longitude'], errors='coerce')

# Format Booleans
crash_data['School Bus Flag'] = crash_data['School Bus Flag'].map({'Yes': True, 'No': False})
crash_data['Commercial Motor Vehicle Flag'] = crash_data['Commercial Motor Vehicle Flag'].map({'Yes': True, 'No': False})

In [103]:
# Fill in missing ages with 9999
median_age = crash_data['Person Age'].median()
crash_data["Person Age"].fillna(9999, inplace=True)

In [104]:
# Search for missing data values
crash_data.isna().sum()

Crash ID                            0
City                                0
Commercial Motor Vehicle Flag       0
County                              0
Crash Date                          0
Crash Severity                      0
Crash Time                          0
Day of Week                         0
Latitude                            0
Longitude                           0
School Bus Flag                     0
Speed Limit                         0
Surface Condition                   0
Weather Condition                   0
CMV Sequence of Events 1         3031
CMV Sequence of Events 2         5446
CMV Sequence of Events 3         5833
CMV Sequence of Events 4         6005
Contributing Factor 1            2995
Person Age                          0
Person Gender                       0
Person Injury Severity              0
Person Type                         0
County_Latitude                     0
County_Longitude                    0
dtype: int64

In [105]:
crash_data.describe()

Unnamed: 0,Crash ID,Latitude,Longitude,Speed Limit,Person Age,County_Latitude,County_Longitude
count,6095.0,6095.0,6095.0,6095.0,6095.0,6095.0,6095.0
mean,19007990.0,31.22192,-97.558039,60.05644,157.473339,31.22192,-97.558039
std,186605.4,1.77,2.478406,15.015781,1083.803901,1.77,2.478406
min,18429000.0,26.138477,-106.235223,-1.0,0.0,26.138477,-106.235223
25%,18845760.0,29.859671,-98.278145,55.0,24.0,29.859671,-98.278145
50%,19021950.0,31.296557,-96.99259,65.0,37.0,31.296557,-96.99259
75%,19167590.0,32.766537,-95.564142,70.0,53.0,32.766537,-95.564142
max,19415280.0,36.278463,-93.74463,85.0,9999.0,36.278463,-93.74463


In [106]:
crash_data.head()

Unnamed: 0,Crash ID,City,Commercial Motor Vehicle Flag,County,Crash Date,Crash Severity,Crash Time,Day of Week,Latitude,Longitude,...,CMV Sequence of Events 2,CMV Sequence of Events 3,CMV Sequence of Events 4,Contributing Factor 1,Person Age,Person Gender,Person Injury Severity,Person Type,County_Latitude,County_Longitude
0,18674968,OUTSIDE CITY LIMITS,True,HARRIS,2022-01-01,A - SUSPECTED SERIOUS INJURY,14:49,SATURDAY,29.859671,-95.397821,...,,,,47 - ILL (EXPLAIN IN NARRATIVE),34.0,1 - MALE,A - SUSPECTED SERIOUS INJURY,1 - DRIVER,29.859671,-95.397821
1,18676126,PHARR,True,HIDALGO,2022-01-01,A - SUSPECTED SERIOUS INJURY,15:28,SATURDAY,26.396627,-98.180887,...,,,,22 - FAILED TO CONTROL SPEED,32.0,1 - MALE,C - POSSIBLE INJURY,1 - DRIVER,26.396627,-98.180887
2,18676126,PHARR,True,HIDALGO,2022-01-01,A - SUSPECTED SERIOUS INJURY,15:28,SATURDAY,26.396627,-98.180887,...,,,,,57.0,1 - MALE,A - SUSPECTED SERIOUS INJURY,5 - DRIVER OF MOTORCYCLE TYPE VEHICLE,26.396627,-98.180887
3,18676126,PHARR,True,HIDALGO,2022-01-01,A - SUSPECTED SERIOUS INJURY,15:28,SATURDAY,26.396627,-98.180887,...,,,,22 - FAILED TO CONTROL SPEED,42.0,1 - MALE,B - SUSPECTED MINOR INJURY,1 - DRIVER,26.396627,-98.180887
4,18679166,GRAND PRAIRIE,True,TARRANT,2022-01-01,A - SUSPECTED SERIOUS INJURY,21:32,SATURDAY,32.771852,-97.291165,...,,,,22 - FAILED TO CONTROL SPEED,20.0,2 - FEMALE,A - SUSPECTED SERIOUS INJURY,1 - DRIVER,32.771852,-97.291165


In [107]:
crash_data.drop(columns=['County_Latitude', 'County_Longitude'], inplace=True)
crash_data.head()

Unnamed: 0,Crash ID,City,Commercial Motor Vehicle Flag,County,Crash Date,Crash Severity,Crash Time,Day of Week,Latitude,Longitude,...,Weather Condition,CMV Sequence of Events 1,CMV Sequence of Events 2,CMV Sequence of Events 3,CMV Sequence of Events 4,Contributing Factor 1,Person Age,Person Gender,Person Injury Severity,Person Type
0,18674968,OUTSIDE CITY LIMITS,True,HARRIS,2022-01-01,A - SUSPECTED SERIOUS INJURY,14:49,SATURDAY,29.859671,-95.397821,...,2 - CLOUDY,18 - COLLISION INVOLVING FIXED OBJECT,,,,47 - ILL (EXPLAIN IN NARRATIVE),34.0,1 - MALE,A - SUSPECTED SERIOUS INJURY,1 - DRIVER
1,18676126,PHARR,True,HIDALGO,2022-01-01,A - SUSPECTED SERIOUS INJURY,15:28,SATURDAY,26.396627,-98.180887,...,1 - CLEAR,,,,,22 - FAILED TO CONTROL SPEED,32.0,1 - MALE,C - POSSIBLE INJURY,1 - DRIVER
2,18676126,PHARR,True,HIDALGO,2022-01-01,A - SUSPECTED SERIOUS INJURY,15:28,SATURDAY,26.396627,-98.180887,...,1 - CLEAR,,,,,,57.0,1 - MALE,A - SUSPECTED SERIOUS INJURY,5 - DRIVER OF MOTORCYCLE TYPE VEHICLE
3,18676126,PHARR,True,HIDALGO,2022-01-01,A - SUSPECTED SERIOUS INJURY,15:28,SATURDAY,26.396627,-98.180887,...,1 - CLEAR,13 - COLLISION INVOLVING MOTOR VEHICLE IN TRAN...,,,,22 - FAILED TO CONTROL SPEED,42.0,1 - MALE,B - SUSPECTED MINOR INJURY,1 - DRIVER
4,18679166,GRAND PRAIRIE,True,TARRANT,2022-01-01,A - SUSPECTED SERIOUS INJURY,21:32,SATURDAY,32.771852,-97.291165,...,1 - CLEAR,,,,,22 - FAILED TO CONTROL SPEED,20.0,2 - FEMALE,A - SUSPECTED SERIOUS INJURY,1 - DRIVER


In [108]:
crash_data.to_csv('cleaned_crash_data.csv')