In [13]:
import pandas as pd
import numpy as np
from pandas_geojson import to_geojson
from pandas_geojson import write_geojson

In [14]:
# Read the crash reports dataset
crash_reports = pd.read_csv('crash_reports.csv', skiprows=11, header=0)

# Read the county coordinates dataset (used to fill in missing coordinate values in crash reports dataset)
county_coordinates = pd.read_csv('Texas_Counties_Centroid_Map.csv')

In [15]:
# Drop unnecessary columns for county_coordinates
columns_to_drop = ['CNTY_NBR', 'FIPS', 'Shape_Leng', 'Shape_Area', 'County Centroid Location']
county_coordinates = county_coordinates.drop(columns=columns_to_drop)

# Convert 'County' column to uppercase 
county_coordinates['CNTY_NM'] = county_coordinates['CNTY_NM'].str.upper()

# Replace 'DE WITT' with 'DEWITT'
county_coordinates['CNTY_NM'] = county_coordinates['CNTY_NM'].replace('DE WITT', 'DEWITT')

# Rename county_coordinates columns to match crash_reports
county_coordinates = county_coordinates.rename(columns={'CNTY_NM': 'County', 'X (Lat)': 'County_Latitude', 'Y (Long)': 'County_Longitude'})

In [16]:
# Merge the county_coordinates on the 'County' column
crash_data = pd.merge(crash_reports, county_coordinates[['County', 'County_Latitude', 'County_Longitude']], on='County', how='left')

# Replace 'No Data' with NaN
crash_data.replace('No Data', np.nan, inplace=True)

# Fill missing latitude and longitude values with county center coordinates
crash_data['Latitude'] = crash_data['Latitude'].fillna(crash_data['County_Latitude'])
crash_data['Longitude'] = crash_data['Longitude'].fillna(crash_data['County_Longitude'])

In [17]:
# Format Date and Time
crash_data['Crash Date'] = pd.to_datetime(crash_data['Crash Date']).dt.date.apply(lambda x: x.strftime('%Y-%m-%d'))
crash_data['Crash Time'] = crash_data['Crash Time'].apply(lambda x: f"{str(x)[:-2].zfill(2)}:{str(x)[-2:]}")

# Format Numbers
crash_data['Latitude'] = pd.to_numeric(crash_data['Latitude'])
crash_data['Longitude'] = pd.to_numeric(crash_data['Longitude'])
crash_data['Person Age'] = pd.to_numeric(crash_data['Person Age'])

# Format Booleans
crash_data['School Bus Flag'] = crash_data['School Bus Flag'].map({'Yes': True, 'No': False})
crash_data['Commercial Motor Vehicle Flag'] = crash_data['Commercial Motor Vehicle Flag'].map({'Yes': True, 'No': False})

In [18]:
# Fill in missing ages with 9999
median_age = crash_data['Person Age'].median()
crash_data["Person Age"].fillna(9999, inplace=True)

# Fill in "No Vehicle Defect" where missing in Possible Vehicle Defect 1 and Vehicle Defect 1 columns
crash_data['Possible Vehicle Defect 1'] = crash_data['Possible Vehicle Defect 1'].fillna('No Vehicle Defect')
crash_data['Vehicle Defect 1'] = crash_data['Vehicle Defect 1'].fillna('No Vehicle Defect')

# Fill in "Not CMV" where missing in CMV Vehicle Type column
crash_data['CMV Vehicle Type'] = crash_data['CMV Vehicle Type'].fillna('Not CMV')

# Fill in "No Contributing Factor" where missing in Contributing Factor 1 column
crash_data['Contributing Factor 1'] = crash_data['Contributing Factor 1'].fillna('No Contributing Factor')

In [19]:
# Drop County_Latitude and County_Longitude now that they've been used to fill in missing data
crash_data.drop(columns=['County_Latitude', 'County_Longitude'], inplace=True)

In [20]:
# Search for missing data values
crash_data.isna().sum()

Crash ID                         0
City                             0
Commercial Motor Vehicle Flag    0
County                           0
Crash Date                       0
Crash Severity                   0
Crash Time                       0
Day of Week                      0
Latitude                         0
Longitude                        0
School Bus Flag                  0
Speed Limit                      0
CMV Vehicle Type                 0
Contributing Factor 1            0
Possible Vehicle Defect 1        0
Vehicle Defect 1                 0
Person Age                       0
Person Gender                    0
Person Injury Severity           0
Person Restraint Used            0
Person Type                      0
dtype: int64

In [21]:
crash_data.describe()

Unnamed: 0,Crash ID,Latitude,Longitude,Speed Limit,Person Age
count,6095.0,6095.0,6095.0,6095.0,6095.0
mean,19007990.0,31.212226,-97.552249,60.05644,157.473339
std,186605.4,1.777563,2.487941,15.015781,1083.803901
min,18429000.0,25.958796,-106.610762,-1.0,0.0
25%,18845760.0,29.866313,-98.333342,55.0,24.0
50%,19021950.0,31.285347,-97.030274,65.0,37.0
75%,19167590.0,32.65377,-95.586549,70.0,53.0
max,19415280.0,36.435303,-93.729286,85.0,9999.0


In [22]:
crash_data.dtypes

Crash ID                           int64
City                              object
Commercial Motor Vehicle Flag       bool
County                            object
Crash Date                        object
Crash Severity                    object
Crash Time                        object
Day of Week                       object
Latitude                         float64
Longitude                        float64
School Bus Flag                     bool
Speed Limit                        int64
CMV Vehicle Type                  object
Contributing Factor 1             object
Possible Vehicle Defect 1         object
Vehicle Defect 1                  object
Person Age                       float64
Person Gender                     object
Person Injury Severity            object
Person Restraint Used             object
Person Type                       object
dtype: object

In [23]:
crash_data.head()

Unnamed: 0,Crash ID,City,Commercial Motor Vehicle Flag,County,Crash Date,Crash Severity,Crash Time,Day of Week,Latitude,Longitude,...,Speed Limit,CMV Vehicle Type,Contributing Factor 1,Possible Vehicle Defect 1,Vehicle Defect 1,Person Age,Person Gender,Person Injury Severity,Person Restraint Used,Person Type
0,18674968,OUTSIDE CITY LIMITS,True,HARRIS,2022-01-01,A - SUSPECTED SERIOUS INJURY,14:49,SATURDAY,30.089179,-95.458757,...,65,9 - TRACTOR/SEMI TRAILER,47 - ILL (EXPLAIN IN NARRATIVE),No Vehicle Defect,No Vehicle Defect,34.0,1 - MALE,A - SUSPECTED SERIOUS INJURY,1 - SHOULDER & LAP BELT,1 - DRIVER
1,18676126,PHARR,True,HIDALGO,2022-01-01,A - SUSPECTED SERIOUS INJURY,15:28,SATURDAY,26.206012,-98.176431,...,45,Not CMV,22 - FAILED TO CONTROL SPEED,No Vehicle Defect,No Vehicle Defect,32.0,1 - MALE,C - POSSIBLE INJURY,1 - SHOULDER & LAP BELT,1 - DRIVER
2,18676126,PHARR,True,HIDALGO,2022-01-01,A - SUSPECTED SERIOUS INJURY,15:28,SATURDAY,26.206012,-98.176431,...,45,Not CMV,No Contributing Factor,No Vehicle Defect,No Vehicle Defect,57.0,1 - MALE,A - SUSPECTED SERIOUS INJURY,97 - NOT APPLICABLE,5 - DRIVER OF MOTORCYCLE TYPE VEHICLE
3,18676126,PHARR,True,HIDALGO,2022-01-01,A - SUSPECTED SERIOUS INJURY,15:28,SATURDAY,26.206012,-98.176431,...,45,9 - TRACTOR/SEMI TRAILER,22 - FAILED TO CONTROL SPEED,No Vehicle Defect,No Vehicle Defect,42.0,1 - MALE,B - SUSPECTED MINOR INJURY,1 - SHOULDER & LAP BELT,1 - DRIVER
4,18679166,GRAND PRAIRIE,True,TARRANT,2022-01-01,A - SUSPECTED SERIOUS INJURY,21:32,SATURDAY,32.778555,-97.061347,...,40,Not CMV,22 - FAILED TO CONTROL SPEED,No Vehicle Defect,No Vehicle Defect,20.0,2 - FEMALE,A - SUSPECTED SERIOUS INJURY,96 - NONE,1 - DRIVER


In [24]:
# Export to CSV
crash_data.to_csv('cleaned_crash_data.csv')

In [25]:
# Export to GeoJson
geo_json = to_geojson(df=crash_data, lat='Latitude', lon='Longitude',
                 properties=['Crash ID', 'City', 'Commercial Motor Vehicle Flag', 'County',
       'Crash Date', 'Crash Severity', 'Crash Time', 'Day of Week', 'School Bus Flag', 'Speed Limit', 'CMV Vehicle Type',
       'Contributing Factor 1', 'Possible Vehicle Defect 1',
       'Vehicle Defect 1', 'Person Age', 'Person Gender',
       'Person Injury Severity', 'Person Restraint Used', 'Person Type'])

In [26]:
geo_json

{'type': 'FeatureCollection',
 'features': [{'type': 'Feature',
   'properties': {'Crash ID': 18674968,
    'City': 'OUTSIDE CITY LIMITS',
    'Commercial Motor Vehicle Flag': True,
    'County': 'HARRIS',
    'Crash Date': '2022-01-01',
    'Crash Severity': 'A - SUSPECTED SERIOUS INJURY',
    'Crash Time': '14:49',
    'Day of Week': 'SATURDAY',
    'School Bus Flag': False,
    'Speed Limit': 65,
    'CMV Vehicle Type': '9 - TRACTOR/SEMI TRAILER',
    'Contributing Factor 1': '47 - ILL (EXPLAIN IN NARRATIVE)',
    'Possible Vehicle Defect 1': 'No Vehicle Defect',
    'Vehicle Defect 1': 'No Vehicle Defect',
    'Person Age': 34.0,
    'Person Gender': '1 - MALE',
    'Person Injury Severity': 'A - SUSPECTED SERIOUS INJURY',
    'Person Restraint Used': '1 - SHOULDER & LAP BELT',
    'Person Type': '1 - DRIVER'},
   'geometry': {'type': 'Point', 'coordinates': [-95.45875665, 30.08917859]}},
  {'type': 'Feature',
   'properties': {'Crash ID': 18676126,
    'City': 'PHARR',
    'Commer

In [27]:
write_geojson(geo_json, filename='cleaned_crash_data.geojson', indent=4)