# Transform Crashes Dataset to Data X-Ray Input
Data Source: https://data.cityofchicago.org/Transportation/Traffic-Crashes-Crashes/85ca-t3if

In [1]:
import pandas as pd
import numpy as np

In [3]:
raw_crashes = pd.read_csv('../scratch/crashes.csv', dtype='str', nrows=100000)

In [4]:
# drop irrelevant columns
raw_crashes.drop(['RD_NO', 'CRASH_DATE', 'REPORT_TYPE', 'CRASH_DATE_EST_I', 'STREET_NO', 'STREET_NAME', 'HIT_AND_RUN_I', 'DATE_POLICE_NOTIFIED', 'BEAT_OF_OCCURRENCE', 'PHOTOS_TAKEN_I', 'STATEMENTS_TAKEN_I', 'INJURIES_TOTAL', 'MOST_SEVERE_INJURY', 'INJURIES_FATAL', 'INJURIES_INCAPACITATING', 'INJURIES_NON_INCAPACITATING', 'INJURIES_REPORTED_NOT_EVIDENT', 'INJURIES_NO_INDICATION', 'INJURIES_UNKNOWN', 'DAMAGE'], axis=1, inplace=True)

In [5]:
#raw_crashes['FAKE_COL'] = raw_crashes['CRASH_TYPE'] == 'NO INJURY / DRIVE AWAY'
raw_crashes.drop(['NOT_RIGHT_OF_WAY_I', 'CRASH_DAY_OF_WEEK', 'CRASH_MONTH', 'DOORING_I', 'WORK_ZONE_I', 'WORK_ZONE_TYPE', 'WORKERS_PRESENT_I'], axis=1, inplace=True)
#raw_crashes = raw_crashes[['CRASH_TYPE', 'FAKE_COL', 'WEATHER_CONDITION']]

In [6]:
# drop columns with too many unique values
raw_crashes.drop(['PRIM_CONTRIBUTORY_CAUSE', 'SEC_CONTRIBUTORY_CAUSE', 'LOCATION', 'LATITUDE', 'LONGITUDE'], axis=1, inplace=True)

raw_crashes.columns

Index(['POSTED_SPEED_LIMIT', 'TRAFFIC_CONTROL_DEVICE', 'DEVICE_CONDITION',
       'WEATHER_CONDITION', 'LIGHTING_CONDITION', 'FIRST_CRASH_TYPE',
       'TRAFFICWAY_TYPE', 'LANE_CNT', 'ALIGNMENT', 'ROADWAY_SURFACE_COND',
       'ROAD_DEFECT', 'CRASH_TYPE', 'INTERSECTION_RELATED_I',
       'STREET_DIRECTION', 'NUM_UNITS', 'CRASH_HOUR'],
      dtype='object')

In [7]:
# bucket lat/lon/location
#raw_crashes['LATITUDE'] = raw_crashes['LATITUDE'].astype(float).round(2)
#raw_crashes['LONGITUDE'] = raw_crashes['LONGITUDE'].astype(float).round(2)

In [32]:
raw_crashes = raw_crashes[(raw_crashes['ROAD_DEFECT'] != 'OTHER') & (raw_crashes['ROADWAY_SURFACE_COND'] != 'OTHER') & (raw_crashes['TRAFFICWAY_TYPE'] != 'OTHER') & (raw_crashes['WEATHER_CONDITION'] != 'OTHER') & (raw_crashes['WEATHER_CONDITION'] != 'UNKNOWN') & (raw_crashes['LIGHTING_CONDITION'] != 'UNKNOWN') & (raw_crashes['ROADWAY_SURFACE_COND'] != 'UNKNOWN') & (raw_crashes['ROAD_DEFECT'] != 'UNKNOWN') & pd.notna(raw_crashes['INTERSECTION_RELATED_I']) & pd.notna(raw_crashes['NUM_UNITS']) &(raw_crashes['TRAFFICWAY_TYPE'] != 'UNKNOWN') & pd.notna(raw_crashes['LANE_CNT']) & (raw_crashes['TRAFFIC_CONTROL_DEVICE'] != 'UNKNOWN') & (raw_crashes['TRAFFIC_CONTROL_DEVICE'] != 'OTHER') & (raw_crashes['DEVICE_CONDITION'] != 'UNKNOWN') & (raw_crashes['DEVICE_CONDITION'] != 'OTHER')]
raw_crashes.shape

(12632, 16)

In [33]:
raw_crashes['LANE_CNT'] = raw_crashes['LANE_CNT'].apply(lambda x: '<=1' if int(x) <= 1 \
                                        else '(1;3]' if 1 < int(x) <= 3 \
                                        else '(3;5]' if 3 <= int(x) <= 5 \
                                        else '>5')

raw_crashes['TRAFFICWAYTYPE_LANECOUNT'] = raw_crashes['TRAFFICWAY_TYPE'].astype(str) + '--///--' + raw_crashes['LANE_CNT'].astype(str)
raw_crashes.drop(['TRAFFICWAY_TYPE', 'LANE_CNT'], axis=1, inplace=True)

raw_crashes['TRAFFICCONTROLDEVICE_DEVICECONDITION'] = raw_crashes['TRAFFIC_CONTROL_DEVICE'].astype(str) + '--///--' + raw_crashes['DEVICE_CONDITION'].astype(str)
raw_crashes.drop(['TRAFFIC_CONTROL_DEVICE', 'DEVICE_CONDITION'], axis=1, inplace=True)

raw_crashes['CRASH_HOUR'] = raw_crashes['CRASH_HOUR'].apply(lambda x: 'MORNING' if 6 <= int(x) < 12 \
                                        else 'DAY' if 12 <= int(x) < 17 \
                                        else 'EVENING' if 17 <= int(x) < 21 \
                                        else 'NIGHT')

raw_crashes['POSTED_SPEED_LIMIT'] = raw_crashes['POSTED_SPEED_LIMIT'].apply(lambda x: '<10' if int(x) < 10 \
                                        else '[10;30)' if 10 <= int(x) < 30 \
                                        else '[30;50)' if 30 <= int(x) < 50 \
                                        else '>50')

In [34]:
list((x, raw_crashes[x].unique()) for x in raw_crashes.columns)

[('POSTED_SPEED_LIMIT',
  array(['[30;50)', '[10;30)', '<10', '>50'], dtype=object)),
 ('WEATHER_CONDITION',
  array(['CLEAR', 'CLOUDY/OVERCAST', 'SNOW', 'RAIN', 'SLEET/HAIL',
         'FOG/SMOKE/HAZE', 'SEVERE CROSS WIND GATE'], dtype=object)),
 ('LIGHTING_CONDITION',
  array(['DARKNESS', 'DARKNESS, LIGHTED ROAD', 'DAYLIGHT', 'DUSK', 'DAWN'],
        dtype=object)),
 ('FIRST_CRASH_TYPE', array(['ANGLE', 'REAR END', 'TURNING', 'PEDESTRIAN',
         'SIDESWIPE SAME DIRECTION', 'PARKED MOTOR VEHICLE',
         'OTHER NONCOLLISION', 'HEAD ON', 'SIDESWIPE OPPOSITE DIRECTION',
         'FIXED OBJECT', 'PEDALCYCLIST', 'OTHER OBJECT', 'ANIMAL',
         'OVERTURNED'], dtype=object)),
 ('ALIGNMENT',
  array(['STRAIGHT AND LEVEL', 'STRAIGHT ON GRADE', 'CURVE, LEVEL',
         'CURVE ON HILLCREST', 'CURVE ON GRADE', 'STRAIGHT ON HILLCREST'],
        dtype=object)),
 ('ROADWAY_SURFACE_COND',
  array(['DRY', 'SNOW OR SLUSH', 'WET', 'ICE', 'SAND, MUD, DIRT'],
        dtype=object)),
 ('ROAD_DEFECT

In [42]:
feature_vector =   'a:a:a:a:a:a:a:a:a:a:a:a:a:'
structure_vector = '0:0:0:0:0:0:0:0:0:0:0:0:0:'
max_dims =         '1:1:1:1:1:1:1:1:1:1:1:2:2:'

assert(len(feature_vector.split(':')) == len(raw_crashes.columns) - 1)

In [36]:
error_rate = 1 - (raw_crashes[raw_crashes['CRASH_TYPE'] == 'NO INJURY / DRIVE AWAY'].count()[0] / raw_crashes.count()[0])
cost = 100.0
error_rate

0.4734800506649778

In [37]:
top_row = feature_vector + '\t' + max_dims + ';' + str(error_rate) + ';' + str(cost) + ';false;' + feature_vector + ';' + structure_vector + ';' + str(raw_crashes.count()[0]) + ';0;'
top_row

'a:a:a:a:a:a:a:a:a:a:a:a:a:\t1:1:1:1:1:1:1:1:1:1:1:2:2:;0.4734800506649778;100.0;false;a:a:a:a:a:a:a:a:a:a:a:a:a:;0:0:0:0:0:0:0:0:0:0:0:0:0:;12632;0;'

In [38]:
truth_vals = raw_crashes.loc[:, 'CRASH_TYPE'] == 'NO INJURY / DRIVE AWAY'
entries = raw_crashes.loc[:, raw_crashes.columns != 'CRASH_TYPE'].astype(str)

In [39]:
raw_crashes['input-str'] = np.sum('a--///--'+entries+'--//--', axis=1)
raw_crashes['input-str'] = raw_crashes['input-str'].str.replace(':', '/COLON/').str.replace(';','/SEMICOLON/').str.replace('=', '/EQ/').str.replace('%', '/PERCENT/').str.replace('_','/UNDERSCORE/').str.replace('--//--',':').str.replace('--///--','_')

In [40]:
with open('./data/crash-input-filtered-noNaNs.txt', 'w') as f:
    f.write(top_row)
    list(f.write('{}%{}%{}='.format(i, truth_vals.iloc[i], raw_crashes['input-str'].iloc[i])) for i in range(len(raw_crashes)))