# Transform Crashes Dataset to Data X-Ray Input
Data Source: https://data.cityofchicago.org/Transportation/Traffic-Crashes-Crashes/85ca-t3if

In [1]:
import pandas as pd
import numpy as np

In [2]:
raw_crashes = pd.read_csv('../scratch/crashes.csv', dtype='str')
raw_crashes=raw_crashes.iloc[:50,:]

In [3]:
raw_crashes = raw_crashes.drop(['RD_NO', 'REPORT_TYPE', 'CRASH_DATE_EST_I', 'HIT_AND_RUN_I', 'DATE_POLICE_NOTIFIED', 'BEAT_OF_OCCURRENCE', 'PHOTOS_TAKEN_I', 'STATEMENTS_TAKEN_I', 'INJURIES_TOTAL', 'MOST_SEVERE_INJURY', 'INJURIES_FATAL', 'INJURIES_INCAPACITATING', 'INJURIES_NON_INCAPACITATING', 'INJURIES_REPORTED_NOT_EVIDENT', 'INJURIES_NO_INDICATION', 'INJURIES_UNKNOWN', 'DAMAGE'], axis=1)
raw_crashes.columns

Index(['CRASH_DATE', 'POSTED_SPEED_LIMIT', 'TRAFFIC_CONTROL_DEVICE',
       'DEVICE_CONDITION', 'WEATHER_CONDITION', 'LIGHTING_CONDITION',
       'FIRST_CRASH_TYPE', 'TRAFFICWAY_TYPE', 'LANE_CNT', 'ALIGNMENT',
       'ROADWAY_SURFACE_COND', 'ROAD_DEFECT', 'CRASH_TYPE',
       'INTERSECTION_RELATED_I', 'NOT_RIGHT_OF_WAY_I',
       'PRIM_CONTRIBUTORY_CAUSE', 'SEC_CONTRIBUTORY_CAUSE', 'STREET_NO',
       'STREET_DIRECTION', 'STREET_NAME', 'DOORING_I', 'WORK_ZONE_I',
       'WORK_ZONE_TYPE', 'WORKERS_PRESENT_I', 'NUM_UNITS', 'CRASH_HOUR',
       'CRASH_DAY_OF_WEEK', 'CRASH_MONTH', 'LATITUDE', 'LONGITUDE',
       'LOCATION'],
      dtype='object')

In [4]:
col_names = list(col.replace(':','/COLON/').replace('=','/EQ/').replace('%', '/PERCENT/').replace(';','/SEMICOLON/').replace('_','/UNDERSCORE/') for col in raw_crashes.columns)
feature_vector = ':'.join(['a' for _ in range(len(col_names) - 1)]) + ':'#':'.join(col_names) + ':'
structure_vector = ':'.join(['0' for _ in range(len(col_names) - 1)]) + ':'
max_dims = ':'.join(['1' for _ in range(len(col_names) - 1)]) + ':'

In [5]:
error_rate = 1 - (raw_crashes[raw_crashes['CRASH_TYPE'] == 'NO INJURY / DRIVE AWAY'].count()[0] / raw_crashes.count()[0])
cost = 99.1
error_rate

0.26

In [6]:
top_row = feature_vector + '\t' + max_dims + ';' + str(error_rate) + ';' + str(cost) + ';false;' + feature_vector + ';' + structure_vector + ';' + str(raw_crashes.count()[0]) + ';0;'
top_row

'a:a:a:a:a:a:a:a:a:a:a:a:a:a:a:a:a:a:a:a:a:a:a:a:a:a:a:a:a:a:\t1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:;0.26;99.1;false;a:a:a:a:a:a:a:a:a:a:a:a:a:a:a:a:a:a:a:a:a:a:a:a:a:a:a:a:a:a:;0:0:0:0:0:0:0:0:0:0:0:0:0:0:0:0:0:0:0:0:0:0:0:0:0:0:0:0:0:0:;50;0;'

In [7]:
truth_vals = raw_crashes.loc[:, 'CRASH_TYPE'] == 'NO INJURY / DRIVE AWAY'
entries = raw_crashes.loc[:, raw_crashes.columns != 'CRASH_TYPE'].astype(str)

In [8]:
raw_crashes['instr'] = np.sum('a--///--'+entries+'--//--', axis=1)
raw_crashes['instr'] = raw_crashes['instr'].str.replace(':', '/COLON/').str.replace(';','/SEMICOLON/').str.replace('=', '/EQ/').str.replace('%', '/PERCENT/').str.replace('_','/UNDERSCORE/').str.replace('--//--',':').str.replace('--///--','_')

In [9]:
with open('crash-input.txt', 'w') as f:
    f.write(top_row)
    list(f.write('{}%{}%{}='.format(i, truth_vals.iloc[i], raw_crashes['instr'].iloc[i])) for i in range(len(raw_crashes)))