In [1]:
import pandas as pd
import numpy as np

In [173]:
df = pd.read_csv("../data/crash-merged-lat-lng.csv")

In [174]:
df.columns

Index([u'TLA NAME', u'CRASH ROAD', u'CRASH DIST', u'CRASH DIRN', u'INTSN', u'SIDE ROAD', u'CRASH ID', u'CRASH DATE', u'CRASH DOW', u'CRASH TIME', u'MVMT', u'VEHICLES', u'CAUSES', u'OBJECTS STRUCK', u'ROAD CURVE', u'ROAD WET', u'LIGHT', u'WTHRa', u'JUNC TYPE', u'TRAF CTRL', u'ROAD MARK', u'SPD LIM', u'CRASH FATAL CNT', u'CRASH SEV CNT', u'CRASH MIN CNT', u'PERS AGE1', u'PERS AGE2', u'EASTING', u'NORTHING', u'LAT', u'LNG'], dtype='object')

---

Map crash factors to general categories

In [3]:
ranges = [
    ("Driver control: Alcohol or drugs", 100, 109),
    ("Driver control: Too fast for conditions", 110, 117),
    ("Driver control: Failed to keep left", 120, 129),
    ("Driver control: Lost control", 130, 139),
    ("Driver control: Failed to signal in time", 140, 145),
    ("Driver control: Overtaking", 150, 161),
    ("Driver control: Wrong lane or turned from wrong position", 170, 179),
    ("Driver control: In line of traffic", 180, 184),
    ("Driver control: Sudden action", 190, 199),
    ("Driver control: Forbidden movements", 200, 210),
    ("Vehicle conflicts: Failed to give way", 300, 316),
    ("Vehicle conflicts: Did not stop", 320, 328),
    ("Vehicle conflicts: Inattentive: failed to notice", 330, 341),
    ("Vehicle conflicts: Attention diverted by", 350, 363),
    ("Vehicle conflicts: Did not see or look for another party until too late", 370, 379),
    ("Vehicle conflicts: Misjudged speed, distance, size or position of", 380, 387),
    ("General driver: Inexperience", 400, 408),
    ("General driver: Fatigue (drowsy, tired, fell asleep)", 410, 415),
    ("General driver: Incorrect use of vehicle controls", 420, 429),
    ("General driver: Showing off", 430, 434),
    ("General driver: Parked or stopped", 440, 448),
    ("General person: Illness and disability", 500, 517),
    ("General person: Driver or passenger, boarding, leaving, in vehicle", 520, 527),
    ("General person: Miscellaneous person", 530, 534),
    ("Vehicles: Lights and reflectors at fault or dirty", 600, 607),
    ("General person: Brakes", 610, 615),
    ("General person: Steering", 620, 622),
    ("General person: Tyres", 630, 634),
    ("General person: Windscreen or mirror", 640, 648),
    ("General person: Mechanical", 650, 653),
    ("General person: Body or chassis", 660, 673),
    ("General person: Load", 680, 688),
    ("General person: Miscellaneous vehicle", 690, 696),
    ("General person: Walking along road", 700, 705),
    ("General person: Crossing road", 710, 719),
    ("General person: Miscellaneous", 720, 732),
    ("Road: Slippery", 800, 809),
    ("Road: Surface", 810, 819),
    ("Road: Obstructed", 820, 827),
    ("Road: Visibility limited", 830, 839),
    ("Road: Signs and signals", 840, 845),
    ("Road: Markings", 850, 855),
    ("Road: Street lighting", 860, 864),
    ("Road: Raised islands and roundabouts", 870, 873),
    ("Miscellaneous: Weather", 900, 905),
    ("Miscellaneous: Animals", 910, 915),
    ("Miscellaneous: Entering or leaving land use", 920, 931)
]

In [4]:
cat_lookup = [None for x in range(0, 1000)]
for tp in ranges:
    for i in range(tp[1], tp[2]+1):
        cat_lookup[i] = tp[0]

In [5]:
causes = df["CAUSES"].tolist()

In [21]:
new_attributes = dict()
for tp in ranges:
    new_attributes[tp[0]] = [0 for x in range(0, len(causes))]

In [22]:
for i in range(0, len(causes)):
    if "nan" in str(causes[i]):
        continue
    this_causes = [
        int(x) for x in causes[i].
        replace("A","").
        replace("B","").
        replace("C","").
        replace("D","").
        replace("+","").split() 
    ]
    for code in this_causes:
        if cat_lookup[code] != None:
            new_attributes[ cat_lookup[code] ][i] = 1

In [140]:
for key in new_attributes:
    df[key.upper().replace(" ", "_")] = new_attributes[key]

----

Create `D1factor`. Is not currently computed correctly

In [179]:
# missing is assumed to be "no"
d1_factor = [ "n" for x in range(0, len(causes)) ]
for i in range(0, len(causes)):
    cause = causes[i]
    if "nan" in str(cause):
        continue
    else:
        if cause.count("A") >= cause.count("B"):
            d1_factor[i] = "y"

In [181]:
d1_factor.count("n")

196578

In [161]:
df["D1_FACTOR"] = d1_factor

----

Create an attribute called `PRINCIPAL_MOVEMENT`

In [74]:
#vehicle_codes = df["VEHICLES"].tolist()
#vehicle_codes = [ (x[0:2], x[2::] ) for x in vehicle_codes ]

In [147]:
movements = df["MVMT"].tolist()

In [148]:
letter_to_movement = {
    'A': "Overtaking and lane change",
    'B': "Head on",
    'C': "Lost control or off road (straight roads)",
    'D': "Cornering",
    'E': "Collision with obstruction",
    'F': "Rear end",
    'G': "Turning versus same direction",
    'H': "Crossing (no turns)",
    'J': "Crossing (vehicle turning)",
    'K': "Merging",
    'L': "Right turn against",
    "M": "Manoeuvring",
    "N": "Pedestrians crossing road",
    "P": "Pedestrians (other)",
    "Q": "Mischellaneous"
}

In [152]:
principal_movements = ["" for x in range(0, len(movements))] 
for i in range(0, len(movements)):
    elem = movements[i]
    if "nan" in str(elem):
        continue
    key = elem[0]
    if key in letter_to_movement:
        principal_movements[i] = key

In [153]:
principal_movements

['D',
 'D',
 'E',
 'B',
 'D',
 'M',
 'H',
 'C',
 'G',
 'E',
 'E',
 'C',
 'M',
 'H',
 'M',
 'H',
 'D',
 'J',
 'H',
 'H',
 'H',
 'P',
 'D',
 'H',
 'C',
 'M',
 'D',
 'C',
 'B',
 'E',
 'M',
 'C',
 'E',
 'H',
 'M',
 'M',
 'C',
 'M',
 'C',
 'H',
 'G',
 'D',
 'H',
 'H',
 'H',
 'H',
 'G',
 'C',
 'F',
 'D',
 'B',
 'C',
 'C',
 'G',
 'H',
 'Q',
 'B',
 'H',
 'H',
 'H',
 'D',
 'D',
 'D',
 'C',
 'L',
 'C',
 'G',
 'D',
 'E',
 'L',
 'C',
 'L',
 'C',
 'E',
 'D',
 'D',
 'H',
 'F',
 'H',
 'K',
 'C',
 'D',
 'H',
 'D',
 'K',
 'H',
 'P',
 'B',
 'Q',
 'L',
 'C',
 'C',
 'Q',
 'A',
 'C',
 'D',
 'D',
 'D',
 'C',
 'C',
 'C',
 'B',
 'M',
 'F',
 'C',
 'L',
 'L',
 'K',
 'H',
 'C',
 'H',
 'H',
 'H',
 'D',
 'F',
 'F',
 'F',
 'L',
 'C',
 'F',
 'E',
 'G',
 'J',
 'B',
 'J',
 'C',
 'C',
 'B',
 'C',
 'A',
 'C',
 'C',
 'A',
 'C',
 'A',
 'B',
 'C',
 'F',
 'A',
 'B',
 'C',
 'H',
 'B',
 'L',
 'D',
 'H',
 'E',
 'D',
 'M',
 'D',
 'D',
 'D',
 'A',
 'L',
 'J',
 'B',
 'L',
 'J',
 'D',
 'F',
 'D',
 'G',
 'J',
 'H',
 'G',
 'K',
 'Q'

In [154]:
df["PRINCIPAL_MOVEMENT"] = principal_movements

----

----

Objects struck

In [137]:
objects_struck = dict()
for elem in ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'X', 'Y', 'Z']:
    objects_struck[elem] = [ 0 for x in range(0, df.shape[0]) ]

In [138]:
struck_vector = df["OBJECTS STRUCK"].tolist()
for i in range(0, len(struck_vector)):
    strucks = struck_vector[i]
    if "nan" in str(strucks):
        continue
    for elem in strucks:
        objects_struck[elem][i] = 1

In [139]:
for key in objects_struck:
    df["STRUCK_" + key] = objects_struck[key]

----

Specify attributes that should be removed

In [108]:
should_remove = ["TLA NAME", "CRASH ROAD", "SIDE ROAD", "CRASH ID", "CRASH DATE", "LAT", "LNG", "MVMT", "OBJECTS STRUCK" ]

In [141]:
for attr in should_remove:
    del df[attr]

In [159]:
df.to_csv("/tmp/tmp.csv", index=False)