In [3]:
import pandas as pd
import numpy as np

In [None]:
folder = 'data/'

collisions = pd.read_csv(folder+"collisions2018.csv", dtype={'case_id': str}, low_memory=False)
parties = pd.read_csv(folder+"parties2018.csv",dtype={'case_id': str}, low_memory=False)
victims = pd.read_csv(folder+"victims2018.csv",dtype={'case_id': str}, low_memory=False)

## PARTIES

In [None]:
parties.head(10)

In [None]:
parties.info()

In [None]:
set(parties.cellphone_use.apply(type))

In [None]:
parties[parties['cellphone_use'].apply(lambda x: isinstance(x, float))]['cellphone_use']

NaN values are labelled as float type

In [None]:
set(parties.vehicle_make.apply(type))

All the other columns of type object have missing values, but the types are the same.

### Check if column fields conform to the schema

In [None]:
for col in parties.columns:
    print('----------------------------------------------')
    print(parties[col].value_counts(dropna=False))

----------------------------------------------
0    3979697
1    3306909
Name: at_fault, dtype: int64
----------------------------------------------
3698263                92
1237567                76
0065479                58
3763361                34
1987342                28
                       ..
3710011201223501103     1
2314517                 1
2522615                 1
0482448                 1
0519714                 1
Name: case_id, Length: 3678058, dtype: int64
----------------------------------------------
3      2572993
NaN    2540882
D      1274423
C       795475
2        39114
B        38932
1        24787
Name: cellphone_use, dtype: int64
----------------------------------------------
Y      5249681
N       746801
O       664187
NaN     624967
E          970
Name: financial_responsibility, dtype: int64
----------------------------------------------
NaN    7269293
A        17313
Name: hazardous_materials, dtype: int64
----------------------------------------------
420

There are a few inconsistencies :

    - cellphone_use : some entries use the chars B, C or D and others integers 1, 2 or 3. As the documentation says that this field should be chars, we can map the integers to chars. By looking at the distribution of values it seems that 1 -> B, 2-> C and 3 -> D.
    - vehicle_make : There seems to be spelling mistakes for the make of the vehicle, for exemple TOYTA instead of TOYOTA.

In [None]:
# cellphone_use
dic = {"1": "B", "2": "C", "3": "D"}

parties.loc[(parties.cellphone_use == "1") | (parties.cellphone_use == "2") 
            | (parties.cellphone_use == "3"), "cellphone_use"] = parties.loc[(parties.cellphone_use == "1") 
            | (parties.cellphone_use == "2")
            | (parties.cellphone_use == "3"), "cellphone_use"].apply(lambda x: dic[x])

In [None]:
# vehicle_make
np.sort(parties.vehicle_make.dropna().unique())

In [None]:
typos = {'MAZD' : 'MAZDA', 'MERCEDES-BENZ': 'MERCEDES BENZ', 'NISS': 'NISSAN', 'NOT STATED': np.nan,
         'TOYTA': 'TOYOTA', 'WHITE': np.nan, 'WHITE GMC': 'GMC', 'WHITE VOLVO': 'VOLVO'}

parties.vehicle_make = parties.vehicle_make.apply(lambda x: typos[x] if x in typos.keys() else x)

Change party sex, at_fault, school_bus_related and hazardous materials to one character strings

In [None]:
dic_sex_fault = {"female" : "F", "male": "M", 1: "T", 0: "F"}
dic_hazardous_bus = {"A" : "T", "E" : "T"}
parties.party_sex = parties.party_sex.apply(lambda x: dic_sex_fault[x] if x in dic_sex_fault.keys() else x)
parties.at_fault = parties.at_fault.apply(lambda x: dic_sex_fault[x] if x in dic_sex_fault.keys() else x)
parties.hazardous_materials = parties.hazardous_materials.apply(lambda x: dic_hazardous_bus[x] if x in dic_hazardous_bus.keys() else "F")
parties.school_bus_related = parties.school_bus_related.apply(lambda x: dic_hazardous_bus[x] if x in dic_hazardous_bus.keys() else "F")
for col in ["party_sex", "at_fault", "hazardous_materials", "school_bus_related"]:
    print('----------------------------------------------')
    print(parties[col].value_counts(dropna=False))

## VICTIMS

In [None]:
victims.head(10)

In [None]:
victims.info()

In [None]:
set(victims.victim_safety_equipment_1.apply(type))

Degree of injury is of type object but contains only fields of type string, same for other columns of type object.

### Check if column fields conform to the schema

In [None]:
for col in victims.columns:
    print('----------------------------------------------')
    print(victims[col].value_counts(dropna=False))

----------------------------------------------
3150216    116
0828420    112
3698263     97
2671741     96
1407091     89
          ... 
1883324      1
3161149      1
2778469      1
3121687      1
1196646      1
Name: case_id, Length: 2053340, dtype: int64
----------------------------------------------
4094       1
525398     1
603248     1
640110     1
644204     1
          ..
1394693    1
1406979    1
1402881    1
2797566    1
4098       1
Name: id, Length: 4082685, dtype: int64
----------------------------------------------
1     1916883
2     1870134
3      239077
4       43874
5        8961
       ...   
62          1
54          1
60          1
32          1
46          1
Name: party_number, Length: 86, dtype: int64
----------------------------------------------
18.0     146098
NaN      142022
19.0     135319
17.0     129664
20.0     122082
          ...  
112.0         4
109.0         3
108.0         3
114.0         2
124.0         1
Name: victim_age, Length: 128, dtype: int64


There are a few inconsistencies :
    
    - victim_degree_of_injury has four entries with value 7 instead of the string "possible injury"
    - victim_ejected has four entries with value 4
 
For victim_degree_of_injury we can substitute 7 for the corresponding string value.
For victim_ejected we can put a Nan value instead of the 4.

In [None]:
victims.loc[victims.victim_degree_of_injury == '7', "victim_degree_of_injury"] = "possible injury"

In [None]:
victims.loc[victims.victim_ejected == 4, "victim_ejected"] = np.nan

Change victim_sex to one character string M / F

In [None]:
dic = {"female" : "F", "male": "M"}
victims.victim_sex = victims.victim_sex.apply(lambda x: dic[x] if x in dic.keys() else x)
victims.victim_sex.value_counts(dropna=False)

## COLLISIONS

In [None]:
collisions.head(10)

In [None]:
collisions.info()

In [None]:
set(collisions.officer_id.apply(type))

In [None]:
collisions[collisions.officer_id.apply(lambda x: isinstance(x, str))].officer_id.head()

All the fields of type object are either of type str or str and float. When they are of type str and float, the float values correspond to a NaN. The type object comes from the fact that when we import the data we set low_memory to false.

### Check if column fields conform to the schema

In [None]:
for col in collisions.columns:
    print('----------------------------------------------')
    print(collisions[col].value_counts(dropna=False))

----------------------------------------------
1733705                1
1287912                1
2104023                1
1613894                1
2392391                1
                      ..
2708011229153889561    1
2653888                1
2382673                1
2590565                1
0217563                1
Name: case_id, Length: 3678063, dtype: int64
----------------------------------------------
2002-11-08    2918
2003-10-31    2479
2005-02-11    2340
2002-12-20    2319
2001-12-14    2264
              ... 
2018-07-20       1
2018-05-16       1
2017-10-21       1
2018-01-05       1
2018-02-08       1
Name: collision_date, Length: 2583, dtype: int64
----------------------------------------------
property damage only    2272352
pain                     846108
other injury             457931
severe injury             76085
fatal                     25587
Name: collision_severity, dtype: int64
----------------------------------------------
NaN         29645
16:00:00    27292

**SECOND MILESTONE :**
There are 3 case_id that are not unique : 97293, 373108 and 965874. We need to check if they are just duplicated rows.

**FINAL MILESTONE :** The previous case_id were not duplicated but due to the fact that we imported case_id as int instead of as a string.

In [None]:
# Look at previously duplicated case_id (here 97293)
collisions.loc[collisions["case_id"].str.contains('97293')].head(10)

In [None]:
'''
duplicated_id = collisions.loc[collisions.case_id.isin(['97293', '373108', '965874'])]
duplicated_id

#Since they do not refer to the same case they seem to be a mistake. We will delete those 6 cases from the data to avoid any issues. We need to be carreful to remove also the associated data in the dataframes parties and victims.

duplicated_case_id = ['97293', '373108', '965874']

collisions.drop(collisions[collisions.case_id.isin(duplicated_case_id)].index, inplace=True)
parties.drop(parties[parties.case_id.isin(duplicated_case_id)].index, inplace=True)
victims.drop(victims[victims.case_id.isin(duplicated_case_id)].index, inplace=True)
'''

In [None]:
collisions.case_id.is_unique

In [None]:
# Check max length of case_id
collisions.case_id.map(len).max()

There are other inconsistencies : 

    - for hit_and_run: there is a field with value D
    - for officer_id: there are ids that are not only numbers
    - for pcf_violation_category: there is a field with value 21804
    - for pcf_violation_subsection: there are integer and string values
    - for road_surface: there is a field with value H
 
For hit_and_run, pcf_violation_category and road_surface, we can set the value of the field to NaN.
For officer_id, we can convert the field to strings.
For pcf_violation_subsection, we will either have to map the integer values to a string or vice versa, or convert all the values to strings

In [None]:
# hit_and_run
collisions.loc[collisions.hit_and_run == 'D', "hit_and_run"] = np.nan

# pcf_violation_category
collisions.loc[collisions.pcf_violation_category == '21804', "pcf_violation_category"] = np.nan

# road_surface
collisions.loc[collisions.road_surface == 'H', "road_surface"] = np.nan

# pcf_violation_subsection
collisions = collisions.astype({'pcf_violation_subsection': str})

In [None]:
collisions = collisions.astype({"officer_id": str})

Remove case_ids in collisions but not in parties :

In [None]:
todrop = collisions.case_id.isin(parties.case_id)
collisions.drop(collisions.loc[~todrop].index, inplace=True)

Change tow_away to a 1 character string T/F

In [None]:
dic = {1.0 : "T", 0.0: "F"}
    
collisions.tow_away = collisions.tow_away.apply(lambda x: dic[x] if x in dic.keys() else x)
collisions.tow_away.value_counts(dropna=False)

# Map short codes to long strings

Some fields have values encoded as integers or letters (A, B, C, ...). We map everything to characters

## Collisions

In [None]:
dic_ramp_intersection = {
    1: 'Ramp Exit, Last 50 Feet',
    2: 'Mid-Ramp',
    3: 'Ramp Entry, First 50 Feet',
    4: 'Not State Highway, Ramp-related, Within 100 Feet',
    5: 'Intersection',
    6: 'Not State Highway, Intersection-related, Within 250 Feet',
    7: 'Highway',
    8: 'Not State Highway'
}

collisions.ramp_intersection.replace(dic_ramp_intersection, inplace=True)

In [34]:
collisions.ramp_intersection.value_counts()

Not State Highway                                           333975
Highway                                                     148031
Intersection                                                 63923
Not State Highway, Ramp-related, Within 100 Feet             61124
Mid-Ramp                                                     56988
Ramp Exit, Last 50 Feet                                      53884
Not State Highway, Intersection-related, Within 250 Feet     21378
Ramp Entry, First 50 Feet                                    11192
Name: ramp_intersection, dtype: int64

## Victims

In [35]:
dic_victim_ejected = {
    0: 'Not Ejected',
    1: 'Fully Ejected',
    2: 'Partially Ejected',
    3: np.nan,
}

victims.victim_ejected.replace(dic_victim_ejected, inplace=True)
victims.victim_ejected.value_counts(dropna=False)

Not Ejected          3619541
NaN                   351825
Fully Ejected          98869
Partially Ejected      12450
Name: victim_ejected, dtype: int64

In [36]:
dic_victim_role = {
    1: 'Driver',
    2: 'Passenger',
    3: 'Pedestrian',
    4: 'Bicyclist',
    5: 'Other',
    6: 'Non-Injured Party'
}

victims.victim_role.replace(dic_victim_role, inplace=True)
victims.victim_role.value_counts(dropna=False)

Passenger            2336957
Driver               1266063
Non-Injured Party     299496
Pedestrian            101951
Bicyclist              75111
Other                   3107
Name: victim_role, dtype: int64

In [37]:
dic_victim_safety = {
    'A': 'None in Vehicle',
    'B': np.nan,
    'C': 'Lap Belt Used',
    'D': 'Lap Belt Not Used',
    'E': 'Shoulder Harness Used',
    'F': 'Shoulder Harness Not Used',
    'G': 'Lap/Shoulder Harness Used',
    'H': 'Lap/Shoulder Harness Not Used',
    'J': 'Passive Restraint Used',
    'K': 'Passive Restraint Not Used',
    'L': 'Air Bag Deployed',
    'M': 'Air Bag Not Deployed',
    'N': 'Other',
    'P': 'Not Required',
    'Q': 'Child Restraint in Vehicle Used',
    'R': 'Child Restraint in Vehicle Not Used',
    'S': 'Child Restraint in Vehicle, Use Unknown',
    'T': 'Child Restraint in Vehicle, Improper Use',
    'U': 'No Child Restraint in Vehicle',
    'V': 'Driver, Motorcycle Helmet Not Used',
    'W': 'Driver, Motorcycle Helmet Used',
    'X': 'Passenger, Motorcycle Helmet Not Used',
    'Y': 'Passenger, Motorcycle Helmet Used',
}

victims.victim_safety_equipment_1.replace(dic_victim_safety, inplace=True)
victims.victim_safety_equipment_2.replace(dic_victim_safety, inplace=True)
victims.victim_safety_equipment_1.value_counts(dropna=False)

Lap/Shoulder Harness Used                   1604459
Air Bag Not Deployed                         852496
Not Required                                 502238
Air Bag Deployed                             333893
NaN                                          311862
Child Restraint in Vehicle Used              131274
Lap Belt Used                                 80584
None in Vehicle                               52112
Lap/Shoulder Harness Not Used                 45154
Other                                         43321
Driver, Motorcycle Helmet Used                41968
Driver, Motorcycle Helmet Not Used            28304
Shoulder Harness Used                         12132
Lap Belt Not Used                             11974
Passenger, Motorcycle Helmet Used              8160
Shoulder Harness Not Used                      6602
No Child Restraint in Vehicle                  6303
Passive Restraint Used                         3747
Child Restraint in Vehicle, Improper Use       1677
Child Restra

In [38]:
victims['victim_seating_position_code'] = victims.victim_seating_position.copy()

In [39]:
dic_victim_seating = {
    1: 'Driver',
    2: 'Passenger',
    3: 'Passenger',
    4: 'Passenger',
    5: 'Passenger',
    6: 'Passenger',
    7: 'Station Wagon Rear',
    8: 'Rear Occupant of Truck or Van',
    9: np.nan,
    0: 'Other',
}

victims.victim_seating_position.replace(dic_victim_seating, inplace=True)
victims.victim_seating_position.value_counts(dropna=False)

Passenger                        2381654
Driver                           1326095
NaN                               209793
Other                              72234
Station Wagon Rear                 51631
Rear Occupant of Truck or Van      41278
Name: victim_seating_position, dtype: int64

## Parties

In [40]:
parties.other_associate_factor_1.value_counts(dropna=False)

N      5410008
A       551353
NaN     479546
F       279826
G       241815
H        68380
M        66173
E        53022
I        43092
L        36958
K        19303
O        15983
J        13707
Y         4927
P          820
R          697
U          310
T          241
X          162
S          102
Q           80
V           74
W           27
Name: other_associate_factor_1, dtype: int64

In [41]:
dic_assoc_factor = {
    'A': 'Violation',
    'E': 'Vision Obscurements',
    'F': 'Inattention',
    'G': 'Stop and Go Traffic',
    'H': 'Entering/Leaving Ramp',
    'I': 'Previous Collision',
    'J': 'Unfamiliar With Road',
    'K': 'Defective Vehicle Equipment',
    'L': 'Uninvolved Vehicle',
    'M': 'Other',
    'N': np.nan,
    'O': 'Runaway Vehicle',
    'P': 'Inattention, Cell Phone',
    'Q': 'Inattention, Electronic Equip.',
    'R': 'Inattention, Radio/CD',
    'S': 'Inattention, Smoking',
    'T': 'Inattention, Eating',
    'U': 'Inattention, Children',
    'V': 'Inattention, Animal',
    'W': 'Inattention, Personal Hygiene',
    'X': 'Inattention, Reading',
    'Y': 'Inattention, Other',
}

parties.other_associate_factor_1.replace(dic_assoc_factor, inplace=True)
parties.other_associate_factor_2.replace(dic_assoc_factor, inplace=True)
parties.other_associate_factor_1.value_counts(dropna=False)

NaN                               5889554
Violation                          551353
Inattention                        279826
Stop and Go Traffic                241815
Entering/Leaving Ramp               68380
Other                               66173
Vision Obscurements                 53022
Previous Collision                  43092
Uninvolved Vehicle                  36958
Defective Vehicle Equipment         19303
Runaway Vehicle                     15983
Unfamiliar With Road                13707
Inattention, Other                   4927
Inattention, Cell Phone               820
Inattention, Radio/CD                 697
Inattention, Children                 310
Inattention, Eating                   241
Inattention, Reading                  162
Inattention, Smoking                  102
Inattention, Electronic Equip.         80
Inattention, Animal                    74
Inattention, Personal Hygiene          27
Name: other_associate_factor_1, dtype: int64

In [47]:
parties.party_safety_equipment_1.replace(dic_victim_safety, inplace=True)
parties.party_safety_equipment_2.replace(dic_victim_safety, inplace=True)
parties.party_safety_equipment_1.value_counts(dropna=False)

Lap/Shoulder Harness Used                   2858794
Air Bag Not Deployed                        1781891
NaN                                         1577720
Air Bag Deployed                             459608
Not Required                                 318302
Other                                         73696
Lap Belt Used                                 47235
None in Vehicle                               37160
Driver, Motorcycle Helmet Used                36938
Lap/Shoulder Harness Not Used                 32227
Shoulder Harness Used                         20840
Passenger, Motorcycle Helmet Used             15295
Driver, Motorcycle Helmet Not Used             7947
Lap Belt Not Used                              5788
Shoulder Harness Not Used                      4876
Passive Restraint Used                         3235
No Child Restraint in Vehicle                  2370
Passive Restraint Not Used                      746
Child Restraint in Vehicle, Use Unknown         608
Child Restra

# SAVE CLEANED DATA

In [48]:
save_folder = "data_cleaned/"

parties.to_csv(save_folder+"parties_cleaned.csv", index=False)
collisions.to_csv(save_folder+"collisions_cleaned.csv", index=False)
victims.to_csv(save_folder+"victims_cleaned.csv", index=False)