## Imports

In [1]:
import pandas as pd
import numpy as np

pd.options.display.max_rows = 5000
pd.options.display.max_seq_items = 2000

## Reading in the datasets

#### Faults dataset

In [2]:
faults = pd.read_csv('../data/J1939Faults.csv',
                     low_memory=False, 
                     parse_dates=[2, 19],
                     infer_datetime_format=True)

In [3]:
faults.head()

Unnamed: 0,RecordID,ESS_Id,EventTimeStamp,eventDescription,actionDescription,ecuSoftwareVersion,ecuSerialNumber,ecuModel,ecuMake,ecuSource,spn,fmi,active,activeTransitionCount,faultValue,EquipmentID,MCTNumber,Latitude,Longitude,LocationTimeStamp
0,1,990349,2015-02-21 10:47:13,Low (Severity Low) Engine Coolant Level,,unknown,unknown,unknown,unknown,0,111,17,True,2,,1439,105354361,38.857638,-84.626851,2015-02-21 11:34:25
1,2,990360,2015-02-21 11:34:34,,,unknown,unknown,unknown,unknown,11,629,12,True,127,,1439,105354361,38.857638,-84.626851,2015-02-21 11:35:10
2,3,990364,2015-02-21 11:35:31,Incorrect Data Steering Wheel Angle,,unknown,unknown,unknown,unknown,11,1807,2,False,127,,1369,105336226,41.42125,-87.767361,2015-02-21 11:35:26
3,4,990370,2015-02-21 11:35:33,Incorrect Data Steering Wheel Angle,,unknown,unknown,unknown,unknown,11,1807,2,True,127,,1369,105336226,41.421018,-87.767361,2015-02-21 11:36:08
4,5,990416,2015-02-21 11:39:41,,,22281684P01*22357957P01*22362082P01*,13063430,0USA13_13_0415_2238A,VOLVO,0,4364,17,False,2,,1674,105427130,38.416481,-89.442638,2015-02-21 11:39:37


#### Diagnostics dataset

In [4]:
diagnostic = pd.read_csv('../data/VehicleDiagnosticOnboardData.csv')

In [5]:
diagnostic.head()

Unnamed: 0,Id,Name,Value,FaultId
0,1,IgnStatus,False,1
1,2,EngineOilPressure,0,1
2,3,EngineOilTemperature,96.74375,1
3,4,TurboBoostPressure,0,1
4,5,EngineLoad,11,1


This data needs to be flattened. We'll use a pivot so it will be ready to join to the Faults data

In [6]:
diagnostic = diagnostic.pivot(index='FaultId', columns='Name', values='Value').reset_index()

In [7]:
diagnostic.head()

Name,FaultId,AcceleratorPedal,BarometricPressure,CruiseControlActive,CruiseControlSetSpeed,DistanceLtd,EngineCoolantTemperature,EngineLoad,EngineOilPressure,EngineOilTemperature,...,FuelTemperature,IgnStatus,IntakeManifoldTemperature,LampStatus,ParkingBrake,ServiceDistance,Speed,SwitchedBatteryVoltage,Throttle,TurboBoostPressure
0,1,0.0,14.21,False,66.48672,423178.7,100.4,11.0,0.0,96.74375,...,,False,78.8,1023,True,,0.0,3276.75,,0.0
1,2,,,,,,,,,,...,,True,,1279,,,,,,
2,3,,,,,,,,,,...,,,,1279,,,,,,
3,4,,,,,,,,,,...,,True,,1279,,,,,,
4,5,,,,,,,,,,...,,,,16639,,,,,,


## Joining the two datasets

In [8]:
faults_diagnostic = faults.merge(diagnostic, left_on='RecordID', right_on='FaultId')

In [9]:
faults_diagnostic.head()

Unnamed: 0,RecordID,ESS_Id,EventTimeStamp,eventDescription,actionDescription,ecuSoftwareVersion,ecuSerialNumber,ecuModel,ecuMake,ecuSource,...,FuelTemperature,IgnStatus,IntakeManifoldTemperature,LampStatus,ParkingBrake,ServiceDistance,Speed,SwitchedBatteryVoltage,Throttle,TurboBoostPressure
0,1,990349,2015-02-21 10:47:13,Low (Severity Low) Engine Coolant Level,,unknown,unknown,unknown,unknown,0,...,,False,78.8,1023,True,,0.0,3276.75,,0.0
1,2,990360,2015-02-21 11:34:34,,,unknown,unknown,unknown,unknown,11,...,,True,,1279,,,,,,
2,3,990364,2015-02-21 11:35:31,Incorrect Data Steering Wheel Angle,,unknown,unknown,unknown,unknown,11,...,,,,1279,,,,,,
3,4,990370,2015-02-21 11:35:33,Incorrect Data Steering Wheel Angle,,unknown,unknown,unknown,unknown,11,...,,True,,1279,,,,,,
4,5,990416,2015-02-21 11:39:41,,,22281684P01*22357957P01*22362082P01*,13063430,0USA13_13_0415_2238A,VOLVO,0,...,,,,16639,,,,,,


## Cleaning

#### Column names

In [10]:
#My team opted to use snake case for column names

faults_diagnostic = faults_diagnostic.rename(columns={
    "RecordID": "record_id",
    "ESS_Id": "ess_id",
    "EventTimeStamp": "event_timestamp",
    "eventDescription": "event_description",
    "actionDescription": "action_description",
    "ecuSoftwareVersion": "ecu_software_version",
    "ecuSerialNumber": "ecu_serial_number",
    "ecuModel": "ecu_model",
    "ecuMake": "ecu_make",
    "ecuSource": "ecu_source",
    "activeTransitionCount": "active_transition_count",
    "faultValue": "fault_value",
    "EquipmentID": "equipment_id",
    "MCTNumber": "mct_number",
    "Latitude": "latitude",
    "Longitude": "longitude",
    "LocationTimeStamp": "location_timestamp",
    "FaultId": "fault_id",
    "AcceleratorPedal": "accelerator_pedal",
    "BarometricPressure": "barometric_pressure",
    "CruiseControlActive": "cruise_control_active",
    "CruiseControlSetSpeed": "cruise_control_set_speed",
    "DistanceLtd": "distance_ltd",
    "EngineCoolantTemperature": "engine_coolant_temperature",
    "EngineLoad": "engine_load",
    "EngineOilPressure": "engine_oil_pressure",
    "EngineOilTemperature": "engine_oil_temperature",
    "EngineRpm": "engine_rpm",
    "EngineTimeLtd": "engine_time_ltd",
    "FuelLevel": "fuel_level",
    "FuelLtd": "fuel_ltd",
    "FuelRate": "fuel_rate",
    "FuelTemperature": "fuel_temperature",
    "IgnStatus": "ign_status",
    "IntakeManifoldTemperature": "intake_manifold_temperature",
    "LampStatus": "lamp_status",
    "ParkingBrake": "parking_brake",
    "ServiceDistance": "service_distance",
    "Speed": "speed",
    "SwitchedBatteryVoltage": "switched_battery_voltage",
    "Throttle": "throttle",
    "TurboBoostPressure": "turbo_boost_pressure"
})

In [11]:
# The columns 'action_description' and 'fault_value' contain all null values, so they will be dropped

faults_diagnostic = faults_diagnostic.drop(columns=['action_description', 'fault_value'])

In [12]:
faults_diagnostic.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1187335 entries, 0 to 1187334
Data columns (total 43 columns):
 #   Column                       Non-Null Count    Dtype         
---  ------                       --------------    -----         
 0   record_id                    1187335 non-null  int64         
 1   ess_id                       1187335 non-null  int64         
 2   event_timestamp              1187335 non-null  datetime64[ns]
 3   event_description            1126490 non-null  object        
 4   ecu_software_version         891285 non-null   object        
 5   ecu_serial_number            844318 non-null   object        
 6   ecu_model                    1122577 non-null  object        
 7   ecu_make                     1122577 non-null  object        
 8   ecu_source                   1187335 non-null  int64         
 9   spn                          1187335 non-null  int64         
 10  fmi                          1187335 non-null  int64         
 11  active     

#### Column data types

In [13]:
faults_diagnostic.head()

Unnamed: 0,record_id,ess_id,event_timestamp,event_description,ecu_software_version,ecu_serial_number,ecu_model,ecu_make,ecu_source,spn,...,fuel_temperature,ign_status,intake_manifold_temperature,lamp_status,parking_brake,service_distance,speed,switched_battery_voltage,throttle,turbo_boost_pressure
0,1,990349,2015-02-21 10:47:13,Low (Severity Low) Engine Coolant Level,unknown,unknown,unknown,unknown,0,111,...,,False,78.8,1023,True,,0.0,3276.75,,0.0
1,2,990360,2015-02-21 11:34:34,,unknown,unknown,unknown,unknown,11,629,...,,True,,1279,,,,,,
2,3,990364,2015-02-21 11:35:31,Incorrect Data Steering Wheel Angle,unknown,unknown,unknown,unknown,11,1807,...,,,,1279,,,,,,
3,4,990370,2015-02-21 11:35:33,Incorrect Data Steering Wheel Angle,unknown,unknown,unknown,unknown,11,1807,...,,True,,1279,,,,,,
4,5,990416,2015-02-21 11:39:41,,22281684P01*22357957P01*22362082P01*,13063430,0USA13_13_0415_2238A,VOLVO,0,4364,...,,,,16639,,,,,,


##### Software Versions

In [14]:
faults_diagnostic[faults_diagnostic['ecu_software_version'].str.contains("?", regex=False, na=False)]

Unnamed: 0,record_id,ess_id,event_timestamp,event_description,ecu_software_version,ecu_serial_number,ecu_model,ecu_make,ecu_source,spn,...,fuel_temperature,ign_status,intake_manifold_temperature,lamp_status,parking_brake,service_distance,speed,switched_battery_voltage,throttle,turbo_boost_pressure
20828,21711,1878624,2015-04-14 11:38:36,,????_1284P4C_2*,________Y043718,MX,PCAR,0,5396,...,,,,17407,,,,,,
20829,21712,1878625,2015-04-14 11:38:36,,????_1284P4C_2*,________Y043718,MX,PCAR,0,5444,...,,,,17407,,,,,,
23937,24820,1933193,2015-04-16 19:49:37,,????1684P01*22357957P01*22362082P01*,13061463,0USA13_13_0415_2238A,VOLVO,0,4811,...,,True,122.0,255,,,4.361734,,17.6,1.74
23938,24821,1933194,2015-04-16 19:49:37,Incorrect Data Engine Oil Pressure,????1684P01*22357957P01*22362082P01*,13061463,0USA13_13_0415_2238A,VOLVO,0,100,...,,True,122.0,255,,,4.361734,,17.6,1.74
40226,41338,2216854,2015-05-02 14:45:37,Not Reporting Data Engine Variable Geometry Tu...,????1684P01*22357957P01*22362082P01*,13063430,0USA13_13_0415_2238A,VOLVO,0,641,...,,True,168.8,255,,,0.0,,0.0,0.0
40483,41595,2221486,2015-05-03 09:47:01,Low (Severity High) Particulate Matter Trap Mo...,????8181P01*22548975P01*22549033P01*,13063847,0USA13_13_0415_2238A,VOLVO,0,3064,...,,False,,255,,,,,,
81635,84021,2897660,2015-06-11 07:43:50,Incorrect Data Particulate Trap Outlet Pressure 1,????7106*04047537*092613211021*09300006*G1*BDR*,79723629,6X1u13D1500000000,CMMNS,0,3610,...,,True,,17407,,,,,,
90267,92653,3037645,2015-06-19 06:36:05,Low (Severity Low) Engine Coolant Level,????7106*04075952*092613211021*09300006*G1*BDR*,79731577,6X1u13D1500000000,CMMNS,0,111,...,32.0,True,89.6,1023,True,,0.0,3276.75,0.0,0.29
90273,92659,3037705,2015-06-19 06:39:59,Low (Severity Low) Engine Coolant Level,????7106*04075952*092613211021*09300006*G1*BDR*,79731577,6X1u13D1500000000,CMMNS,0,111,...,,,,1023,,,,,,
91224,93610,3054166,2015-06-20 01:31:30,Low (Severity Low) Engine Coolant Level,????0170*03015749*051914190353*09400015*G1*BDR*,79642446,6X1u13D1500000000,CMMNS,0,111,...,32.0,True,71.6,1023,True,,0.0,3276.75,0.0,0.0


##### Engine Serial Number

In [15]:
faults_diagnostic['ecu_serial_number'].value_counts()

unknown            298549
6U13D13             11207
79845785            10302
79856768             8158
79845329             7199
79623056             7066
79621048             6828
79845786             6349
79840984             6345
79844876             5872
79623054             5663
79623410             5578
79844877             5506
79620769             5482
79844882             5417
79614871             5326
79615187             5218
79857688             5059
79857689             5019
79614865             4374
79619434             4206
79620768             4133
79615184             3857
79845331             3819
79619117             3797
79614866             3788
S381222841           3176
79615183             3169
79845327             3149
79844880             3113
79857687             3089
79857685             3065
00000000             3062
79620774             3055
79618850             3041
79619125             2855
79607068             2798
79620764             2798
79623055    

There are 'unknown' (298549), 'Unspecified' (97), and NoSerial (3) values. Let's combine them all into 'unknown'

In [16]:
faults_diagnostic['ecu_serial_number'] = faults_diagnostic['ecu_serial_number'].replace('Unspecified', 'unknown')
faults_diagnostic['ecu_serial_number'] = faults_diagnostic['ecu_serial_number'].replace('NoSerial', 'unknown')

##### Engine Model

In [17]:
faults_diagnostic['ecu_model'].sort_values().unique()

array(['0USA10_13_0405_2237A', '0USA13_13_0415_2238A', '202.35.0',
       '20412511P07', '6L u13D0890000000', '6U13D13', '6X1u10D1500000000',
       '6X1u13D1500000000', '6X1u17D1500000000', '6X1u20D1500000000',
       'CE', 'CECU3-NAMUX3', 'CECU3B-NAMUX4', 'E0031', 'EC60-adv',
       'EC80ESP', 'EC80ESP AM000036', 'EC80ESP AM000038', 'EC80ESP+',
       'EEO-xxF112C', 'FAOM-xx810S-EC3', 'Gen 4 Boot Loader', 'MX',
       'MX16U13D13', 'MX16U15D13', 'Y044053', 'Y049568',
       '________Y043718', 'unknown', nan], dtype=object)

Nothing to update for models

##### Engine Make

In [18]:
faults_diagnostic['ecu_make'].value_counts()

CMMNS              433403
unknown            298549
PACCR              277021
BNDWS               71001
PCAR                20229
EATON               12612
VOLVO                7252
?????                 755
????S                 627
????R                 589
?MMNS                 289
?CAR                  152
?ACCR                  39
???CR                  20
?NDWS                  15
?????MX16U13D13         9
?????MX                 6
?ATON                   3
??MNS                   3
5516014                 1
???R                    1
??DWS                   1
Name: ecu_make, dtype: int64

In [19]:
faults_diagnostic['ecu_make'] = faults_diagnostic['ecu_make'].replace('?????', 'unknown')
faults_diagnostic['ecu_make'] = faults_diagnostic['ecu_make'].replace('????R', 'PACCR')
faults_diagnostic['ecu_make'] = faults_diagnostic['ecu_make'].replace('?MMNS', 'CMMNS')
faults_diagnostic['ecu_make'] = faults_diagnostic['ecu_make'].replace('?CAR', 'PCAR')
faults_diagnostic['ecu_make'] = faults_diagnostic['ecu_make'].replace('?ACCR', 'PACCR')
faults_diagnostic['ecu_make'] = faults_diagnostic['ecu_make'].replace('???CR', 'PACCR')
faults_diagnostic['ecu_make'] = faults_diagnostic['ecu_make'].replace('?NDWS', 'BNDWS')
faults_diagnostic['ecu_make'] = faults_diagnostic['ecu_make'].replace('?????MX16U13D13', 'MX16U13D13')
faults_diagnostic['ecu_make'] = faults_diagnostic['ecu_make'].replace('?????MX', 'MX')
faults_diagnostic['ecu_make'] = faults_diagnostic['ecu_make'].replace('?ATON', 'EATON')
faults_diagnostic['ecu_make'] = faults_diagnostic['ecu_make'].replace('??MNS', 'CMMNS')
faults_diagnostic['ecu_make'] = faults_diagnostic['ecu_make'].replace('???R', 'PCAR')
faults_diagnostic['ecu_make'] = faults_diagnostic['ecu_make'].replace('??DWS', 'BNDWS')

##### ECU Source

In [21]:
faults_diagnostic['ecu_source'].value_counts()

0     528044
49    514059
11    131122
3      13484
61       626
Name: ecu_source, dtype: int64

Nothing to clean

##### SPN

In [25]:
faults_diagnostic['spn'].value_counts()

111       365489
929       256541
96         90041
829        87788
639        41062
97         26745
596        22571
50353      11773
1569       10927
1761        9981
2863        8233
789         8022
629         8012
1068        7994
791         7799
1231        6992
1067        6947
641         6292
91          6210
3226        5547
3216        5420
37          5183
792         5164
412         5044
790         5011
807         4945
886         4190
1807        4081
627         4043
802         4036
171         4036
611         4026
1059        3952
51923       3950
793         3906
3251        3905
3464        3382
4364        2618
102         2578
630         2460
157         2426
70          2268
5396        2223
1483        2178
5848        2100
3610        2021
1045        1994
907         1962
4096        1877
5444        1800
1209        1736
523531      1718
2623        1700
101         1686
110         1652
3031        1650
1808        1636
934         1613
248         16

Nothing to clean

##### FMI

In [26]:
faults_diagnostic['fmi'].value_counts()

17    326553
9     288893
3     188631
2      82334
18     53602
31     39881
4      39734
0      27577
7      23249
1      20887
15     20739
14     12699
12     12233
5       9683
16      9242
10      7437
8       5741
19      5733
20      4440
11      3847
13      2634
6       1027
23       396
21       131
29         8
22         4
Name: fmi, dtype: int64

Nothing to clean

##### Active

In [27]:
faults_diagnostic['active'].value_counts(normalize=True)

True     0.512454
False    0.487546
Name: active, dtype: float64

Nothing to clean

##### Active Count

In [31]:
faults_diagnostic['active_transition_count'].value_counts()

126    581874
1      305476
127    102512
2       45075
3       13019
4        9658
6        6880
5        6777
0        5781
7        4782
8        4088
9        3750
11       3387
10       3326
12       2713
13       2603
14       2420
15       2358
16       2158
17       2064
18       1914
20       1755
19       1753
21       1590
22       1482
23       1475
24       1459
25       1393
26       1391
28       1277
27       1276
29       1230
30       1164
33       1132
32       1128
31       1074
34       1039
36       1000
39        973
40        972
37        970
35        968
38        965
44        915
43        908
42        906
41        896
45        837
48        832
46        813
47        811
50        800
49        798
55        762
52        747
54        738
51        731
53        728
59        723
58        720
56        689
57        687
63        676
61        671
64        655
70        648
62        644
60        624
66        623
72        618
67        618
68    

Nothing to clean

##### Equipment ID

In [32]:
faults_diagnostic['equipment_id'].value_counts()

1641          17492
1605          16393
1646          15462
1618          14986
1606          14973
1619          14832
1625          14783
1645          14268
1644          13920
1649          13557
1610          13489
1630          13149
1647          13117
1634          13003
1609          12902
1642          12653
1623          11834
1611          11804
1814          10361
1692          10301
1612           9554
1816           8164
1809           8026
1939           7966
1622           7918
1806           7771
1601           7437
1603           7141
1469           7116
1592           7085
1749           7000
1808           6856
1815           6474
1803           6390
1437           6133
309            5991
305            5738
1600           5684
1804           5670
1589           5532
1562           5483
1620           5470
1995           5365
1559           5358
1818           5135
1820           5080
1616           5015
1810           4883
1422           4736
1873           4584


# Per the instructions, we'll need to remove any equipment_id that is greater than 5 digits (Google 'drop columns that meet condition')

In [34]:
faults_diagnostic[faults_diagnostic['equipment_id'].str.len() > 5]

Unnamed: 0,record_id,ess_id,event_timestamp,event_description,ecu_software_version,ecu_serial_number,ecu_model,ecu_make,ecu_source,spn,...,fuel_temperature,ign_status,intake_manifold_temperature,lamp_status,parking_brake,service_distance,speed,switched_battery_voltage,throttle,turbo_boost_pressure
7069,7070,1157485,2015-03-03 09:18:42,,unknown,unknown,unknown,unknown,11,0,...,,True,,255,,,,,,
7070,7071,1157507,2015-03-03 09:19:43,Abnormal Update Rate Tire Location,unknown,unknown,unknown,unknown,49,929,...,32.0,True,73.4,1279,True,,0.0,3276.75,0.0,0.0
59121,61191,2545755,2015-05-21 08:16:17,,04993120*00027785*040213150018*07700044*I0*BBZ*,79464671,6X1u10D1500000000,CMMNS,0,4364,...,,True,114.8,1023,,,67.18576,3276.75,,11.6
59587,61657,2553312,2015-05-21 13:27:44,,04993120*00027785*040213150018*07700044*I0*BBZ*,79464671,6X1u10D1500000000,CMMNS,0,4364,...,,True,,17407,,,,,,
60728,62798,2570947,2015-05-22 10:44:37,High (Severity Medium) Aftertreatment 1 Intake...,04993120*00027785*040213150018*07700044*I0*BBZ*,79464671,6X1u10D1500000000,CMMNS,0,3216,...,,True,114.8,17407,,,50.70971,3276.75,,23.49
60911,62981,2574542,2015-05-22 13:21:38,,04993120*00027785*040213150018*07700044*I0*BBZ*,79464671,6X1u10D1500000000,CMMNS,0,5394,...,,True,111.2,17407,,,63.28277,3276.75,,11.6
62903,64973,2598094,2015-05-25 07:04:39,High (Severity High) Engine Injector Metering ...,04993120*00027785*040213150018*07700044*I0*BBZ*,79464671,6X1u10D1500000000,CMMNS,0,157,...,,True,84.2,17407,,,6.883628,3276.75,,4.93
63582,65652,2606983,2015-05-26 04:51:45,High (Severity High) Engine Injector Metering ...,04993120*00027785*040213150018*07700044*I0*BBZ*,79464671,6X1u10D1500000000,CMMNS,0,157,...,,True,82.4,17407,,,45.78729,3276.75,,1.74
63900,65970,2611699,2015-05-26 09:11:32,Condition Exists Engine Protection Torque Derate,04993120*00027785*040213150018*07700044*I0*BBZ*,79464671,6X1u10D1500000000,CMMNS,0,1569,...,,True,122.0,18431,,,66.91391,3276.75,,3.19
64069,66139,2614629,2015-05-26 10:56:57,Condition Exists Engine Protection Torque Derate,04993120*00027785*040213150018*07700044*I0*BBZ*,79464671,6X1u10D1500000000,CMMNS,0,1569,...,,True,104.0,18431,,,41.91343,3276.75,,0.87


##### MCT Number

In [37]:
faults_diagnostic['mct_number'].value_counts()

105415080    16503
105420184    15507
105381514    15330
105415457    15119
105416377    15034
105417985    13590
105442858    13187
105420195    13141
105415391    13079
105338710    11834
105420590    11038
105357098    10719
105369518    10430
105420520     9939
105320363     9580
105430420     8590
105420005     8532
105465351     8361
105420580     8111
105434215     8096
105351093     8064
105330008     8019
105304730     8018
105438415     8014
105415630     7889
105364937     7479
105357794     7265
105411041     7190
105357909     6937
105400037     6929
105438416     6879
105355660     6724
105333857     6639
105460307     6529
105432778     6421
105356370     6406
105442799     6200
105435663     6040
105334108     5978
105392933     5872
105410475     5784
105356577     5734
105406655     5728
105338555     5726
105362919     5675
105460271     5670
105410840     5610
105412415     5316
105349500     5165
105465571     5080
105465373     5054
105411109     5009
105427242   

Nothing to clean

##### Latitude and Longitude

In [49]:
faults_diagnostic[faults_diagnostic['latitude'] == 0]

Unnamed: 0,record_id,ess_id,event_timestamp,event_description,ecu_software_version,ecu_serial_number,ecu_model,ecu_make,ecu_source,spn,...,fuel_temperature,ign_status,intake_manifold_temperature,lamp_status,parking_brake,service_distance,speed,switched_battery_voltage,throttle,turbo_boost_pressure
891610,915675,32752046,2017-11-21 11:28:06,Low (Severity Medium) Engine Coolant Level,04384413*22057890*031617122339*60701702*G1*BGT*,80012214,6X1u17D1500000000,CMMNS,0,111,...,,True,66.2,2047,True,,0.0,13.0,100.0,0.0
891672,915737,32763318,2017-11-21 13:10:27,Low (Severity Medium) Engine Coolant Level,04384413*22057890*031617122339*60701702*G1*BGT*,80012214,6X1u17D1500000000,CMMNS,0,111,...,,,,1023,,,,,,
1150008,1207232,106294736,2019-06-24 09:10:10,Low Voltage (Sensor supply voltage 2),unknown,unknown,unknown,unknown,0,3510,...,,True,107.6,17407,False,,0.3301035,,100.0,0.0
1150009,1207233,106294737,2019-06-24 09:10:09,Low Voltage (Engine Exhaust Gas Recirculation ...,unknown,unknown,unknown,unknown,0,27,...,,True,105.8,17407,False,,0.01941785,,100.0,0.0
1150010,1207234,106294738,2019-06-24 09:10:11,,,,,,11,1807,...,,True,107.6,63487,False,,0.9417657,,100.0,0.0
1150011,1207235,106294739,2019-06-24 09:10:12,Low Current Aftertreatment Fuel Injector 1,unknown,unknown,unknown,unknown,0,3556,...,,True,107.6,17407,False,,3.893279,,100.0,0.0
1172275,1231439,116418325,2020-01-09 20:50:32,High Voltage (Fuel Level),,,CECU3B-NAMUX4,PACCR,49,96,...,,True,84.2,1279,True,,0.0,,100.0,0.29
1172276,1231440,116418326,2020-01-09 20:50:32,High Voltage (Left Fuel Level Sensor),,,CECU3B-NAMUX4,PACCR,49,829,...,,True,84.2,1279,True,,0.0,,100.0,0.29


8 rows contain a latitude of 0.0

In [52]:
faults_diagnostic[faults_diagnostic['longitude'] == 0]

Unnamed: 0,record_id,ess_id,event_timestamp,event_description,ecu_software_version,ecu_serial_number,ecu_model,ecu_make,ecu_source,spn,...,fuel_temperature,ign_status,intake_manifold_temperature,lamp_status,parking_brake,service_distance,speed,switched_battery_voltage,throttle,turbo_boost_pressure
891610,915675,32752046,2017-11-21 11:28:06,Low (Severity Medium) Engine Coolant Level,04384413*22057890*031617122339*60701702*G1*BGT*,80012214,6X1u17D1500000000,CMMNS,0,111,...,,True,66.2,2047,True,,0.0,13.0,100.0,0.0
891672,915737,32763318,2017-11-21 13:10:27,Low (Severity Medium) Engine Coolant Level,04384413*22057890*031617122339*60701702*G1*BGT*,80012214,6X1u17D1500000000,CMMNS,0,111,...,,,,1023,,,,,,
1150008,1207232,106294736,2019-06-24 09:10:10,Low Voltage (Sensor supply voltage 2),unknown,unknown,unknown,unknown,0,3510,...,,True,107.6,17407,False,,0.3301035,,100.0,0.0
1150009,1207233,106294737,2019-06-24 09:10:09,Low Voltage (Engine Exhaust Gas Recirculation ...,unknown,unknown,unknown,unknown,0,27,...,,True,105.8,17407,False,,0.01941785,,100.0,0.0
1150010,1207234,106294738,2019-06-24 09:10:11,,,,,,11,1807,...,,True,107.6,63487,False,,0.9417657,,100.0,0.0
1150011,1207235,106294739,2019-06-24 09:10:12,Low Current Aftertreatment Fuel Injector 1,unknown,unknown,unknown,unknown,0,3556,...,,True,107.6,17407,False,,3.893279,,100.0,0.0
1172275,1231439,116418325,2020-01-09 20:50:32,High Voltage (Fuel Level),,,CECU3B-NAMUX4,PACCR,49,96,...,,True,84.2,1279,True,,0.0,,100.0,0.29
1172276,1231440,116418326,2020-01-09 20:50:32,High Voltage (Left Fuel Level Sensor),,,CECU3B-NAMUX4,PACCR,49,829,...,,True,84.2,1279,True,,0.0,,100.0,0.29


The same 8 rows contain a longitude of 0.0

## Potentially remove records at (0.0, 0.0) Lat, Long AND need to filter out service stations using Patti's code

##### Location Timestamp

In [62]:
faults_diagnostic['location_timestamp'].dt.year.value_counts()

2016    332403
2015    325826
2017    254923
2018    144578
2019    112204
2020     17198
2000       192
2011         8
1969         2
2014         1
Name: location_timestamp, dtype: int64

We'll need to investigate the records pre-2014

In [63]:
faults_diagnostic[faults_diagnostic['location_timestamp'].dt.year == 1969]

Unnamed: 0,record_id,ess_id,event_timestamp,event_description,ecu_software_version,ecu_serial_number,ecu_model,ecu_make,ecu_source,spn,...,fuel_temperature,ign_status,intake_manifold_temperature,lamp_status,parking_brake,service_distance,speed,switched_battery_voltage,throttle,turbo_boost_pressure
891610,915675,32752046,2017-11-21 11:28:06,Low (Severity Medium) Engine Coolant Level,04384413*22057890*031617122339*60701702*G1*BGT*,80012214,6X1u17D1500000000,CMMNS,0,111,...,,True,66.2,2047,True,,0.0,13.0,100.0,0.0
891672,915737,32763318,2017-11-21 13:10:27,Low (Severity Medium) Engine Coolant Level,04384413*22057890*031617122339*60701702*G1*BGT*,80012214,6X1u17D1500000000,CMMNS,0,111,...,,,,1023,,,,,,


Is there a way to compare location_timestamp to event_timestamp? More importantly, is this worth doing?

In [64]:
faults_diagnostic[faults_diagnostic['location_timestamp'].dt.year == 2000]

Unnamed: 0,record_id,ess_id,event_timestamp,event_description,ecu_software_version,ecu_serial_number,ecu_model,ecu_make,ecu_source,spn,...,fuel_temperature,ign_status,intake_manifold_temperature,lamp_status,parking_brake,service_distance,speed,switched_battery_voltage,throttle,turbo_boost_pressure
1154193,1211417,108604425,2000-03-18 19:14:10,High Voltage (Left Fuel Level Sensor),,,CECU3B-NAMUX4,PACCR,49,829,...,,True,127.4,1279,False,,0.0,,100.0,0.58
1154194,1211418,108604426,2000-03-18 19:14:10,High Voltage (Fuel Level),,,CECU3B-NAMUX4,PACCR,49,96,...,,True,127.4,1279,False,,0.0,,100.0,0.58
1154195,1211419,108604487,2000-03-18 19:20:47,High Voltage (Fuel Level),,,CECU3B-NAMUX4,PACCR,49,96,...,,,,255,,,,,,
1154196,1211420,108604488,2000-03-18 19:20:47,High Voltage (Left Fuel Level Sensor),,,CECU3B-NAMUX4,PACCR,49,829,...,,,,255,,,,,,
1154198,1211422,108608408,2000-03-19 02:59:58,Not Reporting Data Wheel Sensor ABS Axle 2 Right,AAAI000032*AAAM000038*BB41275 *A82J140721A_9...,5W26153559,EC80ESP,BNDWS,11,792,...,,,,1279,,,,,,
1154199,1211423,108608954,2000-03-19 03:58:23,Not Reporting Data Wheel Sensor ABS Axle 2 Right,AAAI000032*AAAM000038*BB41275 *A82J140721A_9...,5W26153559,EC80ESP,BNDWS,11,792,...,32.0,True,82.4,1279,,,46.64167,,100.0,14.5
1154208,1211432,108615304,2000-03-19 07:32:53,Low (Severity Medium) Transmission Air Tank Pr...,5516014*202.35.0*5516502*E003.e003*5539478*25....,Z0047881,EEO-xxF112C,EATON,3,37,...,,True,118.4,50175,False,,0.1456339,,100.0,0.87
1154209,1211433,108615321,2000-03-19 07:34:31,Low (Severity Medium) Transmission Air Tank Pr...,5516014*202.35.0*5516502*E003.e003*5539478*25....,Z0047881,EEO-xxF112C,EATON,3,37,...,,,,50175,,,,,,
1154211,1211435,108620729,2000-03-19 08:40:03,High Voltage (Left Fuel Level Sensor),,,CECU3B-NAMUX4,PACCR,49,829,...,,True,116.6,1279,True,,0.0,,100.0,0.0
1154212,1211436,108620730,2000-03-19 08:40:03,High Voltage (Fuel Level),,,CECU3B-NAMUX4,PACCR,49,96,...,,True,116.6,1279,True,,0.0,,100.0,0.0


Interesting...the 2000 records appear to have the event_timestamps and location_timestamps in the same year

In [65]:
faults_diagnostic[faults_diagnostic['location_timestamp'].dt.year == 2011]

Unnamed: 0,record_id,ess_id,event_timestamp,event_description,ecu_software_version,ecu_serial_number,ecu_model,ecu_make,ecu_source,spn,...,fuel_temperature,ign_status,intake_manifold_temperature,lamp_status,parking_brake,service_distance,speed,switched_battery_voltage,throttle,turbo_boost_pressure
1154812,1212036,108848937,2011-01-02 03:17:37,Low (Severity Low) Catalyst Tank Level,04384413*22059242*090617144354*60701715*G1*BGT*,80015667,6X1u17D1500000000,CMMNS,0,1761,...,,True,127.4,1023,True,,0.0,,100.0,0.29
1155217,1212441,109079376,2011-01-01 05:10:23,Special Instructions Transmission Clutch Actuator,5516018*202.56.0*5516502*E003.e003*5539540*25....,Z0018064,EEO-xxF112C,EATON,3,788,...,,,,50175,,,,,,
1155218,1212442,109079377,2011-01-01 05:09:43,Low (Severity Medium) Transmission Air Tank Pr...,5516018*202.56.0*5516502*E003.e003*5539540*25....,Z0018064,EEO-xxF112C,EATON,3,37,...,,True,105.8,50175,True,,0.0,,100.0,0.29
1155219,1212443,109079378,2011-01-01 05:11:26,Low (Severity Medium) Transmission Air Tank Pr...,5516018*202.56.0*5516502*E003.e003*5539540*25....,Z0018064,EEO-xxF112C,EATON,3,37,...,,,,50175,,,,,,
1155230,1212454,109079376,2011-01-01 05:10:23,Special Instructions Transmission Clutch Actuator,5516018*202.56.0*5516502*E003.e003*5539540*25....,Z0018064,EEO-xxF112C,EATON,3,788,...,,,,50175,,,,,,
1155231,1212455,109079377,2011-01-01 05:09:43,Low (Severity Medium) Transmission Air Tank Pr...,5516018*202.56.0*5516502*E003.e003*5539540*25....,Z0018064,EEO-xxF112C,EATON,3,37,...,,True,105.8,50175,True,,0.0,,100.0,0.29
1155232,1212456,109079378,2011-01-01 05:11:26,Low (Severity Medium) Transmission Air Tank Pr...,5516018*202.56.0*5516502*E003.e003*5539540*25....,Z0018064,EEO-xxF112C,EATON,3,37,...,,,,50175,,,,,,
1171320,1230484,115919584,2011-01-01 00:10:32,Low (Severity High) Transmission Air Tank Pres...,5516014*202.35.0*5516502*E003.e003*5539478*25....,Z0047881,EEO-xxF112C,EATON,3,37,...,,True,125.6,50175,False,,3.116565,,100.0,2.03


In [66]:
faults_diagnostic[faults_diagnostic['location_timestamp'].dt.year == 2014]

Unnamed: 0,record_id,ess_id,event_timestamp,event_description,ecu_software_version,ecu_serial_number,ecu_model,ecu_make,ecu_source,spn,...,fuel_temperature,ign_status,intake_manifold_temperature,lamp_status,parking_brake,service_distance,speed,switched_battery_voltage,throttle,turbo_boost_pressure
27724,28607,1994354,2015-04-21 03:31:18,Abnormal Rate of Change Aftertreatment 1 Outle...,unknown,unknown,unknown,unknown,0,3226,...,,True,,17407,,,,,,


Given the discrepancies, it may make sense to use only one timestamp column

##### Fault ID

'fault_id' was used to join the datasets and is redundant with 'record_id'. We'll drop it now

In [69]:
faults_diagnostic = faults_diagnostic.drop(columns=['fault_id'])

##### Accelerator Pedal

In [73]:
faults_diagnostic['accelerator_pedal'].value_counts()

0           277162
100          54097
99.6          2845
0.4           1945
27.2          1911
27.6          1867
28.4          1863
28            1824
28.8          1816
25.2          1799
26.8          1798
26.4          1795
29.6          1789
25.6          1773
26            1757
24.4          1724
30.8          1717
29.2          1717
31.6          1688
30.4          1680
32            1667
32.8          1652
22.8          1641
31.2          1632
23.6          1618
33.2          1617
33.6          1603
23.2          1596
24            1589
30            1581
32.4          1581
24.8          1571
34            1518
22            1505
21.6          1502
0.8           1499
34.4          1494
22.4          1476
35.2          1466
21.2          1420
20.8          1393
20.4          1370
36.4          1354
36.8          1340
35.6          1329
99.2          1326
36            1310
34.8          1282
37.6          1281
37.2          1273
19.6          1252
38.4          1252
19.2        

Several values have commas—this is consistent across the diagnostic data, so we'll clean those all up at once

##### Barometric Pressure

In [75]:
faults_diagnostic['barometric_pressure'].value_counts()

14.355      85896
14.4275     82204
14.2825     76928
14.5        56948
14.21       56774
14.5725     40218
14.1375     37674
14.645      32327
14.065      21697
14.7175     20403
13.9925     11526
14.79        7784
13.92        7230
13.775       6661
13.8475      6560
13.7025      6529
13.63        5858
13.5575      4354
3.5525       3326
13.485       3120
14.8625      2420
13.4125      2118
3.625        1448
13.34        1130
3.48          822
14.935        685
13.2675       599
12.2525       311
3.4075        280
13.195        237
15.0075       182
7.25          182
13.1225       155
3.6975        121
3.335         118
13.05         109
12.9775       104
12.615         87
12.8325        70
12.6875        60
12.905         51
11.6725        43
12.76          43
12.5425        40
12.325         39
12.3975        34
11.9625        33
12.18          33
11.455         31
11.6           31
12.035         30
11.745         27
11.5275        26
12.47          26
11.89          24
15.08     

Commas to be fixed

##### Cruise Control Active

In [77]:
faults_diagnostic['cruise_control_active'].value_counts(normalize=True)

False    0.911756
True     0.088244
Name: cruise_control_active, dtype: float64

In [78]:
faults_diagnostic['cruise_control_set_speed'].value_counts()

66.48672    278248
64.6226     130757
0            34698
65.24397     10955
65.86535      9688
64.00124      9305
59.65163      8075
63.37986      6705
61.51575      6642
60.89438      6638
60.27301      5859
55.30204      5712
59.03026      5303
62.75849      5230
56.54478      4817
57.78752      4581
57.16615      4322
62.13712      4264
54.68066      4243
55.92341      4235
58.40889      4225
67.10809      4060
42.25324      2271
54.05929      2213
53.43792       935
68.35083       902
52.81655       748
50.33107       742
68.97221       669
51.57381       626
49.70969       537
52.19518       516
45.3601        465
44.73873       464
47.84558       451
45.98147       440
46.60284       410
41.63187       410
44.11736       390
50.95244       374
47.22421       345
67.72946       341
49.08833       304
70.83632       275
48.46695       242
27.9617        199
70.21494       198
31.68993       191
40.38913       190
43.49598       160
29.82582       150
29.20445       139
34.79679    

There are 34,698 records that have cruise control set to 0 mph. Interesting

##### Distance (Lifetime To Date)

In [87]:
faults_diagnostic['distance_ltd']

0             423178.7
1                  NaN
2                  NaN
3                  NaN
4                  NaN
              ...     
1187330            NaN
1187331       423937.9
1187332       465925.4
1187333    28606.65625
1187334            NaN
Name: distance_ltd, Length: 1187335, dtype: object

We'll need to fix commas before going further

In [None]:
comma_columns = ['accelerator_pedal',
                'barometric_pressure',
                'distance_ltd'
                ]

In [70]:
faults_diagnostic.columns

Index(['record_id', 'ess_id', 'event_timestamp', 'event_description',
       'ecu_software_version', 'ecu_serial_number', 'ecu_model', 'ecu_make',
       'ecu_source', 'spn', 'fmi', 'active', 'active_transition_count',
       'equipment_id', 'mct_number', 'latitude', 'longitude',
       'location_timestamp', 'accelerator_pedal', 'barometric_pressure',
       'cruise_control_active', 'cruise_control_set_speed', 'distance_ltd',
       'engine_coolant_temperature', 'engine_load', 'engine_oil_pressure',
       'engine_oil_temperature', 'engine_rpm', 'engine_time_ltd', 'fuel_level',
       'fuel_ltd', 'fuel_rate', 'fuel_temperature', 'ign_status',
       'intake_manifold_temperature', 'lamp_status', 'parking_brake',
       'service_distance', 'speed', 'switched_battery_voltage', 'throttle',
       'turbo_boost_pressure'],
      dtype='object')