## Imports

In [1]:
import pandas as pd
import numpy as np

pd.options.display.max_rows = 5000
pd.set_option('display.max_columns', None)
pd.options.display.max_seq_items = 2000

## Reading in the datasets

#### Faults dataset

In [2]:
faults = pd.read_csv('../data/J1939Faults.csv',
                     low_memory=False, 
                     parse_dates=[2, 19],
                     infer_datetime_format=True)

In [3]:
faults.head()

Unnamed: 0,RecordID,ESS_Id,EventTimeStamp,eventDescription,actionDescription,ecuSoftwareVersion,ecuSerialNumber,ecuModel,ecuMake,ecuSource,spn,fmi,active,activeTransitionCount,faultValue,EquipmentID,MCTNumber,Latitude,Longitude,LocationTimeStamp
0,1,990349,2015-02-21 10:47:13,Low (Severity Low) Engine Coolant Level,,unknown,unknown,unknown,unknown,0,111,17,True,2,,1439,105354361,38.857638,-84.626851,2015-02-21 11:34:25
1,2,990360,2015-02-21 11:34:34,,,unknown,unknown,unknown,unknown,11,629,12,True,127,,1439,105354361,38.857638,-84.626851,2015-02-21 11:35:10
2,3,990364,2015-02-21 11:35:31,Incorrect Data Steering Wheel Angle,,unknown,unknown,unknown,unknown,11,1807,2,False,127,,1369,105336226,41.42125,-87.767361,2015-02-21 11:35:26
3,4,990370,2015-02-21 11:35:33,Incorrect Data Steering Wheel Angle,,unknown,unknown,unknown,unknown,11,1807,2,True,127,,1369,105336226,41.421018,-87.767361,2015-02-21 11:36:08
4,5,990416,2015-02-21 11:39:41,,,22281684P01*22357957P01*22362082P01*,13063430,0USA13_13_0415_2238A,VOLVO,0,4364,17,False,2,,1674,105427130,38.416481,-89.442638,2015-02-21 11:39:37


#### Diagnostics dataset

In [4]:
diagnostic = pd.read_csv('../data/VehicleDiagnosticOnboardData.csv')

In [5]:
diagnostic.head()

Unnamed: 0,Id,Name,Value,FaultId
0,1,IgnStatus,False,1
1,2,EngineOilPressure,0,1
2,3,EngineOilTemperature,96.74375,1
3,4,TurboBoostPressure,0,1
4,5,EngineLoad,11,1


##### Diagnostics Pivot
This data needs to be flattened. We'll use a pivot so it will be ready to join to the Faults data

In [6]:
diagnostic = diagnostic.pivot(index='FaultId', columns='Name', values='Value').reset_index()

In [7]:
diagnostic.head()

Name,FaultId,AcceleratorPedal,BarometricPressure,CruiseControlActive,CruiseControlSetSpeed,DistanceLtd,EngineCoolantTemperature,EngineLoad,EngineOilPressure,EngineOilTemperature,EngineRpm,EngineTimeLtd,FuelLevel,FuelLtd,FuelRate,FuelTemperature,IgnStatus,IntakeManifoldTemperature,LampStatus,ParkingBrake,ServiceDistance,Speed,SwitchedBatteryVoltage,Throttle,TurboBoostPressure
0,1,0.0,14.21,False,66.48672,423178.7,100.4,11.0,0.0,96.74375,0.0,1632.2,43.2,12300.907429328,0.0,,False,78.8,1023,True,,0.0,3276.75,,0.0
1,2,,,,,,,,,,,,,,,,True,,1279,,,,,,
2,3,,,,,,,,,,,,,,,,,,1279,,,,,,
3,4,,,,,,,,,,,,,,,,True,,1279,,,,,,
4,5,,,,,,,,,,,,,,,,,,16639,,,,,,


## Joining the two datasets

In [8]:
faults_diagnostic = faults.merge(diagnostic, left_on='RecordID', right_on='FaultId')

In [9]:
faults_diagnostic.head()

Unnamed: 0,RecordID,ESS_Id,EventTimeStamp,eventDescription,actionDescription,ecuSoftwareVersion,ecuSerialNumber,ecuModel,ecuMake,ecuSource,spn,fmi,active,activeTransitionCount,faultValue,EquipmentID,MCTNumber,Latitude,Longitude,LocationTimeStamp,FaultId,AcceleratorPedal,BarometricPressure,CruiseControlActive,CruiseControlSetSpeed,DistanceLtd,EngineCoolantTemperature,EngineLoad,EngineOilPressure,EngineOilTemperature,EngineRpm,EngineTimeLtd,FuelLevel,FuelLtd,FuelRate,FuelTemperature,IgnStatus,IntakeManifoldTemperature,LampStatus,ParkingBrake,ServiceDistance,Speed,SwitchedBatteryVoltage,Throttle,TurboBoostPressure
0,1,990349,2015-02-21 10:47:13,Low (Severity Low) Engine Coolant Level,,unknown,unknown,unknown,unknown,0,111,17,True,2,,1439,105354361,38.857638,-84.626851,2015-02-21 11:34:25,1,0.0,14.21,False,66.48672,423178.7,100.4,11.0,0.0,96.74375,0.0,1632.2,43.2,12300.907429328,0.0,,False,78.8,1023,True,,0.0,3276.75,,0.0
1,2,990360,2015-02-21 11:34:34,,,unknown,unknown,unknown,unknown,11,629,12,True,127,,1439,105354361,38.857638,-84.626851,2015-02-21 11:35:10,2,,,,,,,,,,,,,,,,True,,1279,,,,,,
2,3,990364,2015-02-21 11:35:31,Incorrect Data Steering Wheel Angle,,unknown,unknown,unknown,unknown,11,1807,2,False,127,,1369,105336226,41.42125,-87.767361,2015-02-21 11:35:26,3,,,,,,,,,,,,,,,,,,1279,,,,,,
3,4,990370,2015-02-21 11:35:33,Incorrect Data Steering Wheel Angle,,unknown,unknown,unknown,unknown,11,1807,2,True,127,,1369,105336226,41.421018,-87.767361,2015-02-21 11:36:08,4,,,,,,,,,,,,,,,,True,,1279,,,,,,
4,5,990416,2015-02-21 11:39:41,,,22281684P01*22357957P01*22362082P01*,13063430,0USA13_13_0415_2238A,VOLVO,0,4364,17,False,2,,1674,105427130,38.416481,-89.442638,2015-02-21 11:39:37,5,,,,,,,,,,,,,,,,,,16639,,,,,,


## Cleaning

#### Column names

In [10]:
#My team opted to use snake case for column names

faults_diagnostic = faults_diagnostic.rename(columns={
    "RecordID": "record_id",
    "ESS_Id": "ess_id",
    "EventTimeStamp": "event_timestamp",
    "eventDescription": "event_description",
    "actionDescription": "action_description",
    "ecuSoftwareVersion": "ecu_software_version",
    "ecuSerialNumber": "ecu_serial_number",
    "ecuModel": "ecu_model",
    "ecuMake": "ecu_make",
    "ecuSource": "ecu_source",
    "activeTransitionCount": "active_transition_count",
    "faultValue": "fault_value",
    "EquipmentID": "equipment_id",
    "MCTNumber": "mct_number",
    "Latitude": "latitude",
    "Longitude": "longitude",
    "LocationTimeStamp": "location_timestamp",
    "FaultId": "fault_id",
    "AcceleratorPedal": "accelerator_pedal",
    "BarometricPressure": "barometric_pressure",
    "CruiseControlActive": "cruise_control_active",
    "CruiseControlSetSpeed": "cruise_control_set_speed",
    "DistanceLtd": "distance_ltd",
    "EngineCoolantTemperature": "engine_coolant_temperature",
    "EngineLoad": "engine_load",
    "EngineOilPressure": "engine_oil_pressure",
    "EngineOilTemperature": "engine_oil_temperature",
    "EngineRpm": "engine_rpm",
    "EngineTimeLtd": "engine_time_ltd",
    "FuelLevel": "fuel_level",
    "FuelLtd": "fuel_ltd",
    "FuelRate": "fuel_rate",
    "FuelTemperature": "fuel_temperature",
    "IgnStatus": "ign_status",
    "IntakeManifoldTemperature": "intake_manifold_temperature",
    "LampStatus": "lamp_status",
    "ParkingBrake": "parking_brake",
    "ServiceDistance": "service_distance",
    "Speed": "speed",
    "SwitchedBatteryVoltage": "switched_battery_voltage",
    "Throttle": "throttle",
    "TurboBoostPressure": "turbo_boost_pressure"
})

In [11]:
# The columns 'action_description' and 'fault_value' contain all null values, so they will be dropped

faults_diagnostic = faults_diagnostic.drop(columns=['action_description', 'fault_value'])

In [12]:
faults_diagnostic.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1187335 entries, 0 to 1187334
Data columns (total 43 columns):
 #   Column                       Non-Null Count    Dtype         
---  ------                       --------------    -----         
 0   record_id                    1187335 non-null  int64         
 1   ess_id                       1187335 non-null  int64         
 2   event_timestamp              1187335 non-null  datetime64[ns]
 3   event_description            1126490 non-null  object        
 4   ecu_software_version         891285 non-null   object        
 5   ecu_serial_number            844318 non-null   object        
 6   ecu_model                    1122577 non-null  object        
 7   ecu_make                     1122577 non-null  object        
 8   ecu_source                   1187335 non-null  int64         
 9   spn                          1187335 non-null  int64         
 10  fmi                          1187335 non-null  int64         
 11  active     

In [13]:
faults_diagnostic.head()

Unnamed: 0,record_id,ess_id,event_timestamp,event_description,ecu_software_version,ecu_serial_number,ecu_model,ecu_make,ecu_source,spn,fmi,active,active_transition_count,equipment_id,mct_number,latitude,longitude,location_timestamp,fault_id,accelerator_pedal,barometric_pressure,cruise_control_active,cruise_control_set_speed,distance_ltd,engine_coolant_temperature,engine_load,engine_oil_pressure,engine_oil_temperature,engine_rpm,engine_time_ltd,fuel_level,fuel_ltd,fuel_rate,fuel_temperature,ign_status,intake_manifold_temperature,lamp_status,parking_brake,service_distance,speed,switched_battery_voltage,throttle,turbo_boost_pressure
0,1,990349,2015-02-21 10:47:13,Low (Severity Low) Engine Coolant Level,unknown,unknown,unknown,unknown,0,111,17,True,2,1439,105354361,38.857638,-84.626851,2015-02-21 11:34:25,1,0.0,14.21,False,66.48672,423178.7,100.4,11.0,0.0,96.74375,0.0,1632.2,43.2,12300.907429328,0.0,,False,78.8,1023,True,,0.0,3276.75,,0.0
1,2,990360,2015-02-21 11:34:34,,unknown,unknown,unknown,unknown,11,629,12,True,127,1439,105354361,38.857638,-84.626851,2015-02-21 11:35:10,2,,,,,,,,,,,,,,,,True,,1279,,,,,,
2,3,990364,2015-02-21 11:35:31,Incorrect Data Steering Wheel Angle,unknown,unknown,unknown,unknown,11,1807,2,False,127,1369,105336226,41.42125,-87.767361,2015-02-21 11:35:26,3,,,,,,,,,,,,,,,,,,1279,,,,,,
3,4,990370,2015-02-21 11:35:33,Incorrect Data Steering Wheel Angle,unknown,unknown,unknown,unknown,11,1807,2,True,127,1369,105336226,41.421018,-87.767361,2015-02-21 11:36:08,4,,,,,,,,,,,,,,,,True,,1279,,,,,,
4,5,990416,2015-02-21 11:39:41,,22281684P01*22357957P01*22362082P01*,13063430,0USA13_13_0415_2238A,VOLVO,0,4364,17,False,2,1674,105427130,38.416481,-89.442638,2015-02-21 11:39:37,5,,,,,,,,,,,,,,,,,,16639,,,,,,


##### Software Versions

In [14]:
faults_diagnostic[faults_diagnostic['ecu_software_version'].str.contains("?", regex=False, na=False)]

Unnamed: 0,record_id,ess_id,event_timestamp,event_description,ecu_software_version,ecu_serial_number,ecu_model,ecu_make,ecu_source,spn,fmi,active,active_transition_count,equipment_id,mct_number,latitude,longitude,location_timestamp,fault_id,accelerator_pedal,barometric_pressure,cruise_control_active,cruise_control_set_speed,distance_ltd,engine_coolant_temperature,engine_load,engine_oil_pressure,engine_oil_temperature,engine_rpm,engine_time_ltd,fuel_level,fuel_ltd,fuel_rate,fuel_temperature,ign_status,intake_manifold_temperature,lamp_status,parking_brake,service_distance,speed,switched_battery_voltage,throttle,turbo_boost_pressure
20828,21711,1878624,2015-04-14 11:38:36,,????_1284P4C_2*,________Y043718,MX,PCAR,0,5396,1,False,126,302,105435601,35.737407,-84.045046,2015-04-14 11:38:32,21711,,,,,,,,,,,,,,,,,,17407,,,,,,
20829,21712,1878625,2015-04-14 11:38:36,,????_1284P4C_2*,________Y043718,MX,PCAR,0,5444,1,False,126,302,105435601,35.737407,-84.045046,2015-04-14 11:38:32,21712,,,,,,,,,,,,,,,,,,17407,,,,,,
23937,24820,1933193,2015-04-16 19:49:37,,????1684P01*22357957P01*22362082P01*,13061463,0USA13_13_0415_2238A,VOLVO,0,4811,2,True,1,1676,105427266,38.413518,-85.765185,2015-04-16 19:50:14,24820,17.2,3.625,,,171224.0,176.0,23.0,25.52,205.0813,823.875,3459.1,90.8,24246.239276664,2.007714,,True,122.0,255,,,4.361734,,17.6,1.74
23938,24821,1933194,2015-04-16 19:49:37,Incorrect Data Engine Oil Pressure,????1684P01*22357957P01*22362082P01*,13061463,0USA13_13_0415_2238A,VOLVO,0,100,2,True,1,1676,105427266,38.413518,-85.765185,2015-04-16 19:50:14,24821,17.2,3.625,,,171224.0,176.0,23.0,25.52,205.0813,823.875,3459.1,90.8,24246.239276664,2.007714,,True,122.0,255,,,4.361734,,17.6,1.74
40226,41338,2216854,2015-05-02 14:45:37,Not Reporting Data Engine Variable Geometry Tu...,????1684P01*22357957P01*22362082P01*,13063430,0USA13_13_0415_2238A,VOLVO,0,641,7,True,19,1674,105427130,33.62787,-84.392083,2015-05-02 14:46:13,41338,0.0,3.5525,,,237887.2,179.6,13.0,36.54,226.2875,652.625,3923.2,0.0,102862.389005578,0.7132668,,True,168.8,255,,,0.0,,0.0,0.0
40483,41595,2221486,2015-05-03 09:47:01,Low (Severity High) Particulate Matter Trap Mo...,????8181P01*22548975P01*22549033P01*,13063847,0USA13_13_0415_2238A,VOLVO,0,3064,1,True,1,1677,105427212,36.938935,-80.992222,2015-05-03 09:47:36,41595,,,,,,,,,,,,,,,,False,,255,,,,,,
81635,84021,2897660,2015-06-11 07:43:50,Incorrect Data Particulate Trap Outlet Pressure 1,????7106*04047537*092613211021*09300006*G1*BDR*,79723629,6X1u13D1500000000,CMMNS,0,3610,2,True,8,1722,105437870,35.155555,-90.135324,2015-06-11 07:44:27,84021,,,,,,,,,,,,,,,,True,,17407,,,,,,
90267,92653,3037645,2015-06-19 06:36:05,Low (Severity Low) Engine Coolant Level,????7106*04075952*092613211021*09300006*G1*BDR*,79731577,6X1u13D1500000000,CMMNS,0,111,17,True,55,1744,105306493,36.066342,-86.434953,2015-06-19 06:36:41,92653,0.0,14.4275,False,66.48672,140518.1,95.0,19.0,42.92,91.7375,600.75,2555.6,58.0,18077.029346308,1.162361,32.0,True,89.6,1023,True,,0.0,3276.75,0.0,0.29
90273,92659,3037705,2015-06-19 06:39:59,Low (Severity Low) Engine Coolant Level,????7106*04075952*092613211021*09300006*G1*BDR*,79731577,6X1u13D1500000000,CMMNS,0,111,17,False,55,1744,105306493,36.069398,-86.435462,2015-06-19 06:39:54,92659,,,,,,,,,,,,,,,,,,1023,,,,,,
91224,93610,3054166,2015-06-20 01:31:30,Low (Severity Low) Engine Coolant Level,????0170*03015749*051914190353*09400015*G1*BDR*,79642446,6X1u13D1500000000,CMMNS,0,111,17,True,3,1630,105329900,42.013888,-87.926898,2015-06-20 01:32:05,93610,0.0,14.2825,False,66.48672,254094.0,78.8,16.0,43.5,75.3125,600.75,4937.45,,35576.446500918,1.096317,32.0,True,71.6,1023,True,,0.0,3276.75,0.0,0.0


##### Engine Serial Number

In [15]:
faults_diagnostic['ecu_serial_number'].value_counts()

unknown            298549
6U13D13             11207
79845785            10302
79856768             8158
79845329             7199
79623056             7066
79621048             6828
79845786             6349
79840984             6345
79844876             5872
79623054             5663
79623410             5578
79844877             5506
79620769             5482
79844882             5417
79614871             5326
79615187             5218
79857688             5059
79857689             5019
79614865             4374
79619434             4206
79620768             4133
79615184             3857
79845331             3819
79619117             3797
79614866             3788
S381222841           3176
79615183             3169
79845327             3149
79844880             3113
79857687             3089
79857685             3065
00000000             3062
79620774             3055
79618850             3041
79619125             2855
79607068             2798
79620764             2798
79623055    

There are 'unknown' (298549), 'Unspecified' (97), and NoSerial (3) values. Let's combine them all into 'unknown'

In [16]:
faults_diagnostic['ecu_serial_number'] = faults_diagnostic['ecu_serial_number'].replace('Unspecified', 'unknown')
faults_diagnostic['ecu_serial_number'] = faults_diagnostic['ecu_serial_number'].replace('NoSerial', 'unknown')

##### Engine Model

In [17]:
faults_diagnostic['ecu_model'].sort_values().unique()

array(['0USA10_13_0405_2237A', '0USA13_13_0415_2238A', '202.35.0',
       '20412511P07', '6L u13D0890000000', '6U13D13', '6X1u10D1500000000',
       '6X1u13D1500000000', '6X1u17D1500000000', '6X1u20D1500000000',
       'CE', 'CECU3-NAMUX3', 'CECU3B-NAMUX4', 'E0031', 'EC60-adv',
       'EC80ESP', 'EC80ESP AM000036', 'EC80ESP AM000038', 'EC80ESP+',
       'EEO-xxF112C', 'FAOM-xx810S-EC3', 'Gen 4 Boot Loader', 'MX',
       'MX16U13D13', 'MX16U15D13', 'Y044053', 'Y049568',
       '________Y043718', 'unknown', nan], dtype=object)

Nothing to update for models

##### Engine Make

In [18]:
faults_diagnostic['ecu_make'].value_counts()

CMMNS              433403
unknown            298549
PACCR              277021
BNDWS               71001
PCAR                20229
EATON               12612
VOLVO                7252
?????                 755
????S                 627
????R                 589
?MMNS                 289
?CAR                  152
?ACCR                  39
???CR                  20
?NDWS                  15
?????MX16U13D13         9
?????MX                 6
??MNS                   3
?ATON                   3
???R                    1
??DWS                   1
5516014                 1
Name: ecu_make, dtype: int64

In [19]:
faults_diagnostic['ecu_make'] = faults_diagnostic['ecu_make'].replace('?????', 'unknown')
faults_diagnostic['ecu_make'] = faults_diagnostic['ecu_make'].replace('????S', 'unknown')
faults_diagnostic['ecu_make'] = faults_diagnostic['ecu_make'].replace('????R', 'PACCR')
faults_diagnostic['ecu_make'] = faults_diagnostic['ecu_make'].replace('?MMNS', 'CMMNS')
faults_diagnostic['ecu_make'] = faults_diagnostic['ecu_make'].replace('?CAR', 'PCAR')
faults_diagnostic['ecu_make'] = faults_diagnostic['ecu_make'].replace('?ACCR', 'PACCR')
faults_diagnostic['ecu_make'] = faults_diagnostic['ecu_make'].replace('???CR', 'PACCR')
faults_diagnostic['ecu_make'] = faults_diagnostic['ecu_make'].replace('?NDWS', 'BNDWS')
faults_diagnostic['ecu_make'] = faults_diagnostic['ecu_make'].replace('?????MX16U13D13', 'MX16U13D13')
faults_diagnostic['ecu_make'] = faults_diagnostic['ecu_make'].replace('?????MX', 'MX')
faults_diagnostic['ecu_make'] = faults_diagnostic['ecu_make'].replace('?ATON', 'EATON')
faults_diagnostic['ecu_make'] = faults_diagnostic['ecu_make'].replace('??MNS', 'CMMNS')
faults_diagnostic['ecu_make'] = faults_diagnostic['ecu_make'].replace('???R', 'PCAR')
faults_diagnostic['ecu_make'] = faults_diagnostic['ecu_make'].replace('??DWS', 'BNDWS')

##### ECU Source

In [20]:
faults_diagnostic['ecu_source'].value_counts()

0     528044
49    514059
11    131122
3      13484
61       626
Name: ecu_source, dtype: int64

Nothing to clean

##### SPN

In [21]:
faults_diagnostic['spn'].value_counts()

111       365489
929       256541
96         90041
829        87788
639        41062
97         26745
596        22571
50353      11773
1569       10927
1761        9981
2863        8233
789         8022
629         8012
1068        7994
791         7799
1231        6992
1067        6947
641         6292
91          6210
3226        5547
3216        5420
37          5183
792         5164
412         5044
790         5011
807         4945
886         4190
1807        4081
627         4043
802         4036
171         4036
611         4026
1059        3952
51923       3950
793         3906
3251        3905
3464        3382
4364        2618
102         2578
630         2460
157         2426
70          2268
5396        2223
1483        2178
5848        2100
3610        2021
1045        1994
907         1962
4096        1877
5444        1800
1209        1736
523531      1718
2623        1700
101         1686
110         1652
3031        1650
1808        1636
934         1613
248         16

Nothing to clean

##### FMI

In [22]:
faults_diagnostic['fmi'].value_counts()

17    326553
9     288893
3     188631
2      82334
18     53602
31     39881
4      39734
0      27577
7      23249
1      20887
15     20739
14     12699
12     12233
5       9683
16      9242
10      7437
8       5741
19      5733
20      4440
11      3847
13      2634
6       1027
23       396
21       131
29         8
22         4
Name: fmi, dtype: int64

Nothing to clean

##### Active

In [23]:
faults_diagnostic['active'].value_counts(normalize=True)

True     0.512454
False    0.487546
Name: active, dtype: float64

Nothing to clean

##### Active Count

In [24]:
faults_diagnostic['active_transition_count'].value_counts()

126    581874
1      305476
127    102512
2       45075
3       13019
4        9658
6        6880
5        6777
0        5781
7        4782
8        4088
9        3750
11       3387
10       3326
12       2713
13       2603
14       2420
15       2358
16       2158
17       2064
18       1914
20       1755
19       1753
21       1590
22       1482
23       1475
24       1459
25       1393
26       1391
28       1277
27       1276
29       1230
30       1164
33       1132
32       1128
31       1074
34       1039
36       1000
39        973
40        972
37        970
35        968
38        965
44        915
43        908
42        906
41        896
45        837
48        832
46        813
47        811
50        800
49        798
55        762
52        747
54        738
51        731
53        728
59        723
58        720
56        689
57        687
63        676
61        671
64        655
70        648
62        644
60        624
66        623
72        618
67        618
68    

Nothing to clean

##### Equipment ID

In [25]:
faults_diagnostic['equipment_id'].value_counts()

1641          17492
1605          16393
1646          15462
1618          14986
1606          14973
1619          14832
1625          14783
1645          14268
1644          13920
1649          13557
1610          13489
1630          13149
1647          13117
1634          13003
1609          12902
1642          12653
1623          11834
1611          11804
1814          10361
1692          10301
1612           9554
1816           8164
1809           8026
1939           7966
1622           7918
1806           7771
1601           7437
1603           7141
1469           7116
1592           7085
1749           7000
1808           6856
1815           6474
1803           6390
1437           6133
309            5991
305            5738
1600           5684
1804           5670
1589           5532
1562           5483
1620           5470
1995           5365
1559           5358
1818           5135
1820           5080
1616           5015
1810           4883
1422           4736
1873           4584


Per the instructions, we'll need to remove any equipment_id that is greater than 5 digits

In [26]:
faults_diagnostic = faults_diagnostic[faults_diagnostic['equipment_id'].str.len() <= 5]

This removes ~2k rows

##### MCT Number

In [27]:
faults_diagnostic['mct_number'].value_counts()

105415080    16503
105420184    15507
105381514    15330
105415457    15119
105416377    15034
105417985    13590
105442858    13187
105420195    13141
105415391    13079
105338710    11834
105420590    11038
105357098    10719
105369518    10427
105420520     9939
105320363     9580
105430420     8590
105420005     8532
105465351     8361
105420580     8111
105434215     8086
105351093     8064
105330008     8019
105304730     8018
105438415     8005
105415630     7889
105364937     7479
105357794     7265
105411041     7187
105357909     6937
105400037     6929
105438416     6877
105355660     6723
105333857     6639
105460307     6529
105432778     6421
105356370     6406
105442799     6200
105435663     6040
105334108     5978
105392933     5872
105410475     5784
105356577     5732
105338555     5726
105406655     5722
105362919     5675
105460271     5670
105410840     5610
105412415     5316
105349500     5164
105465571     5080
105465373     5054
105411109     5009
105427242   

Nothing to clean

##### Latitude and Longitude

In [28]:
faults_diagnostic[faults_diagnostic['latitude'] == 0]

Unnamed: 0,record_id,ess_id,event_timestamp,event_description,ecu_software_version,ecu_serial_number,ecu_model,ecu_make,ecu_source,spn,fmi,active,active_transition_count,equipment_id,mct_number,latitude,longitude,location_timestamp,fault_id,accelerator_pedal,barometric_pressure,cruise_control_active,cruise_control_set_speed,distance_ltd,engine_coolant_temperature,engine_load,engine_oil_pressure,engine_oil_temperature,engine_rpm,engine_time_ltd,fuel_level,fuel_ltd,fuel_rate,fuel_temperature,ign_status,intake_manifold_temperature,lamp_status,parking_brake,service_distance,speed,switched_battery_voltage,throttle,turbo_boost_pressure
1150008,1207232,106294736,2019-06-24 09:10:10,Low Voltage (Sensor supply voltage 2),unknown,unknown,unknown,unknown,0,3510,4,True,1,2007,105304730,0.0,0.0,2019-06-24 08:43:30,1207232,0.0,14.21,False,64.6226,288318.0,203.0,0,0.0,216.05,0.0,5910.8,60.0,39434.811406404,0.0,,True,107.6,17407,False,,0.3301035,,100,0.0
1150009,1207233,106294737,2019-06-24 09:10:09,Low Voltage (Engine Exhaust Gas Recirculation ...,unknown,unknown,unknown,unknown,0,27,4,True,1,2007,105304730,0.0,0.0,2019-06-24 08:43:30,1207233,0.0,14.21,False,64.6226,288318.0,203.0,0,0.0,216.05,0.0,5910.8,60.4,39434.811406404,0.0,,True,105.8,17407,False,,0.01941785,,100,0.0
1150010,1207234,106294738,2019-06-24 09:10:11,,,,,,11,1807,12,True,1,2007,105304730,0.0,0.0,2019-06-24 08:43:30,1207234,0.0,14.21,False,64.6226,288318.0,203.0,0,0.0,215.9937,0.0,5910.8,61.2,39434.811406404,0.0,,True,107.6,63487,False,,0.9417657,,100,0.0
1150011,1207235,106294739,2019-06-24 09:10:12,Low Current Aftertreatment Fuel Injector 1,unknown,unknown,unknown,unknown,0,3556,5,True,1,2007,105304730,0.0,0.0,2019-06-24 08:43:30,1207235,25.2,14.21,False,64.6226,288318.0,203.0,0,0.0,215.9937,0.0,5910.8,61.6,39434.811406404,0.0,,True,107.6,17407,False,,3.893279,,100,0.0
1172275,1231439,116418325,2020-01-09 20:50:32,High Voltage (Fuel Level),,,CECU3B-NAMUX4,PACCR,49,96,3,True,126,1857,105430420,0.0,0.0,2020-01-09 20:51:09,1231439,0.0,14.5,False,66.48672,383791.8,102.2,10,40.6,102.5375,600.125,2463.15,74.0,12497.187263964,0.7925186,,True,84.2,1279,True,,0.0,,100,0.29
1172276,1231440,116418326,2020-01-09 20:50:32,High Voltage (Left Fuel Level Sensor),,,CECU3B-NAMUX4,PACCR,49,829,3,True,126,1857,105430420,0.0,0.0,2020-01-09 20:51:09,1231440,0.0,14.5,False,66.48672,383791.8,102.2,10,40.6,102.5375,600.125,2463.15,74.0,12497.187263964,0.7925186,,True,84.2,1279,True,,0.0,,100,0.29


8 rows contain a latitude of 0.0

In [29]:
faults_diagnostic[faults_diagnostic['longitude'] == 0]

Unnamed: 0,record_id,ess_id,event_timestamp,event_description,ecu_software_version,ecu_serial_number,ecu_model,ecu_make,ecu_source,spn,fmi,active,active_transition_count,equipment_id,mct_number,latitude,longitude,location_timestamp,fault_id,accelerator_pedal,barometric_pressure,cruise_control_active,cruise_control_set_speed,distance_ltd,engine_coolant_temperature,engine_load,engine_oil_pressure,engine_oil_temperature,engine_rpm,engine_time_ltd,fuel_level,fuel_ltd,fuel_rate,fuel_temperature,ign_status,intake_manifold_temperature,lamp_status,parking_brake,service_distance,speed,switched_battery_voltage,throttle,turbo_boost_pressure
1150008,1207232,106294736,2019-06-24 09:10:10,Low Voltage (Sensor supply voltage 2),unknown,unknown,unknown,unknown,0,3510,4,True,1,2007,105304730,0.0,0.0,2019-06-24 08:43:30,1207232,0.0,14.21,False,64.6226,288318.0,203.0,0,0.0,216.05,0.0,5910.8,60.0,39434.811406404,0.0,,True,107.6,17407,False,,0.3301035,,100,0.0
1150009,1207233,106294737,2019-06-24 09:10:09,Low Voltage (Engine Exhaust Gas Recirculation ...,unknown,unknown,unknown,unknown,0,27,4,True,1,2007,105304730,0.0,0.0,2019-06-24 08:43:30,1207233,0.0,14.21,False,64.6226,288318.0,203.0,0,0.0,216.05,0.0,5910.8,60.4,39434.811406404,0.0,,True,105.8,17407,False,,0.01941785,,100,0.0
1150010,1207234,106294738,2019-06-24 09:10:11,,,,,,11,1807,12,True,1,2007,105304730,0.0,0.0,2019-06-24 08:43:30,1207234,0.0,14.21,False,64.6226,288318.0,203.0,0,0.0,215.9937,0.0,5910.8,61.2,39434.811406404,0.0,,True,107.6,63487,False,,0.9417657,,100,0.0
1150011,1207235,106294739,2019-06-24 09:10:12,Low Current Aftertreatment Fuel Injector 1,unknown,unknown,unknown,unknown,0,3556,5,True,1,2007,105304730,0.0,0.0,2019-06-24 08:43:30,1207235,25.2,14.21,False,64.6226,288318.0,203.0,0,0.0,215.9937,0.0,5910.8,61.6,39434.811406404,0.0,,True,107.6,17407,False,,3.893279,,100,0.0
1172275,1231439,116418325,2020-01-09 20:50:32,High Voltage (Fuel Level),,,CECU3B-NAMUX4,PACCR,49,96,3,True,126,1857,105430420,0.0,0.0,2020-01-09 20:51:09,1231439,0.0,14.5,False,66.48672,383791.8,102.2,10,40.6,102.5375,600.125,2463.15,74.0,12497.187263964,0.7925186,,True,84.2,1279,True,,0.0,,100,0.29
1172276,1231440,116418326,2020-01-09 20:50:32,High Voltage (Left Fuel Level Sensor),,,CECU3B-NAMUX4,PACCR,49,829,3,True,126,1857,105430420,0.0,0.0,2020-01-09 20:51:09,1231440,0.0,14.5,False,66.48672,383791.8,102.2,10,40.6,102.5375,600.125,2463.15,74.0,12497.187263964,0.7925186,,True,84.2,1279,True,,0.0,,100,0.29


The same 8 rows contain a longitude of 0.0

We'll replace 0.0 with np.nan. We'll get some values from ffill and backfills before filtering out service stations by latitude and longitude.

In [30]:
lat_long = ['latitude', 'longitude']

for column in lat_long:
    faults_diagnostic[column] = faults_diagnostic[column].replace('0.0', np.nan)

##### Location Timestamp

In [31]:
faults_diagnostic['location_timestamp'].dt.year.value_counts()

2016    332288
2015    325678
2017    254860
2018    143423
2019    111520
2020     17196
2000       192
2011         8
2014         1
Name: location_timestamp, dtype: int64

We'll need to investigate the records pre-2014

Is there a way to compare location_timestamp to event_timestamp? More importantly, is this worth doing?

In [32]:
faults_diagnostic[faults_diagnostic['location_timestamp'].dt.year == 2000]

Unnamed: 0,record_id,ess_id,event_timestamp,event_description,ecu_software_version,ecu_serial_number,ecu_model,ecu_make,ecu_source,spn,fmi,active,active_transition_count,equipment_id,mct_number,latitude,longitude,location_timestamp,fault_id,accelerator_pedal,barometric_pressure,cruise_control_active,cruise_control_set_speed,distance_ltd,engine_coolant_temperature,engine_load,engine_oil_pressure,engine_oil_temperature,engine_rpm,engine_time_ltd,fuel_level,fuel_ltd,fuel_rate,fuel_temperature,ign_status,intake_manifold_temperature,lamp_status,parking_brake,service_distance,speed,switched_battery_voltage,throttle,turbo_boost_pressure
1154193,1211417,108604425,2000-03-18 19:14:10,High Voltage (Left Fuel Level Sensor),,,CECU3B-NAMUX4,PACCR,49,829,3,True,126,2015,105427130,36.935972,-86.507407,2000-03-18 19:14:46,1211417,0.0,14.4275,False,34.17542,274765.4,183.2,15.0,25.52,190.85,597.625,5673.1,68.0,37866.42193368,0.6736408,,True,127.4,1279,False,,0.0,,100.0,0.58
1154194,1211418,108604426,2000-03-18 19:14:10,High Voltage (Fuel Level),,,CECU3B-NAMUX4,PACCR,49,96,3,True,126,2015,105427130,36.935972,-86.507407,2000-03-18 19:14:46,1211418,0.0,14.4275,False,34.17542,274765.4,183.2,15.0,25.52,190.85,597.625,5673.1,68.0,37866.42193368,0.6736408,,True,127.4,1279,False,,0.0,,100.0,0.58
1154195,1211419,108604487,2000-03-18 19:20:47,High Voltage (Fuel Level),,,CECU3B-NAMUX4,PACCR,49,96,3,False,126,2015,105427130,36.92912,-86.496898,2000-03-18 19:20:43,1211419,,,,,,,,,,,,,,,,,,255,,,,,,
1154196,1211420,108604488,2000-03-18 19:20:47,High Voltage (Left Fuel Level Sensor),,,CECU3B-NAMUX4,PACCR,49,829,3,False,126,2015,105427130,36.92912,-86.496898,2000-03-18 19:20:43,1211420,,,,,,,,,,,,,,,,,,255,,,,,,
1154198,1211422,108608408,2000-03-19 02:59:58,Not Reporting Data Wheel Sensor ABS Axle 2 Right,AAAI000032*AAAM000038*BB41275 *A82J140721A_9...,5W26153559,EC80ESP,BNDWS,11,792,7,False,13,1849,105381862,36.758194,-86.17162,2000-03-19 02:59:53,1211422,,,,,,,,,,,,,,,,,,1279,,,,,,
1154199,1211423,108608954,2000-03-19 03:58:23,Not Reporting Data Wheel Sensor ABS Axle 2 Right,AAAI000032*AAAM000038*BB41275 *A82J140721A_9...,5W26153559,EC80ESP,BNDWS,11,792,7,True,14,1849,105381862,36.771851,-86.178009,2000-03-19 03:58:58,1211423,23.2,14.355,False,64.00124,418316.5,170.6,6.0,41.18,173.1875,1279.75,8909.25,92.4,58344.246888564,2.113383,32.0,True,82.4,1279,,,46.64167,,100.0,14.5
1154208,1211432,108615304,2000-03-19 07:32:53,Low (Severity Medium) Transmission Air Tank Pr...,5516014*202.35.0*5516502*E003.e003*5539478*25....,Z0047881,EEO-xxF112C,EATON,3,37,18,True,19,2283,105437713,33.796435,-81.005185,2000-03-19 07:33:29,1211432,48.0,14.7175,False,0.0,55971.24,168.8,37.0,22.62,188.375,600.0,1096.5,75.6,7238.842568904,1.056692,,True,118.4,50175,False,,0.1456339,,100.0,0.87
1154209,1211433,108615321,2000-03-19 07:34:31,Low (Severity Medium) Transmission Air Tank Pr...,5516014*202.35.0*5516502*E003.e003*5539478*25....,Z0047881,EEO-xxF112C,EATON,3,37,18,False,19,2283,105437713,33.796435,-81.005185,2000-03-19 07:34:26,1211433,,,,,,,,,,,,,,,,,,50175,,,,,,
1154211,1211435,108620729,2000-03-19 08:40:03,High Voltage (Left Fuel Level Sensor),,,CECU3B-NAMUX4,PACCR,49,829,3,True,126,2034,105385920,40.47449,-76.293657,2000-03-19 08:40:40,1211435,0.0,14.4275,False,64.6226,306348.1,186.8,14.0,19.14,208.0625,599.625,8180.75,63.6,44400.453467848,0.5283458,,True,116.6,1279,True,,0.0,,100.0,0.0
1154212,1211436,108620730,2000-03-19 08:40:03,High Voltage (Fuel Level),,,CECU3B-NAMUX4,PACCR,49,96,3,True,126,2034,105385920,40.47449,-76.293657,2000-03-19 08:40:40,1211436,0.0,14.4275,False,64.6226,306348.1,186.8,14.0,19.14,208.0625,599.625,8180.75,63.6,44400.453467848,0.5283458,,True,116.6,1279,True,,0.0,,100.0,0.0


In [33]:
faults_diagnostic[faults_diagnostic['location_timestamp'].dt.year == 2000]['equipment_id'].unique()

array(['2015', '1849', '2283', '2034', '1909', '1997', '1961', '1826',
       '1887', '1968', '1873', '2074', '1965', '2336', '1977', '1970',
       '1869', '1861', '1979', '2037', '2184', '2054', '1989', '2159',
       '2158', '2174'], dtype=object)

In [34]:
faults_diagnostic[faults_diagnostic['equipment_id'] == '2015']

Unnamed: 0,record_id,ess_id,event_timestamp,event_description,ecu_software_version,ecu_serial_number,ecu_model,ecu_make,ecu_source,spn,fmi,active,active_transition_count,equipment_id,mct_number,latitude,longitude,location_timestamp,fault_id,accelerator_pedal,barometric_pressure,cruise_control_active,cruise_control_set_speed,distance_ltd,engine_coolant_temperature,engine_load,engine_oil_pressure,engine_oil_temperature,engine_rpm,engine_time_ltd,fuel_level,fuel_ltd,fuel_rate,fuel_temperature,ign_status,intake_manifold_temperature,lamp_status,parking_brake,service_distance,speed,switched_battery_voltage,throttle,turbo_boost_pressure
768584,787340,16361053,2017-05-18 05:18:34,Low (Severity Medium) Engine Coolant Level,04358814*06162577*061516161145*09401661*G1*BDR*,79951763,6X1u13D1500000000,CMMNS,0,111,18,True,4,2015,105329960,39.948703,-86.357731,2017-05-18 05:19:10,787340,0.0,14.065,False,56.54478,37308.05,87.8,16.0,42.34,84.3125,599.5,744.6,53.2,5358.465902768,1.056692,,True,82.4,2047,True,,0.0,,100.0,0.0
768589,787345,16361224,2017-05-18 05:32:49,Low (Severity Medium) Engine Coolant Level,04358814*06162577*061516161145*09401661*G1*BDR*,79951763,6X1u13D1500000000,CMMNS,0,111,18,False,4,2015,105329960,39.94875,-86.357685,2017-05-18 05:32:45,787345,,,,,,,,,,,,,,,,,,1023,,,,,,
780578,800135,17117698,2017-06-03 16:57:41,Condition Exists Catalyst Dosing Unit Input Lines,04358814*06162577*061516161145*09401661*G1*BDR*,79951763,6X1u13D1500000000,CMMNS,0,3362,31,True,1,2015,105329960,35.07787,-89.916574,2017-06-03 16:58:16,800135,41.2,14.4275,False,58.40889,41940.92,177.8,26.0,38.86,214.4187,1175.0,846.55,66.0,5980.591085228,3.830507,,True,125.6,17407,False,,58.18559,,100.0,4.35
780595,800152,17119828,2017-06-03 17:57:41,Condition Exists Engine Protection Torque Derate,04358814*06162577*061516161145*09401661*G1*BDR*,79951763,6X1u13D1500000000,CMMNS,0,1569,31,True,1,2015,105329960,35.590185,-89.075879,2017-06-03 17:58:18,800152,100.0,14.4275,False,58.40889,42004.07,179.6,91.0,38.86,224.8813,1329.125,847.55,63.6,5987.063300502,16.36551,,True,116.6,18431,False,,65.40903,,100.0,16.53
780601,800158,17120589,2017-06-03 18:24:23,Condition Exists Engine Protection Torque Derate,04358814*06162577*061516161145*09401661*G1*BDR*,79951763,6X1u13D1500000000,CMMNS,0,1569,31,False,1,2015,105329960,35.591898,-88.929537,2017-06-03 18:24:18,800158,,,,,,,,,,,,,,,,,,17407,,,,,,
780619,800176,17122776,2017-06-03 19:41:11,Condition Exists Catalyst Dosing Unit Input Lines,04358814*06162577*061516161145*09401661*G1*BDR*,79951763,6X1u13D1500000000,CMMNS,0,3362,31,False,1,2015,105329960,35.390694,-89.410648,2017-06-03 19:41:06,800176,,,,,,,,,,,,,,,,,,1023,,,,,,
1000018,1037155,55341060,2018-07-17 05:25:26,Abnormal Update Rate Total Power Takeoff Hours,,,CECU3B-NAMUX4,PACCR,49,248,9,True,31,2015,105329960,40.340277,-88.760601,2018-07-17 05:31:22,1037155,0.0,14.21,False,64.6226,153737.2,174.2,13.0,30.74,205.5875,593.625,3234.85,100.0,21379.972512464,0.0,,True,107.6,1279,False,,5.3302,,100.0,2.03
1000020,1037157,55341160,2018-07-17 05:32:26,Abnormal Update Rate Total Power Takeoff Hours,,,CECU3B-NAMUX4,PACCR,49,248,9,False,31,2015,105329960,40.341388,-88.759675,2018-07-17 05:32:22,1037157,,,,,,,,,,,,,,,,,,255,,,,,,
1009948,1047085,58819006,2018-08-18 09:47:45,Low (Severity Medium) Engine Turbocharger 1 Speed,04358814*06162577*061516161145*09401661*G1*BDR*,79951763,6X1u13D1500000000,CMMNS,0,103,18,True,1,2015,105427130,36.167083,-86.354768,2018-08-18 09:48:22,1047085,34.4,14.355,False,66.48672,161647.3,104.0,20.0,51.04,84.5375,1292.5,3393.8,76.8,22476.022356212,3.090823,,True,95.0,17407,False,,12.86433,,100.0,3.19
1010006,1047143,58837028,2018-08-18 13:35:46,Low (Severity Medium) Engine Turbocharger 1 Speed,04358814*06162577*061516161145*09401661*G1*BDR*,79951763,6X1u13D1500000000,CMMNS,0,103,18,False,1,2015,105427130,35.992638,-87.487592,2018-08-18 13:35:42,1047143,,,,,,,,,,,,,,,,,,1023,,,,,,


Interesting...the 2000 records appear to have the event_timestamps and location_timestamps in the same year

In [35]:
faults_diagnostic[faults_diagnostic['location_timestamp'].dt.year == 2011]

Unnamed: 0,record_id,ess_id,event_timestamp,event_description,ecu_software_version,ecu_serial_number,ecu_model,ecu_make,ecu_source,spn,fmi,active,active_transition_count,equipment_id,mct_number,latitude,longitude,location_timestamp,fault_id,accelerator_pedal,barometric_pressure,cruise_control_active,cruise_control_set_speed,distance_ltd,engine_coolant_temperature,engine_load,engine_oil_pressure,engine_oil_temperature,engine_rpm,engine_time_ltd,fuel_level,fuel_ltd,fuel_rate,fuel_temperature,ign_status,intake_manifold_temperature,lamp_status,parking_brake,service_distance,speed,switched_battery_voltage,throttle,turbo_boost_pressure
1154812,1212036,108848937,2011-01-02 03:17:37,Low (Severity Low) Catalyst Tank Level,04384413*22059242*090617144354*60701715*G1*BGT*,80015667,6X1u17D1500000000,CMMNS,0,1761,17,True,5,2144,105418094,35.153101,-86.592037,2011-01-02 03:18:13,1212036,0.0,14.4275,False,0.0,228827.2,179.6,16.0,19.14,204.0688,601.125,4759.55,26.0,29654.369525208,0.6604322,,True,127.4,1023,True,,0.0,,100.0,0.29
1155217,1212441,109079376,2011-01-01 05:10:23,Special Instructions Transmission Clutch Actuator,5516018*202.56.0*5516502*E003.e003*5539540*25....,Z0018064,EEO-xxF112C,EATON,3,788,14,False,2,2218,105435718,40.226851,-76.721296,2011-01-01 05:10:19,1212441,,,,,,,,,,,,,,,,,,50175,,,,,,
1155218,1212442,109079377,2011-01-01 05:09:43,Low (Severity Medium) Transmission Air Tank Pr...,5516018*202.56.0*5516502*E003.e003*5539540*25....,Z0018064,EEO-xxF112C,EATON,3,37,18,True,5,2218,105435718,40.226851,-76.721296,2011-01-01 05:10:20,1212442,0.0,14.5,False,0.0,119422.7,132.8,20.0,35.38,138.1438,601.25,2488.75,54.8,16047.79172887,0.7925186,,True,105.8,50175,True,,0.0,,100.0,0.29
1155219,1212443,109079378,2011-01-01 05:11:26,Low (Severity Medium) Transmission Air Tank Pr...,5516018*202.56.0*5516502*E003.e003*5539540*25....,Z0018064,EEO-xxF112C,EATON,3,37,18,False,5,2218,105435718,40.226851,-76.721296,2011-01-01 05:11:22,1212443,,,,,,,,,,,,,,,,,,50175,,,,,,
1155230,1212454,109079376,2011-01-01 05:10:23,Special Instructions Transmission Clutch Actuator,5516018*202.56.0*5516502*E003.e003*5539540*25....,Z0018064,EEO-xxF112C,EATON,3,788,14,False,2,2218,105435718,40.226851,-76.721296,2011-01-01 05:10:19,1212454,,,,,,,,,,,,,,,,,,50175,,,,,,
1155231,1212455,109079377,2011-01-01 05:09:43,Low (Severity Medium) Transmission Air Tank Pr...,5516018*202.56.0*5516502*E003.e003*5539540*25....,Z0018064,EEO-xxF112C,EATON,3,37,18,True,5,2218,105435718,40.226851,-76.721296,2011-01-01 05:10:20,1212455,0.0,14.5,False,0.0,119422.7,132.8,20.0,35.38,138.1438,601.25,2488.75,54.8,16047.79172887,0.7925186,,True,105.8,50175,True,,0.0,,100.0,0.29
1155232,1212456,109079378,2011-01-01 05:11:26,Low (Severity Medium) Transmission Air Tank Pr...,5516018*202.56.0*5516502*E003.e003*5539540*25....,Z0018064,EEO-xxF112C,EATON,3,37,18,False,5,2218,105435718,40.226851,-76.721296,2011-01-01 05:11:22,1212456,,,,,,,,,,,,,,,,,,50175,,,,,,
1171320,1230484,115919584,2011-01-01 00:10:32,Low (Severity High) Transmission Air Tank Pres...,5516014*202.35.0*5516502*E003.e003*5539478*25....,Z0047881,EEO-xxF112C,EATON,3,37,1,True,3,2283,105437713,28.156342,-81.799861,2011-01-01 00:11:08,1230484,0.0,14.645,False,0.0,78880.97,183.2,0.0,34.8,202.775,709.625,1755.15,78.4,10293.067748102,0.0,,True,125.6,50175,False,,3.116565,,100.0,2.03


In [36]:
faults_diagnostic[faults_diagnostic['location_timestamp'].dt.year == 2014]

Unnamed: 0,record_id,ess_id,event_timestamp,event_description,ecu_software_version,ecu_serial_number,ecu_model,ecu_make,ecu_source,spn,fmi,active,active_transition_count,equipment_id,mct_number,latitude,longitude,location_timestamp,fault_id,accelerator_pedal,barometric_pressure,cruise_control_active,cruise_control_set_speed,distance_ltd,engine_coolant_temperature,engine_load,engine_oil_pressure,engine_oil_temperature,engine_rpm,engine_time_ltd,fuel_level,fuel_ltd,fuel_rate,fuel_temperature,ign_status,intake_manifold_temperature,lamp_status,parking_brake,service_distance,speed,switched_battery_voltage,throttle,turbo_boost_pressure
27724,28607,1994354,2015-04-21 03:31:18,Abnormal Rate of Change Aftertreatment 1 Outle...,unknown,unknown,unknown,unknown,0,3226,10,True,1,1470,105446251,36.945092,-80.944583,2014-12-30 13:44:03,28607,,,,,,,,,,,,,,,,True,,17407,,,,,,


Given the discrepancies, it may make sense to use only one timestamp column

##### Fault ID

'fault_id' was used to join the datasets and is redundant with 'record_id'. We'll drop it now

In [37]:
faults_diagnostic = faults_diagnostic.drop(columns=['fault_id'])

##### Accelerator Pedal

In [38]:
faults_diagnostic['accelerator_pedal'].value_counts()

0           276459
100          53985
99.6          2841
0.4           1939
27.2          1910
27.6          1865
28.4          1862
28            1823
28.8          1815
25.2          1797
26.8          1796
26.4          1792
29.6          1788
25.6          1771
26            1756
24.4          1722
30.8          1716
29.2          1715
31.6          1686
30.4          1677
32            1666
32.8          1649
22.8          1639
31.2          1631
23.6          1618
33.2          1615
33.6          1601
23.2          1594
24            1588
32.4          1578
30            1578
24.8          1567
34            1517
22            1503
21.6          1502
0.8           1497
34.4          1493
22.4          1475
35.2          1464
21.2          1419
20.8          1392
20.4          1369
36.4          1350
36.8          1338
99.2          1326
35.6          1325
36            1310
34.8          1282
37.6          1281
37.2          1270
19.6          1251
38.4          1249
19.2        

Several values have commas—this is consistent across the diagnostic data, so we'll clean those all up at once

##### Barometric Pressure

In [39]:
faults_diagnostic['barometric_pressure'].value_counts()

14.355      85702
14.4275     82051
14.2825     76797
14.5        56810
14.21       56652
14.5725     40160
14.1375     37609
14.645      32297
14.065      21652
14.7175     20379
13.9925     11503
14.79        7775
13.92        7209
13.775       6653
13.8475      6545
13.7025      6523
13.63        5847
13.5575      4342
3.5525       3325
13.485       3113
14.8625      2420
13.4125      2111
3.625        1448
13.34        1129
3.48          822
14.935        685
13.2675       598
12.2525       311
3.4075        280
13.195        237
15.0075       182
7.25          182
13.1225       154
3.6975        121
3.335         118
13.05         109
12.9775       104
12.615         87
12.8325        70
12.6875        60
12.905         51
11.6725        43
12.76          42
12.5425        40
12.325         39
12.3975        34
11.9625        33
12.18          33
11.455         31
11.6           31
12.035         30
11.745         27
11.5275        26
12.47          26
11.89          24
15.08     

Commas to be fixed

##### Cruise Control Active

In [40]:
faults_diagnostic['cruise_control_active'].value_counts(normalize=True)

False    0.911834
True     0.088166
Name: cruise_control_active, dtype: float64

##### Cruise Control Set Speed

In [41]:
faults_diagnostic['cruise_control_set_speed'].value_counts()

66.48672    278090
64.6226     130383
0            34280
65.24397     10952
65.86535      9675
64.00124      9302
59.65163      8065
63.37986      6705
61.51575      6639
60.89438      6628
60.27301      5857
55.30204      5697
59.03026      5294
62.75849      5226
56.54478      4805
57.78752      4575
57.16615      4319
62.13712      4262
54.68066      4237
55.92341      4223
58.40889      4218
67.10809      4060
42.25324      2270
54.05929      2212
53.43792       935
68.35083       902
52.81655       747
50.33107       742
68.97221       669
51.57381       626
49.70969       537
52.19518       508
45.3601        465
44.73873       464
47.84558       451
45.98147       440
46.60284       410
41.63187       410
44.11736       387
50.95244       373
47.22421       345
67.72946       341
49.08833       304
70.83632       275
48.46695       242
27.9617        199
70.21494       198
31.68993       191
40.38913       190
43.49598       160
29.82582       150
29.20445       139
34.79679    

There are 34,698 records that have cruise control set to 0 mph. Interesting

##### Distance (Lifetime To Date)

In [42]:
faults_diagnostic['distance_ltd']

0             423178.7
1                  NaN
2                  NaN
3                  NaN
4                  NaN
              ...     
1187330            NaN
1187331       423937.9
1187332       465925.4
1187333    28606.65625
1187334            NaN
Name: distance_ltd, Length: 1185166, dtype: object

We'll need to fix commas before going further

##### Engine Coolant Temperature

In [43]:
faults_diagnostic['engine_coolant_temperature'].value_counts()

185      69413
183.2    60148
186.8    56234
181.4    40461
188.6    36754
179.6    25905
190.4    24036
177.8    16864
192.2    14203
174.2    12874
176      12197
172.4    10172
170.6     8488
194       7097
168.8     6848
167       5953
195.8     5240
165.2     4803
163.4     4519
197.6     4374
161.6     4032
199.4     3855
159.8     3469
156.2     3460
158       3225
154.4     3157
201.2     3052
150.8     2851
152.6     2824
86        2797
87.8      2773
203       2736
91.4      2706
95        2646
98.6      2624
96.8      2605
93.2      2588
84.2      2587
100.4     2581
89.6      2536
102.2     2514
80.6      2481
149       2444
120.2     2439
78.8      2427
82.4      2394
147.2     2352
77        2344
105.8     2339
109.4     2329
204.8     2317
104       2299
138.2     2289
114.8     2288
107.6     2287
116.6     2275
75.2      2273
118.4     2212
132.8     2209
145.4     2184
111.2     2175
136.4     2169
113       2160
134.6     2145
127.4     2118
143.6     2118
73.4      

There are a few negative values and also more commas

They can go to -36 which does not occur in the dataset. 

##### Engine Load

In [44]:
faults_diagnostic['engine_load'].value_counts()

0      90770
100    34976
12     25335
14     24209
16     22158
10     18771
17     18567
13     17329
11     15876
15     15855
19     14054
21     11516
18     10894
9       9689
23      9321
8       8731
20      8206
25      7598
26      7283
28      7184
22      7046
1       6839
30      6660
7       5710
24      5589
32      5006
27      4920
29      4814
99      4590
33      4516
35      4131
31      3953
37      3776
39      3654
41      3564
42      3448
44      3354
46      3308
48      3260
51      3071
50      3005
6       2961
34      2930
53      2900
58      2733
36      2725
55      2719
5       2715
57      2714
60      2543
62      2542
66      2461
40      2456
38      2451
64      2381
69      2314
45      2284
43      2253
67      2238
47      2197
49      2116
98      2104
73      2064
52      2047
71      2033
75      2022
78      1999
76      1978
54      1914
56      1911
3       1877
80      1863
59      1783
82      1772
61      1752
63      1685
83      1647

Nothing to clean

##### Engine Oil Pressure

In [45]:
faults_diagnostic['engine_oil_pressure'].value_counts()

0           43115
38.86       29287
38.28       28559
39.44       26958
37.7        26588
37.12       24910
36.54       23697
40.02       22957
35.96       21914
35.38       20106
40.6        18270
34.8        16546
41.18       15029
34.22       13391
41.76       12058
33.64       10613
42.34        9820
20.88        9518
21.46        9394
20.3         9131
22.04        8901
19.72        8281
22.62        8055
42.92        7896
23.2         7622
33.06        7619
19.14        7007
23.78        6896
43.5         6299
24.36        6100
24.94        5358
32.48        5307
18.56        5241
44.08        5165
25.52        4826
26.1         4520
44.66        4253
31.9         4212
26.68        4176
27.26        3945
45.24        3746
17.98        3665
30.16        3617
31.32        3583
30.74        3511
27.84        3497
29.58        3426
29           3366
28.42        3270
45.82        3119
17.4         2927
46.4         2668
46.98        2276
16.82        2135
47.56        1900
48.14     

Commas to replace

##### Engine Oil Temperature

In [46]:
faults_diagnostic['engine_oil_temperature'].value_counts()

215.9937     4114
217.4563     4047
216.725      4022
213.7437     3854
214.5313     3834
218.2437     3833
219.7063     3813
210.5375     3679
215.2625     3637
213.0125     3604
211.6625     3603
218.975      3591
212.2813     3589
211.1        3467
220.4375     3438
221.1687     3220
210.0313     2982
221.9        2969
209.4688     2935
208.9063     2762
215.825      2758
217.2875     2753
211.55       2749
207.7813     2668
216.5562     2600
219.5375     2596
212.1125     2566
215.0375     2554
208.3438     2538
207.2188     2508
206.6563     2506
222.6875     2504
212.8438     2501
214.3062     2416
210.9875     2406
213.575      2373
206.0938     2371
220.2688     2352
205.5313     2349
210.425      2335
218.75       2328
218.0188     2293
204.9688     2278
221          2259
204.4063     2246
209.8625     2227
224.15       2220
203.8438     2175
223.4187     2125
209.3        2117
203.2813     2096
202.775      2030
206.5437     1937
224.8813     1929
197.6        1865
208.7375  

Have some negative values and commas

##### Engine RPM

In [47]:
faults_diagnostic['engine_rpm'].value_counts()

0          45563
600.125     5367
599.75      5192
600.375     5145
599.875     5136
           ...  
174            1
181            1
497.625        1
198.875        1
413.5          1
Name: engine_rpm, Length: 11470, dtype: int64

In [48]:
faults_diagnostic[faults_diagnostic['engine_rpm'].str.contains(",", regex=False, na=False)]

Unnamed: 0,record_id,ess_id,event_timestamp,event_description,ecu_software_version,ecu_serial_number,ecu_model,ecu_make,ecu_source,spn,fmi,active,active_transition_count,equipment_id,mct_number,latitude,longitude,location_timestamp,accelerator_pedal,barometric_pressure,cruise_control_active,cruise_control_set_speed,distance_ltd,engine_coolant_temperature,engine_load,engine_oil_pressure,engine_oil_temperature,engine_rpm,engine_time_ltd,fuel_level,fuel_ltd,fuel_rate,fuel_temperature,ign_status,intake_manifold_temperature,lamp_status,parking_brake,service_distance,speed,switched_battery_voltage,throttle,turbo_boost_pressure
17519,18402,1823305,2015-04-10 14:31:59,Data May Be Invalid Relative Speed; Rear Axle ...,PC4__1284P4C_2*,________Y048665,MX,PCAR,0,907,19,True,38,305,105362919,34.459583,-84.919537,2015-04-10 15:02:32,100,14355,False,0,1130179,1868,99,7598,177575,157075,243565,616,1786793716715,2068474,86,True,1202,17407,False,,2248587,327675.0,100,3335
57330,59400,2517346,2015-05-19 20:48:51,Data May Be Invalid Relative Speed; Rear Axle ...,PC4__1284P4C_2*,________Y048665,MX,PCAR,0,907,19,True,63,305,105362919,34.402222,-84.919398,2015-05-19 21:21:34,100,14355,False,0,1270466,1868,100,7366,1801062,17255,274405,732,19992012551256,2073757,968,True,1238,17407,False,,244228,327675.0,100,3277
98748,101134,3180493,2015-06-27 05:46:01,Data May Be Invalid Relative Speed; Rear Axle ...,PC4__1284P4C_2*,________Y048665,MX,PCAR,0,907,19,True,86,305,105362919,34.628472,-84.868333,2015-06-27 06:02:21,512,142825,False,0,1409274,1832,55,667,1873062,139175,305635,80,22060875976494,9959317,1004,True,1328,17407,False,,2806365,327675.0,432,1247
109858,112244,3326811,2015-07-07 05:31:43,Data May Be Invalid Relative Speed; Rear Axle ...,PC4__1284P4C_2*,________Y048665,MX,PCAR,0,907,19,True,87,305,105362919,34.786064,-84.824027,2015-07-07 05:32:19,100,14355,False,0,1418729,185,100,7946,172175,162325,307865,288,22192829916468,2068474,842,True,104,17407,False,,230053,327675.0,100,3335
130411,132797,3623872,2015-07-22 22:17:35,Data May Be Invalid Relative Speed; Rear Axle ...,PC4__1284P4C_2*,________Y048665,MX,PCAR,0,907,19,True,95,305,105362919,34.803842,-85.012731,2015-07-22 22:34:36,168,14355,False,0,146218,1886,12,7192,1896687,14145,31741,496,22817332647396,2760607,104,True,131,17407,False,,1108759,327675.0,168,203
167370,169756,4093848,2015-08-17 09:36:38,Data May Be Invalid Relative Speed; Rear Axle ...,PC4__1284P4C_2*,________Y048665,MX,PCAR,0,907,19,True,106,305,105362919,33.39412,-84.786435,2015-08-17 09:37:13,0,142825,False,0,155290,1814,0,7192,1877,132325,33828,528,24105303486922,0,1256,True,131,17407,False,,1368473,327675.0,0,116
190360,192746,4441365,2015-09-04 04:46:23,Abnormal Update Rate Engine Instantaneous Fuel...,unknown,unknown,unknown,unknown,49,184,9,True,126,305,105362919,34.783564,-84.824074,2015-09-04 04:46:18,4,14355,False,0,1613705,1868,0,8468,1841,171775,352075,74,24998337108708,5283457,1022,True,1202,1279,False,,7204022,327675.0,2,1189
275289,279109,5795315,2015-11-16 03:04:21,,PC4__1284P4C_2*,________Y048665,MX,PCAR,0,5396,1,True,78,305,105362919,35.004583,-85.223148,2015-11-16 03:15:12,100,144275,False,0,1809965,1778,100,8236,2080063,169675,396145,556,27914928648814,2098854,716,True,86,17407,False,,6300121,,100,29
277270,281776,5827502,2015-11-17 08:23:45,,PC4__1284P4C_2*,________Y048665,MX,PCAR,0,5396,1,True,81,305,105362919,34.793888,-84.821203,2015-11-17 08:24:21,0,144275,False,0,1813523,1634,15,6206,1486062,6535,39691,88,27973310672306,1624663,644,True,698,17407,True,,0,,0,87
278386,282892,5845487,2015-11-18 07:58:38,,PC4__1284P4C_2*,________Y048665,MX,PCAR,0,5396,1,True,85,305,105362919,34.703333,-84.820185,2015-11-18 08:18:48,0,14355,False,0,1819237,1796,15,551,1639063,6505,39805,872,28054279406244,1690706,77,True,896,17407,False,,0,,0,87


More commas to replace

##### Engine Time (LTD)

In [49]:
faults_diagnostic['engine_time_ltd'].value_counts()

0           584
8890.55     253
4307.4       82
4277.55      76
4422.35      71
           ... 
12052.95      1
1181.9        1
10182.2       1
1517.65       1
4727.45       1
Name: engine_time_ltd, Length: 201163, dtype: int64

In [50]:
faults_diagnostic[faults_diagnostic['engine_time_ltd'].str.contains(",", regex=False, na=False)]

Unnamed: 0,record_id,ess_id,event_timestamp,event_description,ecu_software_version,ecu_serial_number,ecu_model,ecu_make,ecu_source,spn,fmi,active,active_transition_count,equipment_id,mct_number,latitude,longitude,location_timestamp,accelerator_pedal,barometric_pressure,cruise_control_active,cruise_control_set_speed,distance_ltd,engine_coolant_temperature,engine_load,engine_oil_pressure,engine_oil_temperature,engine_rpm,engine_time_ltd,fuel_level,fuel_ltd,fuel_rate,fuel_temperature,ign_status,intake_manifold_temperature,lamp_status,parking_brake,service_distance,speed,switched_battery_voltage,throttle,turbo_boost_pressure
17519,18402,1823305,2015-04-10 14:31:59,Data May Be Invalid Relative Speed; Rear Axle ...,PC4__1284P4C_2*,________Y048665,MX,PCAR,0,907,19,True,38,305,105362919,34.459583,-84.919537,2015-04-10 15:02:32,100,14355,False,0,1130179,1868,99,7598,177575.0,157075,243565,616,1786793716715.0,2068474,86.0,True,1202,17407,False,,2248587.0,327675.0,100.0,3335
57330,59400,2517346,2015-05-19 20:48:51,Data May Be Invalid Relative Speed; Rear Axle ...,PC4__1284P4C_2*,________Y048665,MX,PCAR,0,907,19,True,63,305,105362919,34.402222,-84.919398,2015-05-19 21:21:34,100,14355,False,0,1270466,1868,100,7366,1801062.0,17255,274405,732,19992012551256.0,2073757,968.0,True,1238,17407,False,,244228.0,327675.0,100.0,3277
85617,88003,2964433,2015-06-15 15:01:09,Data May Be Invalid Relative Speed; Rear Axle ...,PC4__1284P4C_2*,________Y048665,MX,PCAR,0,907,19,True,82,305,105362919,34.588935,-84.667962,2015-06-15 15:32:39,48,14355,False,0,1364678,185,0,8062,1896687.0,1446,29569,92,21407314319846.0,0,1166.0,True,131,17407,False,,2892774.0,327675.0,68.0,1798
87186,89572,2990820,2015-06-16 22:43:01,Data May Be Invalid Relative Speed; Rear Axle ...,PC4__1284P4C_2*,________Y048665,MX,PCAR,0,907,19,True,82,305,105362919,35.146944,-86.580462,2015-06-16 22:43:37,14,144275,False,0,1372176,1832,18,5568,1972063.0,1074,29739,656,21515096517062.0,2998362,1256.0,True,1472,17407,False,,1103419.0,327675.0,14.0,145
98748,101134,3180493,2015-06-27 05:46:01,Data May Be Invalid Relative Speed; Rear Axle ...,PC4__1284P4C_2*,________Y048665,MX,PCAR,0,907,19,True,86,305,105362919,34.628472,-84.868333,2015-06-27 06:02:21,512,142825,False,0,1409274,1832,55,667,1873062.0,139175,305635,80,22060875976494.0,9959317,1004.0,True,1328,17407,False,,2806365.0,327675.0,432.0,1247
106027,108413,3274660,2015-07-02 12:00:22,High (Severity Medium) Engine Speed,unknown,unknown,unknown,unknown,49,190,16,True,126,305,105362919,34.247129,-84.471111,2015-07-02 12:06:31,992,1421,False,0,1412394,1832,100,6786,2105375.0,2063,30635,612,22102086816606.0,154277,1112.0,True,1364,1279,False,,1493718.0,327675.0,988.0,1566
109858,112244,3326811,2015-07-07 05:31:43,Data May Be Invalid Relative Speed; Rear Axle ...,PC4__1284P4C_2*,________Y048665,MX,PCAR,0,907,19,True,87,305,105362919,34.786064,-84.824027,2015-07-07 05:32:19,100,14355,False,0,1418729,185,100,7946,172175.0,162325,307865,288,22192829916468.0,2068474,842.0,True,104,17407,False,,230053.0,327675.0,100.0,3335
118105,120491,3443634,2015-07-13 14:24:52,Data May Be Invalid Relative Speed; Rear Axle ...,PC4__1284P4C_2*,________Y048665,MX,PCAR,0,907,19,True,88,305,105362919,34.845324,-85.017268,2015-07-13 14:49:38,996,142825,False,0,1439218,1868,99,6786,189275.0,1426,31248,568,224942502278.0,1993184,113.0,True,1328,17407,False,,2028194.0,327675.0,996.0,3045
130411,132797,3623872,2015-07-22 22:17:35,Data May Be Invalid Relative Speed; Rear Axle ...,PC4__1284P4C_2*,________Y048665,MX,PCAR,0,907,19,True,95,305,105362919,34.803842,-85.012731,2015-07-22 22:34:36,168,14355,False,0,146218,1886,12,7192,1896687.0,14145,31741,496,22817332647396.0,2760607,104.0,True,131,17407,False,,1108759.0,327675.0,168.0,203
143445,145831,3788565,2015-07-31 09:59:49,Special Instructions System Diagnostic Code #1,unknown,unknown,unknown,unknown,11,611,14,True,127,305,105362919,34.793518,-84.821157,2015-07-31 10:00:25,0,14355,False,0,1494298,185,0,174,,0,32491,436,,0,,True,113,1279,True,,,327675.0,,0


Replace commas 

##### Fuel Level

In [51]:
faults_diagnostic['fuel_level'].value_counts()

100         42171
60           6898
48           6729
50           6181
67.2         6175
43.2         5777
34.8         5522
53.2         5476
72.4         5426
60.4         5425
50.8         5304
54.4         5069
64           4977
54           4843
40           4763
65.2         4600
54.8         4418
39.2         4191
56.8         4170
61.2         4148
44.8         4144
48.4         4112
51.2         4046
57.2         3993
44.4         3965
50.4         3861
59.2         3761
52.8         3718
38.8         3708
58           3667
0            3520
56.4         3512
62.4         3453
64.4         3450
48.8         3432
44           3409
46.8         3328
72           3305
63.2         3276
49.2         3217
98.4         3190
78.4         3170
60.8         3146
56           3126
82           3110
42.4         3048
65.6         3037
47.2         3031
55.2         2989
46.4         2945
49.6         2885
70.4         2868
42.8         2842
58.4         2836
78           2814
64.8      

More commas to replace

##### Fuel LTD

In [52]:
faults_diagnostic['fuel_ltd'].value_counts()

0                  2923
52878.791304736     250
29786.191379156     109
0.132086026         103
29476.185476134      73
                   ... 
32721.407048928       1
47586.368414968       1
76258.282078788       1
76703.279900382       1
62127.058587178       1
Name: fuel_ltd, Length: 328682, dtype: int64

In [53]:
faults_diagnostic[faults_diagnostic['fuel_ltd'].str.contains(',', regex=False, na=False)]

Unnamed: 0,record_id,ess_id,event_timestamp,event_description,ecu_software_version,ecu_serial_number,ecu_model,ecu_make,ecu_source,spn,fmi,active,active_transition_count,equipment_id,mct_number,latitude,longitude,location_timestamp,accelerator_pedal,barometric_pressure,cruise_control_active,cruise_control_set_speed,distance_ltd,engine_coolant_temperature,engine_load,engine_oil_pressure,engine_oil_temperature,engine_rpm,engine_time_ltd,fuel_level,fuel_ltd,fuel_rate,fuel_temperature,ign_status,intake_manifold_temperature,lamp_status,parking_brake,service_distance,speed,switched_battery_voltage,throttle,turbo_boost_pressure
17519,18402,1823305,2015-04-10 14:31:59,Data May Be Invalid Relative Speed; Rear Axle ...,PC4__1284P4C_2*,________Y048665,MX,PCAR,0,907,19,True,38,305,105362919,34.459583,-84.919537,2015-04-10 15:02:32,100,14355,False,0,1130179,1868,99,7598,177575,157075,243565,616,1786793716715,2068474,86,True,1202,17407,False,,2248587,327675.0,100,3335
57330,59400,2517346,2015-05-19 20:48:51,Data May Be Invalid Relative Speed; Rear Axle ...,PC4__1284P4C_2*,________Y048665,MX,PCAR,0,907,19,True,63,305,105362919,34.402222,-84.919398,2015-05-19 21:21:34,100,14355,False,0,1270466,1868,100,7366,1801062,17255,274405,732,19992012551256,2073757,968,True,1238,17407,False,,244228,327675.0,100,3277
85617,88003,2964433,2015-06-15 15:01:09,Data May Be Invalid Relative Speed; Rear Axle ...,PC4__1284P4C_2*,________Y048665,MX,PCAR,0,907,19,True,82,305,105362919,34.588935,-84.667962,2015-06-15 15:32:39,48,14355,False,0,1364678,185,0,8062,1896687,1446,29569,92,21407314319846,0,1166,True,131,17407,False,,2892774,327675.0,68,1798
87186,89572,2990820,2015-06-16 22:43:01,Data May Be Invalid Relative Speed; Rear Axle ...,PC4__1284P4C_2*,________Y048665,MX,PCAR,0,907,19,True,82,305,105362919,35.146944,-86.580462,2015-06-16 22:43:37,14,144275,False,0,1372176,1832,18,5568,1972063,1074,29739,656,21515096517062,2998362,1256,True,1472,17407,False,,1103419,327675.0,14,145
98748,101134,3180493,2015-06-27 05:46:01,Data May Be Invalid Relative Speed; Rear Axle ...,PC4__1284P4C_2*,________Y048665,MX,PCAR,0,907,19,True,86,305,105362919,34.628472,-84.868333,2015-06-27 06:02:21,512,142825,False,0,1409274,1832,55,667,1873062,139175,305635,80,22060875976494,9959317,1004,True,1328,17407,False,,2806365,327675.0,432,1247
106027,108413,3274660,2015-07-02 12:00:22,High (Severity Medium) Engine Speed,unknown,unknown,unknown,unknown,49,190,16,True,126,305,105362919,34.247129,-84.471111,2015-07-02 12:06:31,992,1421,False,0,1412394,1832,100,6786,2105375,2063,30635,612,22102086816606,154277,1112,True,1364,1279,False,,1493718,327675.0,988,1566
109858,112244,3326811,2015-07-07 05:31:43,Data May Be Invalid Relative Speed; Rear Axle ...,PC4__1284P4C_2*,________Y048665,MX,PCAR,0,907,19,True,87,305,105362919,34.786064,-84.824027,2015-07-07 05:32:19,100,14355,False,0,1418729,185,100,7946,172175,162325,307865,288,22192829916468,2068474,842,True,104,17407,False,,230053,327675.0,100,3335
118105,120491,3443634,2015-07-13 14:24:52,Data May Be Invalid Relative Speed; Rear Axle ...,PC4__1284P4C_2*,________Y048665,MX,PCAR,0,907,19,True,88,305,105362919,34.845324,-85.017268,2015-07-13 14:49:38,996,142825,False,0,1439218,1868,99,6786,189275,1426,31248,568,224942502278,1993184,113,True,1328,17407,False,,2028194,327675.0,996,3045
130411,132797,3623872,2015-07-22 22:17:35,Data May Be Invalid Relative Speed; Rear Axle ...,PC4__1284P4C_2*,________Y048665,MX,PCAR,0,907,19,True,95,305,105362919,34.803842,-85.012731,2015-07-22 22:34:36,168,14355,False,0,146218,1886,12,7192,1896687,14145,31741,496,22817332647396,2760607,104,True,131,17407,False,,1108759,327675.0,168,203
167370,169756,4093848,2015-08-17 09:36:38,Data May Be Invalid Relative Speed; Rear Axle ...,PC4__1284P4C_2*,________Y048665,MX,PCAR,0,907,19,True,106,305,105362919,33.39412,-84.786435,2015-08-17 09:37:13,0,142825,False,0,155290,1814,0,7192,1877,132325,33828,528,24105303486922,0,1256,True,131,17407,False,,1368473,327675.0,0,116


More commas

##### Fuel Rate

In [54]:
faults_diagnostic['fuel_rate'].value_counts()

0             97883
0.6075976      6154
0.7528927      5957
0.6340149      5885
0.6604322      5711
0.5679717      5633
0.6868495      5498
0.7132668      5481
0.5415544      5476
0.77931        5348
0.8057272      4396
0.594389       4291
0.5151371      4279
0.5811803      4217
0.7661014      4143
0.6208063      3922
0.6736408      3868
0.739684       3799
0.6472235      3745
0.5547631      3738
0.7000582      3646
0.8321446      3592
0.7264754      3467
0.7925186      3458
0.5283458      3306
0.8585618      3078
0.8849792      2789
0.8189359      2781
0.4887198      2683
0.5019284      2346
0.9246051      2341
0.9510224      2201
0.8453532      2192
1.003857       2188
0.9774396      2183
1.0699         2153
1.096317       2139
1.030274       2086
0.9113964      1959
0.8717705      1908
1.122735       1806
0.8981878      1748
1.149152       1745
0.964231       1705
0.4623025      1693
0.9378137      1673
1.175569       1644
1.215195       1588
0.9906483      1573
1.017066       1542


Commas to replace

##### Fuel Temperature

In [55]:
faults_diagnostic['fuel_temperature'].value_counts()

32       282519
183.2       661
181.4       566
185         449
114.8       393
120.2       363
129.2       360
123.8       353
111.2       350
127.4       350
179.6       349
105.8       346
102.2       336
116.6       326
118.4       320
122         319
96.8        318
109.4       312
98.6        304
100.4       302
125.6       294
93.2        290
89.6        287
113         286
95          285
91.4        278
107.6       275
84.2        269
131         255
80.6        255
104         250
186.8       244
87.8        244
86          241
77          223
132.8       213
82.4        213
134.6       205
78.8        201
136.4       187
75.2        179
73.4        167
140         156
138.2       151
68          149
141.8       149
66.2        144
188.6       143
71.6        139
69.8        137
177.8       136
64.4        111
143.6       104
62.6         99
145.4        98
150.8        97
147.2        94
60.8         92
59           91
46.4         88
149          84
190.4        74
192.2   

Commas

##### IGN Status

In [56]:
faults_diagnostic['ign_status'].value_counts(normalize=True)

True     0.99566
False    0.00434
Name: ign_status, dtype: float64

##### Intake Manifold Temperature

In [57]:
faults_diagnostic['intake_manifold_temperature'].value_counts()

120.2    16720
118.4    16414
102.2    15996
116.6    15896
114.8    15511
100.4    15212
113      14813
98.6     14607
111.2    14472
109.4    13974
96.8     13828
107.6    13494
125.6    13163
105.8    12981
95       12976
123.8    12860
127.4    12839
129.2    12812
122      12780
93.2     12745
131      12626
132.8    12515
104      12426
134.6    12270
136.4    12156
84.2     11825
91.4     11813
138.2    11733
89.6     11110
82.4     10759
87.8     10716
86        9927
80.6      9814
78.8      9100
77        8284
140       8244
141.8     7925
75.2      7640
143.6     7360
73.4      6787
145.4     6626
71.6      6285
66.2      6023
147.2     5866
69.8      5614
68        5333
64.4      5133
149       4753
62.6      4509
60.8      3980
150.8     3923
59        3419
152.6     2979
57.2      2930
55.4      2730
154.4     2342
53.6      2219
51.8      2080
48.2      1803
50        1783
156.2     1679
46.4      1499
44.6      1280
42.8      1185
41        1052
158        931
39.2      

Commas and negative values

##### Lamp Status

In [58]:
faults_diagnostic['lamp_status'].value_counts()

1023     352603
1279     314278
255      301225
17407     99876
18431     28314
2047      19602
2         11667
50175      9341
63487      8220
62463      7860
16639      7620
22527      5738
51199      4373
0          3664
21503      2758
65535      1981
2035       1470
6143       1301
5119        825
17663       627
4351        484
55295       391
9           345
5375        217
18419       216
20735        60
11           33
544          26
511          23
50431        10
16895         5
22515         4
28436         4
11801         2
617           1
6131          1
65523         1
Name: lamp_status, dtype: int64

Nothing to clean!

##### Parking Brake

In [59]:
faults_diagnostic['parking_brake'].value_counts(normalize=True)

False    0.667462
True     0.332538
Name: parking_brake, dtype: float64

30% of the dataset includes trucks that are parked. Will we want to use this?

##### Service Distance

In [60]:
faults_diagnostic['service_distance'].value_counts()

-10774.576171875     8
16500.51171875       4
-99813.9609375       3
-23487.83203125      3
-23770.5546875       2
17370.431640625      2
-13235.2060546875    2
-21334.779296875     2
37953.3515625        1
42203.53125          1
5635.8369140625      1
37043.04296875       1
-22506.064453125     1
39811.25390625       1
38546.76171875       1
826.423706054688     1
-23528.220703125     1
-9214.9345703125     1
-8093.35986328125    1
-20784.8671875       1
-17808.498046875     1
-23388.412109375     1
-16332.7421875       1
37350.62109375       1
38105.58984375       1
-11678.671875        1
-21766.6328125       1
-2951.51318359375    1
37002.65625          1
39668.3359375        1
40019.41015625       1
-21760.419921875     1
38456.6640625        1
37810.4375           1
4737.95556640625     1
-17308.294921875     1
-21107.978515625     1
-16255.0703125       1
-15742.439453125     1
-23646.28125         1
-22509.171875        1
-10215.3427734375    1
37213.921875         1
-21751.0976

There are very few records within this column (~400 out of a 1.18 million), so we're safe to drop it

In [61]:
faults_diagnostic = faults_diagnostic.drop(columns='service_distance')

##### Speed

In [62]:
faults_diagnostic['speed'].value_counts()

0              199216
0.009708925      1613
0.01941785       1025
66.88478          837
66.83624          795
                ...  
64.08133            1
73.28297            1
3.720945            1
77.09857            1
31.04429            1
Name: speed, Length: 13533, dtype: int64

In [63]:
faults_diagnostic[faults_diagnostic['speed'].str.contains(',', regex=False, na=False)]

Unnamed: 0,record_id,ess_id,event_timestamp,event_description,ecu_software_version,ecu_serial_number,ecu_model,ecu_make,ecu_source,spn,fmi,active,active_transition_count,equipment_id,mct_number,latitude,longitude,location_timestamp,accelerator_pedal,barometric_pressure,cruise_control_active,cruise_control_set_speed,distance_ltd,engine_coolant_temperature,engine_load,engine_oil_pressure,engine_oil_temperature,engine_rpm,engine_time_ltd,fuel_level,fuel_ltd,fuel_rate,fuel_temperature,ign_status,intake_manifold_temperature,lamp_status,parking_brake,speed,switched_battery_voltage,throttle,turbo_boost_pressure
17519,18402,1823305,2015-04-10 14:31:59,Data May Be Invalid Relative Speed; Rear Axle ...,PC4__1284P4C_2*,________Y048665,MX,PCAR,0,907,19,True,38,305,105362919,34.459583,-84.919537,2015-04-10 15:02:32,100,14355,False,0,1130179,1868,99,7598,177575,157075,243565,616,1786793716715,2068474,86,True,1202,17407,False,2248587,327675.0,100,3335
57330,59400,2517346,2015-05-19 20:48:51,Data May Be Invalid Relative Speed; Rear Axle ...,PC4__1284P4C_2*,________Y048665,MX,PCAR,0,907,19,True,63,305,105362919,34.402222,-84.919398,2015-05-19 21:21:34,100,14355,False,0,1270466,1868,100,7366,1801062,17255,274405,732,19992012551256,2073757,968,True,1238,17407,False,244228,327675.0,100,3277
85617,88003,2964433,2015-06-15 15:01:09,Data May Be Invalid Relative Speed; Rear Axle ...,PC4__1284P4C_2*,________Y048665,MX,PCAR,0,907,19,True,82,305,105362919,34.588935,-84.667962,2015-06-15 15:32:39,48,14355,False,0,1364678,185,0,8062,1896687,1446,29569,92,21407314319846,0,1166,True,131,17407,False,2892774,327675.0,68,1798
87186,89572,2990820,2015-06-16 22:43:01,Data May Be Invalid Relative Speed; Rear Axle ...,PC4__1284P4C_2*,________Y048665,MX,PCAR,0,907,19,True,82,305,105362919,35.146944,-86.580462,2015-06-16 22:43:37,14,144275,False,0,1372176,1832,18,5568,1972063,1074,29739,656,21515096517062,2998362,1256,True,1472,17407,False,1103419,327675.0,14,145
98748,101134,3180493,2015-06-27 05:46:01,Data May Be Invalid Relative Speed; Rear Axle ...,PC4__1284P4C_2*,________Y048665,MX,PCAR,0,907,19,True,86,305,105362919,34.628472,-84.868333,2015-06-27 06:02:21,512,142825,False,0,1409274,1832,55,667,1873062,139175,305635,80,22060875976494,9959317,1004,True,1328,17407,False,2806365,327675.0,432,1247
106027,108413,3274660,2015-07-02 12:00:22,High (Severity Medium) Engine Speed,unknown,unknown,unknown,unknown,49,190,16,True,126,305,105362919,34.247129,-84.471111,2015-07-02 12:06:31,992,1421,False,0,1412394,1832,100,6786,2105375,2063,30635,612,22102086816606,154277,1112,True,1364,1279,False,1493718,327675.0,988,1566
109858,112244,3326811,2015-07-07 05:31:43,Data May Be Invalid Relative Speed; Rear Axle ...,PC4__1284P4C_2*,________Y048665,MX,PCAR,0,907,19,True,87,305,105362919,34.786064,-84.824027,2015-07-07 05:32:19,100,14355,False,0,1418729,185,100,7946,172175,162325,307865,288,22192829916468,2068474,842,True,104,17407,False,230053,327675.0,100,3335
118105,120491,3443634,2015-07-13 14:24:52,Data May Be Invalid Relative Speed; Rear Axle ...,PC4__1284P4C_2*,________Y048665,MX,PCAR,0,907,19,True,88,305,105362919,34.845324,-85.017268,2015-07-13 14:49:38,996,142825,False,0,1439218,1868,99,6786,189275,1426,31248,568,224942502278,1993184,113,True,1328,17407,False,2028194,327675.0,996,3045
130411,132797,3623872,2015-07-22 22:17:35,Data May Be Invalid Relative Speed; Rear Axle ...,PC4__1284P4C_2*,________Y048665,MX,PCAR,0,907,19,True,95,305,105362919,34.803842,-85.012731,2015-07-22 22:34:36,168,14355,False,0,146218,1886,12,7192,1896687,14145,31741,496,22817332647396,2760607,104,True,131,17407,False,1108759,327675.0,168,203
167370,169756,4093848,2015-08-17 09:36:38,Data May Be Invalid Relative Speed; Rear Axle ...,PC4__1284P4C_2*,________Y048665,MX,PCAR,0,907,19,True,106,305,105362919,33.39412,-84.786435,2015-08-17 09:37:13,0,142825,False,0,155290,1814,0,7192,1877,132325,33828,528,24105303486922,0,1256,True,131,17407,False,1368473,327675.0,0,116


Commas to replace

##### Switched Battery Voltage

In [64]:
faults_diagnostic['switched_battery_voltage'].value_counts()

3276.75     102907
14            1375
13.95         1097
13.9           962
13.85          858
14.05          803
13.8           521
13.75          441
14.1           441
13.7           322
13.6           234
13.65          227
12.5           178
14.15          157
13.55          151
12.55          142
13.5           140
12.6           131
12.75          126
12.7           126
12.8           126
12.4           115
12.25          113
12.45          113
13.05          108
13.45          102
13.25          101
12.65          100
12.85          100
13.15           99
13.1            96
13.35           95
12.9            94
12.35           93
13.3            89
13.2            86
12.95           85
13              85
12.3            84
13.4            73
12.2            62
12.15           48
12              47
11.8            47
12.1            45
12.05           45
11.85           40
14.2            34
11.7            28
11.95           25
11.75           17
11.9            16
3276,75     

Commas to replace

##### Throttle

In [65]:
faults_diagnostic['throttle'].value_counts()

100         285394
0           113358
38.4           226
37.6           225
38.8           224
40.8           214
36.4           210
38             207
39.2           202
35.2           201
33.6           199
34.4           196
34.8           195
42.8           193
34             192
37.2           192
0.4            191
28.8           189
36             187
32.8           187
39.6           184
41.2           182
36.8           182
42             182
44             178
32.4           178
33.2           177
40.4           176
44.8           175
31.6           172
40             170
41.6           169
30             167
42.4           166
30.8           165
43.6           163
35.6           162
28.4           160
32             160
46.4           158
99.6           154
46             152
44.4           152
43.2           151
45.6           150
31.2           150
30.4           150
29.2           143
27.6           141
45.2           141
29.6           140
99.2           134
47.6        

Commas

##### Turbo Boost Pressure

In [66]:
faults_diagnostic['turbo_boost_pressure'].value_counts()

0.29     94718
0        66474
0.58     52541
0.87     29056
1.16     24923
1.45     22229
1.74     19455
2.03     16948
2.32     14241
2.61     11765
2.9      10017
3.19      8799
3.48      8026
3.77      7356
4.06      6582
4.35      6018
4.64      5406
4.93      4856
5.22      4507
5.51      4167
5.8       3904
6.09      3767
6.38      3608
6.67      3333
6.96      3277
7.25      3255
7.54      3035
7.83      2878
8.12      2837
8.41      2718
8.7       2661
9.28      2599
8.99      2583
9.57      2419
10.15     2325
9.86      2294
10.73     2265
25.52     2236
11.02     2201
10.44     2185
26.68     2184
26.1      2168
26.39     2166
25.81     2162
26.97     2117
25.23     2087
11.31     2085
12.47     2081
11.89     2061
27.26     2049
11.6      2018
12.18     2005
27.55     1995
24.94     1929
13.34     1917
12.76     1893
13.63     1873
24.65     1862
13.05     1844
27.84     1834
28.13     1820
24.36     1809
14.21     1745
13.92     1729
28.42     1699
14.5      1682
14.79     

Commas

## Replacing commas and changing to floats

In [67]:
comma_columns = [
    'accelerator_pedal',
    'barometric_pressure',
    'distance_ltd',
     'engine_coolant_temperature',
     'engine_oil_pressure',
     'engine_oil_temperature',
     'engine_rpm',
     'engine_time_ltd',
     'fuel_level',
     'fuel_ltd',
     'fuel_rate',
     'fuel_temperature',
     'intake_manifold_temperature',
     'speed',
     'switched_battery_voltage',
     'throttle',
     'turbo_boost_pressure']

for column in comma_columns:
    faults_diagnostic[column] = faults_diagnostic[column].str.replace(',', '.')
    faults_diagnostic[column] = faults_diagnostic[column].astype('float64')

## Use forward fill and backfill to reduce null values

In [68]:
def input_fill(df):
    return df.sort_values(by='event_timestamp').fillna(method='ffill').fillna(method='bfill')

In [69]:
faults_diagnostic = faults_diagnostic.groupby('equipment_id').apply(input_fill)

In [70]:
faults_diagnostic.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 1185166 entries, ('1327', 4967) to ('R1764', 4999)
Data columns (total 41 columns):
 #   Column                       Non-Null Count    Dtype         
---  ------                       --------------    -----         
 0   record_id                    1185166 non-null  int64         
 1   ess_id                       1185166 non-null  int64         
 2   event_timestamp              1185166 non-null  datetime64[ns]
 3   event_description            1185166 non-null  object        
 4   ecu_software_version         1185003 non-null  object        
 5   ecu_serial_number            1185003 non-null  object        
 6   ecu_model                    1185003 non-null  object        
 7   ecu_make                     1185003 non-null  object        
 8   ecu_source                   1185166 non-null  int64         
 9   spn                          1185166 non-null  int64         
 10  fmi                          1185166 non-null  int64     

## Data Types

In [71]:
string_columns = [
    'record_id',
    'ess_id',
    'ecu_source',
    'spn',
    'fmi',
    'mct_number'
]

integers = [
    'engine_load'
]

bools = [
    'cruise_control_active',
    'ign_status',
    'parking_brake'
]

floats = [
    'cruise_control_set_speed',
    'service_distance'
]

In [72]:
for column in string_columns:
    faults_diagnostic[column] = faults_diagnostic[column].apply(lambda x: str(x))
    
for column in integers:
    faults_diagnostic[column] = pd.to_numeric(faults_diagnostic[column], errors='coerce')
    
for column in bools:
    faults_diagnostic[column] = faults_diagnostic[column].apply(lambda x: bool(x))
    
for column in floats:
    faults_diagnostic[column] = faults_diagnostic[column].apply(lambda x: float(x))

KeyError: 'service_distance'

In [None]:
faults_diagnostic.info()

#### Removing service station records using Lat/Long

In [None]:
service_centers = faults_diagnostic[ 
        ((faults_diagnostic['latitude'].between(36.05, 36.07, inclusive=True)) & (faults_diagnostic['longitude'].between(-86.44, -86.42, inclusive=True))) |
        ((faults_diagnostic['latitude'].between(35.57, 35.59, inclusive=True)) & (faults_diagnostic['longitude'].between(-86.45, -86.43, inclusive=True))) |
        ((faults_diagnostic['latitude'].between(36.18, 36.21, inclusive=True)) & (faults_diagnostic['longitude'].between(-83.18, -83.16, inclusive=True)))
        ].index

In [None]:
faults_diagnostic = faults_diagnostic.drop(service_centers, axis=0)

In [None]:
faults_diagnostic.shape

#### Ungrouping by Equipment ID

This will ensure our cleaned dataset will match our unseen data

In [None]:
#equipment_id became and index and the column was retained. We drop it first to avoid an error
#after resetting the index, a 'level_1' column is created, so that is dropped as well

faults_diagnostic = faults_diagnostic.drop(columns =['equipment_id']).reset_index().drop(columns=['level_1'])

#### Dropping Active = False

In [None]:
faults_diagnostic = faults_diagnostic[faults_diagnostic['active'] == True]

## Creating Target Column

In [None]:
faults_diagnostic['5246_derate'] = faults_diagnostic['spn'].apply(lambda x: 'True' if x == '5246' else 'False') 

In [None]:
faults_diagnostic[faults_diagnostic['5246_derate'] == 'True']

In [None]:
faults_diagnostic['1569_derate'] = faults_diagnostic['spn'].apply(lambda x: 'True' if x == '1569' else 'False')

In [None]:
faults_diagnostic[faults_diagnostic['1569_derate'] == 'True']

## Export as CSV

In [None]:
faults_diagnostic.to_csv('../data/cleaned_faults_diagnostic.csv', index=False)