## Imports

In [1]:
import pandas as pd
import numpy as np

## Reading in Data

In [2]:
faults = pd.read_csv('../data/J1939Faults.csv', 
                     low_memory=False, 
                     parse_dates=[2, 19],
                     infer_datetime_format=True)

In [3]:
faults.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1187335 entries, 0 to 1187334
Data columns (total 20 columns):
 #   Column                 Non-Null Count    Dtype         
---  ------                 --------------    -----         
 0   RecordID               1187335 non-null  int64         
 1   ESS_Id                 1187335 non-null  int64         
 2   EventTimeStamp         1187335 non-null  datetime64[ns]
 3   eventDescription       1126490 non-null  object        
 4   actionDescription      0 non-null        float64       
 5   ecuSoftwareVersion     891285 non-null   object        
 6   ecuSerialNumber        844318 non-null   object        
 7   ecuModel               1122577 non-null  object        
 8   ecuMake                1122577 non-null  object        
 9   ecuSource              1187335 non-null  int64         
 10  spn                    1187335 non-null  int64         
 11  fmi                    1187335 non-null  int64         
 12  active                 11873

## Changing data types

In [4]:
#dropping columns with all null values

faults = faults.drop(columns=['actionDescription', 'faultValue'])

In [5]:
# changing certain column to strings

#str_columns = ['ESS_Id',  
               #'spn', 
               #'fmi', 
               #'EquipmentID', 
               #'MCTNumber'
              #]

#for column in str_columns:
    #faults[column] = faults[column].astype(str)

## Reviewing columns values

In [6]:
faults.RecordID.value_counts()

2047      1
983222    1
620665    1
616571    1
610428    1
         ..
653204    1
655253    1
649110    1
651159    1
2049      1
Name: RecordID, Length: 1187335, dtype: int64

In [7]:
faults.ESS_Id.value_counts().head(200)

1001059      8
1000422      8
1000318      8
1000086      8
1000978      8
            ..
6956559      2
69304349     2
109204155    2
6960261      2
109165718    2
Name: ESS_Id, Length: 200, dtype: int64

In [8]:
faults[faults['ESS_Id'] == '1000939']

  res_values = method(rvalues)


Unnamed: 0,RecordID,ESS_Id,EventTimeStamp,eventDescription,ecuSoftwareVersion,ecuSerialNumber,ecuModel,ecuMake,ecuSource,spn,fmi,active,activeTransitionCount,EquipmentID,MCTNumber,Latitude,Longitude,LocationTimeStamp


In [9]:
faults[faults['ESS_Id'] == '1000603']

  res_values = method(rvalues)


Unnamed: 0,RecordID,ESS_Id,EventTimeStamp,eventDescription,ecuSoftwareVersion,ecuSerialNumber,ecuModel,ecuMake,ecuSource,spn,fmi,active,activeTransitionCount,EquipmentID,MCTNumber,Latitude,Longitude,LocationTimeStamp


In [10]:
faults.eventDescription.value_counts().head(35)

Low (Severity Low) Engine Coolant Level                                                      320244
Abnormal Update Rate Tire Location                                                           256541
High Voltage (Fuel Level)                                                                     82818
High Voltage (Left Fuel Level Sensor)                                                         81754
Low (Severity Medium) Engine Coolant Level                                                    38866
Incorrect Data J1939 Network #1 Primary Vehicle Network ( previously SAE J1939 Data Link)     37086
Condition Exists Cruise Control Enable Switch                                                 22570
High (Severity Low) Water In Fuel Indicator                                                   17681
Condition Exists Engine Protection Torque Derate                                              10927
High Voltage (Water In Fuel Indicator)                                                         8628


In [11]:
faults.ecuSoftwareVersion.value_counts()

unknown                                                                                                        298549
BB41103*   BB41104*                                                                                             45381
P30-1011-124*1*                                                                                                 22068
PC4__1284P4C_2*                                                                                                 15082
AAAI000031*AAAM000036*BB41259   *A82J140612A_9203usadv   *AAAC000032*BB41276     *A8XL140606B_Bendix      *     11939
                                                                                                                ...  
????7106*04029075*092613211021*09300006*G1*BDR*                                                                     1
NDWS*EC80ESP *5N35150963**259   *A82J140612A_9203usadv   *AAAC000032*BB41276     *A8XL140606B_Bendix      *         1
????0170*03015995*121316112755*09401371*G1*BDR*         

In [12]:
faults.ecuSerialNumber.value_counts()

unknown     298549
6U13D13      11207
79845785     10302
79856768      8158
79845329      7199
             ...  
Z0036833         1
80199597         1
80239683         1
79979992         1
80205220         1
Name: ecuSerialNumber, Length: 1989, dtype: int64

In [13]:
faults.ecuModel.value_counts()

unknown                 298549
CECU3B-NAMUX4           277919
6X1u10D1500000000       216230
6X1u13D1500000000       203685
EC60-adv                 48816
EC80ESP                  22202
MX                       16362
6X1u17D1500000000        14499
EEO-xxF112C               8131
0USA13_13_0415_2238A      7252
FAOM-xx810S-EC3           4464
MX16U13D13                3629
MX16U15D13                 391
CE                         297
EC80ESP+                    42
6L u13D0890000000           28
Gen 4 Boot Loader           21
CECU3-NAMUX3                20
EC80ESP AM000036             7
6X1u20D1500000000            6
Y049568                      5
6U13D13                      4
Y044053                      4
E0031                        4
20412511P07                  4
________Y043718              2
EC80ESP AM000038             2
0USA10_13_0405_2237A         1
202.35.0                     1
Name: ecuModel, dtype: int64

In [14]:
faults.ecuMake.value_counts()

CMMNS              433403
unknown            298549
PACCR              277021
BNDWS               71001
PCAR                20229
EATON               12612
VOLVO                7252
?????                 755
????S                 627
????R                 589
?MMNS                 289
?CAR                  152
?ACCR                  39
???CR                  20
?NDWS                  15
?????MX16U13D13         9
?????MX                 6
?ATON                   3
??MNS                   3
???R                    1
5516014                 1
??DWS                   1
Name: ecuMake, dtype: int64

In [15]:
faults.ecuSource.value_counts()

0     528044
49    514059
11    131122
3      13484
61       626
Name: ecuSource, dtype: int64

In [16]:
faults.spn.value_counts()

111      365489
929      256541
96        90041
829       87788
639       41062
          ...  
6327          1
677           1
56503         1
37265         1
65287         1
Name: spn, Length: 450, dtype: int64

In [17]:
faults.fmi.value_counts()

17    326553
9     288893
3     188631
2      82334
18     53602
31     39881
4      39734
0      27577
7      23249
1      20887
15     20739
14     12699
12     12233
5       9683
16      9242
10      7437
8       5741
19      5733
20      4440
11      3847
13      2634
6       1027
23       396
21       131
29         8
22         4
Name: fmi, dtype: int64

In [18]:
faults.active.value_counts()

True     608454
False    578881
Name: active, dtype: int64

In [19]:
faults.activeTransitionCount.value_counts()

126    581874
1      305476
127    102512
2       45075
3       13019
        ...  
120       371
123       363
124       362
107       361
118       338
Name: activeTransitionCount, Length: 128, dtype: int64

In [20]:
faults.EquipmentID.value_counts()

1641         17492
1605         16393
1646         15462
1618         14986
1606         14973
             ...  
105322231        1
105427271        1
105340140        1
105355660        1
105349493        1
Name: EquipmentID, Length: 1122, dtype: int64

In [21]:
faults.MCTNumber.value_counts()

105415080    16503
105420184    15507
105381514    15330
105415457    15119
105416377    15034
             ...  
105329945        2
108608400        2
108605642        2
108609466        1
108616776        1
Name: MCTNumber, Length: 768, dtype: int64

#### Definitely some null values we'll need to fix (some are listed as 'nan' and others as 'unknown'). A lot of these columns are pretty messy and they don't necessarily appear to contain information that will prove to be valuable. We may not clean all as they may be dropped when considering feature selections.

## Faults Cleaning

In [22]:
faults['ecuSoftwareVersion'] = faults['ecuSoftwareVersion'].replace('unknown', np.nan)

In [23]:
faults['ecuSerialNumber'] = faults['ecuSerialNumber'].replace('unknown', np.nan)

In [24]:
faults['ecuModel'] = faults['ecuModel'].replace('unknown', np.nan)

In [25]:
faults['ecuMake'] = faults['ecuMake'].replace('unknown', np.nan)
faults['ecuMake'] = faults['ecuMake'].replace('?????', np.nan)
faults['ecuMake'] = faults['ecuMake'].replace('????R', 'PACCR')
faults['ecuMake'] = faults['ecuMake'].replace('?MMNS', 'CMMNS')
faults['ecuMake'] = faults['ecuMake'].replace('?CAR', 'PCAR')
faults['ecuMake'] = faults['ecuMake'].replace('?ACCR', 'PACCR')
faults['ecuMake'] = faults['ecuMake'].replace('???CR', 'PACCR')
faults['ecuMake'] = faults['ecuMake'].replace('?NDWS', 'BNDWS')
faults['ecuMake'] = faults['ecuMake'].replace('?????MX16U13D13', 'MX')
faults['ecuMake'] = faults['ecuMake'].replace('?????MX', 'MX')
faults['ecuMake'] = faults['ecuMake'].replace('?ATON', 'EATON')
faults['ecuMake'] = faults['ecuMake'].replace('??MNS', 'CMMNS')
faults['ecuMake'] = faults['ecuMake'].replace('???R', 'PCAR')
faults['ecuMake'] = faults['ecuMake'].replace('??DWS', 'BNDWS')

In [26]:
faults['ecuMake'].value_counts()

CMMNS      433695
PACCR      277669
BNDWS       71017
PCAR        20382
EATON       12615
VOLVO        7252
????S         627
MX             15
5516014         1
Name: ecuMake, dtype: int64

## Diagnostic Data

In [27]:
diagnostic = pd.read_csv('../data/VehicleDiagnosticOnboardData.csv')

In [28]:
diagnostic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12821626 entries, 0 to 12821625
Data columns (total 4 columns):
 #   Column   Dtype 
---  ------   ----- 
 0   Id       int64 
 1   Name     object
 2   Value    object
 3   FaultId  int64 
dtypes: int64(2), object(2)
memory usage: 391.3+ MB


In [29]:
diagnostic.head()

Unnamed: 0,Id,Name,Value,FaultId
0,1,IgnStatus,False,1
1,2,EngineOilPressure,0,1
2,3,EngineOilTemperature,96.74375,1
3,4,TurboBoostPressure,0,1
4,5,EngineLoad,11,1


#### This is in long format and we'll want to pivot it to wide format. The 'FaultId' column in the Diagnostics dataframe will be used to join to the Fault dataframe where 'FaultId' = 'RecordId'

In [30]:
diagnostic = diagnostic.pivot(index='FaultId', columns='Name', values='Value').reset_index()

In [31]:
diagnostic.head()

Name,FaultId,AcceleratorPedal,BarometricPressure,CruiseControlActive,CruiseControlSetSpeed,DistanceLtd,EngineCoolantTemperature,EngineLoad,EngineOilPressure,EngineOilTemperature,...,FuelTemperature,IgnStatus,IntakeManifoldTemperature,LampStatus,ParkingBrake,ServiceDistance,Speed,SwitchedBatteryVoltage,Throttle,TurboBoostPressure
0,1,0.0,14.21,False,66.48672,423178.7,100.4,11.0,0.0,96.74375,...,,False,78.8,1023,True,,0.0,3276.75,,0.0
1,2,,,,,,,,,,...,,True,,1279,,,,,,
2,3,,,,,,,,,,...,,,,1279,,,,,,
3,4,,,,,,,,,,...,,True,,1279,,,,,,
4,5,,,,,,,,,,...,,,,16639,,,,,,


In [32]:
diagnostic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1187335 entries, 0 to 1187334
Data columns (total 25 columns):
 #   Column                     Non-Null Count    Dtype 
---  ------                     --------------    ----- 
 0   FaultId                    1187335 non-null  int64 
 1   AcceleratorPedal           531889 non-null   object
 2   BarometricPressure         585976 non-null   object
 3   CruiseControlActive        574916 non-null   object
 4   CruiseControlSetSpeed      576458 non-null   object
 5   DistanceLtd                585819 non-null   object
 6   EngineCoolantTemperature   586071 non-null   object
 7   EngineLoad                 585621 non-null   object
 8   EngineOilPressure          586244 non-null   object
 9   EngineOilTemperature       583912 non-null   object
 10  EngineRpm                  586921 non-null   object
 11  EngineTimeLtd              581366 non-null   object
 12  FuelLevel                  502795 non-null   object
 13  FuelLtd                    

## Changing Data Types

In [33]:
diagnostic['AcceleratorPedal'].value_counts(ascending=True).head(10)

4,8        1
51,2       1
14,4       1
16,8       1
99,2       1
4,4        1
20,4       1
99,6       1
2,8        1
101.6    122
Name: AcceleratorPedal, dtype: int64

In [34]:
diagnostic[diagnostic['AcceleratorPedal'].str.contains(',', na=False)]

Name,FaultId,AcceleratorPedal,BarometricPressure,CruiseControlActive,CruiseControlSetSpeed,DistanceLtd,EngineCoolantTemperature,EngineLoad,EngineOilPressure,EngineOilTemperature,...,FuelTemperature,IgnStatus,IntakeManifoldTemperature,LampStatus,ParkingBrake,ServiceDistance,Speed,SwitchedBatteryVoltage,Throttle,TurboBoostPressure
85617,88003,48,14355,False,0,1364678,185,0,8062,1896687,...,1166,True,131,17407,False,,2892774,327675.0,68,1798
98748,101134,512,142825,False,0,1409274,1832,55,667,1873062,...,1004,True,1328,17407,False,,2806365,327675.0,432,1247
106027,108413,992,1421,False,0,1412394,1832,100,6786,2105375,...,1112,True,1364,1279,False,,1493718,327675.0,988,1566
118105,120491,996,142825,False,0,1439218,1868,99,6786,189275,...,113,True,1328,17407,False,,2028194,327675.0,996,3045
130411,132797,168,14355,False,0,146218,1886,12,7192,1896687,...,104,True,131,17407,False,,1108759,327675.0,168,203
282142,286648,144,144275,False,0,1831614,1562,17,8932,1457375,...,644,True,662,1279,False,,2330142,,144,145
384770,392271,44,14355,False,0,7249709,1814,10,4176,1727375,...,86,True,95,1279,False,,4553486,,4,174
461898,471015,204,14355,False,0,9316862,176,36,522,1657063,...,896,True,932,1279,False,,4835044,,204,203
533850,545727,28,142825,False,0,119005,185,0,4988,2111,...,1148,True,986,1279,False,,6447697,,24,2059


#### Looks like we have several columns that need to be integers that have comma issues. We can address those and data types all at once.

In [35]:
to_float_columns = [
    "AcceleratorPedal", 
    "BarometricPressure",
    "CruiseControlSetSpeed",
    "DistanceLtd",
    "EngineCoolantTemperature",
    "EngineLoad",
    "EngineOilPressure",
    "EngineOilTemperature",
    "EngineRpm",
    "EngineTimeLtd",
    "FuelLevel",
    "FuelLtd",
    "FuelRate",
    "FuelTemperature",
    "IntakeManifoldTemperature",
    "ServiceDistance",
    "Speed",
    "SwitchedBatteryVoltage",
    "Throttle",
    "TurboBoostPressure"
]

for col in to_float_columns:
    diagnostic[col] = diagnostic[col].str.replace(',', '.')
    diagnostic[col] = diagnostic[col].astype("float64")

#### The remaining columns are booleans. We'll change those now

In [36]:
to_bool_columns = [
    'CruiseControlActive',
    'IgnStatus',
    'ParkingBrake'
]

for col in to_bool_columns:
    diagnostic[col] = diagnostic[col].astype('bool')

In [37]:
diagnostic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1187335 entries, 0 to 1187334
Data columns (total 25 columns):
 #   Column                     Non-Null Count    Dtype  
---  ------                     --------------    -----  
 0   FaultId                    1187335 non-null  int64  
 1   AcceleratorPedal           531889 non-null   float64
 2   BarometricPressure         585976 non-null   float64
 3   CruiseControlActive        1187335 non-null  bool   
 4   CruiseControlSetSpeed      576458 non-null   float64
 5   DistanceLtd                585819 non-null   float64
 6   EngineCoolantTemperature   586071 non-null   float64
 7   EngineLoad                 585621 non-null   float64
 8   EngineOilPressure          586244 non-null   float64
 9   EngineOilTemperature       583912 non-null   float64
 10  EngineRpm                  586921 non-null   float64
 11  EngineTimeLtd              581366 non-null   float64
 12  FuelLevel                  502795 non-null   float64
 13  FuelLtd     

In [38]:
diagnostic.head()

Name,FaultId,AcceleratorPedal,BarometricPressure,CruiseControlActive,CruiseControlSetSpeed,DistanceLtd,EngineCoolantTemperature,EngineLoad,EngineOilPressure,EngineOilTemperature,...,FuelTemperature,IgnStatus,IntakeManifoldTemperature,LampStatus,ParkingBrake,ServiceDistance,Speed,SwitchedBatteryVoltage,Throttle,TurboBoostPressure
0,1,0.0,14.21,True,66.48672,423178.7,100.4,11.0,0.0,96.74375,...,,True,78.8,1023,True,,0.0,3276.75,,0.0
1,2,,,True,,,,,,,...,,True,,1279,True,,,,,
2,3,,,True,,,,,,,...,,True,,1279,True,,,,,
3,4,,,True,,,,,,,...,,True,,1279,True,,,,,
4,5,,,True,,,,,,,...,,True,,16639,True,,,,,


## Joining Fault and Diagnostic data

In [39]:
faults.head()

Unnamed: 0,RecordID,ESS_Id,EventTimeStamp,eventDescription,ecuSoftwareVersion,ecuSerialNumber,ecuModel,ecuMake,ecuSource,spn,fmi,active,activeTransitionCount,EquipmentID,MCTNumber,Latitude,Longitude,LocationTimeStamp
0,1,990349,2015-02-21 10:47:13,Low (Severity Low) Engine Coolant Level,,,,,0,111,17,True,2,1439,105354361,38.857638,-84.626851,2015-02-21 11:34:25
1,2,990360,2015-02-21 11:34:34,,,,,,11,629,12,True,127,1439,105354361,38.857638,-84.626851,2015-02-21 11:35:10
2,3,990364,2015-02-21 11:35:31,Incorrect Data Steering Wheel Angle,,,,,11,1807,2,False,127,1369,105336226,41.42125,-87.767361,2015-02-21 11:35:26
3,4,990370,2015-02-21 11:35:33,Incorrect Data Steering Wheel Angle,,,,,11,1807,2,True,127,1369,105336226,41.421018,-87.767361,2015-02-21 11:36:08
4,5,990416,2015-02-21 11:39:41,,22281684P01*22357957P01*22362082P01*,13063430.0,0USA13_13_0415_2238A,VOLVO,0,4364,17,False,2,1674,105427130,38.416481,-89.442638,2015-02-21 11:39:37


In [40]:
diagnostic.head()

Name,FaultId,AcceleratorPedal,BarometricPressure,CruiseControlActive,CruiseControlSetSpeed,DistanceLtd,EngineCoolantTemperature,EngineLoad,EngineOilPressure,EngineOilTemperature,...,FuelTemperature,IgnStatus,IntakeManifoldTemperature,LampStatus,ParkingBrake,ServiceDistance,Speed,SwitchedBatteryVoltage,Throttle,TurboBoostPressure
0,1,0.0,14.21,True,66.48672,423178.7,100.4,11.0,0.0,96.74375,...,,True,78.8,1023,True,,0.0,3276.75,,0.0
1,2,,,True,,,,,,,...,,True,,1279,True,,,,,
2,3,,,True,,,,,,,...,,True,,1279,True,,,,,
3,4,,,True,,,,,,,...,,True,,1279,True,,,,,
4,5,,,True,,,,,,,...,,True,,16639,True,,,,,


In [41]:
fault_diagnostic = faults.merge(diagnostic, left_on='RecordID', right_on='FaultId').drop('FaultId', axis=1)

In [42]:
fault_diagnostic.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1187335 entries, 0 to 1187334
Data columns (total 42 columns):
 #   Column                     Non-Null Count    Dtype         
---  ------                     --------------    -----         
 0   RecordID                   1187335 non-null  int64         
 1   ESS_Id                     1187335 non-null  int64         
 2   EventTimeStamp             1187335 non-null  datetime64[ns]
 3   eventDescription           1126490 non-null  object        
 4   ecuSoftwareVersion         592736 non-null   object        
 5   ecuSerialNumber            545769 non-null   object        
 6   ecuModel                   824028 non-null   object        
 7   ecuMake                    823273 non-null   object        
 8   ecuSource                  1187335 non-null  int64         
 9   spn                        1187335 non-null  int64         
 10  fmi                        1187335 non-null  int64         
 11  active                     1187335 no

## Dealing with Nulls

In [44]:
def input_fill(df):
    return df.sort_values(by='EventTimeStamp').fillna(method='ffill').fillna(method='bfill')

In [46]:
fault_diagnostic = fault_diagnostic.groupby('EquipmentID').apply(input_fill)

In [47]:
fault_diagnostic.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 1187335 entries, ('0105309016', 1008925) to ('R1764', 4999)
Data columns (total 42 columns):
 #   Column                     Non-Null Count    Dtype         
---  ------                     --------------    -----         
 0   RecordID                   1187335 non-null  int64         
 1   ESS_Id                     1187335 non-null  int64         
 2   EventTimeStamp             1187335 non-null  datetime64[ns]
 3   eventDescription           1187330 non-null  object        
 4   ecuSoftwareVersion         1179010 non-null  object        
 5   ecuSerialNumber            1179014 non-null  object        
 6   ecuModel                   1179023 non-null  object        
 7   ecuMake                    1179023 non-null  object        
 8   ecuSource                  1187335 non-null  int64         
 9   spn                        1187335 non-null  int64         
 10  fmi                        1187335 non-null  int64         
 11  active 