In [2]:
from __future__ import print_function, division
import time, os
import numpy as np
import matplotlib.pyplot as plt
import sys
import networkx as nx
import pandas

%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

# for auto-reloading external modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

In [8]:
repairs = pandas.read_csv('/home/cs231n/data/repairs.csv')

In [5]:
# 6/14/2016 20:17:07
s_to_date = lambda x: pandas.to_datetime(x, format='%m/%d/%Y %H:%M:%S')
snapshots_df = pandas.read_csv('/home/cs231n/data/Field_Snaps_With_warranty.txt', converters={'Event DateTime':s_to_date})

In [8]:
# 2/3/2016
r_to_date = lambda x: pandas.to_datetime(x, format='%m/%d/%Y')
repairs_df = pandas.read_csv('/home/cs231n/data/repairs.csv', converters={'Rpr_Dt':r_to_date})

In [25]:
selected_repairs = repairs_df[[
 'Chassis\nReference\nNumber',
 'Model Vehicle',
 'Build_Dt',
 'Dlvry_Dt',
 'In Service Date',
 'Miles',
 'Rpr_Dt',
 'ATA3',
 'ATA3Desc',
 'ATA6',
 'ATA6Desc',
 'ATA9',
 'ATA9Desc',
 'Fail Type',
 'Repair Cost']]
selected_repairs = selected_repairs[selected_repairs['Chassis\nReference\nNumber'].notnull()]

In [27]:
high_repairs = selected_repairs[selected_repairs['Repair Cost'].isin(['medium', 'high', 'very high'])]#.sort_values(by='Rpr_Dt')
veh_ids = high_repairs['Chassis\nReference\nNumber'].unique()

In [32]:
selected_repairs = high_repairs['Chassis\nReference\nNumber'].isin(veh_ids)

In [28]:
selected_snapshots = snapshots_df['Veh Ref ID'].isin(veh_ids)
selected_snapshots = snapshots_df[['Veh Ref ID',
 'Event DateTime',
 'Event Type Description',
 'Acc Pedal Position',
 'Ambient Air Temp',
 'Barometric Press',
 'Brake Switch',
 'Bus Utilization',
 'Cat Intake Gas Temp',
 'Cat Outlet Gas Temp',
 'Clutch Switch',
 'Cmd Eng Fuel Press',
 'Cruise Status',
 'Dpf Regen Inhibit Sw',
 'Dpf Thermal Mngmnt',
 'Drvr Demand Torque',
 'Eng Air Flow Rate',
 'Eng Avg Fuel Econ',
 'Eng Coolant Level',
 'Eng Coolant Temp',
 'Eng Demand Torque',
 'Eng DPF Intake Press',
 'Eng Egr Valve Pos',
 'Eng Exhaust Gas Temp',
 'Eng Fuel Del Press',
 'EngFuelTemp1',
 'Engine Speed',
 'Eng Man Abs Pressure',
 'Eng Oil Pressure',
 'EngInjRail1Press',
 'EngIntakeMan1Temp',
 'EngOilTemp1',
 'Eng Percent Torque',
 'EngTurbo1Boost',
 'EngTurbo1Pos',
 'EngTurbo1Speed',
 'Event - All Lamps On Time Hr',
 'Event - Amber Lamp Time Hr',
 'Event - Mil Lamp Time Hr',
 'Event - Red Lamp Time Hr',
 'Exhaust Tank Level',
 'Exhaust Tank Temp',
 'Fan Speed',
 'Keyswitch Bat Pot',
 'Part Trap Diff Press',
 'Part Trap Out Temp',
 'Scr Intake Gas Temp',
 'Scr Outlet Gas Temp',
 'Vehicle Speed',
 'Population',
 'DTCID',
 'Trip Distance',
 'Trip Idle Time',
 'Trip Run Time',
 'Altitude',
 'Engine Start Ambient',
 'Engine Start Coolant',
 'Ignition Cycle Counter',
 'Latitude',
 'Longitude',
 'Lifetime Idle Hours',
 'Lifetime Idle Fuel',
 'Lifetime Fuel',
 'Lifetime Distance',
 'Lifetime Engine Hours']]

In [35]:
def get_012_dates(end_date):
    ## split into 0 (10+), 1 (5-10), 2(0-5)
    two_end = end_date
    one_end = pandas.to_datetime(two_end) + pandas.DateOffset(days=-5)
    zero_end = one_end + pandas.DateOffset(days=-5)
    return (zero_end, one_end, two_end)

In [36]:
def get_repair_slices(veh_ids, snapshots, repairs, code='ATA9'):
    repair_slices = {}
    for veh_id in veh_ids:
        v_snapshots = snapshots[snapshots['Veh Ref ID'] == veh_id].sort_values(by='Event DateTime')
        v_repairs = repairs[repairs['Chassis\nReference\nNumber'] == veh_id].sort_values(by='Rpr_Dt')

        start_date = pandas.to_datetime('1/1/2000') ## in past so first snapshot is captured
        event_dt_key = 'Event DateTime'

        repair_slices[veh_id] = {}
        veh_slices = repair_slices[veh_id]

        ## Best indicator of repair type is the ATA9 code
        ## Iterate over each repair type and append slices
        for repair_type, repair_group in veh_repairs.groupby([code]):
            start = start_date
            end = -1

            ## dates of all of the repairs in that group
            repair_dates = repair_group.sort_values(by='Rpr_Dt')['Rpr_Dt']
            r_size = len(repair_dates)

            ## for each repair type, grab slices of snapshots
            veh_slices_repair = {0:[],1:[],2:[]}
            for repair in repair_group.itertuples():
                end = repair[7] ##['Rpr_Dt']

                (zero_end, one_end, two_end) = get_012_dates(end)

                ## grab a slice of snapshots from end of last repair to start of current repair
                #mask = (veh_snapshots['Event DateTime'] >= start) & (veh_snapshots['Event DateTime'] < end)
                #snapshot_slice = v_snapshots.loc[mask]
                
                # divide up snapshots into 
                two_mask = (v_snapshots[event_dt_key] > one_end) & (v_snapshots[event_dt_key] <= end)
                print(repair_type, start, end, zero_end, one_end, two_end)
                one_mask = (v_snapshots[event_dt_key] > zero_end) & (v_snapshots[event_dt_key] <= one_end)
                zero_mask = (v_snapshots[event_dt_key] >= start) & (v_snapshots[event_dt_key] <= zero_end)
                two_slices = v_snapshots.loc[two_mask]
                one_slices = v_snapshots.loc[one_mask]
                zero_slices = v_snapshots.loc[zero_mask]

                if len(two_slices) > 0:
                    veh_slices_repair[2].append(two_slices)
                if len(one_slices) > 0:
                    veh_slices_repair[1].append(one_slices)
                if len(zero_slices) > 0:
                    veh_slices_repair[0].append(zero_slices)

                ## reset start to end for next iteration
                start = end
            
            if len(veh_slices_repair[0]) > 0 or len(veh_slices_repair[1]) > 0 or len(veh_slices_repair[2]) > 0:
                veh_slices[repair_type] = veh_slices_repair

    return repair_slices

In [30]:
veh_snapshots = selected_snapshots[selected_snapshots['Veh Ref ID'] == 797].sort_values(by='Event DateTime')
veh_repairs = selected_repairs[selected_repairs['Chassis\nReference\nNumber'] == 797]

In [37]:
repair_slices = get_repair_slices([797], veh_snapshots, veh_repairs)

43006007 2000-01-01 00:00:00 2016-07-18 00:00:00 2016-07-08 00:00:00 2016-07-13 00:00:00 2016-07-18 00:00:00
44004001 2000-01-01 00:00:00 2016-05-27 00:00:00 2016-05-17 00:00:00 2016-05-22 00:00:00 2016-05-27 00:00:00
45021003 2000-01-01 00:00:00 2016-06-23 00:00:00 2016-06-13 00:00:00 2016-06-18 00:00:00 2016-06-23 00:00:00
101001001 2000-01-01 00:00:00 2016-06-13 00:00:00 2016-06-03 00:00:00 2016-06-08 00:00:00 2016-06-13 00:00:00


In [44]:
print(len(repair_slices[797][45021003][0][0]))
print(len(repair_slices[797][45021003][1][0]))
print(len(repair_slices[797][45021003][2][0]))

70
56
34


In [None]:
repair_slices_all = get_repair_slices(veh_ids, selected_snapshots, selected_repairs)

In [9]:
repairs.columns

Index(['Dealer Cd', 'Chassis Division', 'Chassis\nReference\nNumber',
       'Plant Name', 'Model Vehicle', 'Build_Dt', 'Dlvry_Dt',
       'In Service Date', 'Miles', 'Rpr_Dt', 'ATA3', 'ATA3Desc', 'ATA6',
       'ATA6Desc', 'ATA9', 'ATA9Desc', 'Fail Type', 'Repair Cost', 'Dlr Story',
       'Chassis Build Month', 'Engine Build Month', 'Engine Manufacture Date',
       'Engine Displacement Liter Quantity', 'Engine Horsepower Number'],
      dtype='object')

In [10]:
ex_w = pandas.read_csv('/home/cs231n/data/Field_Snaps_With_warranty.txt')

In [13]:
print(min(repairs['Chassis\nReference\nNumber'].unique()), max(repairs['Chassis\nReference\nNumber'].unique()))

1.0 853.0


In [14]:
print(min(ex_nw['Veh Ref ID'].unique()), max(ex_nw['Veh Ref ID'].unique()))

NameError: name 'ex_nw' is not defined

In [15]:
print(min(ex_w['Veh Ref ID'].unique()), max(ex_w['Veh Ref ID'].unique()))

1.0 813.0


In [16]:
selected_repairs = repairs[[
 'Chassis\nReference\nNumber',
 'Model Vehicle',
 'Build_Dt',
 'Dlvry_Dt',
 'In Service Date',
 'Miles',
 'Rpr_Dt',
 'ATA3',
 'ATA3Desc',
 'ATA6',
 'ATA6Desc',
 'ATA9',
 'ATA9Desc',
 'Fail Type',
 'Repair Cost']]
selected_repairs = selected_repairs[selected_repairs['Chassis\nReference\nNumber'].notnull()]

In [17]:
selected_repairs.iloc[850].values

array([628.0, 'T680   ', '6/24/2016', '7/27/2016', '7/27/2016', 406,
       '7/26/2016', 45, 'POWER PLANT (045)', 45021,
       'ELECTRONIC ENGINE CONTROL', 45021003,
       'ELECTRONIC CONTROL MODULE (ECU)', 'REFLASH ECU/ICU', 'very low'], dtype=object)

In [18]:
len(ex_w)

1581892

In [19]:
selected_w = ex_w[['Veh Ref ID',
 'Event DateTime',
 'Event Type Description',
 'Acc Pedal Position',
 'Ambient Air Temp',
 'Barometric Press',
 'Brake Switch',
 'Bus Utilization',
 'Cat Intake Gas Temp',
 'Cat Outlet Gas Temp',
 'Clutch Switch',
 'Cmd Eng Fuel Press',
 'Cruise Status',
 'Dpf Regen Inhibit Sw',
 'Dpf Thermal Mngmnt',
 'Drvr Demand Torque',
 'Eng Air Flow Rate',
 'Eng Avg Fuel Econ',
 'Eng Coolant Level',
 'Eng Coolant Temp',
 'Eng Demand Torque',
 'Eng DPF Intake Press',
 'Eng Egr Valve Pos',
 'Eng Exhaust Gas Temp',
 'Eng Fuel Del Press',
 'EngFuelTemp1',
 'Engine Speed',
 'Eng Man Abs Pressure',
 'Eng Oil Pressure',
 'EngInjRail1Press',
 'EngIntakeMan1Temp',
 'EngOilTemp1',
 'Eng Percent Torque',
 'EngTurbo1Boost',
 'EngTurbo1Pos',
 'EngTurbo1Speed',
 'Event - All Lamps On Time Hr',
 'Event - Amber Lamp Time Hr',
 'Event - Mil Lamp Time Hr',
 'Event - Red Lamp Time Hr',
 'Exhaust Tank Level',
 'Exhaust Tank Temp',
 'Fan Speed',
 'Keyswitch Bat Pot',
 'Part Trap Diff Press',
 'Part Trap Out Temp',
 'Scr Intake Gas Temp',
 'Scr Outlet Gas Temp',
 'Vehicle Speed',
 'Population',
 'DTCID',
 'Trip Distance',
 'Trip Idle Time',
 'Trip Run Time',
 'Altitude',
 'Engine Start Ambient',
 'Engine Start Coolant',
 'Ignition Cycle Counter',
 'Latitude',
 'Longitude',
 'Lifetime Idle Hours',
 'Lifetime Idle Fuel',
 'Lifetime Fuel',
 'Lifetime Distance',
 'Lifetime Engine Hours']]

In [20]:
ex_w_indexed = selected_w[:100000].set_index('Veh Ref ID')

In [21]:
repairs_indexed = selected_repairs.set_index('Chassis\nReference\nNumber')

In [22]:
joined_repairs_snapshots = ex_w_indexed.join(repairs_indexed)

In [61]:
high_repairs = selected_repairs[selected_repairs['Repair Cost'].isin(['medium', 'high', 'very high'])]

In [62]:
veh_ids = high_repairs['Chassis\nReference\nNumber'].unique()

In [63]:
veh_ids

array([ 208.,  761.,  765.,  158.,  123.,  301.,   60.,  182.,  543.,
        319.,  320.,  616.,   27.,  460.,   18.,  270.,   25.,  651.,
        245.,  233.,  558.,  605.,  429.,  317.,  703.,  441.,  165.,
        178.,   75.,  448.,  578.,  472.,  597.,  467.,  772.,  184.,
        236.,  283.,  735.,  809.,  724.,  571.,  679.,  673.,  555.,
        218.,  633.,  340.,  168.,  674.,   73.,  612.,  226.,  249.,
        297.,  634.,  259.,  409.,  690.,  719.,  584.,  556.,  192.,
        495.,  179.,  436.,  291.,  180.,  602.,  516.,  515.,  118.,
        211.,  113.,  752.,  424.,  730.,  810.,  808.,   71.,  412.,
        722.,  453.,  751.,  692.,  163.,  382.,  408.,  428.,  137.,
        801.,   43.,  120.,    4.,  373.,  725.,  748.,  791.,  491.,
         40.,  797.,  646.])

In [83]:
veh_snapshots = selected_w[selected_w['Veh Ref ID'] == 797].sort_values(by='Event DateTime')

In [65]:
v0 = veh_snapshots.iloc[0]

In [81]:
veh_repairs = selected_repairs[selected_repairs['Chassis\nReference\nNumber'] == 797]

In [82]:
veh_repairs

Unnamed: 0,Chassis Reference Number,Model Vehicle,Build_Dt,Dlvry_Dt,In Service Date,Miles,Rpr_Dt,ATA3,ATA3Desc,ATA6,ATA6Desc,ATA9,ATA9Desc,Fail Type,Repair Cost
612,797.0,T680,1/19/2016,2/9/2016,2/9/2016,38236,6/23/2016,45,POWER PLANT (045),45021,ELECTRONIC ENGINE CONTROL,45021003,ELECTRONIC CONTROL MODULE (ECU),CALIBRATE WRONG,very low
652,797.0,W900B,3/16/2016,5/4/2016,5/4/2016,4735,6/13/2016,101,MX DIAG FAULT CODES (101),101001,MX DIAG FAULT CODES (001),101001001,PACCAR MX DIAGNOSTIC CODES,REFLASH ECU/ICU,very low
836,797.0,567,5/11/2016,6/22/2016,6/22/2016,529,5/27/2016,44,FUEL SYSTEM (044),44004,INJECTORS (004),44004001,INJECTOR ASSEMBLY,WEAK,medium
837,797.0,T680,2/5/2016,4/11/2016,4/11/2016,38030,7/18/2016,43,EXHAUST SYSTEM (043),43006,EMISSIONS (006),43006007,"SENSOR - DIFFERENTIAL PRESSURE, PARTICULATE TR...",CHECK CONDITION,very low


In [127]:
repair_slices = get_repair_slices([797], veh_snapshots, veh_repairs)

43006007 1/1/2000 7/18/2016 07/08/2016 07/13/2016 7/18/2016
44004001 1/1/2000 5/27/2016 05/17/2016 05/22/2016 5/27/2016
45021003 1/1/2000 6/23/2016 06/13/2016 06/18/2016 6/23/2016
101001001 1/1/2000 6/13/2016 06/03/2016 06/08/2016 6/13/2016


In [130]:
event_dt_key = 'Event DateTime'
len(veh_snapshots.loc[(veh_snapshots[event_dt_key] > '06/13/2016 00:00:00') & (veh_snapshots[event_dt_key] <= '06/18/2016 00:00:00')])

0

In [118]:
repair_slices[797].keys()

dict_keys([101001001, 45021003, 43006007])

In [119]:
len(repair_slices[797][45021003])

3

In [123]:
len(repair_slices[797][45021003][0])

0

In [131]:
repair_slices[797][45021003][2][0]['Event DateTime'].unique()

array(['6/1/2016 0:11:20', '6/1/2016 0:12:26', '6/1/2016 12:04:49',
       '6/1/2016 12:11:48', '6/1/2016 12:23:05', '6/1/2016 13:23:05',
       '6/1/2016 14:23:05', '6/1/2016 14:46:07', '6/1/2016 14:47:13',
       '6/1/2016 15:03:12', '6/1/2016 16:14:19', '6/1/2016 17:14:18',
       '6/1/2016 1:12:26', '6/1/2016 1:23:53', '6/1/2016 1:31:42',
       '6/1/2016 1:40:45', '6/1/2016 1:43:34', '6/1/2016 20:27:15',
       '6/1/2016 23:27:16', '6/1/2016 23:29:42', '6/1/2016 23:33:05',
       '6/1/2016 2:43:34', '6/1/2016 3:43:34', '6/1/2016 4:11:45',
       '6/10/2016 15:24:16', '6/13/2016 15:53:13', '6/13/2016 15:58:19',
       '6/13/2016 16:39:17', '6/13/2016 16:39:47', '6/13/2016 16:55:33',
       '6/13/2016 17:11:46', '6/13/2016 17:34:33', '6/13/2016 20:18:39',
       '6/13/2016 20:23:57', '6/14/2016 19:18:41', '6/14/2016 19:20:06',
       '6/14/2016 20:07:14', '6/14/2016 20:07:49', '6/14/2016 20:21:28',
       '6/14/2016 20:27:44', '6/14/2016 22:19:35', '6/14/2016 22:23:17',
       '6/14

In [35]:
veh_snapshots = selected_w[selected_w['Veh Ref ID'].isin(veh_ids)]
veh_repairs = selected_repairs[selected_repairs['Chassis\nReference\nNumber'].isin(veh_ids)]

In [48]:
repair_slices_all = get_repair_slices(veh_ids, veh_snapshots, veh_repairs)

In [37]:
len(repair_slices_all)

13

In [38]:
repair_slices_all.keys()

dict_keys([801.0, 25.0, 616.0, 340.0, 555.0, 751.0, 40.0, 180.0, 373.0, 249.0, 724.0, 495.0, 27.0])

In [40]:
len(repair_slices_all[801][45021003][0])

142

In [53]:
repair_slices_all[801][45021003][0].iloc[0]

Veh Ref ID                                     801
Event DateTime                  6/14/2016 20:17:07
Event Type Description                  trip_start
Acc Pedal Position                            36.8
Ambient Air Temp                             31.87
Barometric Press                                98
Brake Switch                                     0
Bus Utilization                                 65
Cat Intake Gas Temp                         102.59
Cat Outlet Gas Temp                          56.09
Clutch Switch                                    0
Cmd Eng Fuel Press                           46.44
Cruise Status                                    0
Dpf Regen Inhibit Sw                             0
Dpf Thermal Mngmnt                               0
Drvr Demand Torque                              38
Eng Air Flow Rate                            133.1
Eng Avg Fuel Econ                             0.87
Eng Coolant Level                              100
Eng Coolant Temp               