In [1]:
from __future__ import print_function, division
import time, os
import numpy as np
import matplotlib.pyplot as plt
import sys
import networkx as nx
import pandas

%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

# for auto-reloading external modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

In [3]:
# 6/14/2016 20:17:07
s_to_date = lambda x: pandas.to_datetime(x, format='%m/%d/%Y %H:%M:%S')
snapshots_df = pandas.read_csv('/home/cs231n/data/Field_Snaps_With_warranty.txt', converters={'Event DateTime':s_to_date})
#snapshots_df = pandas.read_csv('Field Snaps - With warranty.txt', converters={'Event DateTime':s_to_date})

In [5]:
# 2/3/2016
r_to_date = lambda x: pandas.to_datetime(x, format='%m/%d/%Y')
repairs_df = pandas.read_csv('/home/cs231n/data/repairs.csv', converters={'Rpr_Dt':r_to_date})
#repairs_df = pandas.read_csv('repairs.csv', converters={'Rpr_Dt':r_to_date})

In [13]:
selected_repairs = repairs_df[[
 'Chassis\nReference\nNumber',
 'Model Vehicle',
 'Build_Dt',
 'Dlvry_Dt',
 'In Service Date',
 'Miles',
 'Rpr_Dt',
 'ATA3',
 'ATA3Desc',
 'ATA6',
 'ATA6Desc',
 'ATA9',
 'ATA9Desc',
 'Fail Type',
 'Repair Cost']]
selected_repairs = selected_repairs[selected_repairs['Chassis\nReference\nNumber'].notnull()] 
selected_repairs.shape  #before: 1128x15, after: 853x15

(853, 15)

In [27]:
# To shift below after selected_repairs is cleaned
# ZY: 'Repair Cost' category is 'high ', instead of 'high'
#high_repairs = selected_repairs[selected_repairs['Repair Cost'].isin(['medium', 'high ', 'very high'])]
#veh_ids = high_repairs['Chassis\nReference\nNumber'].unique()

In [32]:
# ZY: is this not needed?
#selected_repairs = high_repairs['Chassis\nReference\nNumber'].isin(veh_ids)

In [8]:
#selected_snapshots = snapshots_df['Veh Ref ID'].isin(veh_ids)  #ZY: is this not needed?
selected_snapshots = snapshots_df[['Veh Ref ID',
 'Event DateTime',
 'Event Type Description',
 'Acc Pedal Position',
 'Ambient Air Temp',
 'Barometric Press',
 'Brake Switch',
 'Bus Utilization',
 'Cat Intake Gas Temp',
 'Cat Outlet Gas Temp',
 'Clutch Switch',
 'Cmd Eng Fuel Press',
 'Cruise Status',
 'Dpf Regen Inhibit Sw',
 'Dpf Thermal Mngmnt',
 'Drvr Demand Torque',
 'Eng Air Flow Rate',
 'Eng Avg Fuel Econ',
 'Eng Coolant Level',
 'Eng Coolant Temp',
 'Eng Demand Torque',
 'Eng DPF Intake Press',
 'Eng Egr Valve Pos',
 'Eng Exhaust Gas Temp',
 'Eng Fuel Del Press',
 'EngFuelTemp1',
 'Engine Speed',
 'Eng Man Abs Pressure',
 'Eng Oil Pressure',
 'EngInjRail1Press',
 'EngIntakeMan1Temp',
 'EngOilTemp1',
 'Eng Percent Torque',
 'EngTurbo1Boost',
 'EngTurbo1Pos',
 'EngTurbo1Speed',
 'Event - All Lamps On Time Hr',
 'Event - Amber Lamp Time Hr',
 'Event - Mil Lamp Time Hr',
 'Event - Red Lamp Time Hr',
 'Exhaust Tank Level',
 'Exhaust Tank Temp',
 'Fan Speed',
 'Keyswitch Bat Pot',
 'Part Trap Diff Press',
 'Part Trap Out Temp',
 'Scr Intake Gas Temp',
 'Scr Outlet Gas Temp',
 'Vehicle Speed',
 'Population',
 'DTCID',
 'Trip Distance',
 'Trip Idle Time',
 'Trip Run Time',
 'Altitude',
 'Engine Start Ambient',
 'Engine Start Coolant',
 'Ignition Cycle Counter',
 'Latitude',
 'Longitude',
 'Lifetime Idle Hours',
 'Lifetime Idle Fuel',
 'Lifetime Fuel',
 'Lifetime Distance',
 'Lifetime Engine Hours']]

In [9]:
#---------------------------------------
# Data-Cleaning - selected_snapshots (part 1)
#---------------------------------------
# drop duplicate rows
selected_snapshots = selected_snapshots.drop_duplicates() 
selected_snapshots.shape  #before: 1581892 x 65, after removing duplicate rows: 1015071 x 65

# Checking for null entries
nullTable = selected_snapshots.isnull().sum()
nullTable[nullTable > 0]/len(selected_snapshots)

Eng Avg Fuel Econ         0.001537
Eng DPF Intake Press      0.000031
Eng Fuel Del Press        0.000062
EngInjRail1Press          0.000052
EngIntakeMan1Temp         0.000003
Fan Speed                 0.000003
Ignition Cycle Counter    0.691675
dtype: float64

In [10]:
#---------------------------------------
# Data-Cleaning - selected_snapshots (part 2)
#---------------------------------------
# Remove column 'Ignition Cycle Counter' ~69.2% missing values  --> 101571 x 64
selected_snapshots = selected_snapshots.drop('Ignition Cycle Counter', axis=1) 

# Impute missing values using mean value, i.e. E(value | vehicle id)
fields = ['Eng Avg Fuel Econ', 'Eng DPF Intake Press', 'Eng Fuel Del Press', 'EngInjRail1Press', 'EngIntakeMan1Temp', 'Fan Speed']
for f in fields:
    noData_vehID = selected_snapshots[selected_snapshots[f].isnull()]['Veh Ref ID'].unique()
    for veh_id in noData_vehID:
        # compute mean value of field f for that vehicle
        value = selected_snapshots[selected_snapshots['Veh Ref ID'] ==  veh_id][f].mean()
        
        #update missing value with mean value
        idx = selected_snapshots.index[selected_snapshots[f].isnull()]
        selected_snapshots.loc[idx,f] = value

nullTable = selected_snapshots.isnull().sum()
nullTable[nullTable > 0]/len(selected_snapshots)

Series([], dtype: float64)

In [17]:
#---------------------------------------
# Data-Cleaning - selected_repairs
#---------------------------------------
#filter away vehicles that broke down before delivery & service (veh 616: accident during undecking) 
selected_repairs = selected_repairs[selected_repairs['Miles']>0]   
selected_repairs.shape   # before: 853x15, after: 852x15

# identify and remove rows with no snapshot data prior to repair date
nodata_veh_repair = []
for veh_id in selected_repairs['Chassis\nReference\nNumber'].unique():
    v_snapshots = selected_snapshots[selected_snapshots['Veh Ref ID'] == veh_id].sort_values(by='Event DateTime')
    repairDate = (selected_repairs[selected_repairs['Chassis\nReference\nNumber']==veh_id]['Rpr_Dt'])    
    for r in repairDate:
        if len(v_snapshots[v_snapshots['Event DateTime']<=r]) == 0: #no snapshot data prior to repair date
            nodata_veh_repair.append({'Chassis\nReference\nNumber':veh_id,'Rpr_Dt':r})
            #remove that row in selected_repair
            idx = selected_repairs.index[(selected_repairs['Chassis\nReference\nNumber'] == veh_id) & (selected_repairs['Rpr_Dt'] == r)]
            selected_repairs = selected_repairs.drop(idx)
            
nodata_veh_repair = pandas.DataFrame(nodata_veh_repair)
selected_repairs.shape  # before: 852x15, after: 713x15

# check for other null entries:
# ('Chasis\nReference\nNumber', 'ATA9', 'Rpr_Dt') has no null entries. Sufficient to link to snapshot data
selected_repairs.isnull().sum()

Chassis\nReference\nNumber    0
Model Vehicle                 0
Build_Dt                      0
Dlvry_Dt                      0
In Service Date               0
Miles                         0
Rpr_Dt                        0
ATA3                          0
ATA3Desc                      0
ATA6                          0
ATA6Desc                      1
ATA9                          0
ATA9Desc                      0
Fail Type                     4
Repair Cost                   0
dtype: int64

In [25]:
# save cleaned data to pickle format for quick reloading in future
selected_snapshots.to_pickle('cleaned_selected_snapshots.pkl')
selected_repairs.to_pickle('cleaned_selected_repairs.pkl')

# reload cleaned data
#selected_snapshots = pandas.read_pickle('cleaned_selected_snapshots.pkl')
#selected_repairs = pandas.read_pickle('cleaned_selected_repairs.pkl')

In [18]:
# ZY: 'Repair Cost' category is 'high ', instead of 'high' (extra white-space)
high_repairs = selected_repairs[selected_repairs['Repair Cost'].isin(['medium', 'high ', 'very high'])]
veh_ids = high_repairs['Chassis\nReference\nNumber'].unique()

In [19]:
def get_012_dates(end_date):
    ## split into 0 (10+), 1 (5-10), 2(0-5)
    two_end = end_date
    one_end = pandas.to_datetime(two_end) + pandas.DateOffset(days=-5)
    zero_end = one_end + pandas.DateOffset(days=-5)
    return (zero_end, one_end, two_end)

In [22]:
def get_repair_slices(veh_ids, snapshots, repairs, code='ATA9'):
    repair_slices = {}
    for veh_id in veh_ids:
        v_snapshots = snapshots[snapshots['Veh Ref ID'] == veh_id].sort_values(by='Event DateTime')
        v_repairs = repairs[repairs['Chassis\nReference\nNumber'] == veh_id].sort_values(by='Rpr_Dt')

        start_date = pandas.to_datetime('1/1/2000') ## in past so first snapshot is captured
        event_dt_key = 'Event DateTime'

        repair_slices[veh_id] = {}
        veh_slices = repair_slices[veh_id]

        ## Best indicator of repair type is the ATA9 code
        ## Iterate over each repair type and append slices
        for repair_type, repair_group in v_repairs.groupby([code]):  #ZY: veh_repairs --> v_repairs
            start = start_date
            end = -1

            ## dates of all of the repairs in that group
            repair_dates = repair_group.sort_values(by='Rpr_Dt')['Rpr_Dt']
            r_size = len(repair_dates)

            ## for each repair type, grab slices of snapshots
            veh_slices_repair = {0:[],1:[],2:[]}
            for repair in repair_group.itertuples():
                end = repair[7] ##['Rpr_Dt']

                (zero_end, one_end, two_end) = get_012_dates(end)
                
                # divide up snapshots into 0, 1, 2 slices
                two_mask = (v_snapshots[event_dt_key] > one_end) & (v_snapshots[event_dt_key] <= end)
                one_mask = (v_snapshots[event_dt_key] > zero_end) & (v_snapshots[event_dt_key] <= one_end)
                zero_mask = (v_snapshots[event_dt_key] >= start) & (v_snapshots[event_dt_key] <= zero_end)
                two_slices = v_snapshots.loc[two_mask]
                one_slices = v_snapshots.loc[one_mask]
                zero_slices = v_snapshots.loc[zero_mask]

                if len(two_slices) > 0:
                    veh_slices_repair[2].append(two_slices)
                if len(one_slices) > 0:
                    veh_slices_repair[1].append(one_slices)
                if len(zero_slices) > 0:
                    veh_slices_repair[0].append(zero_slices)

                ## reset start to end for next iteration
                start = end
            
            if len(veh_slices_repair[0]) > 0 or len(veh_slices_repair[1]) > 0 or len(veh_slices_repair[2]) > 0:
                veh_slices[repair_type] = veh_slices_repair

    return repair_slices

In [23]:
repair_slices_all = get_repair_slices(veh_ids, selected_snapshots, selected_repairs)