In [1]:
import numpy as np
import pandas
import math
from hmmlearn import hmm
import numpy as np
import warnings
warnings.filterwarnings('ignore')

from create_train_test_val_maps import *

In [2]:
train_map_revived = open_map('/home/cs231n/data/train')
val_map_revived = open_map('/home/cs231n/data/val')
test_map_revived = open_map('/home/cs231n/data/test')
selected_codes = [45021, 44004, 43004, 45008, 45002, 45007]

In [None]:
train_map_revived[797][43006][0][0]

In [3]:
# get global index from (row, col) index
def sub2ind(array_shape, row, col):
    ind = row*array_shape[1] + col
    if row < 0 or row >= array_shape[0]:
        ind = -1
    if col < 0 or col >= array_shape[1]:
        ind = -1
    return ind

# get (row, col) index from global index
def ind2sub(array_shape, ind):
    row = int(ind) / array_shape[1]
    col = ind % array_shape[1]
    if ind < 0:
        row = -1
        col = -1
    if ind >=  array_shape[0]*array_shape[1]:
        row = -1
        col = -1
    return (row, col)

def softmax(x):
    """
    Compute softmax function for input. 
    Use tricks from previous assignment to avoid overflow
    """
    ### YOUR CODE HERE
    xshift = np.max(x, axis = 1)
    xshift = xshift.reshape((x.shape[0],1))
    x = x - xshift
    s = np.exp(x) / np.sum(np.exp(x),axis = 1).reshape((x.shape[0],1))
    ### END YOUR CODE
    return s

In [16]:
# TRAIN
num_states = 2
num_time_windows = 10
num_iter = 100
models = {}

codes = []
code_window_map = {}
for vehicleID in train_map_revived.keys():
    for ATA6code in train_map_revived[vehicleID].keys():
        if ATA6code not in selected_codes:
            continue
        if ATA6code not in code_window_map:
            code_window_map[ATA6code] = {}
        for time_window in train_map_revived[vehicleID][ATA6code].keys():
            if time_window not in code_window_map[ATA6code]:
                code_window_map[ATA6code][time_window] = []
            for sequence_of_snapshots in train_map_revived[vehicleID][ATA6code][time_window]:
                code_window_map[ATA6code][time_window].append(sequence_of_snapshots)
                
for ATA6code in code_window_map.keys():
    for time_window in code_window_map[ATA6code].keys():
        lengths = []
        listofsequences = code_window_map[ATA6code][time_window]
        X = pandas.concat(listofsequences).as_matrix()
        for sequence in listofsequences:
            lengths.append(sequence.shape[0]) 
        models[(ATA6code, time_window)] = hmm.GaussianHMM(n_components=num_states, n_iter=num_iter).fit(X.astype(float),lengths)
        


In [None]:
models[(32002,9)].transmat_

In [17]:
# VALIDATE
num_codes = len(selected_codes)
lengths = []
for vehicleID in val_map_revived.keys():
    for ATA6code in val_map_revived[vehicleID].keys():
        if ATA6code not in selected_codes:
            continue
        for time_window in val_map_revived[vehicleID][ATA6code].keys():
            lengths.append(len(val_map_revived[vehicleID][ATA6code][time_window]))
            #for sequence_of_snapshots in val_map_revived[vehicleID][ATA6code][time_window]:
            #    lengths.append(sequence_of_snapshots.shape[0])
num_val = sum(lengths)

sample = 0
labels = []
log_likelihoods = np.zeros((num_val, num_codes, num_time_windows))
for vehicleID in val_map_revived.keys():
    for ATA6code in val_map_revived[vehicleID].keys():
        if ATA6code not in selected_codes:
            continue
        for time_window in val_map_revived[vehicleID][ATA6code].keys():
            for sequence_of_snapshots in val_map_revived[vehicleID][ATA6code][time_window]:
                labels.append([selected_codes.index(ATA6code),time_window])
                x = sequence_of_snapshots.as_matrix() 
                for pair in models.keys():
                    c = selected_codes.index(pair[0])
                    w = pair[1]
                    #print(pair[0])
                    #print(sample,c,w)
                    log_likelihoods[sample,c,w] = models[pair].score(x.astype(float))
                    #print('ok')
        sample = sample+1 
labels = np.asarray(labels)

In [None]:
np.asarray(["unknown"]).astype(float)

In [18]:
#accuracy
errors = np.zeros((labels.shape))
correct_codes = np.zeros((labels.shape))
probabilities = np.zeros(log_likelihoods.shape)
time_counts = 0
for s in range(0,num_val):
    probabilities_temp = softmax(log_likelihoods[s,:,:].reshape(1,num_codes*num_time_windows))
    probabilities[s] = probabilities_temp.reshape(num_codes, num_time_windows) 
    max_index = np.unravel_index(probabilities[s].argmax(), probabilities[s].shape)
    errors[s] = max_index - np.array(labels[s,:])
    correct_codes[s] = max_index == np.array(labels[s,:])
    if correct_codes[s,0] == True and correct_codes[s,1] == True:
        time_counts = time_counts + 1        
error_sum = sum(errors)
correct_sum = sum(correct_codes)
code_accuracy = correct_sum[0]/num_val
k_accuracy = time_counts/num_val
mse = (1.0/num_val)*sum(np.power(errors[:,0],2))


print('ATA6: Mean Squared Error',mse)
print('ATA6: Accuracy', code_accuracy)
print('K: Accuracy', k_accuracy)


ATA6: Mean Squared Error 4.23981900452
ATA6: Accuracy 0.420814479638
K: Accuracy 0.058823529411764705


In [20]:
code_probabilities = np.zeros((num_val, num_codes))
predicted_codes = np.zeros(num_val)
for s in range(0,num_val):
    probabilities_temp = softmax(log_likelihoods[s,:,:].reshape(1,num_codes*num_time_windows))
    code_probabilities[s,:] = np.sum(probabilities_temp.reshape(num_codes, num_time_windows),axis=1)
    predicted_codes[s] = np.argmax(code_probabilities[s,:])
code_accuracy = np.sum(predicted_codes == labels[:,0])/num_val
print("Marginalized Predicted Code Accuracy: ", code_accuracy)

Marginalized Predicted Code Accuracy:  0.420814479638


In [21]:
selected_repairs = pandas.read_pickle('cleaned_selected_repairs.pkl')
veh_repair_map = {}
for veh_id, repairs in selected_repairs.groupby(['Chassis\nReference\nNumber']):
    veh_repair_map[veh_id] = repairs['ATA6'].values

In [22]:
veh_repair_map

{1.0: array([45021]),
 2.0: array([43006]),
 3.0: array([43006]),
 4.0: array([44004]),
 5.0: array([43006]),
 6.0: array([44003, 43001, 43007, 45021]),
 8.0: array([43006]),
 9.0: array([44002]),
 10.0: array([43006]),
 11.0: array([42003]),
 12.0: array([45021]),
 13.0: array([43001]),
 14.0: array([45007]),
 15.0: array([45021]),
 16.0: array([43004]),
 17.0: array([43006]),
 18.0: array([44004]),
 19.0: array([43002]),
 20.0: array([43006, 43007, 42004]),
 21.0: array([45021]),
 22.0: array([45002]),
 24.0: array([43007]),
 25.0: array([45007]),
 27.0: array([45002]),
 28.0: array([43006]),
 29.0: array([43006]),
 30.0: array([43006]),
 31.0: array([44004]),
 32.0: array([101001]),
 33.0: array([43002]),
 35.0: array([44002]),
 37.0: array([43006]),
 39.0: array([45021]),
 41.0: array([43006]),
 45.0: array([45021]),
 46.0: array([43006]),
 47.0: array([43006]),
 48.0: array([43006]),
 49.0: array([44001]),
 51.0: array([43004]),
 54.0: array([45002]),
 55.0: array([44004]),
 57.0: