In [126]:
import numpy as np
import pandas
import math
from hmmlearn import hmm
import numpy as np
import warnings
warnings.filterwarnings('ignore')

from create_train_test_val_maps import *

In [2]:
train_map_revived = open_map('/home/cs231n/data/train')
val_map_revived = open_map('/home/cs231n/data/val')
test_map_revived = open_map('/home/cs231n/data/test')
codes = [45021, 44004, 43004, 45008, 45002, 45007]

In [3]:
# get global index from (row, col) index
def sub2ind(array_shape, row, col):
    ind = row*array_shape[1] + col
    if row < 0 or row >= array_shape[0]:
        ind = -1
    if col < 0 or col >= array_shape[1]:
        ind = -1
    return ind

# get (row, col) index from global index
def ind2sub(array_shape, ind):
    row = int(ind) / array_shape[1]
    col = ind % array_shape[1]
    if ind < 0:
        row = -1
        col = -1
    if ind >=  array_shape[0]*array_shape[1]:
        row = -1
        col = -1
    return (row, col)

def softmax(x):
    """
    Compute softmax function for input. 
    Use tricks from previous assignment to avoid overflow
    """
	### YOUR CODE HERE
    xshift = np.max(x, axis = 1)
    xshift = xshift.reshape((x.shape[0],1))
    x = x - xshift
    s = np.exp(x) / np.sum(np.exp(x),axis = 1).reshape((x.shape[0],1))
	### END YOUR CODE
    return s

In [4]:
# Categorical Data to Numeric Format

# Train Data
category_var = ['Veh Ref ID','Event Type Description','Brake Switch','Clutch Switch','Cruise Status','Dpf Regen Inhibit Sw', 
                'Dpf Thermal Mngmnt','Eng Coolant Level','DTCID']
for ATA6code in codes:
    for i in range(len(train_map_revived[ATA6code])):
        for j in range(len(train_map_revived[ATA6code][i])):
            for k in category_var:
                train_map_revived[ATA6code][i][j][k] = train_map_revived[ATA6code][i][j][k].astype('category')
                train_map_revived[ATA6code][i][j][k] = train_map_revived[ATA6code][i][j][k].cat.codes

# Validate Data
for ATA6code in codes:
    for i in range(len(val_map_revived[ATA6code])):
        for j in range(len(val_map_revived[ATA6code][i])):
            for k in category_var:
                val_map_revived[ATA6code][i][j][k] = val_map_revived[ATA6code][i][j][k].astype('category')
                val_map_revived[ATA6code][i][j][k] = val_map_revived[ATA6code][i][j][k].cat.codes
                
                
# Test Data
for ATA6code in codes:
    for i in range(len(test_map_revived[ATA6code])):
        for j in range(len(test_map_revived[ATA6code][i])):
            for k in category_var:
                test_map_revived[ATA6code][i][j][k] = test_map_revived[ATA6code][i][j][k].astype('category')
                test_map_revived[ATA6code][i][j][k] = test_map_revived[ATA6code][i][j][k].cat.codes

In [5]:
# TRAIN
num_states = 2
num_codes = len(codes)
num_time_steps = 10
num_iter = 10
models = []
lengths = []

# For each code type
for i, ATA6code in enumerate(codes):
    # For each numer time steps  left
    for j in train_map_revived[ATA6code].keys():
        #print(j,'\n')
        X = pandas.concat(train_map_revived[ATA6code][j]).as_matrix()[:,2:] # ignore first two columns (veh id, timestamp)
        lengths = []
        # Concatenate all sequences of that code and time
        for sequence_of_snapshots in train_map_revived[ATA6code][j]:
            lengths.append(sequence_of_snapshots.shape[0])                
        # Make an HMM instance and execute fit (i.e. train)
        models.append(hmm.GaussianHMM(n_components=num_states, n_iter=num_iter).fit(X,lengths))

In [29]:
# VALIDATE
# Get number of validation samples (sequences)
lengths = []
for i, ATA6code in enumerate(codes):
    for j in val_map_revived[ATA6code].keys():
        lengths.append(len(val_map_revived[ATA6code][j]))
num_val = np.sum(lengths)


# Compute Log Likelihoods for each
labels = []
s = 0
log_likelihoods = np.zeros((num_val, num_codes, num_time_steps))
for true_ATA6code in codes:
    # time steps 
    for true_k in val_map_revived[ATA6code].keys():
        # for sequence of snapshots
        for j,sample in enumerate(val_map_revived[true_ATA6code][true_k]):
            # get true labels
            labels.append([true_ATA6code,true_k])
            for i, ATA6code in enumerate(codes):
                for k in val_map_revived[ATA6code].keys():
                    x = sample.as_matrix()[:,2:] 
                    log_likelihoods[s,i,k] = models[sub2ind([num_codes, num_time_steps],i,k)].score(x)
            s = s+1
labels = np.array(labels).reshape((num_val,2))

In [137]:
#accuracy
errors = np.zeros((labels.shape))
correct_codes = np.zeros((labels.shape))
probabilities = np.zeros(log_likelihoods.shape)
time_counts = 0
for s in range(0,num_val):
    probabilities_temp = softmax(log_likelihoods[s,:,:].reshape(1,num_codes*num_time_steps))
    probabilities[s] = probabilities_temp.reshape(num_codes, num_time_steps) 
    max_index = np.unravel_index(probabilities[s].argmax(), probabilities[s].shape)
    errors[s] = max_index - np.array([codes.index(labels[s,0]),labels[s,1]])
    correct_codes[s] = max_index == np.array([codes.index(labels[s,0]),labels[s,1]])
    if correct_codes[s,0] == True and correct_codes[s,1] == True:
        time_counts = time_counts + 1        
error_sum = sum(errors)
correct_sum = sum(correct_codes)
code_accuracy = correct_sum[0]/num_val
k_accuracy = time_counts/num_val
mse = (1/num_val)*sum(np.power(errors[:,0],2))


print('ATA6: Mean Squared Error',mse)
print('ATA6: Accuracy', code_accuracy)
print('K: Accuracy', k_accuracy)


ATA6: Mean Squared Error 4.11299435028
ATA6: Accuracy 0.381355932203
K: Accuracy 0.0423728813559
