In [1]:
import os
import argparse
import pandas as pd
import numpy as np
import pickle
from utils_test import *

In [2]:
from jellyfish._jellyfish import damerau_levenshtein_distance
import distance
from sklearn import metrics

In [3]:
from keras.models import load_model
from theano.ifelse import ifelse #added this

Using Theano backend.


In [9]:
from keras import backend as K
from keras.engine.topology import Layer, InputSpec
from keras import initializations

In [10]:
class AttLayer(Layer):
    def __init__(self, **kwargs):
        self.init = initializations.get('normal')
        #self.input_spec = [InputSpec(ndim=3)]
        super(AttLayer, self).__init__(** kwargs)

    def build(self, input_shape):
        assert len(input_shape)==3
        #self.W = self.init((input_shape[-1],1))
        self.W = self.init((input_shape[-1],))
        #self.input_spec = [InputSpec(shape=input_shape)]
        self.trainable_weights = [self.W]
        super(AttLayer, self).build(input_shape)  # be sure you call this somewhere!

    def call(self, x, mask=None):
        eij = K.tanh(K.dot(x, self.W))

        ai = K.exp(eij)
        weights = ai/K.sum(ai, axis=1).dimshuffle(0,'x')

        weighted_input = x*weights.dimshuffle(0,1,'x')
        return weighted_input.sum(axis=1)

    def get_output_shape_for(self, input_shape):
        return (input_shape[0], input_shape[-1])

In [13]:
name = 'helpdesk'
sub_name = 'at'
args = {
    'inputdir': '../input/{}/'.format(name),   
    'outputdir': './output_files/{0}_{1}/'.format(name, sub_name),
    'modelname': 'model_00-1.76.h5',
}

args = argparse.Namespace(**args)

In [6]:
test = pd.read_csv(args.inputdir+'test.csv')
test = transformDf(test)

In [7]:
with open(args.inputdir+'parameters.pkl', "rb") as f:
    maxlen = pickle.load(f)
    num_features = pickle.load(f)
    chartoindice = pickle.load(f)
    targetchartoindice = pickle.load(f)
    divisor = pickle.load(f)
    divisor2 = pickle.load(f)

In [8]:
with open(args.inputdir+'preprocessed_data.pkl', "rb") as f:
    X = pickle.load(f)
    y_a = pickle.load(f)
    y_t = pickle.load(f)
    X_test = pickle.load(f)
    y_a_test = pickle.load(f)
    y_t_test = pickle.load(f)

# Utils

In [12]:
model = load_model(args.outputdir+args.modelname, custom_objects={'AttLayer': AttLayer})

AttributeError: 'Namespace' object has no attribute 'modelname'

In [10]:
test_groupByCase = test.groupby(['CaseID'])

#get features all data
sentences, sentences_t, sentences_t2, sentences_t3, sentences_t4 = getFeature(test_groupByCase)

#get output all data
next_chars, next_chars_t, next_chars_t2, next_chars_t3, next_chars_t4 = getOutput(test_groupByCase)

# Check longest case --> suffix: 2 to 7
test_len = findLongestLength(test_groupByCase)
test_len - 5

8

In [None]:
def evalAct(true_label, pred_prob, targetchartoindice):
    #for log-loss: get probabilities
    gt_a = one_hot_encode(true_label, targetchartoindice)
    log_loss = metrics.log_loss(gt_a, pred_prob)
    print 'Log-loss: {}'.format(log_loss)
    
    #for accuracy: get labels
    gt_a_label = true_label
    pred_a_label = getLabel(pred_prob, targetchartoindice)
    acc = metrics.accuracy_score(gt_a_label, pred_a_label)
    print 'Accuracy: {}%'.format(acc*100)
    acc_3 = get_top3_accuracy(pred_prob, gt_a_label, targetchartoindice)
    print 'Top 3 accuracy: {}%'.format(acc_3*100)

In [None]:
def evalTime(true_time, pred_time, divisor):
    #get time
    gt_t = true_time
    pred_t = inverseTime(pred_time, divisor)
    
    mse = metrics.mean_squared_error(gt_t, pred_t)
    mae = metrics.mean_absolute_error(gt_t, pred_t)
    median = metrics.median_absolute_error(gt_t, pred_t)
    
    print 'Mean Squared Error: {0}s\t| {1} days'.format(mse, mse/86400)
    print 'Mean Absolute Error: {0}s\t| {1} days'.format(mae, mae/86400)
    print 'Median Absolute Error: {0}s\t| {1} days'.format(median, median/86400)

In [None]:
def getSuffix(suffix, sentences, sentences_t, sentences_t2, sentences_t3, sentences_t4, next_chars, next_chars_t):
    sentences_2 = []
    sentences_t_2 = []
    sentences_t2_2 = []
    sentences_t3_2 = []
    sentences_t4_2 = []
    next_chars_2 = []
    next_chars_t_2 = []

    for i in range(len(sentences)):
        if len(sentences[i]) == suffix:
            sentences_2.append(sentences[i])
            sentences_t_2.append(sentences_t[i])
            sentences_t2_2.append(sentences_t2[i])
            sentences_t3_2.append(sentences_t3[i])
            sentences_t4_2.append(sentences_t4[i])
            next_chars_2.append(next_chars[i])
            next_chars_t_2.append(next_chars_t[i])
    return sentences_2, sentences_t_2, sentences_t2_2, sentences_t3_2, sentences_t4_2, next_chars_2, next_chars_t_2

In [None]:
# DL distance
#distance.nlevenshtein(gt_a_label, pred_a_label) #0.1731066460587326

#damerau_levenshtein_distance(gt_a_label, pred_a_label) #784

#distance.jaccard(gt_a_label, pred_a_label) #0.4444444444444444

# All data

## Predict

In [11]:
pred = model.predict(X_test, verbose=0)
pred_a = pred[0]
pred_t = pred[1]

In [12]:
pred[0].shape, pred[1].shape

((4529, 10), (4529, 1))

## Evaluate

### Activity

In [16]:
evalAct(next_chars, pred[0], targetchartoindice)

Log-loss: 0.67863853056
Accuracy: 79.6423051446%
Top 3 accuracy: 0.986089644513


### Time

In [17]:
evalTime(next_chars_t, pred_t, divisor)

Mean Squared Error: 2.42233271614e+11s| 2803625.82887 days
Mean Absolute Error: 69413.1875s| 0.803393373843 days
Median Absolute Error: 69413.1875s| 0.803393373843 days


# Suffix

## Predict

In [32]:
sentences_2, sentences_t_2, sentences_t2_2, sentences_t3_2, sentences_t4_2, next_chars_2, next_chars_t_2 =  getSuffix(2, sentences, sentences_t, sentences_t2, sentences_t3, sentences_t4, next_chars, next_chars_t)

In [39]:
len(sentences_2), len(next_chars_2)

(1268, 1268)

In [40]:
X_test = vectorizeInput(sentences_2, sentences_t_2, sentences_t2_2, sentences_t3_2, sentences_t4_2, 
                        maxlen, num_features, chartoindice, 
                        divisor, divisor2, divisor3=86400, divisor4=7)

In [41]:
pred = model.predict(X_test, verbose=0)
pred_a = pred[0]
pred_t = pred[1]

In [42]:
pred[0].shape, pred[1].shape

((1268, 10), (1268, 1))

## Evaluate

### Activity

In [43]:
evalAct(next_chars_2, pred[0], targetchartoindice)

Log-loss: 0.791890852142
Accuracy: 74.3690851735%
Top 3 accuracy: 0.980283911672


### Time

In [48]:
evalTime(next_chars_t_2, pred_t, divisor)

Mean Squared Error: 3.13218006133e+11s	| 3625208.40432 days
Mean Absolute Error: 293667.710269s	| 3.39893183182 days
Median Absolute Error: 118244.105469s	| 1.36856603552 days
