This script takes the output of create_df as input and prepare data for training model

# Load data

In [1]:
import os
import argparse
import pandas as pd
import numpy as np
#from utils import * 

In [2]:
%cd ../utils/
from utils_1 import *

/Users/hoangnguyen/Dropbox/process-sequence/utils


In [48]:
name = 'helpdesk'
parser = {
    'datafile': name + '.csv',
    'inputdir': './../input/{}/'.format(name),   
    'utils': 'utils_1'
}

dirs = argparse.Namespace(**parser)

In [4]:
data = pd.read_csv(dirs.inputdir+'full_data.csv')
train = pd.read_csv(dirs.inputdir+'train.csv')
test = pd.read_csv(dirs.inputdir+'test.csv')

In [5]:
data = transformDf(data)
train = transformDf(train)
test = transformDf(test)

In [6]:
data['ActivityID'].unique()

array(['1.0', '8.0', '6.0', '3.0', '9.0', '2.0', '4.0', '5.0', '7.0'], dtype=object)

In [7]:
data['TimeSinceMidnight'].unique()

array([ 60938.,  60953.,  62152., ...,  77531.,   1811.,   7660.])

In [8]:
data.shape, train.shape, test.shape #13710, 9181, 4529

((13710, 7), (9181, 7), (4529, 7))

# Parameters

Parameters for preprocessing

In [9]:
groupByCase = data.groupby(['CaseID'])

In [10]:
# define the denominator for normalization
divisor = data['Duration'].mean()
divisor2 = data['CumDuration'].mean()

In [11]:
#find len of longest case
maxlen = findLongestLength(groupByCase)
maxlen

15

In [12]:
#define number of features
if dirs.utils == 'utils':
    features = ['number_of_past_activitiy', 'duration', 'cumduration', 'time_from_midnight', 'day_of_week']
else:
    features = ['number_of_past_activitiy', 'duration', 'cumduration', 'time_from_midnight', 
              'Mon', 'Tue', 'Wed', 'Thur', 'Fri', 'Sat', 'Sun']
num_features = len(data['ActivityID'].unique()) + len(features)
num_features

20

In [13]:
unique_chars = data['ActivityID'].unique().tolist()
target_chars = unique_chars + ['EOS']

In [14]:
chartoindice = char_indice_dict(unique_chars)
targetchartoindice = char_indice_dict(target_chars)

In [15]:
targetchartoindice

{'1.0': 0,
 '2.0': 5,
 '3.0': 3,
 '4.0': 6,
 '5.0': 7,
 '6.0': 2,
 '7.0': 8,
 '8.0': 1,
 '9.0': 4,
 'EOS': 9}

# Train

## Input

In [16]:
train_groupByCase = train.groupby(['CaseID'])

In [17]:
sentences, sentences_t, sentences_t2, sentences_t3, sentences_t4 = getFeature(train_groupByCase)

In [18]:
len(sentences), len(sentences_t), len(sentences_t2), len(sentences_t3), len(sentences_t4)

(9181, 9181, 9181, 9181, 9181)

In [19]:
X = vectorizeInput(sentences, sentences_t, sentences_t2, sentences_t3, sentences_t4, maxlen, num_features, chartoindice, divisor, divisor2,divisor3=86400,divisor4=7)

In [20]:
X.shape

(9181, 15, 20)

## Output

In [21]:
next_chars, next_chars_t, next_chars_t2, next_chars_t3, next_chars_t4 = getOutput(train_groupByCase)

In [22]:
len(next_chars), len(next_chars_t), len(next_chars_t2), len(next_chars_t3), len(next_chars_t4)

(9181, 9181, 9181, 9181, 9181)

### y_a

In [23]:
next_chars[:10]

['8.0', '6.0', 'EOS', '8.0', '6.0', 'EOS', '8.0', '6.0', 'EOS', '8.0']

In [24]:
next_chars, next_chars_t, next_chars_t2, next_chars_t3, next_chars_t4 = getOutput(groupByCase)

In [25]:
y_a = one_hot_encode(next_chars, targetchartoindice)

In [26]:
y_a.shape

(13710, 10)

In [27]:
y_a[:10]

array([[ 0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.],
       [ 0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.],
       [ 0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.],
       [ 0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.]], dtype=float32)

### y_t 

In [28]:
y_t = normalize(next_chars_t, divisor)

In [29]:
y_t

array([  7.11185218e-05,   8.24970111e-01,   0.00000000e+00, ...,
         2.96890433e+00,   2.77314823e-02,   0.00000000e+00])

In [30]:
y_t.shape

(13710,)

# Test

In [31]:
test_groupByCase = test.groupby(['CaseID'])

In [32]:
df_test = pd.DataFrame(columns=['CaseID', 'ActivityID', 'CompleteTimestamp', 'Duration', 'CumDuration', 'TimeSinceMidnight', 'WeekDay'])
for case, group in test_groupByCase:
    if group.shape[0] > 1:     
        df_test = df_test.append(group)

In [33]:
df_test.shape #No case with one activity

(4529, 7)

In [34]:
test.shape

(4529, 7)

## Input

In [35]:
sentences, sentences_t, sentences_t2, sentences_t3, sentences_t4 = getFeature(test_groupByCase)

In [36]:
len(sentences), len(sentences_t), len(sentences_t2), len(sentences_t3), len(sentences_t4)

(4529, 4529, 4529, 4529, 4529)

In [37]:
X_test = vectorizeInput(sentences, sentences_t, sentences_t2, sentences_t3, sentences_t4, maxlen, num_features, chartoindice, divisor, divisor2)

In [38]:
X.shape

(9181, 15, 20)

## Output

In [39]:
next_chars, next_chars_t, next_chars_t2, next_chars_t3, next_chars_t4 = getOutput(test_groupByCase)

In [40]:
len(next_chars), len(next_chars_t), len(next_chars_t2), len(next_chars_t3), len(next_chars_t4)

(4529, 4529, 4529, 4529, 4529)

### y_a

One-hot encode for y_a

In [41]:
next_chars[:10]

['8.0', '6.0', 'EOS', '1.0', '8.0', '9.0', '8.0', '6.0', 'EOS', '8.0']

In [42]:
y_a_test = one_hot_encode(next_chars, targetchartoindice)

In [43]:
y_a_test.shape

(4529, 10)

In [44]:
y_a[:10]

array([[ 0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.],
       [ 0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.],
       [ 0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.],
       [ 0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.]], dtype=float32)

### y_t

Scale/Normalize data. This can be done by using [sklearn](http://scikit-learn.org/stable/modules/preprocessing.html#preprocessing-scaler)

In [45]:
y_t_test = normalize(next_chars_t, divisor)

In [46]:
y_t_test

array([ 1.3322443 ,  1.21594656,  0.        , ...,  2.96890433,
        0.02773148,  0.        ])

In [47]:
y_t_test.shape

(4529,)

# Save data

In [None]:
import pickle

In [None]:
with open(dirs.inputdir + 'parameters.pkl', 'wb') as f:
    pickle.dump(maxlen, f, protocol=2)
    pickle.dump(num_features, f, protocol=2)
    pickle.dump(chartoindice, f, protocol=2)
    pickle.dump(targetchartoindice, f, protocol=2)
    pickle.dump(divisor, f, protocol=2)
    pickle.dump(divisor2, f, protocol=2)

In [None]:
with open(dirs.inputdir + 'preprocessed_data.pkl', 'wb') as f:
    pickle.dump(X, f, protocol=2)
    pickle.dump(y_a, f, protocol=2)
    pickle.dump(y_t, f, protocol=2)
    pickle.dump(X_test, f, protocol=2)
    pickle.dump(y_a_test, f, protocol=2)
    pickle.dump(y_t_test, f, protocol=2)