This script takes the output of create_df as input and prepare data for training model

# Load data

In [1]:
import os
import argparse
import pandas as pd
import numpy as np
from utils import * 

In [2]:
name = 'helpdesk'
args = {
    'datafile': name + '.csv',
    'inputdir': '../input/{}/'.format(name),   
}

args = argparse.Namespace(**args)

In [3]:
data = pd.read_csv(args.inputdir+'full_data.csv')
train = pd.read_csv(args.inputdir+'train.csv')
test = pd.read_csv(args.inputdir+'test.csv')

In [4]:
data = transformDf(data)
train = transformDf(train)
test = transformDf(test)

In [5]:
data['ActivityID'].unique()

array(['1.0', '8.0', '6.0', '3.0', '9.0', '2.0', '4.0', '5.0', '7.0'], dtype=object)

In [6]:
data['TimeSinceMidnight'].unique()

array([ 60938.,  60953.,  62152., ...,  77531.,   1811.,   7660.])

In [7]:
data.shape, train.shape, test.shape #13710, 9181, 4529

((13710, 7), (9181, 7), (4529, 7))

# Parameters

Parameters for preprocessing

In [8]:
groupByCase = data.groupby(['CaseID'])

In [9]:
# define the denominator for normalization
divisor = data['Duration'].mean()
divisor2 = data['CumDuration'].mean()

In [10]:
#find len of longest case
maxlen = findLongestLength(groupByCase)
maxlen

15

In [11]:
#define number of features
num_features = len(data['ActivityID'].unique()) + 5
num_features

14

In [12]:
unique_chars = data['ActivityID'].unique().tolist()
target_chars = unique_chars + ['EOS']

In [13]:
chartoindice = char_indice_dict(unique_chars)
targetchartoindice = char_indice_dict(target_chars)

In [14]:
targetchartoindice

{'1.0': 0,
 '2.0': 5,
 '3.0': 3,
 '4.0': 6,
 '5.0': 7,
 '6.0': 2,
 '7.0': 8,
 '8.0': 1,
 '9.0': 4,
 'EOS': 9}

# Train

## Input

In [15]:
train_groupByCase = train.groupby(['CaseID'])

In [16]:
sentences, sentences_t, sentences_t2, sentences_t3, sentences_t4 = getFeature(train_groupByCase)

In [17]:
len(sentences), len(sentences_t), len(sentences_t2), len(sentences_t3), len(sentences_t4)

(9181, 9181, 9181, 9181, 9181)

In [18]:
X = vectorizeInput(train_groupByCase, maxlen, num_features, chartoindice, divisor, divisor2)

In [19]:
X.shape

(9181, 15, 14)

## Output

In [20]:
next_chars, next_chars_t, next_chars_t2, next_chars_t3, next_chars_t4 = getOutput(train_groupByCase)

In [21]:
len(next_chars), len(next_chars_t), len(next_chars_t2), len(next_chars_t3), len(next_chars_t4)

(9181, 9181, 9181, 9181, 9181)

### y_a

In [22]:
next_chars[:10]

['8.0', '6.0', 'EOS', '8.0', '6.0', 'EOS', '8.0', '6.0', 'EOS', '8.0']

In [23]:
y_a = one_hot_encode(train_groupByCase, targetchartoindice)

In [24]:
y_a.shape

(9181, 10)

In [25]:
y_a[:10]

array([[ 0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.],
       [ 0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.],
       [ 0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.],
       [ 0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.]], dtype=float32)

### y_t 

In [26]:
next_chars_t = np.asarray(next_chars_t)

In [27]:
y_t = normalize(train_groupByCase, divisor)

In [28]:
y_t

array([  7.11185218e-05,   8.24970111e-01,   0.00000000e+00, ...,
         7.11185218e-05,   4.12487426e-04,   0.00000000e+00])

In [29]:
y_t.shape

(9181,)

# Test

In [30]:
test_groupByCase = test.groupby(['CaseID'])

In [31]:
df_test = pd.DataFrame(columns=['CaseID', 'ActivityID', 'CompleteTimestamp', 'Duration', 'CumDuration', 'TimeSinceMidnight', 'WeekDay'])
for case, group in test_groupByCase:
    if group.shape[0] > 1:     
        df_test = df_test.append(group)

In [32]:
df_test.shape #No case with one activity

(4529, 7)

## Input

In [33]:
sentences, sentences_t, sentences_t2, sentences_t3, sentences_t4 = getFeature(test_groupByCase)

In [34]:
len(sentences), len(sentences_t), len(sentences_t2), len(sentences_t3), len(sentences_t4)

(4529, 4529, 4529, 4529, 4529)

In [35]:
X_test = vectorizeInput(test_groupByCase, maxlen, num_features, chartoindice, divisor, divisor2)

In [36]:
X.shape

(9181, 15, 14)

## Output

In [37]:
next_chars, next_chars_t, next_chars_t2, next_chars_t3, next_chars_t4 = getOutput(test_groupByCase)

In [38]:
len(next_chars), len(next_chars_t), len(next_chars_t2), len(next_chars_t3), len(next_chars_t4)

(4529, 4529, 4529, 4529, 4529)

### y_a

One-hot encode for y_a

In [39]:
next_chars[:10]

['8.0', '6.0', 'EOS', '1.0', '8.0', '9.0', '8.0', '6.0', 'EOS', '8.0']

In [40]:
y_a_test = one_hot_encode(test_groupByCase, targetchartoindice)

In [41]:
y_a_test.shape

(4529, 10)

In [42]:
y_a[:10]

array([[ 0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.],
       [ 0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.],
       [ 0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.],
       [ 0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.]], dtype=float32)

### y_t

Scale/Normalize data. This can be done by using [sklearn](http://scikit-learn.org/stable/modules/preprocessing.html#preprocessing-scaler)

In [43]:
y_t_test = normalize(test_groupByCase, divisor)

In [44]:
y_t_test

array([ 1.3322443 ,  1.21594656,  0.        , ...,  2.96890433,
        0.02773148,  0.        ])

In [45]:
y_t_test.shape

(4529,)

# Save data

In [46]:
import pickle

In [47]:
with open(args.inputdir + 'parameters.pkl', 'wb') as f:
    pickle.dump(maxlen, f, protocol=2)
    pickle.dump(num_features, f, protocol=2)
    pickle.dump(chartoindice, f, protocol=2)
    pickle.dump(targetchartoindice, f, protocol=2)
    pickle.dump(divisor, f, protocol=2)
    pickle.dump(divisor2, f, protocol=2)

In [48]:
with open(args.inputdir + 'preprocessed_data.pkl', 'wb') as f:
    pickle.dump(X, f, protocol=2)
    pickle.dump(y_a, f, protocol=2)
    pickle.dump(y_t, f, protocol=2)
    pickle.dump(X_test, f, protocol=2)
    pickle.dump(y_a_test, f, protocol=2)
    pickle.dump(y_t_test, f, protocol=2)