This script takes the output of create_df as input and prepare data for training model

# Load data

In [1]:
import os
import argparse
import pandas as pd
import numpy as np
import sys
#from utils import * 

In [2]:
%pwd

'/home/hoang/Dropbox/process-sequence/data_preprocessing'

In [3]:
sys.path.insert(0, './../utils/')
from utils_ma import *

In [4]:
name = 'helpdesk'
parser = {
    'datafile': name + '.csv',
    'inputdir': './../input/{}/'.format(name),   
#    'utils': 'utils_1'
}

dirs = argparse.Namespace(**parser)

In [5]:
data = pd.read_csv(dirs.inputdir+'full_data_newfeatures.csv')
train = pd.read_csv(dirs.inputdir+'train_newfeatures.csv')
test = pd.read_csv(dirs.inputdir+'test_newfeatures.csv')

In [6]:
data = transformDf(data)
train = transformDf(train)
test = transformDf(test)

In [7]:
data['ActivityID'].unique()

array(['1.0', '8.0', '6.0', '3.0', '9.0', '2.0', '4.0', '5.0', '7.0'], dtype=object)

In [8]:
data['TimeSinceMidnight'].unique()

array([ 59432.,  62745.,  63886., ...,  77531.,   1811.,   7660.])

In [9]:
data.shape, train.shape, test.shape #13710, 9181, 4529

((13693, 15), (9167, 15), (4526, 15))

In [10]:
data.head()

Unnamed: 0,CaseID,ActivityID,CompleteTimestamp,Duration,CumDuration,TimeSinceMidnight,WeekDay,SMA_16,STD_16,Upper_band_16,Lower_band_16,Band_value_16,RSI,Williams,MACD
0,8.0,1.0,2012-07-05 16:30:32,0.0,0.0,59432.0,3.0,8.0,118396.803377,24.0,-8.0,-0.5,49.999512,-100.0,1406.66673
1,8.0,8.0,2012-07-05 17:25:45,3313.0,3313.0,62745.0,3.0,8.0,113922.209586,24.0,-8.0,206.5625,43.746336,-99.27557,1224.258754
2,8.0,6.0,2012-07-05 17:44:46,1141.0,4454.0,63886.0,3.0,8.0,113897.428427,24.0,-8.0,70.8125,50.047826,-99.750506,923.71968
3,9.0,3.0,2010-05-07 21:02:21,0.0,0.0,75741.0,4.0,9.0,21398.527859,27.0,-9.0,-0.5,18.960277,-100.0,605.548176
4,9.0,1.0,2010-05-07 21:02:34,13.0,13.0,75754.0,4.0,9.0,21406.709379,27.0,-9.0,0.222222,49.947617,-99.979861,368.878283


# Parameters

Parameters for preprocessing

In [11]:
groupByCase = data.groupby(['CaseID'])

In [12]:
# define the denominator for normalization
divisor = data['Duration'].mean()
divisor2 = data['CumDuration'].mean()
di1 = data['SMA_16'].mean()
di2 = data['STD_16'].mean()
di3 = data['Band_value_16'].mean()
#di4 = data['Momentum'].mean()
di4 = data['RSI'].mean()
di5 = data['Williams'].mean()
di6 = data['MACD'].mean()

In [13]:
#find len of longest case
maxlen = findLongestLength(groupByCase)
maxlen

15

In [14]:
#define number of features
features = ['number_of_past_activitiy', 'duration', 'cumduration', 'time_from_midnight', 
            'Mon', 'Tue', 'Wed', 'Thur', 'Fri', 'Sat', 'Sun',
           'rolling_mean', 'rolling_std', 'bands', 'rsi', 'william', 'macd']
num_features = len(data['ActivityID'].unique()) + len(features)
num_features

26

In [15]:
unique_chars = data['ActivityID'].unique().tolist()
target_chars = unique_chars + ['EOS']

In [16]:
chartoindice = char_indice_dict(unique_chars)
targetchartoindice = char_indice_dict(target_chars)

In [17]:
targetchartoindice

{'1.0': 0,
 '2.0': 5,
 '3.0': 3,
 '4.0': 6,
 '5.0': 7,
 '6.0': 2,
 '7.0': 8,
 '8.0': 1,
 '9.0': 4,
 'EOS': 9}

# Train

## Input

In [18]:
train_groupByCase = train.groupby(['CaseID'])

In [19]:
sentences, sentences_t, sentences_t2, sentences_t3, sentences_t4, ma_1, ma_2,  ma_3, ma_4, ma_5, ma_6 = getFeature(train_groupByCase)

In [20]:
len(sentences), len(sentences_t), len(sentences_t2), len(sentences_t3), len(sentences_t4)

(9167, 9167, 9167, 9167, 9167)

In [21]:
X = vectorizeInput(sentences, sentences_t, sentences_t2, sentences_t3, sentences_t4, ma_1, ma_2,  ma_3, ma_4, ma_5, ma_6, maxlen, num_features, chartoindice, divisor, divisor2,86400,7, di1, di2, di3, di4, di5, di6)

In [23]:
X.shape

(9167, 15, 26)

## Output

In [24]:
next_chars, next_chars_t, next_chars_t2, next_chars_t3, next_chars_t4 = getOutput(train_groupByCase)

In [25]:
len(next_chars), len(next_chars_t), len(next_chars_t2), len(next_chars_t3), len(next_chars_t4)

(9167, 9167, 9167, 9167, 9167)

### y_a

In [26]:
next_chars[:10]

['8.0', '6.0', 'EOS', '1.0', '8.0', '6.0', 'EOS', '8.0', '6.0', 'EOS']

In [27]:
y_a = one_hot_encode(next_chars, targetchartoindice)

In [28]:
y_a.shape

(9167, 10)

In [29]:
y_a[:10]

array([[ 0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.],
       [ 1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.],
       [ 0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.]], dtype=float32)

### y_t 

In [30]:
y_t = normalize(next_chars_t, divisor)

In [31]:
y_t

array([ 0.01569243,  0.00540449,  0.        , ...,  1.33094816,
        1.21476356,  0.        ])

In [32]:
y_t.shape

(9167,)

# Test

In [33]:
test_groupByCase = test.groupby(['CaseID'])

In [34]:
df_test = pd.DataFrame(columns=['CaseID', 'ActivityID', 'CompleteTimestamp', 'Duration', 'CumDuration', 'TimeSinceMidnight', 'WeekDay'])
for case, group in test_groupByCase:
    if group.shape[0] > 1:     
        df_test = df_test.append(group)

In [35]:
df_test.shape #No case with one activity

(4526, 15)

In [36]:
test.shape

(4526, 15)

## Input

In [37]:
sentences, sentences_t, sentences_t2, sentences_t3, sentences_t4, ma_1, ma_2,  ma_3, ma_4, ma_5, ma_6 = getFeature(test_groupByCase)

In [38]:
len(sentences), len(sentences_t), len(sentences_t2), len(sentences_t3), len(sentences_t4)

(4526, 4526, 4526, 4526, 4526)

In [39]:
X_test = vectorizeInput(sentences, sentences_t, sentences_t2, sentences_t3, sentences_t4,  ma_1, ma_2,  ma_3, ma_4, ma_5, ma_6, maxlen, num_features, chartoindice, divisor, divisor2, 86400, 7, di1, di2, di3, di4, di5, di6)

In [40]:
X_test.shape

(4526, 15, 26)

## Output

In [41]:
next_chars, next_chars_t, next_chars_t2, next_chars_t3, next_chars_t4 = getOutput(test_groupByCase)

In [42]:
len(next_chars), len(next_chars_t), len(next_chars_t2), len(next_chars_t3), len(next_chars_t4)

(4526, 4526, 4526, 4526, 4526)

### y_a

One-hot encode for y_a

In [43]:
next_chars[:10]

['1.0', '8.0', '9.0', '8.0', '6.0', 'EOS', '8.0', '9.0', '6.0', 'EOS']

In [44]:
y_a_test = one_hot_encode(next_chars, targetchartoindice)

In [45]:
y_a_test.shape

(4526, 10)

In [46]:
y_a[:10]

array([[ 0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.],
       [ 1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.],
       [ 0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.]], dtype=float32)

### y_t

Scale/Normalize data. This can be done by using [sklearn](http://scikit-learn.org/stable/modules/preprocessing.html#preprocessing-scaler)

In [47]:
y_t_test = normalize(next_chars_t, divisor)

In [48]:
y_t_test

array([  1.05385103e-01,   6.63127082e-05,   5.62163142e+00, ...,
         2.96601587e+00,   2.77045022e-02,   0.00000000e+00])

In [49]:
y_t_test.shape

(4526,)

# Save data

In [50]:
import pickle

In [51]:
with open(dirs.inputdir + 'parameters_newfeatures.pkl', 'wb') as f:
    pickle.dump(maxlen, f, protocol=2)
    pickle.dump(num_features, f, protocol=2)
    pickle.dump(chartoindice, f, protocol=2)
    pickle.dump(targetchartoindice, f, protocol=2)
    pickle.dump(divisor, f, protocol=2)
    pickle.dump(divisor2, f, protocol=2)

In [52]:
with open(dirs.inputdir + 'preprocessed_data_newfeatures.pkl', 'wb') as f:
    pickle.dump(X, f, protocol=2)
    pickle.dump(y_a, f, protocol=2)
    pickle.dump(y_t, f, protocol=2)
    pickle.dump(X_test, f, protocol=2)
    pickle.dump(y_a_test, f, protocol=2)
    pickle.dump(y_t_test, f, protocol=2)