In [1]:
import os
import argparse
import pandas as pd
import numpy as np

In [2]:
from dateutil.parser import parse
from datetime import datetime
import time

pd.options.mode.chained_assignment = None #to run loop quicker without warnings

In [3]:
name = 'bpi_12_w_no_repeat'
args = {
    'datadir': '../data/',
    'datafile': name + '.csv',
    'inputdir': '../input/{}/'.format(name),   
}

args = argparse.Namespace(**args)

In [38]:
if not os.path.isdir('../input/'):
    os.makedirs('../input/')
    
if not os.path.isdir(args.inputdir):
    os.makedirs(args.inputdir)

In [5]:
args.datafile

'bpi_12_w_no_repeat.csv'

# Preprocessing data

In [6]:
data = pd.read_csv(args.datadir + args.datafile)
data['CompleteTimestamp'] = pd.to_datetime(data['CompleteTimestamp'], errors='coerce')

In [7]:
data.head()

Unnamed: 0,CaseID,ActivityID,CompleteTimestamp
0,173688,3,2011-10-01 19:45:13
1,173688,5,2011-10-01 20:17:08
2,173688,6,2011-10-13 18:37:37
3,173691,3,2011-10-01 19:43:13
4,173691,5,2011-10-01 22:36:25


In [8]:
data.shape

(29410, 3)

In [9]:
def calculateDuration(df):
    df['Duration'] = (df['CompleteTimestamp'] - df['CompleteTimestamp'].shift(1)).fillna(0)
    return df

def calculateCumDuration(df):
    df['CumDuration'] = (df['CompleteTimestamp'] - df['CompleteTimestamp'].iloc[0]).fillna(0) 
    #change df['CompleteTimestamp'][0] --> df['CompleteTimestamp'].iloc[0]
    return df

def calculateTimeSinceMidNight(x):
    x = str(x)
    x = time.strptime(x, "%Y-%m-%d %H:%M:%S")
    midnight = datetime.fromtimestamp(time.mktime(x)).replace(hour=0, minute=0, second=0, microsecond=0)
    timesincemidnight = datetime.fromtimestamp(time.mktime(x))-midnight
    return timesincemidnight.seconds

def convert2seconds(x):
    x = int(x.total_seconds())
    return x

In [10]:
groupByCase = data.groupby(['CaseID'])

In [11]:
len(groupByCase)

9658

In [12]:
df = pd.DataFrame(columns=['CaseID', 'ActivityID', 'CompleteTimestamp', 'Duration', 'CumDuration', 'TimeSinceMidnight', 'WeekDay'])
#Loop all group and apply above functions
for case, group in groupByCase:
    group = calculateDuration(group)
    group = calculateCumDuration(group)
    group['Duration'] = group['Duration'].apply(convert2seconds)
    group['CumDuration'] = group['CumDuration'].apply(convert2seconds)
    group['TimeSinceMidnight'] = group['CompleteTimestamp'].apply(calculateTimeSinceMidNight)
    group['WeekDay'] = group['CompleteTimestamp'].dt.dayofweek
    
    df = df.append(group)

In [13]:
df.head(10)

Unnamed: 0,CaseID,ActivityID,CompleteTimestamp,Duration,CumDuration,TimeSinceMidnight,WeekDay
0,173688.0,3.0,2011-10-01 19:45:13,0.0,0.0,71113.0,5.0
1,173688.0,5.0,2011-10-01 20:17:08,1915.0,1915.0,73028.0,5.0
2,173688.0,6.0,2011-10-13 18:37:37,1030829.0,1032744.0,67057.0,3.0
3,173691.0,3.0,2011-10-01 19:43:13,0.0,0.0,70993.0,5.0
4,173691.0,5.0,2011-10-01 22:36:25,10392.0,10392.0,81385.0,5.0
5,173691.0,6.0,2011-10-10 19:30:54,766469.0,776861.0,70254.0,0.0
6,173691.0,6.0,2011-10-10 22:17:34,10000.0,786861.0,80254.0,0.0
7,173694.0,3.0,2011-10-01 19:35:59,0.0,0.0,70559.0,5.0
8,173694.0,5.0,2011-10-03 21:44:21,180502.0,180502.0,78261.0,0.0
9,173694.0,6.0,2011-11-05 01:05:01,2776840.0,2957342.0,3901.0,5.0


In [14]:
list(df)

['CaseID',
 'ActivityID',
 'CompleteTimestamp',
 'Duration',
 'CumDuration',
 'TimeSinceMidnight',
 'WeekDay']

In [15]:
for i in list(df):
    if i != 'CompleteTimestamp':
        df[i] = df[i].apply(int)

In [16]:
df.head(10)

Unnamed: 0,CaseID,ActivityID,CompleteTimestamp,Duration,CumDuration,TimeSinceMidnight,WeekDay
0,173688,3,2011-10-01 19:45:13,0,0,71113,5
1,173688,5,2011-10-01 20:17:08,1915,1915,73028,5
2,173688,6,2011-10-13 18:37:37,1030829,1032744,67057,3
3,173691,3,2011-10-01 19:43:13,0,0,70993,5
4,173691,5,2011-10-01 22:36:25,10392,10392,81385,5
5,173691,6,2011-10-10 19:30:54,766469,776861,70254,0
6,173691,6,2011-10-10 22:17:34,10000,786861,80254,0
7,173694,3,2011-10-01 19:35:59,0,0,70559,5
8,173694,5,2011-10-03 21:44:21,180502,180502,78261,0
9,173694,6,2011-11-05 01:05:01,2776840,2957342,3901,5


In [17]:
df.shape

(29410, 7)

In [18]:
df.to_csv(args.inputdir+'full_data.csv', index=False)

In [19]:
df = pd.read_csv(args.inputdir+'full_data.csv')

In [20]:
df.head()

Unnamed: 0,CaseID,ActivityID,CompleteTimestamp,Duration,CumDuration,TimeSinceMidnight,WeekDay
0,173688,3,2011-10-01 19:45:13,0,0,71113,5
1,173688,5,2011-10-01 20:17:08,1915,1915,73028,5
2,173688,6,2011-10-13 18:37:37,1030829,1032744,67057,3
3,173691,3,2011-10-01 19:43:13,0,0,70993,5
4,173691,5,2011-10-01 22:36:25,10392,10392,81385,5


# Split data into train and test

In [21]:
groupByCase = df.groupby(['CaseID'])

In [22]:
num_cases = len(groupByCase)
train_size = int(num_cases*2/3)
test_size = num_cases - train_size

In [23]:
num_cases, train_size, test_size

(9658, 6438, 3220)

In [24]:
df_train = pd.DataFrame(columns=['CaseID', 'ActivityID', 'CompleteTimestamp', 'Duration', 'CumDuration', 'TimeSinceMidnight', 'WeekDay'])
df_test = pd.DataFrame(columns=['CaseID', 'ActivityID', 'CompleteTimestamp', 'Duration', 'CumDuration', 'TimeSinceMidnight', 'WeekDay'])

for i, (case, group) in enumerate(groupByCase):
    if i < train_size:     
        df_train = df_train.append(group)
    else:
        df_test = df_test.append(group)

In [25]:
df.shape, df_train.shape, df_test.shape

((29410, 7), (19460, 7), (9950, 7))

In [26]:
trainGroupby = df_train.groupby(['CaseID'])
testGroupby = df_test.groupby(['CaseID'])

In [27]:
len(trainGroupby), len(testGroupby)

(6438, 3220)

In [28]:
df_train.to_csv(args.inputdir+'train.csv', index=False)
df_test.to_csv(args.inputdir+'test.csv', index=False)

# Test

In [29]:
test = data[:10]
#data['CompleteTimestamp'] = data['CompleteTimestamp'].apply(parse)
test['CompleteTimestamp'] = pd.to_datetime(test['CompleteTimestamp'])

In [30]:
test

Unnamed: 0,CaseID,ActivityID,CompleteTimestamp
0,173688,3,2011-10-01 19:45:13
1,173688,5,2011-10-01 20:17:08
2,173688,6,2011-10-13 18:37:37
3,173691,3,2011-10-01 19:43:13
4,173691,5,2011-10-01 22:36:25
5,173691,6,2011-10-10 19:30:54
6,173691,6,2011-10-10 22:17:34
7,173694,3,2011-10-01 19:35:59
8,173694,5,2011-10-03 21:44:21
9,173694,6,2011-11-05 01:05:01


In [31]:
type(test['CompleteTimestamp'])

pandas.core.series.Series

In [32]:
test = calculateDuration(test)
test = calculateCumDuration(test)

In [33]:
test['Duration'] = test['Duration'].apply(convert2seconds)
test['CumDuration'] = test['CumDuration'].apply(convert2seconds)

In [34]:
test['Timefrommidnight'] = test['CompleteTimestamp'].apply(calculateTimeSinceMidNight)

In [35]:
test['WeekDay'] = test['CompleteTimestamp'].dt.dayofweek

In [36]:
test['CaseID'] = test['CaseID'].apply(float)

In [37]:
test

Unnamed: 0,CaseID,ActivityID,CompleteTimestamp,Duration,CumDuration,Timefrommidnight,WeekDay
0,173688.0,3,2011-10-01 19:45:13,0,0,71113,5
1,173688.0,5,2011-10-01 20:17:08,1915,1915,73028,5
2,173688.0,6,2011-10-13 18:37:37,1030829,1032744,67057,3
3,173691.0,3,2011-10-01 19:43:13,-1032864,-120,70993,5
4,173691.0,5,2011-10-01 22:36:25,10392,10272,81385,5
5,173691.0,6,2011-10-10 19:30:54,766469,776741,70254,0
6,173691.0,6,2011-10-10 22:17:34,10000,786741,80254,0
7,173694.0,3,2011-10-01 19:35:59,-787295,-554,70559,5
8,173694.0,5,2011-10-03 21:44:21,180502,179948,78261,0
9,173694.0,6,2011-11-05 01:05:01,2776840,2956788,3901,5


Another way to get ```Duration```:

```python
import pandas as pd
from dateutil.parser import parse
data = pd.read_csv('../data/helpdesk.csv')
df = data[:10]
for i in range(0, len(df)-1):
    starting = parse(df.loc[i,'CompleteTimestamp'])
    ending = parse(df.loc[i+1,'CompleteTimestamp'])
    df.loc[i+1,'Duration'] = (ending-starting).total_seconds()

df.fillna(0)
```