This script returns a dataframe with all information needed for analysis and later used for preparing input

In [1]:
import os
import argparse
import pandas as pd
import numpy as np
import sys

In [2]:
from dateutil.parser import parse
from datetime import datetime
import time

pd.options.mode.chained_assignment = None #to run loop quicker without warnings

In [3]:
name = 'helpdesk'
args = {
    'datadir': '../data/',
    'datafile': name + '.csv',
    'inputdir': '../input/{}/'.format(name),   
}

args = argparse.Namespace(**args)

In [4]:
if not os.path.isdir('../input/'):
    os.makedirs('../input/')
    
if not os.path.isdir(args.inputdir):
    os.makedirs(args.inputdir)

In [5]:
args.datafile

'helpdesk.csv'

# Preprocessing data

## Get time features

In [6]:
data = pd.read_csv(args.datadir + args.datafile)
data['CompleteTimestamp'] = pd.to_datetime(data['CompleteTimestamp'], errors='coerce')

In [7]:
data.head()

Unnamed: 0,CaseID,ActivityID,CompleteTimestamp
0,2,1,2012-04-03 16:55:38
1,2,8,2012-04-03 16:55:53
2,2,6,2012-04-05 17:15:52
3,3,1,2010-10-29 18:14:06
4,3,8,2010-11-04 01:16:11


In [8]:
data.shape

(13710, 3)

In [9]:
def calculateDuration(df):
    df['Duration'] = (df['CompleteTimestamp'] - df['CompleteTimestamp'].shift(1)).fillna(0)
    return df

def calculateCumDuration(df):
    df['CumDuration'] = (df['CompleteTimestamp'] - df['CompleteTimestamp'].iloc[0]).fillna(0) 
    #change df['CompleteTimestamp'][0] --> df['CompleteTimestamp'].iloc[0]
    return df

def calculateTimeSinceMidNight(x):
    x = str(x)
    x = time.strptime(x, "%Y-%m-%d %H:%M:%S")
    midnight = datetime.fromtimestamp(time.mktime(x)).replace(hour=0, minute=0, second=0, microsecond=0)
    timesincemidnight = datetime.fromtimestamp(time.mktime(x))-midnight
    return timesincemidnight.seconds

def convert2seconds(x):
    x = int(x.total_seconds())
    return x

In [10]:
groupByCase = data.groupby(['CaseID'])

In [11]:
len(groupByCase)

3804

In [12]:
df = pd.DataFrame(columns=['CaseID', 'ActivityID', 'CompleteTimestamp', 'Duration', 'CumDuration', 'TimeSinceMidnight', 'WeekDay'])
#Loop all group and apply above functions
for case, group in groupByCase:
    group = calculateDuration(group)
    group = calculateCumDuration(group)
    group['Duration'] = group['Duration'].apply(convert2seconds)
    group['CumDuration'] = group['CumDuration'].apply(convert2seconds)
    group['TimeSinceMidnight'] = group['CompleteTimestamp'].apply(calculateTimeSinceMidNight)
    group['WeekDay'] = group['CompleteTimestamp'].dt.dayofweek
    
    df = df.append(group)

In [13]:
df.head(10)

Unnamed: 0,CaseID,ActivityID,CompleteTimestamp,Duration,CumDuration,TimeSinceMidnight,WeekDay
0,2.0,1.0,2012-04-03 16:55:38,0.0,0.0,60938.0,1.0
1,2.0,8.0,2012-04-03 16:55:53,15.0,15.0,60953.0,1.0
2,2.0,6.0,2012-04-05 17:15:52,173999.0,174014.0,62152.0,3.0
3,3.0,1.0,2010-10-29 18:14:06,0.0,0.0,65646.0,4.0
4,3.0,8.0,2010-11-04 01:16:11,457325.0,457325.0,4571.0,3.0
5,3.0,6.0,2010-11-04 01:21:17,306.0,457631.0,4877.0,3.0
6,4.0,1.0,2010-12-15 23:31:53,0.0,0.0,84713.0,2.0
7,4.0,8.0,2010-12-16 17:01:07,62954.0,62954.0,61267.0,3.0
8,4.0,6.0,2010-12-16 17:08:19,432.0,63386.0,61699.0,3.0
9,5.0,1.0,2012-04-03 21:08:32,0.0,0.0,76112.0,1.0


In [14]:
list(df)

['CaseID',
 'ActivityID',
 'CompleteTimestamp',
 'Duration',
 'CumDuration',
 'TimeSinceMidnight',
 'WeekDay']

In [15]:
for i in list(df):
    if i != 'CompleteTimestamp':
        df[i] = df[i].apply(int)

In [16]:
#df.to_csv(args.inputdir+'full_data.csv', index=False)

In [17]:
#df = pd.read_csv(args.inputdir+'full_data.csv')

## Add new features

In [18]:
sys.path.insert(0, '../utils/')
from indicators import *

In [19]:
df = df.rename(index=str, columns={"Duration": "Price"})

In [20]:
window = 16

In [21]:
# get features
df = compute_rolling_mean(df, window)
df = compute_rolling_std(df, window)
df = compute_bollinger_bands(df, window)
#df = compute_momentum(df)
df = compute_rsi(df, window)
df = compute_william_percent_r(df, window)
df = compute_macd(df)
#data = compute_daily_return(data)
#data = compute_cumulative_return(data)
fill_missing_values(data)

In [22]:
df = df.iloc[window+1:]

In [23]:
nans = pd.isnull(df).sum()
nans[nans>0]

Series([], dtype: int64)

In [24]:
df.head()

Unnamed: 0,CaseID,ActivityID,CompleteTimestamp,Price,CumDuration,TimeSinceMidnight,WeekDay,SMA_16,STD_16,Upper_band_16,Lower_band_16,Band_value_16,RSI,Williams,MACD
17,8,1,2012-07-05 16:30:32,0,0,59432,3,8,118396.803377,24,-8,-0.5,49.999512,-100.0,1406.66673
18,8,8,2012-07-05 17:25:45,3313,3313,62745,3,8,113922.209586,24,-8,206.562,43.746336,-99.27557,1224.258754
19,8,6,2012-07-05 17:44:46,1141,4454,63886,3,8,113897.428427,24,-8,70.8125,50.047826,-99.750506,923.71968
20,9,3,2010-05-07 21:02:21,0,0,75741,4,9,21398.527859,27,-9,-0.5,18.960277,-100.0,605.548176
21,9,1,2010-05-07 21:02:34,13,13,75754,4,9,21406.709379,27,-9,0.222222,49.947617,-99.979861,368.878283


In [25]:
df = df.rename(index=str, columns={"Price": "Duration"})
df.to_csv(args.inputdir+'full_data_newfeatures.csv', index=False)

In [26]:
df.head()

Unnamed: 0,CaseID,ActivityID,CompleteTimestamp,Duration,CumDuration,TimeSinceMidnight,WeekDay,SMA_16,STD_16,Upper_band_16,Lower_band_16,Band_value_16,RSI,Williams,MACD
17,8,1,2012-07-05 16:30:32,0,0,59432,3,8,118396.803377,24,-8,-0.5,49.999512,-100.0,1406.66673
18,8,8,2012-07-05 17:25:45,3313,3313,62745,3,8,113922.209586,24,-8,206.562,43.746336,-99.27557,1224.258754
19,8,6,2012-07-05 17:44:46,1141,4454,63886,3,8,113897.428427,24,-8,70.8125,50.047826,-99.750506,923.71968
20,9,3,2010-05-07 21:02:21,0,0,75741,4,9,21398.527859,27,-9,-0.5,18.960277,-100.0,605.548176
21,9,1,2010-05-07 21:02:34,13,13,75754,4,9,21406.709379,27,-9,0.222222,49.947617,-99.979861,368.878283


In [27]:
df.shape

(13693, 15)

# Split data into train and test

In [28]:
groupByCase = df.groupby(['CaseID'])

In [29]:
num_cases = len(groupByCase)
train_size = int(num_cases*2/3)
test_size = num_cases - train_size

In [30]:
num_cases, train_size, test_size

(3799, 2532, 1267)

In [31]:
df_train = pd.DataFrame(columns=['CaseID', 'ActivityID', 'CompleteTimestamp', 'Duration', 'CumDuration', 'TimeSinceMidnight', 'WeekDay'])
df_test = pd.DataFrame(columns=['CaseID', 'ActivityID', 'CompleteTimestamp', 'Duration', 'CumDuration', 'TimeSinceMidnight', 'WeekDay'])

for i, (case, group) in enumerate(groupByCase):
    if i < train_size:     
        df_train = df_train.append(group)
    else:
        df_test = df_test.append(group)

In [32]:
df.shape, df_train.shape, df_test.shape

((13693, 15), (9167, 15), (4526, 15))

In [33]:
trainGroupby = df_train.groupby(['CaseID'])
testGroupby = df_test.groupby(['CaseID'])

In [34]:
len(trainGroupby), len(testGroupby)

(2532, 1267)

In [35]:
df_train.to_csv(args.inputdir+'train_newfeatures.csv', index=False)
df_test.to_csv(args.inputdir+'test_newfeatures.csv', index=False)

In [36]:
df_train.head()

Unnamed: 0,ActivityID,Band_value_16,CaseID,CompleteTimestamp,CumDuration,Duration,Lower_band_16,MACD,RSI,SMA_16,STD_16,TimeSinceMidnight,Upper_band_16,WeekDay,Williams
17,1.0,-0.5,8.0,2012-07-05 16:30:32,0.0,0.0,-8,1406.66673,49.999512,8,118396.803377,59432.0,24,3.0,-100.0
18,8.0,206.562,8.0,2012-07-05 17:25:45,3313.0,3313.0,-8,1224.258754,43.746336,8,113922.209586,62745.0,24,3.0,-99.27557
19,6.0,70.8125,8.0,2012-07-05 17:44:46,4454.0,1141.0,-8,923.71968,50.047826,8,113897.428427,63886.0,24,3.0,-99.750506
20,3.0,-0.5,9.0,2010-05-07 21:02:21,0.0,0.0,-9,605.548176,18.960277,9,21398.527859,75741.0,27,4.0,-100.0
21,1.0,0.222222,9.0,2010-05-07 21:02:34,13.0,13.0,-9,368.878283,49.947617,9,21406.709379,75754.0,27,4.0,-99.979861


# Test

In [37]:
test = data[:10]
#data['CompleteTimestamp'] = data['CompleteTimestamp'].apply(parse)
test['CompleteTimestamp'] = pd.to_datetime(test['CompleteTimestamp'])

In [38]:
test

Unnamed: 0,CaseID,ActivityID,CompleteTimestamp
0,2,1,2012-04-03 16:55:38
1,2,8,2012-04-03 16:55:53
2,2,6,2012-04-05 17:15:52
3,3,1,2010-10-29 18:14:06
4,3,8,2010-11-04 01:16:11
5,3,6,2010-11-04 01:21:17
6,4,1,2010-12-15 23:31:53
7,4,8,2010-12-16 17:01:07
8,4,6,2010-12-16 17:08:19
9,5,1,2012-04-03 21:08:32


In [39]:
type(test['CompleteTimestamp'])

pandas.core.series.Series

In [40]:
test = calculateDuration(test)
test = calculateCumDuration(test)

In [41]:
test['Duration'] = test['Duration'].apply(convert2seconds)
test['CumDuration'] = test['CumDuration'].apply(convert2seconds)

In [42]:
test['Timefrommidnight'] = test['CompleteTimestamp'].apply(calculateTimeSinceMidNight)

In [43]:
test['WeekDay'] = test['CompleteTimestamp'].dt.dayofweek

In [44]:
test['CaseID'] = test['CaseID'].apply(float)

In [45]:
test

Unnamed: 0,CaseID,ActivityID,CompleteTimestamp,Duration,CumDuration,Timefrommidnight,WeekDay
0,2.0,1,2012-04-03 16:55:38,0,0,60938,1
1,2.0,8,2012-04-03 16:55:53,15,15,60953,1
2,2.0,6,2012-04-05 17:15:52,173999,174014,62152,3
3,3.0,1,2010-10-29 18:14:06,-45270106,-45096092,65646,4
4,3.0,8,2010-11-04 01:16:11,457325,-44638767,4571,3
5,3.0,6,2010-11-04 01:21:17,306,-44638461,4877,3
6,4.0,1,2010-12-15 23:31:53,3622236,-41016225,84713,2
7,4.0,8,2010-12-16 17:01:07,62954,-40953271,61267,3
8,4.0,6,2010-12-16 17:08:19,432,-40952839,61699,3
9,5.0,1,2012-04-03 21:08:32,40968013,15174,76112,1


Another way to get ```Duration```:

```python
import pandas as pd
from dateutil.parser import parse
data = pd.read_csv('../data/helpdesk.csv')
df = data[:10]
for i in range(0, len(df)-1):
    starting = parse(df.loc[i,'CompleteTimestamp'])
    ending = parse(df.loc[i+1,'CompleteTimestamp'])
    df.loc[i+1,'Duration'] = (ending-starting).total_seconds()

df.fillna(0)
```