# Assumptions for *Clean* version of CEC 2018 that were observed:
- if no start time / end time was entered for a start phase, the scheduled start time for the next phase was assumed
- if there were incorrect entries (ie. open time was before attendant arrival, work stop before work start) hours were incremented until things made sense
- durations were always taken between phases (open less attendant arrival)


In [1]:
import pandas as pd
import numpy as np
import os
import datetime as dt
import strgen
%config IPCompleter.greedy=True

In [2]:
print(os.getcwd())
os.listdir("C:\\Users\\chris\\OneDrive - Irwin's Safety and Industrial Labour Services Ltd\\Irwins\\raw_inputs")

C:\Users\chris\OneDrive - Irwin's Safety and Industrial Labour Services Ltd\Irwins\python_scripts


['AutomatedForm-V6 - SH1 - 03.13.2018.xlsm',
 'CEC-2018-Analytics.xlsm',
 'Enmax Shepard-April-2018.xlsm',
 'Joffre - V6 - March 2018 - 13.03.2018.xlsm',
 'Total Validated Data-2013-2016.csv']

In [3]:
path = "C:\\Users\\chris\\OneDrive - Irwin's Safety and Industrial Labour Services Ltd\\Irwins"
cec = pd.read_excel(path+'\\raw_inputs\\CEC-2018-Analytics.xlsm','DataBase')
cec['Date']

0     2018-05-26
1     2018-05-27
2     2018-06-01
3     2018-06-01
4     2018-06-01
         ...    
108   2018-05-30
109   2018-05-30
110   2018-05-31
111   2018-05-31
112   2018-05-31
Name: Date, Length: 113, dtype: datetime64[ns]

In [4]:
cec['csas_id']=strgen.StringGenerator("[\d]{10}").render_list(len(cec),unique=True)
cec=cec.replace('L','C')

In [5]:
def dateconvert(mydate):
    """ If excel file is incorrectly read as YYYY-MM-DD and should be YYYY-DD-MM, rearranges to correct format in text.
    This then allows the text to be converted back to datetime
    """
    if isinstance(mydate, dt.datetime):
        return dt.datetime.strftime(mydate, '%d/%m/%Y')
    else:
        return mydate       

In [6]:
def timeconvert(x):
    """ Converts datetime.time to datetime.timedelta so that times can be differenced to calculate durations"""
    if isinstance(x,dt.time):
        x = dt.timedelta(hours=x.hour, minutes=x.minute, seconds=x.second, microseconds=x.microsecond)
        return x
    else:
        return pd.NaT

In [7]:
# Create a new column called 'clean_date' that has consistent format for all rows
cec['clean_date']=pd.to_datetime(cec['Date'].apply(dateconvert),dayfirst=True)
cec['clean_date'].value_counts()

2018-06-08    10
2018-06-11     8
2018-06-12     8
2018-06-13     8
2018-06-01     8
2018-06-06     7
2018-06-09     7
2018-06-02     7
2018-05-30     6
2018-06-04     6
2018-06-10     5
2018-06-21     5
2018-06-05     5
2018-06-03     4
2018-06-07     3
2018-05-31     3
2018-05-29     3
2018-06-22     2
2018-06-20     2
2018-05-28     2
2018-05-26     1
2018-06-23     1
2018-05-27     1
2018-06-14     1
Name: clean_date, dtype: int64

In [8]:
starts = [1,2,3,4,5]
phase = [0,1,2,3]
index = ['csas_id','Work Order', 'Code of Practice', 'Work Areas', 'Facility', 'clean_date',
       'Number of Crews', 'Shift']
times_prefix = ['ScheduledST', 'Attendant Arrival',
       'Space Open', 'Work Start', 'Work Stop']
cause_prefix = ['WT/DT Cause Code','Cause Note']
li=[]
# There are 6 start in a given shift, accounting for breaks, etc.  Each CSA could have a max of 6 entries per work order in a shift
for s in starts:
    times = ['{}{}'.format(t, s) for t in times_prefix]
    codes = ['{}{}{}'.format(c,s,p)  for c in cause_prefix for p in phase]
    cols = index + times + codes
    #subset the dataframe to refer to the given set of columns for each 'start'
    df = cec[cols].copy()
    df['start_phase'] = s
    # convert all the time columns to time.delta to allow for delay calculations
    for i in range(8,13):
        df.iloc[:,i] = df.iloc[:,i].map(timeconvert)
        df.iloc[:,i] = np.where(df.iloc[:,i].dt.seconds<21600,df['clean_date']+df.iloc[:,i]+
                                dt.timedelta(days=1),df['clean_date']+df.iloc[:,i])
    df['delay0'] = np.where(((df.iloc[:,9]-df.iloc[:,8])<dt.timedelta(minutes = 0))
                            &(df.iloc[:,8]!=""),0,(df.iloc[:,9]-df.iloc[:,8]))
    df['delay1'] = np.where(((df.iloc[:,10]-df.iloc[:,9])<dt.timedelta(minutes = 0))
                            &(df.iloc[:,9]!=""),0,(df.iloc[:,10]-df.iloc[:,9]))
    df['delay2'] = np.where(((df.iloc[:,11]-df.iloc[:,10])<dt.timedelta(minutes = 0))
                            &(df.iloc[:,10]!=""),0,(df.iloc[:,11]-df.iloc[:,10]))
    df['tool_time'] = np.where(((df.iloc[:,12]-df.iloc[:,11])<dt.timedelta(minutes = 0))
                            &(df.iloc[:,11]!=""),0,(df.iloc[:,12]-df.iloc[:,11]))
    # Dictionary to rename intervals as per Irwin's naming conventions
    timephase_dict = {0:'ArrivLessSched',1:'OpenLessArriv',2:'StartLessOpen',3:'StopLessStart'}
    # Use loop to re-arrange dataframe with fewer columns (ie. have start and phase as a feature column )
    for i in range(8,12):
        colnums = [0,1,2,3,4,5,6,7,21,i,i+1,i+14,i+5,i+9,]
        mydf = df.iloc[:,colnums].copy()
        mydf['time_phase'] = i-8
        mydf['time_phase'] = mydf['time_phase'].map(timephase_dict)
        #rename columns
        mydf.columns = ['csas_id','Work Order', 'Code of Practice', 'Work Areas', 'Facility',
       'clean_date', 'Number of Crews', 'Shift', 'start_phase', 'start_time','stop_time',
       'duration', 'WT/DT Cause Code', 'Note', 'time_phase']
        #re-arrange columns
        mydf = mydf[['csas_id','Work Order', 'Code of Practice', 'Work Areas', 'Facility',
       'clean_date', 'Number of Crews', 'Shift', 'start_phase','time_phase', 'start_time','stop_time',
       'duration', 'WT/DT Cause Code', 'Note' ]]
        li.append(mydf)


In [9]:
def timedelta_to_hours(mytime):
    ''' converts the duration column into hours'''
    return mytime.seconds/3600

In [10]:
def timedelta_to_time(mytime):
    ''' converts start time from timedelta object back into time'''
    if isinstance(mytime,datetime.timedelta):
        hours = mytime.seconds//3600
        minutes = (mytime.seconds%3600)//60
        return datetime.time(hour = hours, minute = minutes)
    else: 
        return pd.NaT

In [11]:
# concatenate all individual dataframes
cec_stack = pd.concat(li,axis=0, ignore_index= True)
# convert duration to a hourly object
cec_stack['duration'] = cec_stack['duration'].map(timedelta_to_hours)
# add man hours which is duration * number in crew
cec_stack['person_hours'] = cec_stack['Number of Crews']*cec_stack['duration']
# create a list of conditions which will allow creation of a time interval that relates
# downtime cause codes back to wait time/ down time/ idle time / tool time
condlist = [(cec_stack['time_phase']=='StopLessStart') & (cec_stack['duration']!=0),
             cec_stack['WT/DT Cause Code'].isin(['C','GT','EF','TB']),
             cec_stack['WT/DT Cause Code'].isin(['PE','IN','CD']),
             cec_stack['WT/DT Cause Code'].isin(['S','NW','I','SB']),
            (pd.isnull(cec_stack['WT/DT Cause Code'])) & (cec_stack['duration']!=0)]
choicelist = ['tool_time','wait_time','down_time','idle_time','nonwork_time']
cec_stack['code_phase'] = np.select(condlist,choicelist)
cec_stack['code_phase'].replace('0',np.nan,inplace=True)
# replace an entry of 'Shift' with day 
cec_stack['Shift'] = cec_stack['Shift'].replace(dt.datetime(2018, 6, 3, 0, 0),'Day')
print(cec_stack['Shift'].unique())
# add columns for company, project and year
cec_stack['company'] = 'Enmax'
cec_stack['project'] = 'Calgary Energy Centre'
cec_stack['year'] = 2018

['Night' 'Day']


In [12]:
cec_stack['code_phase'].value_counts()

nonwork_time    892
tool_time       531
wait_time       271
idle_time       234
down_time        93
Name: code_phase, dtype: int64

In [13]:
cec_stack.to_csv(path+'\\output\\cec_2018.csv',index=False)
cec_stack.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2260 entries, 0 to 2259
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   csas_id           2260 non-null   object        
 1   Work Order        2260 non-null   int64         
 2   Code of Practice  2260 non-null   object        
 3   Work Areas        2240 non-null   object        
 4   Facility          2260 non-null   object        
 5   clean_date        2260 non-null   datetime64[ns]
 6   Number of Crews   2220 non-null   float64       
 7   Shift             2260 non-null   object        
 8   start_phase       2260 non-null   int64         
 9   time_phase        2260 non-null   object        
 10  start_time        1112 non-null   datetime64[ns]
 11  stop_time         1111 non-null   datetime64[ns]
 12  duration          1111 non-null   float64       
 13  WT/DT Cause Code  598 non-null    object        
 14  Note              3 non-

In [14]:
# clean the dataframe so that no blank values for duration
cec_clean=cec_stack.dropna(subset=['duration'])
cec_clean.sort_values('duration')

Unnamed: 0,csas_id,Work Order,Code of Practice,Work Areas,Facility,clean_date,Number of Crews,Shift,start_phase,time_phase,start_time,stop_time,duration,WT/DT Cause Code,Note,person_hours,code_phase,company,project,year
768,9027420501,21150437,Door 41A,ST Hotwell Upper Acess,Enmax CEC,2018-06-20,3.0,Day,2,StartLessOpen,2018-06-20 11:10:00,2018-06-20 11:10:00,0.000000,,,0.000000,,Enmax,Calgary Energy Centre,2018
798,4223716912,21129611,Door 51,CT Generator Vault West,Enmax CEC,2018-06-01,2.0,Day,2,StopLessStart,2018-06-01 13:45:00,2018-06-01 13:45:00,0.000000,,,0.000000,,Enmax,Calgary Energy Centre,2018
805,4484735534,21129601,Door 50,CT Generator Vault East,Enmax CEC,2018-06-02,1.0,Day,2,StopLessStart,2018-06-02 13:38:00,2018-06-02 13:38:00,0.000000,,,0.000000,,Enmax,Calgary Energy Centre,2018
1670,6737706091,21122364,Door 49,Combustor Shell,Enmax CEC,2018-06-14,4.0,Day,4,StartLessOpen,2018-06-14 16:30:00,2018-06-14 16:30:00,0.000000,,,0.000000,,Enmax,Calgary Energy Centre,2018
330,7962689258,21122364,Door 51,CT Generator Vault West,Enmax CEC,2018-05-30,2.0,Day,1,StartLessOpen,2018-05-30 10:20:00,2018-05-30 10:20:00,0.000000,,,0.000000,,Enmax,Calgary Energy Centre,2018
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
880,5856843311,21161934,Door 6,HP Steam Drum,Enmax CEC,2018-06-20,6.0,Day,2,StopLessStart,2018-06-20 11:38:00,2018-06-20 16:42:00,5.066667,,,30.400000,tool_time,Enmax,Calgary Energy Centre,2018
840,2965898771,21097627,Door 6,HP Steam Drum,Enmax CEC,2018-06-08,4.0,Day,2,StopLessStart,2018-06-08 11:49:00,2018-06-08 17:00:00,5.183333,,,20.733333,tool_time,Enmax,Calgary Energy Centre,2018
16,2797539674,21118244,Door 38,Main Condensor,Enmax CEC,2018-06-02,1.0,Day,1,ArrivLessSched,2018-06-02 06:30:00,2018-06-02 11:43:00,5.216667,NW,,5.216667,idle_time,Enmax,Calgary Energy Centre,2018
66,4450523044,21122364,Door 48A,CT Exhaust Duct East,Enmax CEC,2018-06-11,3.0,Night,1,ArrivLessSched,2018-06-11 18:30:00,2018-06-12 02:45:00,8.250000,SB,,24.750000,idle_time,Enmax,Calgary Energy Centre,2018


In [15]:
cec_daily = cec_stack[cec_stack['code_phase']!='0'].pivot_table(values='duration',
                            index=['Work Order','Code of Practice','clean_date','Number of Crews','Shift'],
                            columns=['code_phase'],aggfunc='sum')
cec_daily

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,code_phase,down_time,idle_time,nonwork_time,tool_time,wait_time
Work Order,Code of Practice,clean_date,Number of Crews,Shift,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
21091147,Door 15,2018-06-02,6.0,Day,1.083333,0.516667,0.000000,4.133333,0.183333
21097627,Door 5/6,2018-06-12,5.0,Day,1.366667,2.933333,1.183333,4.316667,0.050000
21097627,Door 6,2018-06-06,2.0,Day,0.000000,0.916667,0.000000,1.400000,0.116667
21097627,Door 6,2018-06-08,4.0,Day,0.550000,,0.000000,5.183333,0.016667
21097627,Door 6,2018-06-09,3.0,Day,0.500000,0.066667,0.000000,2.233333,0.083333
...,...,...,...,...,...,...,...,...,...
21153758,Door 110,2018-06-06,4.0,Day,1.800000,0.050000,1.600000,5.400000,0.000000
21153758,Door 63/103,2018-06-13,7.0,Day,1.000000,0.116667,0.000000,8.966667,0.233333
21161934,Door 6,2018-06-20,6.0,Day,,1.950000,0.100000,5.100000,0.000000
21161936,Door 6,2018-06-13,5.0,Day,1.450000,1.683333,0.000000,5.133333,0.200000


In [16]:
cec_daily['nonwork_time']=cec_daily.iloc[:,[0,1,3]].sum(axis=1)
cec_daily['total_time']=cec_daily.iloc[:,0:4].sum(axis=1)
cec_daily['down_percent'] = cec_daily['down_time']/cec_daily['total_time']
cec_daily['idle_percent'] = cec_daily['idle_time']/cec_daily['total_time']
cec_daily['tool_percent'] = cec_daily['tool_time']/cec_daily['total_time']
cec_daily['wait_percent'] = cec_daily['wait_time']/cec_daily['total_time']
cec_daily['nonwork_percent'] = cec_daily['nonwork_time']/cec_daily['total_time']

In [17]:
cec_daily = cec_daily.reset_index()
cec_daily = cec_daily.fillna(0)

In [18]:
cec_daily.describe()

code_phase,Work Order,Number of Crews,down_time,idle_time,nonwork_time,tool_time,wait_time,total_time,down_percent,idle_percent,tool_percent,wait_percent,nonwork_percent
count,111.0,111.0,111.0,111.0,111.0,111.0,111.0,111.0,111.0,111.0,111.0,111.0,111.0
mean,21125070.0,3.369369,0.869219,1.344444,5.927327,3.713664,0.192042,11.854655,0.070292,0.10062,0.329088,0.031641,0.5
std,16861.9,1.981404,0.887199,1.885406,3.125095,2.460919,0.242866,6.250191,0.076911,0.11625,0.131146,0.073075,2.049885e-17
min,21091150.0,1.0,0.0,0.0,0.15,0.133333,0.0,0.3,0.0,0.0,0.012972,0.0,0.5
25%,21118250.0,2.0,0.0,0.066667,3.108333,1.758333,0.066667,6.216667,0.0,0.008475,0.252733,0.008274,0.5
50%,21122360.0,3.0,0.816667,0.55,6.116667,3.233333,0.15,12.233333,0.051458,0.055,0.352665,0.011236,0.5
75%,21133950.0,4.0,1.166667,1.825,8.641667,5.316667,0.2,17.283333,0.092206,0.162781,0.444536,0.021241,0.5
max,21195480.0,9.0,4.166667,8.666667,10.883333,9.433333,1.933333,21.766667,0.385852,0.440678,0.5,0.47619,0.5


In [19]:
cec_stack.groupby(['clean_date','Work Order','Code of Practice','Shift'])['Number of Crews'].mean()

clean_date  Work Order  Code of Practice  Shift
2018-05-26  21129601    Door 51           Night    2.0
2018-05-27  21129601    Door 50           Night    4.0
2018-05-28  21122364    Door 46           Day      4.0
                        Door 48A          Day      2.0
2018-05-29  21098749    Door 46           Day      8.0
                                                  ... 
2018-06-21  21106118    Door 6            Night    2.0
            21118249    Door 8/15         Day      3.0
            21150431    Door 41A          Day      7.0
2018-06-22  21097627    Door 6            Day      3.0
2018-06-23  21097627    Door 6            Day      3.0
Name: Number of Crews, Length: 109, dtype: float64

In [20]:
cec_totals = cec_stack.groupby(['clean_date','Work Order','Code of Practice','Shift'])['Number of Crews'].mean()
cec_totals = pd.DataFrame(cec_totals.groupby(['clean_date','Shift']).agg(['sum','nunique'])).reset_index()
cec_totals.head()

Unnamed: 0,clean_date,Shift,sum,nunique
0,2018-05-26,Night,2.0,1
1,2018-05-27,Night,4.0,1
2,2018-05-28,Day,6.0,2
3,2018-05-29,Day,12.0,2
4,2018-05-29,Night,4.0,1


In [21]:
cec_daily = cec_daily.merge(cec_totals,how='left',left_on=['clean_date','Shift'],right_on=['clean_date','Shift'])


In [22]:
cec_daily.columns

Index(['Work Order', 'Code of Practice', 'clean_date', 'Number of Crews',
       'Shift', 'down_time', 'idle_time', 'nonwork_time', 'tool_time',
       'wait_time', 'total_time', 'down_percent', 'idle_percent',
       'tool_percent', 'wait_percent', 'nonwork_percent', 'sum', 'nunique'],
      dtype='object')

In [23]:
cec_daily.columns = ['Work Order', 'Code of Practice', 'clean_date', 'Number of Crews',
       'Shift', 'down_time', 'idle_time', 'tool_time', 'wait_time',
       'nonwork_time', 'total_time', 'down_percent', 'idle_percent',
       'tool_percent', 'wait_percent', 'nonwork_percent','total_crew', 'total_workorders']


In [24]:
cec_daily.sort_values(['clean_date','Shift']).to_csv('DailyWO.csv',index=False)

In [25]:
cec_daily.sort_values('total_time')

Unnamed: 0,Work Order,Code of Practice,clean_date,Number of Crews,Shift,down_time,idle_time,tool_time,wait_time,nonwork_time,total_time,down_percent,idle_percent,tool_percent,wait_percent,nonwork_percent,total_crew,total_workorders
13,21098749,Door 41A,2018-06-12,1.0,Night,0.000000,0.016667,0.150000,0.133333,0.100000,0.300000,0.000000,0.055556,0.444444,0.333333,0.5,1.0,1
50,21122364,Door 48A,2018-06-01,1.0,Day,0.000000,0.000000,0.350000,0.350000,0.333333,0.700000,0.000000,0.000000,0.500000,0.476190,0.5,13.0,4
30,21118250,Door 9,2018-06-08,2.0,Day,0.000000,0.200000,0.700000,0.500000,0.050000,1.400000,0.000000,0.142857,0.357143,0.035714,0.5,22.0,5
43,21122364,Door 44G,2018-05-30,4.0,Night,0.000000,0.000000,0.750000,0.750000,0.166667,1.500000,0.000000,0.000000,0.500000,0.111111,0.5,4.0,1
75,21129601,Door 50,2018-06-12,2.0,Day,0.000000,0.033333,0.800000,0.766667,0.133333,1.600000,0.000000,0.020833,0.479167,0.083333,0.5,25.0,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
93,21145290,Door 48A,2018-06-09,2.0,Day,1.166667,1.250000,10.116667,7.700000,0.150000,20.233333,0.057661,0.061779,0.380560,0.007414,0.5,19.0,3
12,21098206,Door 103/63,2018-06-11,7.0,Day,0.750000,0.116667,10.300000,9.433333,0.200000,20.600000,0.036408,0.005663,0.457929,0.009709,0.5,26.0,5
24,21118249,Door 8,2018-06-11,3.0,Day,0.033333,8.250000,10.450000,2.166667,0.050000,20.900000,0.001595,0.394737,0.103668,0.002392,0.5,26.0,5
79,21129601,Door 51,2018-06-06,2.0,Day,1.583333,4.233333,10.800000,4.983333,0.200000,21.600000,0.073302,0.195988,0.230710,0.009259,0.5,11.5,3
