In [1]:
!jt -t oceans16

### Read in Libraries and Raw Files

In [2]:
import pandas as pd
import numpy as np
import os
import datetime as dt
import strgen
%config IPCompleter.greedy=True

In [3]:
print(os.getcwd())
os.listdir("C:\\Users\\chris\\OneDrive - Irwin's Safety and Industrial Labour Services Ltd\\Irwins\\raw_inputs")

C:\Users\chris\OneDrive - Irwin's Safety and Industrial Labour Services Ltd\Irwins\python_scripts


['AutomatedForm-V6 - SH1 - 03.13.2018.xlsm',
 'CEC-2018-Analytics.xlsm',
 'Enmax Shepard-April-2018.xlsm',
 'Joffre - V6 - March 2018 - 13.03.2018.xlsm',
 'Total Validated Data-2013-2016.csv']

In [4]:
path = "C:\\Users\\chris\\OneDrive - Irwin's Safety and Industrial Labour Services Ltd\\Irwins"

In [5]:
joffre_raw = pd.read_excel(path+'\\raw_inputs\\Joffre - V6 - March 2018 - 13.03.2018.xlsm',
                           'DataBase_Clean', parse_dates = True, dayfirst=False)
joffre_raw['Date']

0     2018-09-03 00:00:00
1     2018-09-03 00:00:00
2     2018-09-03 00:00:00
3     2018-09-03 00:00:00
4     2018-09-03 00:00:00
             ...         
59    2018-11-03 00:00:00
60    2018-10-03 00:00:00
61    2018-10-03 00:00:00
62    2018-12-03 00:00:00
63             2018-13-03
Name: Date, Length: 64, dtype: object

In [29]:
joffre_raw['csas_id']=strgen.StringGenerator("[\d]{10}").render_list(len(joffre_raw),unique=True)

### Define Functions that will fill in blank values and transform excel files to more useful formats

In [7]:
def dateconvert(mydate):
    """ If excel file is incorrectly read as YYYY-MM-DD and should be YYYY-DD-MM, rearranges to correct format in text.
    This then allows the text to be converted back to datetime
    """
    if isinstance(mydate, dt.datetime):
        ret_date =  dt.datetime.strftime(mydate, '%Y-%m-%d')
    else:
        ret_date = mydate
    return dt.datetime.strptime(ret_date,'%Y-%d-%m')

In [8]:
def timeconvert(x):
    """ Converts datetime.time to datetime.timedelta so that times can be differenced 
    to calculate durations"""
    if isinstance(x,dt.time):
        x = dt.timedelta(hours=x.hour, minutes=x.minute, 
                               seconds=x.second, microseconds=x.microsecond)
        return x
    else:
        return pd.NaT

In [9]:
# Create new dataframe to allow for cleaning
joffre_clean = joffre_raw.copy()
joffre_clean['clean_date']=pd.to_datetime(joffre_clean['Date'].map(dateconvert))
joffre_clean['clean_date'].value_counts()

2018-03-06    12
2018-03-07    10
2018-03-05     9
2018-03-09     7
2018-03-11     5
2018-03-04     5
2018-03-10     5
2018-03-08     4
2018-03-03     3
2018-03-12     3
2018-03-13     1
Name: clean_date, dtype: int64

### The following four functions populate end times for No Work Periods, using either the start of the next phase or a two hours window.

In [10]:
def fill_nw1 (row):
    """ This function deals with instances where a NW code has been entered but there's no obvious way 
    to calculate the duration of 'no work'. This is specifically for the first portion of the day """
    if pd.isnull(row['Work Start1']) and row['WT/DT Cause Code1']=="NW (IT)":
        if pd.notnull(row['ScheduledST2']):
            time =  row['ScheduledST2']
        elif pd.notnull(row['Space Open1']):
            time = (dt.datetime.combine(dt.date(1,1,1),row['Space Open1']) + 
                    dt.timedelta(hours=2)).time()
        else:
            time = pd.NaT
    else:
        time =  row['Work Start1']
    return time
    

In [11]:
def fill_nw2 (row):
    if pd.isnull(row['Work Start2']) and row['WT/DT Cause Code2']=="NW (IT)":
        if pd.notnull(row['ScheduledST3']):
            time =  row['ScheduledST3']
        elif pd.notnull(row['Space Open2']):
            time = (dt.datetime.combine(dt.date(1,1,1),row['Space Open2']) + 
                    dt.timedelta(hours=2)).time()
        else:
            time = pd.NaT
    else:
        time =  row['Work Start2']
    return time
    

In [12]:
def fill_nw3 (row):
    if pd.isnull(row['Work Start3']) and row['WT/DT Cause Code3']=="NW (IT)":
        if pd.notnull(row['ScheduledST4']):
            time =  row['ScheduledST4']
        elif pd.notnull(row['Space Open3']):
            time = (dt.datetime.combine(dt.date(1,1,1),row['Space Open3']) + 
                    dt.timedelta(hours=2)).time()
        else:
            time = pd.NaT
    else:
        time =  row['Work Start3']
    return time
    

In [13]:
def fill_nw4 (row):
    if pd.isnull(row['Work Start4']) and row['WT/DT Cause Code4']=="NW (IT)":
        if pd.notnull(row['ScheduledST5']):
            time =  row['ScheduledST5']
        elif pd.notnull(row['Space Open4']):
            time = (dt.datetime.combine(dt.date(1,1,1),row['Space Open4']) + 
                    dt.timedelta(hours=2)).time()
        else:
            time = pd.NaT
    else:
        time =  row['Work Start4']
    return time
    

In [14]:
joffre_clean['Work Start1'] = joffre_clean.apply(fill_nw1,axis=1)
joffre_clean['Work Start2'] = joffre_clean.apply(fill_nw2,axis=1)
joffre_clean['Work Start3'] = joffre_clean.apply(fill_nw3,axis=1)
joffre_clean['Work Start4'] = joffre_clean.apply(fill_nw4,axis=1)

### Next, use cause codes to determine what time interval should be used to calculate non work time and tool time

In [15]:
def duration_calc(row):
    ''' This function takes the cause codes and uses them to determine what time interval 
    to use to calculate non work time'''
    start_time = pd.NaT
    stop_time = pd.NaT
    duration = pd.NaT
    time_phase = np.nan
    code_phase = np.nan
    if (row['WT/DT Cause Code']=='EF'or row['WT/DT Cause Code']=='C'or row['WT/DT Cause Code']=='L'):
        start_time = row['ScheduledST']
        stop_time = row['Attendant Arrival'] 
        time_phase = 'ArrivLessSched'
        code_phase = 'wait_time'
    
    if row['WT/DT Cause Code']=='GT':
        start_time = row['Attendant Arrival']
        stop_time = row['Space Open']
        time_phase = 'OpenLessArriv'
        code_phase = 'wait_time'

    if (row['WT/DT Cause Code']=='CD' or row['WT/DT Cause Code']=='IN' or row['WT/DT Cause Code']=='PE'
        or row['WT/DT Cause Code']=='V'):
        if pd.isnull(row['Space Open']):
            start_time = row['ScheduledST']
            stop_time = row['Attendant Arrival'] 
            time_phase = 'ArrivLessSched'
            code_phase = 'down_time'
        else:
            start_time = row['ScheduledST']
            stop_time= row['Space Open']
            time_phase = 'OpenLessSched'
            code_phase = 'down_time'
            
    if (row['WT/DT Cause Code']=='SB' or row['WT/DT Cause Code']=='S' or row['WT/DT Cause Code']=='I'
        or row['WT/DT Cause Code']=='NW'):
        start_time = row['Space Open']
        stop_time = row['Work Start']
        time_phase = 'StartLessOpen'
        code_phase = 'idle_time'
    if start_time > stop_time:
        duration=pd.NaT
    else:
        duration = stop_time - start_time
    return (start_time, stop_time,duration, time_phase, code_phase)

In [16]:
def duration_calc_other(row):
    ''' This function takes other intervals that are not coded, calculates duration, and assigns a 
    generic time code'''
    start_time = pd.NaT
    stop_time = pd.NaT
    duration = pd.NaT
    time_phase = np.nan
    code_phase = np.nan
    if (row['WT/DT Cause Code']=='EF'or row['WT/DT Cause Code']=='C'or row['WT/DT Cause Code']=='L' or
        row['WT/DT Cause Code']=='GT' or row['WT/DT Cause Code']=='CD' or row['WT/DT Cause Code']=='IN' or 
        row['WT/DT Cause Code']=='PE'or row['WT/DT Cause Code']=='V'):
            start_time = row['Space Open']
            stop_time = row['Work Start'] 
            time_phase = 'StartLessOpen'
            code_phase = 'nonwork_time'
            
    if (row['WT/DT Cause Code']=='SB' or row['WT/DT Cause Code']=='S' or row['WT/DT Cause Code']=='I'
        or row['WT/DT Cause Code']=='NW'):
        start_time = row['ScheduledST']
        stop_time = row['Space Open']
        time_phase = 'OpenLessSched'
        code_phase = 'nonwork_time'
    if start_time > stop_time:
        duration=pd.NaT
    else:
        duration = stop_time - start_time
    return (start_time, stop_time,duration, time_phase, code_phase)

In [17]:
def duration_calc_blank_it(row):
    ''' This function takes other intervals that are not coded, calculates duration, and assigns a 
    generic time code'''
    start_time = pd.NaT
    stop_time = pd.NaT
    duration = pd.NaT
    time_phase = np.nan
    code_phase = np.nan
    if pd.isnull(row['WT/DT Cause Code']):
            start_time = row['Space Open']
            stop_time = row['Work Start'] 
            time_phase = 'StartLessOpen'
            code_phase = 'nonwork_time'
    if start_time > stop_time:
        duration=pd.NaT
    else:
        duration = stop_time - start_time
    return (start_time, stop_time,duration, time_phase, code_phase)

In [18]:
def duration_calc_blank_wtdt(row):
    ''' This function takes other intervals that are not coded, calculates duration, and assigns a 
    generic time code'''
    start_time = pd.NaT
    stop_time = pd.NaT
    duration = pd.NaT
    time_phase = np.nan
    code_phase = np.nan
    if pd.isnull(row['WT/DT Cause Code']):
            start_time = row['ScheduledST']
            stop_time = row['Space Open'] 
            time_phase = 'OpenLessSched'
            code_phase = 'nonwork_time'
    if start_time > stop_time:
        duration=pd.NaT
    else:
        duration = stop_time - start_time
    return (start_time, stop_time,duration, time_phase, code_phase)

In [30]:
def non_work_transform(mydf):
    mydf['WT/DT Cause Code'] = mydf['WT/DT Cause Code'].str.split(" ").str[0]
    # Apply the first function
    df1 = mydf.copy()
    tempdf = df1.apply(duration_calc,axis=1,result_type='expand')
    df1['start_time'] = tempdf.iloc[:,0]
    df1['stop_time'] = tempdf.iloc[:,1]
    df1['duration'] = tempdf.iloc[:,2]
    df1['time_phase']= tempdf.iloc[:,3]
    df1['code_phase'] = tempdf.iloc[:,4]
    df1 = df1[['csas_id','Work Order', 'Code of Practice', 'Work Areas', 'Facility',
       'clean_date', 'Number of Crews', 'Shift', 'start_phase','time_phase', 'start_time','stop_time',
       'duration', 'WT/DT Cause Code', 'Note','code_phase' ]]
    # Second Function
    df2 = mydf.copy()
    tempdf = df2.apply(duration_calc_other,axis=1,result_type='expand')
    df2['start_time'] = tempdf.iloc[:,0]
    df2['stop_time'] = tempdf.iloc[:,1]
    df2['duration'] = tempdf.iloc[:,2]
    df2['time_phase']= tempdf.iloc[:,3]
    df2['code_phase'] = tempdf.iloc[:,4]
    df2 = df2[['csas_id','Work Order', 'Code of Practice', 'Work Areas', 'Facility',
       'clean_date', 'Number of Crews', 'Shift', 'start_phase','time_phase', 'start_time','stop_time',
       'duration', 'code_phase' ]]
    # Third Function
    df3 = mydf.copy()
    tempdf = df3.apply(duration_calc_blank_it,axis=1,result_type='expand')
    df3['start_time'] = tempdf.iloc[:,0]
    df3['stop_time'] = tempdf.iloc[:,1]
    df3['duration'] = tempdf.iloc[:,2]
    df3['time_phase']= tempdf.iloc[:,3]
    df3['code_phase'] = tempdf.iloc[:,4]
    df3 = df3[['csas_id','Work Order', 'Code of Practice', 'Work Areas', 'Facility',
       'clean_date', 'Number of Crews', 'Shift', 'start_phase','time_phase', 'start_time','stop_time',
       'duration', 'code_phase' ]]
    # Fourth Function
    df4 = mydf.copy()
    tempdf = df4.apply(duration_calc_blank_wtdt,axis=1,result_type='expand')
    df4['start_time'] = tempdf.iloc[:,0]
    df4['stop_time'] = tempdf.iloc[:,1]
    df4['duration'] = tempdf.iloc[:,2]
    df4['time_phase']= tempdf.iloc[:,3]
    df4['code_phase'] = tempdf.iloc[:,4]
    df4 = df4[['csas_id','Work Order', 'Code of Practice', 'Work Areas', 'Facility',
       'clean_date', 'Number of Crews', 'Shift', 'start_phase','time_phase', 'start_time','stop_time',
       'duration' ,'code_phase' ]]
    return pd.concat([df1,df2,df3,df4])

In [31]:
def tool_time_transform(mydf_2):
    mydf_2['start_time'] = mydf_2['Work Start']
    mydf_2['stop_time'] = mydf_2['Work Stop']
    mydf_2['duration'] = mydf_2['tool_time']
    mydf_2['WT/DT Cause Code'] = None
    mydf_2['Note'] = None
    mydf_2['time_phase'] = 'StopLessStart'
    mydf_2['code_phase'] = 'tool_time'
    mydf_2 = mydf_2[['csas_id','Work Order', 'Code of Practice', 'Work Areas', 'Facility',
       'clean_date', 'Number of Crews', 'Shift', 'start_phase','time_phase', 'start_time','stop_time',
       'duration', 'WT/DT Cause Code', 'Note','code_phase' ]]
    return mydf_2

In [32]:
rawdf = joffre_clean
li = []
starts = [1,2,3,4]
phase = [0,1,2,3]
index = ['csas_id','WorkOrder', 'Code of Practice', 'Work Areas', 'Facility','clean_date',
           'Number of Crews', 'Shift']
times_prefix = ['ScheduledST', 'Attendant Arrival',
           'Space Open', 'Work Start', 'Work Stop']
cause_prefix = ['WT/DT Cause Code','Cause Note']
# There are 6 start in a given shift, accounting for breaks, etc.  Each CSA could have a max of 6 entries per work order in a shift
for s in starts:
    times = ['{}{}'.format(t, s) for t in times_prefix]
    codes = ['{}{}'.format(c,s)  for c in cause_prefix]
    cols = index + times + codes
    df = rawdf[cols].copy()
    df['start_phase'] = s
    for i in range(8,13):
        df.iloc[:,i] = df.iloc[:,i].map(timeconvert)
        df.iloc[:,i] = np.where(df.iloc[:,i].dt.seconds<21600,df['clean_date']+df.iloc[:,i]+
                                dt.timedelta(days=1),df['clean_date']+df.iloc[:,i])
    df['delay0'] = np.where(((df.iloc[:,9]-df.iloc[:,8])<dt.timedelta(minutes = 0))
                            &(df.iloc[:,8]!=""),0,(df.iloc[:,9]-df.iloc[:,8]))
    df['delay1'] = np.where(((df.iloc[:,10]-df.iloc[:,9])<dt.timedelta(minutes = 0))
                            &(df.iloc[:,9]!=""),0,(df.iloc[:,10]-df.iloc[:,9]))
    df['delay2'] = np.where(((df.iloc[:,11]-df.iloc[:,10])<dt.timedelta(minutes = 0))
                            &(df.iloc[:,10]!=""),0,(df.iloc[:,11]-df.iloc[:,10]))
    df['tool_time'] = np.where(((df.iloc[:,12]-df.iloc[:,11])<dt.timedelta(minutes = 0))
                            &(df.iloc[:,11]!=""),0,(df.iloc[:,12]-df.iloc[:,11]))
    # Rename columns so can consistently manipulate
    df.columns = ['csas_id','Work Order', 'Code of Practice', 'Work Areas', 'Facility', 'clean_date',
       'Number of Crews', 'Shift', 'ScheduledST', 'Attendant Arrival',
       'Space Open', 'Work Start', 'Work Stop', 'WT/DT Cause Code',
       'Note', 'start_phase', 'delay0', 'delay1', 'delay2',
       'tool_time']
    df_tool_time = tool_time_transform(df.copy())
    df_non_work = non_work_transform(df.copy())
    li.append(df_tool_time)
    li.append(df_non_work)
#     li.append(df)

In [33]:
def timedelta_to_hours(mytime):
    ''' converts the duration column into hours'''
    return mytime.seconds/3600

In [34]:
# concatenate all individual dataframes
joffre_stack = pd.concat(li).sort_values(['clean_date','Work Order','Work Areas'])
# convert duration to hourly
joffre_stack['duration']= joffre_stack['duration'].map(timedelta_to_hours)
joffre_stack['person_hours']=joffre_stack['Number of Crews']*joffre_stack['duration']
# add columns for company, project and year
joffre_stack['company'] = 'Atco Power'
joffre_stack['project']='Joffre'
joffre_stack['year']= '2018'

In [35]:
joffre_stack['code_phase'].value_counts()
joffre_stack.to_csv(path+'\\output\\joffre_2018.csv',index=False)