### Import Libraries

In [1]:
import pandas as pd

### Create Base File Path and Sheet Names

In [2]:
base_file_path = "20180401_20190622.xlsx"

In [3]:
sheets = []

for i in range(1,33):
    sheet = 'Sheet{}'.format(i)
    sheets.append(sheet)

### Define Functions

In [4]:
def setDateTime(row, sheet):
    sheet_num = int(sheet[5:])
    dow, md  = row['Date'].split(" ", 2)
    month, day = md.split("/", 2)
    
    if sheet_num < 20:
        year = 2018
    elif sheet_num == 20:
        if int(month) == 1:
            year = 2019
        else:
            year = 2018
    elif sheet_num > 20:
        year = 2019

    dt_in = '{}-{}-{} {}'.format(year, month, day, row['Time In'])
    dt_out = '{}-{}-{} {}'.format(year, month, day, row['Time Out'])
    
    ts_date = pd.Timestamp(dt_in).strftime('%Y-%m-%d')
    ts_in = pd.Timestamp(dt_in).strftime('%Y-%m-%d %H:%M:%S')
    ts_out = pd.Timestamp(dt_out).strftime('%Y-%m-%d %H:%M:%S')

    return ts_date, ts_in, ts_out

### Data Wrangling

Import Data

In [27]:
xlsx = pd.ExcelFile('../data/{}'.format(base_file_path))
sheets = xlsx.sheet_names

for sheet in sheets[0:1]:
    print(sheet)
    df = pd.read_excel(xlsx, sheet_name=sheet, skiprows=2)

Sheet1


Handle Missing Data

In [7]:
df_na = df.dropna(axis=0, how='all')

df_na = df_na.loc[df['Transaction'].isna() == False]

df_na[['Date','Shift Total','Daily Total']] = df_na[['Date','Shift Total','Daily Total']].shift(+1)

df_na = df_na.loc[(df_na['Date'].isna() == False) &
                  (df_na['Date'] != 'Date')]

Handle Date and Time

In [8]:
df_dt = df_na.copy()

df_dt[['Date','time_in','time_out']] = df_dt.apply(setDateTime, axis=1, result_type='expand', args=(sheet,))

Handle Transactions

'01 - Regular Base Pay',
'29 - Credit Hours Worked-Regular Time',
'50 - Credit Hours Used',
'61 - Annual Leave Taken',
'62 - Sick Leave Taken',
'66 - Administrative Leave - Agency Directed',
'66 - Other Leave-Federal Holiday',
'66 - Paid Time Off for Holiday',
'66 - Weather and Safety Leave'

In [10]:
df_tr = df_dt.loc[(df_dt['Transaction'] == '01 - Regular Base Pay') |
                  (df_dt['Transaction'] == '29 - Credit Hours Worked-Regular Time')]

In [11]:
df_tr['time_out'] = df_tr.groupby(by=['Date'])['time_out'].transform(max)

In [12]:
df_tr = df_tr.loc[(df_tr['Transaction'] == '01 - Regular Base Pay')]

Finalize Columns

In [14]:
df_in = df_tr[['time_in']].copy()
df_out = df_tr[['time_out']].copy()

In [15]:
df_in.loc[:, 'entry_type'] = 'Arrival'
df_out.loc[:, 'entry_type'] = 'Departure'

In [16]:
df_in.loc[:, 'Description'] = ""
df_out.loc[:, 'Description'] = ""

In [17]:
df_in.rename(columns={'time_in':'arrival_time'}, inplace=True)
df_out.rename(columns={'time_out':'arrival_time'}, inplace=True)

In [21]:
df_final = pd.concat([df_in, df_out], sort=False).reset_index(drop=True)

### Final Program

In [8]:
import pandas as pd

In [9]:
def importData(sheet, file_name):
    file_path = '../data/{}'.format(file_name)
    
    df = pd.read_excel(file_path, sheet_name=sheet, skiprows=2)
    
    return df

In [10]:
def setDateTime(row, sheet):
    sheet_num = int(sheet[5:])
    dow, md  = row['Date'].split(" ", 2)
    month, day = md.split("/", 2)
    
    if sheet_num < 20:
        year = 2018
    elif sheet_num == 20:
        if int(month) == 1:
            year = 2019
        else:
            year = 2018
    elif sheet_num > 20:
        year = 2019

    dt_in = '{}-{}-{} {}'.format(year, month, day, row['Time In'])
    dt_out = '{}-{}-{} {}'.format(year, month, day, row['Time Out'])
    
    ts_date = pd.Timestamp(dt_in).strftime('%Y-%m-%d')
    ts_in = pd.Timestamp(dt_in).strftime('%Y-%m-%d %H:%M:%S')
    ts_out = pd.Timestamp(dt_out).strftime('%Y-%m-%d %H:%M:%S')

    return ts_date, ts_in, ts_out

In [89]:
def wrangleData(df, sheet):
    #Handle missing values.
    df.dropna(axis=0, how='all', inplace=True)
    
    df = df.loc[df['Transaction'].isna() == False]
    
    df[['Date','Shift Total','Daily Total']] = df[['Date','Shift Total','Daily Total']].shift(+1)
    
    df = df.loc[(df['Date'].isna() == False) &
                (df['Date'] != 'Date')]
    
    #Handle datetime.
    df[['Date','time_in','time_out']] = df.apply(setDateTime, axis=1, result_type='expand', args=(sheet,))
    
    #Handle transactions.
    df = df.loc[(df['Transaction'] == '01 - Regular Base Pay') |
                (df['Transaction'] == '29 - Credit Hours Worked-Regular Time')]
    
    df['time_out_max'] = df.groupby(by=['Date'])['time_out'].transform(max)
    
    df = df.loc[(df['Transaction'] == '01 - Regular Base Pay')]
    
    #Finalize columns and format
    
    df_in = df[['time_in']].copy()
    df_in.loc[:, 'entry_type'] = 'Arrival'
    df_in.loc[:, 'description'] = ""
    df_in.rename(columns={'time_in':'arrival_time'}, inplace=True)

    df_out = df[['time_out_max']].copy()
    df_out.loc[:, 'entry_type'] = 'Departure'
    df_out.loc[:, 'description'] = ""
    df_out.rename(columns={'time_out_max':'arrival_time'}, inplace=True)

    df_final = pd.concat([df_in, df_out], sort=False)
    df_final.reset_index(drop=True, inplace=True)
    
    return df_final

In [94]:
def main():
    df_all = pd.DataFrame(columns=['arrival_time','entry_type','description'])
    sheets = []
        
    for i in range(1,33):
        sheet = 'Sheet{}'.format(i)
        sheets.append(sheet)
        
    for sheet in sheets:
        print(sheet)
        df = importData(sheet, "20180401_20190622.xlsx")
        
        df_wng = wrangleData(df, sheet)
        
        df_all = df_all.append(other=df_wng)
        
    return df_all

In [95]:
web_df = main()

Sheet1


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[k1] = value[k2]


Sheet2
Sheet3
Sheet4
Sheet5
Sheet6
Sheet7
Sheet8
Sheet9
Sheet10
Sheet11
Sheet12
Sheet13
Sheet14
Sheet15
Sheet16
Sheet17
Sheet18
Sheet19
Sheet20
Sheet21
Sheet22
Sheet23
Sheet24
Sheet25
Sheet26
Sheet27
Sheet28
Sheet29
Sheet30
Sheet31
Sheet32
