In [1]:
import pandas as pd
import os
from datetime import datetime as dt


In [2]:
#pandas options
import pandas as pd
pd.set_option('display.max_rows', None)

In [3]:
#!py -m pip install openpyxl --upgrade

In [4]:
#change to payroll teams documents folder, note that this needs to be synched to your directory for this to work
os.chdir(r"C:\Users\alex.jefferies\Leighs Construction Limited\People and Culture - 11 Payroll - People Dashboard Data Sources")

In [5]:
#helper functions
# Assuming 'df' is your DataFrame
def missing_info(df:pd.DataFrame):
    total_rows = len(df)
    missing_values_count = df.isnull().sum()
    missing_values_ratio = (missing_values_count / total_rows)

    missing_values_df = pd.DataFrame({'missing_count': missing_values_count, 'missing_ratio': missing_values_ratio})
    print(missing_values_df)

def time_convert(atime):
    newtime = dt.fromtimestamp(atime)
    return newtime.date()

def create_file_records(somepath) -> dict:
    #dictionary
    firstDict = {}
    
    for name in os.listdir(somepath): 

        if ".xlsx" in name or ".csv" in name.lower():
    
            filepath = os.path.join(somepath, name)
            
            #main library that holds stats
            stats = os.stat(filepath)
            
            attrs = {
                'File Name': name,
                #'Size (KB)': sizeFormat(stats.st_size),
                'Creation Date': time_convert(stats.st_ctime),
                'Modified Date': time_convert(stats.st_mtime),
                'Last Access Date': time_convert(stats.st_atime),
                
            }
           
            firstDict[name] = attrs 

    return firstDict 

def view_records(file_records) -> None:
    
    for key, value in file_records.items():
            
        for k,v in value.items():
            print(f"{k}: {v}")
        print()

def get_info(df:pd.DataFrame):
    print("="*50)
    print(df.shape)
    print(df.head())
    print()
    print(df.info())
    print()
    print("="*50)


In [6]:
current_path = os.getcwd()
subdirs = [os.path.join(current_path, d) for d in os.listdir(current_path) if os.path.isdir(os.path.join(current_path, d))]

# Sort the subdirectories by their creation time in descending order
sorted_subdirs = sorted(subdirs, key=os.path.getctime, reverse=True)

# Get the latest subdirectory
latest_subdir = sorted_subdirs[0]
os.chdir(latest_subdir)

In [7]:
latest_subdir

'C:\\Users\\alex.jefferies\\Leighs Construction Limited\\People and Culture - 11 Payroll - People Dashboard Data Sources\\240601'

In [8]:
!dir

 Volume in drive C is Windows 
 Volume Serial Number is 9841-01DE

 Directory of C:\Users\alex.jefferies\Leighs Construction Limited\People and Culture - 11 Payroll - People Dashboard Data Sources\240601

05/30/2024  08:28 AM    <DIR>          .
06/04/2024  08:39 AM    <DIR>          ..
05/30/2024  07:59 AM          (21,028) 240601 NorthSalaries EmployeeList.xlsx
05/30/2024  08:00 AM           (2,250) 240601 NORTHSALARIES LEAVEBALANCES.CSV
05/30/2024  08:00 AM          (31,911) 240601 NorthSalaries TerminatedEmployees.xlsx
05/30/2024  08:05 AM          (20,634) 240601 NorthWages EmployeeList.xlsx
05/30/2024  08:07 AM           (2,481) 240601 NORTHWAGES LEAVEBALANCES.CSV
05/30/2024  08:06 AM          (23,689) 240601 NorthWages TerminatedEmployees.xlsx
05/30/2024  08:01 AM          (23,408) 240601 SouthSalaries EmployeeList.xlsx
05/30/2024  08:02 AM          (32,979) 240601 SouthSalaries TerminatedEmployees.xlsx
05/30/2024  08:03 AM          (21,199) 240601 SouthWages EmployeeList.xlsx
0

### Directory sorting and filtering

In [9]:
file_records = create_file_records(latest_subdir)

In [10]:
view_records(file_records)

File Name: 240601 NorthSalaries EmployeeList.xlsx
Creation Date: 2024-05-30
Modified Date: 2024-05-30
Last Access Date: 2024-05-30

File Name: 240601 NORTHSALARIES LEAVEBALANCES.CSV
Creation Date: 2024-05-30
Modified Date: 2024-05-30
Last Access Date: 2024-05-30

File Name: 240601 NorthSalaries TerminatedEmployees.xlsx
Creation Date: 2024-05-30
Modified Date: 2024-05-30
Last Access Date: 2024-05-30

File Name: 240601 NorthWages EmployeeList.xlsx
Creation Date: 2024-05-30
Modified Date: 2024-05-30
Last Access Date: 2024-05-30

File Name: 240601 NORTHWAGES LEAVEBALANCES.CSV
Creation Date: 2024-05-30
Modified Date: 2024-05-30
Last Access Date: 2024-05-30

File Name: 240601 NorthWages TerminatedEmployees.xlsx
Creation Date: 2024-05-30
Modified Date: 2024-05-30
Last Access Date: 2024-05-30

File Name: 240601 SouthSalaries EmployeeList.xlsx
Creation Date: 2024-05-30
Modified Date: 2024-05-30
Last Access Date: 2024-05-30

File Name: 240601 SouthSalaries TerminatedEmployees.xlsx
Creation Date:

In [11]:
#checking the files in the folder, there should be 12 excluding "employee_data_tidy"
most_recent_records = file_records.values()
for item in most_recent_records:
    print(item['File Name'])
    print()

240601 NorthSalaries EmployeeList.xlsx

240601 NORTHSALARIES LEAVEBALANCES.CSV

240601 NorthSalaries TerminatedEmployees.xlsx

240601 NorthWages EmployeeList.xlsx

240601 NORTHWAGES LEAVEBALANCES.CSV

240601 NorthWages TerminatedEmployees.xlsx

240601 SouthSalaries EmployeeList.xlsx

240601 SouthSalaries TerminatedEmployees.xlsx

240601 SouthWages EmployeeList.xlsx

240601 SOUTHWAGES LEAVEBALANCES.CSV

240601 SouthWages TerminatedEmployees.xlsx

240601SOUTHSALARIES LEAVEBALANCES.CSV



In [12]:
#determine correct file types

for record in most_recent_records:
    lower_name = record["File Name"].lower()

    record["IsSalaries"] = True if "salaries" in lower_name else False
    record["IsWages"] = True if "wages" in lower_name else False
    record["IsNorth"] = True if "north" in lower_name else False
    record["IsSouth"] = True if "south" in lower_name else False
    record["IsTerminated"] = True if "terminated" in lower_name else False
    record["IsLeaveBalance"] = True if "balances" in lower_name else False

print("="*50)
for item in most_recent_records:
    for k,v in item.items():
        print(f"{k}:{v}")
    print("="*50)

File Name:240601 NorthSalaries EmployeeList.xlsx
Creation Date:2024-05-30
Modified Date:2024-05-30
Last Access Date:2024-05-30
IsSalaries:True
IsWages:False
IsNorth:True
IsSouth:False
IsTerminated:False
IsLeaveBalance:False
File Name:240601 NORTHSALARIES LEAVEBALANCES.CSV
Creation Date:2024-05-30
Modified Date:2024-05-30
Last Access Date:2024-05-30
IsSalaries:True
IsWages:False
IsNorth:True
IsSouth:False
IsTerminated:False
IsLeaveBalance:True
File Name:240601 NorthSalaries TerminatedEmployees.xlsx
Creation Date:2024-05-30
Modified Date:2024-05-30
Last Access Date:2024-05-30
IsSalaries:True
IsWages:False
IsNorth:True
IsSouth:False
IsTerminated:True
IsLeaveBalance:False
File Name:240601 NorthWages EmployeeList.xlsx
Creation Date:2024-05-30
Modified Date:2024-05-30
Last Access Date:2024-05-30
IsSalaries:False
IsWages:True
IsNorth:True
IsSouth:False
IsTerminated:False
IsLeaveBalance:False
File Name:240601 NORTHWAGES LEAVEBALANCES.CSV
Creation Date:2024-05-30
Modified Date:2024-05-30
Last A

### Combining and Processing files

In [13]:
def separate_files(records,south_or_north:str="IsSouth",salaried_or_waged:str = "IsWages",leave_filter:str='IsLeaveBalance',terminated_filter='IsTerminated') -> tuple[list,list,list]:
    """separates list of files into south/noth,waged/salaried, leave balanced and terminated"""
    for record in records:
        file = record["File Name"]
        if file == "employee_data_tidy.csv":
            continue
        #print(file)
        if record[south_or_north] and record[salaried_or_waged]: #record selection recuded to south/north or wages/salaried based on parameter        
            print(record)
            
            if record[leave_filter]:
                leave_balances_file = file

            elif record[terminated_filter]:
                terminated_file = file

            else:
                waged_or_salaried_file = file
                
    return waged_or_salaried_file,leave_balances_file,terminated_file

def add_name_key(df:pd.DataFrame,name_column:str="Name") -> pd.DataFrame:
    """add name key to dataframe"""
    df["Name_Key"] = df[name_column].str.replace(",","").str.replace(" ","").str.lower().str.lstrip().str.rstrip()
    return df

def table_to_html(df):
    html = df.to_html()
    # Write html to file
    text_file = open("index.html", "w")
    text_file.write(html)
    text_file.close()

def files_to_dataframe(file,leave_balances_file,terminated_file,engine='openpyxl',date_format='%d-%m-%Y') -> pd.DataFrame:
    """convert separated files into dataframe format"""
    file_df = pd.read_excel(file,skiprows=1,engine=engine,date_format=date_format,parse_dates=["Start Date","Birth Date","Visa Expiry"])
    file_df["Name"] = file_df["Name"].str.lstrip().str.rstrip()
    print(file_df["Fixed Term Expiry"])
    print(file_df.shape)
    ##add status information
    file_df["Status"] = "Active"
    #standardize name
    add_name_key(file_df, name_column="Name")
    print("main file read successfully!")
    print(file_df.columns)

    leave_df = pd.read_csv(leave_balances_file)
    print(leave_df.head())
    leave_df["Employee Full Name"] = leave_df["Employee Full Name"].str.lstrip().str.rstrip()
    add_name_key(leave_df, name_column="Employee Full Name")
    print("leave file read successfully!")
    print(leave_df.columns)
    print(leave_df.shape)

    terminated_df = pd.read_excel(terminated_file,skiprows=1,engine=engine,date_format=date_format,parse_dates=["Start Date","Birth Date","Finish Date","Fixed Term Expiry"])
    terminated_df["Status"] = "Inactive"
    terminated_df["Name"] = terminated_df["Name"].str.lstrip().str.rstrip()
    add_name_key(terminated_df, name_column="Name")
    print("terminated file read successfully!")
    print(terminated_df.shape)
    print(terminated_df.columns)
    #merge on Name_Key
    merged_df = file_df.merge(leave_df,left_on="Name_Key",right_on="Name_Key",how="inner")
    #concatendate active and inative staff
    concat_df = pd.concat([merged_df,terminated_df],axis=0)
    return concat_df

def add_region(df,modify_to:str='North Island'): 
    df["Region"] = modify_to
    return df

In [14]:

print("="*50)
print("South Waged")
waged_file,leave_balances_file,terminated_file = separate_files(most_recent_records)
print(waged_file,leave_balances_file,terminated_file,sep="\n")
south_waged = files_to_dataframe(waged_file,leave_balances_file,terminated_file)
#write an html table
add_region(south_waged, modify_to="South Island")

get_info(south_waged)


South Waged
{'File Name': '240601 SouthWages EmployeeList.xlsx', 'Creation Date': datetime.date(2024, 5, 30), 'Modified Date': datetime.date(2024, 5, 30), 'Last Access Date': datetime.date(2024, 5, 30), 'IsSalaries': False, 'IsWages': True, 'IsNorth': False, 'IsSouth': True, 'IsTerminated': False, 'IsLeaveBalance': False}
{'File Name': '240601 SOUTHWAGES LEAVEBALANCES.CSV', 'Creation Date': datetime.date(2024, 5, 30), 'Modified Date': datetime.date(2024, 5, 30), 'Last Access Date': datetime.date(2024, 5, 30), 'IsSalaries': False, 'IsWages': True, 'IsNorth': False, 'IsSouth': True, 'IsTerminated': False, 'IsLeaveBalance': True}
{'File Name': '240601 SouthWages TerminatedEmployees.xlsx', 'Creation Date': datetime.date(2024, 5, 30), 'Modified Date': datetime.date(2024, 5, 30), 'Last Access Date': datetime.date(2024, 5, 30), 'IsSalaries': False, 'IsWages': True, 'IsNorth': False, 'IsSouth': True, 'IsTerminated': True, 'IsLeaveBalance': False}
240601 SouthWages EmployeeList.xlsx
240601 SOUT

In [15]:
south_waged.columns

Index(['Name', 'Alpha Code', 'Start Date', 'Department', 'Employment  Status',
       'Salary/ Wage', 'Hours Worked', 'Birth Date', 'Occupation', 'Ethnicity',
       'Visa Type', 'Visa Duration', 'Visa Expiry', 'EstimatedFixedTermExpiry',
       'FixedTermAgreement', 'Fixed Term Expiry', 'Gender', 'Status',
       'Name_Key', 'Employee Full Name', ' Sick/Special Leave Balance',
       ' Holidays Balance', 'Finish Date', 'Region'],
      dtype='object')

In [16]:
#north wages, filenames contain both north and wages

print("="*50)
print("North Waged")
north_waged_file,leave_balances_file,terminated_file = separate_files(most_recent_records,south_or_north="IsNorth",salaried_or_waged="IsWages")
north_waged = files_to_dataframe(north_waged_file,leave_balances_file,terminated_file)

add_region(north_waged,"North Island")
#adjust_departments(north_waged)

##final preprocessing
#for exited staff MYOB defaults the Cost Center to 1 - Leighs Christchurch.Modify the North Island wages to Auckland (by Default) or New Plymouth
#north_waged.loc[north_waged["Finish"] and [north_waged["Cost Centre"]] = "2 - Leighs Auckland"
get_info(north_waged)


North Waged
{'File Name': '240601 NorthWages EmployeeList.xlsx', 'Creation Date': datetime.date(2024, 5, 30), 'Modified Date': datetime.date(2024, 5, 30), 'Last Access Date': datetime.date(2024, 5, 30), 'IsSalaries': False, 'IsWages': True, 'IsNorth': True, 'IsSouth': False, 'IsTerminated': False, 'IsLeaveBalance': False}
{'File Name': '240601 NORTHWAGES LEAVEBALANCES.CSV', 'Creation Date': datetime.date(2024, 5, 30), 'Modified Date': datetime.date(2024, 5, 30), 'Last Access Date': datetime.date(2024, 5, 30), 'IsSalaries': False, 'IsWages': True, 'IsNorth': True, 'IsSouth': False, 'IsTerminated': False, 'IsLeaveBalance': True}
{'File Name': '240601 NorthWages TerminatedEmployees.xlsx', 'Creation Date': datetime.date(2024, 5, 30), 'Modified Date': datetime.date(2024, 5, 30), 'Last Access Date': datetime.date(2024, 5, 30), 'IsSalaries': False, 'IsWages': True, 'IsNorth': True, 'IsSouth': False, 'IsTerminated': True, 'IsLeaveBalance': False}
0           NaT
1    2025-02-28
2           NaT

In [17]:
south_waged.columns

Index(['Name', 'Alpha Code', 'Start Date', 'Department', 'Employment  Status',
       'Salary/ Wage', 'Hours Worked', 'Birth Date', 'Occupation', 'Ethnicity',
       'Visa Type', 'Visa Duration', 'Visa Expiry', 'EstimatedFixedTermExpiry',
       'FixedTermAgreement', 'Fixed Term Expiry', 'Gender', 'Status',
       'Name_Key', 'Employee Full Name', ' Sick/Special Leave Balance',
       ' Holidays Balance', 'Finish Date', 'Region'],
      dtype='object')

In [18]:
#concatenate north and south for waged
waged = pd.concat([south_waged,north_waged])
waged.head()

  waged = pd.concat([south_waged,north_waged])


Unnamed: 0,Name,Alpha Code,Start Date,Department,Employment Status,Salary/ Wage,Hours Worked,Birth Date,Occupation,Ethnicity,...,FixedTermAgreement,Fixed Term Expiry,Gender,Status,Name_Key,Employee Full Name,Sick/Special Leave Balance,Holidays Balance,Finish Date,Region
0,"Abad, Arnold Sanguyo",ABAD,2012-09-27 00:00:00,22 - Projects Operations,Permanent,Wage,40.0,1977-10-08 00:00:00,Leading Hand,...,...,False,NaT,Male,Active,abadarnoldsanguyo,Abad Arnold Sanguyo,9.0,18.12,,South Island
1,"Acantilado, Richel",ACANTILA,2015-01-08 00:00:00,22 - Projects Operations,Permanent,Wage,40.0,1977-01-24 00:00:00,Carpenter,...,...,False,NaT,Male,Active,acantiladorichel,Acantilado Richel,19.0,15.26,,South Island
2,"Adlaon, Arthur Prieto",ADLAO001,2022-10-13 00:00:00,22 - Projects Operations,Permanent,Wage,47.5,1970-03-24 00:00:00,Carpenter,...,...,False,NaT,Male,Active,adlaonarthurprieto,Adlaon Arthur Prieto,10.0,11.9,,South Island
3,"Algar, Roy Anthony",ALGAR,2011-09-12 00:00:00,25 - Projects Management,Permanent,Wage,45.0,1973-12-02 00:00:00,Site Supervisor,...,...,False,NaT,Male,Active,algarroyanthony,Algar Roy Anthony,0.0,5.11,,South Island
4,"Anderson, Liam James",ANDER001,2022-08-29 00:00:00,25 - Projects Management,Permanent,Wage,45.0,1989-07-31 00:00:00,Site Supervisor,NZ European ...,...,False,NaT,Male,Active,andersonliamjames,Anderson Liam James,9.0,14.0,,South Island


In [19]:
#fixed term dates not parsed 
#join for leave balances not working
missing_info(waged)
#missing_info(waged.query("Status == 'Active'"))
#waged.to_csv("waged_check.csv")

                             missing_count  missing_ratio
Name                                     0       0.000000
Alpha Code                               0       0.000000
Start Date                               0       0.000000
Department                               0       0.000000
Employment  Status                       0       0.000000
Salary/ Wage                             0       0.000000
Hours Worked                             0       0.000000
Birth Date                               6       0.009174
Occupation                               0       0.000000
Ethnicity                                0       0.000000
Visa Type                                0       0.000000
Visa Duration                            0       0.000000
Visa Expiry                            603       0.922018
EstimatedFixedTermExpiry               513       0.784404
FixedTermAgreement                     513       0.784404
Fixed Term Expiry                      646       0.987768
Gender        

In [20]:
#note. completely missing leave balances for inactive employees is to be expected 
missing_info(waged.query("Status != 'Active'"))

                             missing_count  missing_ratio
Name                                     0       0.000000
Alpha Code                               0       0.000000
Start Date                               0       0.000000
Department                               0       0.000000
Employment  Status                       0       0.000000
Salary/ Wage                             0       0.000000
Hours Worked                             0       0.000000
Birth Date                               6       0.011696
Occupation                               0       0.000000
Ethnicity                                0       0.000000
Visa Type                                0       0.000000
Visa Duration                            0       0.000000
Visa Expiry                            513       1.000000
EstimatedFixedTermExpiry               513       1.000000
FixedTermAgreement                     513       1.000000
Fixed Term Expiry                      513       1.000000
Gender        

In [21]:
waged.info()

<class 'pandas.core.frame.DataFrame'>
Index: 654 entries, 0 to 107
Data columns (total 24 columns):
 #   Column                       Non-Null Count  Dtype         
---  ------                       --------------  -----         
 0   Name                         654 non-null    object        
 1   Alpha Code                   654 non-null    object        
 2   Start Date                   654 non-null    object        
 3   Department                   654 non-null    object        
 4   Employment  Status           654 non-null    object        
 5   Salary/ Wage                 654 non-null    object        
 6   Hours Worked                 654 non-null    float64       
 7   Birth Date                   648 non-null    object        
 8   Occupation                   654 non-null    object        
 9   Ethnicity                    654 non-null    object        
 10  Visa Type                    654 non-null    object        
 11  Visa Duration                654 non-null    objec

In [22]:

print("="*50)
print("South Salaried")
south_salaried_file,leave_balances_file,terminated_file = separate_files(most_recent_records,south_or_north="IsSouth",salaried_or_waged="IsSalaries")
south_salaried = files_to_dataframe(south_salaried_file,leave_balances_file,terminated_file)
add_region(south_salaried,"South Island")

get_info(south_salaried)

South Salaried
{'File Name': '240601 SouthSalaries EmployeeList.xlsx', 'Creation Date': datetime.date(2024, 5, 30), 'Modified Date': datetime.date(2024, 5, 30), 'Last Access Date': datetime.date(2024, 5, 30), 'IsSalaries': True, 'IsWages': False, 'IsNorth': False, 'IsSouth': True, 'IsTerminated': False, 'IsLeaveBalance': False}
{'File Name': '240601 SouthSalaries TerminatedEmployees.xlsx', 'Creation Date': datetime.date(2024, 5, 30), 'Modified Date': datetime.date(2024, 5, 30), 'Last Access Date': datetime.date(2024, 5, 30), 'IsSalaries': True, 'IsWages': False, 'IsNorth': False, 'IsSouth': True, 'IsTerminated': True, 'IsLeaveBalance': False}
{'File Name': '240601SOUTHSALARIES LEAVEBALANCES.CSV', 'Creation Date': datetime.date(2024, 5, 30), 'Modified Date': datetime.date(2024, 5, 30), 'Last Access Date': datetime.date(2024, 5, 30), 'IsSalaries': True, 'IsWages': False, 'IsNorth': False, 'IsSouth': True, 'IsTerminated': False, 'IsLeaveBalance': True}
0           NaT
1           NaT
2   

In [23]:
print("="*50)
print("North Salaried")
north_salaried_file,leave_balances_file,terminated_file = separate_files(most_recent_records,south_or_north="IsNorth",salaried_or_waged="IsSalaries")
north_salaried = files_to_dataframe(north_salaried_file,leave_balances_file,terminated_file)

add_region(north_salaried,"North Island")
get_info(north_salaried)

North Salaried
{'File Name': '240601 NorthSalaries EmployeeList.xlsx', 'Creation Date': datetime.date(2024, 5, 30), 'Modified Date': datetime.date(2024, 5, 30), 'Last Access Date': datetime.date(2024, 5, 30), 'IsSalaries': True, 'IsWages': False, 'IsNorth': True, 'IsSouth': False, 'IsTerminated': False, 'IsLeaveBalance': False}
{'File Name': '240601 NORTHSALARIES LEAVEBALANCES.CSV', 'Creation Date': datetime.date(2024, 5, 30), 'Modified Date': datetime.date(2024, 5, 30), 'Last Access Date': datetime.date(2024, 5, 30), 'IsSalaries': True, 'IsWages': False, 'IsNorth': True, 'IsSouth': False, 'IsTerminated': False, 'IsLeaveBalance': True}
{'File Name': '240601 NorthSalaries TerminatedEmployees.xlsx', 'Creation Date': datetime.date(2024, 5, 30), 'Modified Date': datetime.date(2024, 5, 30), 'Last Access Date': datetime.date(2024, 5, 30), 'IsSalaries': True, 'IsWages': False, 'IsNorth': True, 'IsSouth': False, 'IsTerminated': True, 'IsLeaveBalance': False}
0           NaT
1    2025-05-30
2  

In [24]:
north_salaried.head()

Unnamed: 0,Name,Alpha Code,Start Date,Department,Employment Status,Salary/ Wage,Hours Worked,Birth Date,Occupation,Ethnicity,...,FixedTermAgreement,Fixed Term Expiry,Gender,Status,Name_Key,Employee Full Name,Sick/Special Leave Balance,Holidays Balance,Finish Date,Region
0,"Alombro, Allan Jesus Benitez",ALOMBRO,2023-01-23 00:00:00,25 - Projects Management,Permanent,Salary,45,1971-02-04 00:00:00,Site Manager,Filipino ...,...,False,NaT,Male,Active,alombroallanjesusbenitez,Alombro Allan Jesus Benitez,9.0,8.82,,North Island
1,"Astashkin, Ivan",ASTASHKI,2024-03-25 00:00:00,25 - Projects Management,Permanent,Salary,45,1990-08-24 00:00:00,Site Supervisor,European ...,...,True,2025-05-30,Male,Active,astashkinivan,Astashkin Ivan,0.0,3.78,,North Island
2,"Baranyai, Damian Miklos",BARANYAI,2021-11-29 00:00:00,26 - Future Leaders Progr,Permanent,Salary,45,1998-01-18 00:00:00,Building Services Coordinator,...,...,False,NaT,Male,Active,baranyaidamianmiklos,Baranyai Damian Miklos,8.33,12.77,,North Island
3,"Barnes, Stuart Ross",BARNES,2023-09-18 00:00:00,25 - Projects Management,Permanent,Salary,45,1985-12-20 00:00:00,Site Manager,NZ European ...,...,True,2025-02-28,Male,Active,barnesstuartross,Barnes Stuart Ross,10.0,6.82,,North Island
4,"Bence, Gary Peter",BENCE,2021-04-19 00:00:00,23 - Projects Commercial,Permanent,Salary,45,1960-10-07 00:00:00,Senior Quantity Surveyor,...,...,True,2025-04-25,Male,Active,bencegarypeter,Bence Gary Peter,20.0,5.41,,North Island


In [25]:
#concatenate north and south for salaried
salaried = pd.concat([south_salaried,north_salaried])
salaried.head()

Unnamed: 0,Name,Alpha Code,Start Date,Department,Employment Status,Salary/ Wage,Hours Worked,Birth Date,Occupation,Ethnicity,...,FixedTermAgreement,Fixed Term Expiry,Gender,Status,Name_Key,Employee Full Name,Sick/Special Leave Balance,Holidays Balance,Finish Date,Region
0,"Aitcheson, Shane Andrew",AITCHESO,2015-01-12 00:00:00,25 - Projects Management,Permanent,Salary,45.0,1985-09-11 00:00:00,Project Manager,...,...,False,NaT,Male,Active,aitchesonshaneandrew,Aitcheson Shane Andrew,17.53,4.7,,South Island
1,"Antrobus, Dean",ANTRO001,2021-01-11 00:00:00,23 - Projects Commercial,Permanent,Salary,45.0,1969-01-29 00:00:00,Senior Quantity Surveyor,...,...,False,NaT,Male,Active,antrobusdean,Antrobus Dean,11.2,7.84,,South Island
2,"Aston, Stephen James",ASTON,2023-10-30 00:00:00,25 - Projects Management,Permanent,Salary,45.0,1980-07-16 00:00:00,Site Manager,British ...,...,False,NaT,Male,Active,astonstephenjames,Aston Stephen James,10.0,11.84,,South Island
3,"Baggstrom, Kimberley Dawn",BAGGSTRO,2023-09-07 00:00:00,13 - Administration,Permanent,Salary,40.0,1990-05-19 00:00:00,Regional Management Team Administra,"NZ Maori, European ...",...,False,NaT,Female,Active,baggstromkimberleydawn,Baggstrom Kimberley Dawn,9.0,3.82,,South Island
4,"Baker, Anthony Ian",BAKER001,2024-01-22 00:00:00,30 - External Consultant,Permanent,Salary,40.0,1983-11-18 00:00:00,Job Pac Consultant,NZ European ...,...,True,2024-05-10,Male,Active,bakeranthonyian,Baker Anthony Ian,0.0,2.23,,South Island


In [26]:
missing_info(salaried)

                             missing_count  missing_ratio
Name                                     0       0.000000
Alpha Code                               0       0.000000
Start Date                               0       0.000000
Department                               0       0.000000
Employment  Status                       0       0.000000
Salary/ Wage                             0       0.000000
Hours Worked                             0       0.000000
Birth Date                               1       0.001866
Occupation                               0       0.000000
Ethnicity                                0       0.000000
Visa Type                                0       0.000000
Visa Duration                            0       0.000000
Visa Expiry                            529       0.986940
EstimatedFixedTermExpiry               381       0.710821
FixedTermAgreement                     381       0.710821
Fixed Term Expiry                      512       0.955224
Gender        

In [28]:
salaried.info()

<class 'pandas.core.frame.DataFrame'>
Index: 536 entries, 0 to 185
Data columns (total 24 columns):
 #   Column                       Non-Null Count  Dtype         
---  ------                       --------------  -----         
 0   Name                         536 non-null    object        
 1   Alpha Code                   536 non-null    object        
 2   Start Date                   536 non-null    object        
 3   Department                   536 non-null    object        
 4   Employment  Status           536 non-null    object        
 5   Salary/ Wage                 536 non-null    object        
 6   Hours Worked                 536 non-null    float64       
 7   Birth Date                   535 non-null    object        
 8   Occupation                   536 non-null    object        
 9   Ethnicity                    536 non-null    object        
 10  Visa Type                    536 non-null    object        
 11  Visa Duration                536 non-null    objec

### Combining salaried and waged data

In [29]:

mismatch=False
for s,w in zip(salaried.columns,waged.columns):
    if s != w:
        mismatch=True
if mismatch: 
    print("There is a mismatch of columns, please reorder appropriately")
    raise ValueError()
else:
    print("No Column Mismatch, can combine waged and salaried!")
    

No Column Mismatch, can combine waged and salaried!


In [30]:
employee_data = pd.concat([salaried,waged])

#drop name key and employee full name
employee_data.drop(columns=["Employee Full Name","Name_Key"],inplace=True)

#add a last updated column

employee_data["Data Last Updated At"] = dt.now()

get_info(employee_data)

(1190, 23)
                        Name Alpha Code           Start Date  \
0    Aitcheson, Shane Andrew   AITCHESO  2015-01-12 00:00:00   
1             Antrobus, Dean   ANTRO001  2021-01-11 00:00:00   
2       Aston, Stephen James   ASTON     2023-10-30 00:00:00   
3  Baggstrom, Kimberley Dawn   BAGGSTRO  2023-09-07 00:00:00   
4         Baker, Anthony Ian   BAKER001  2024-01-22 00:00:00   

                 Department Employment  Status Salary/ Wage  Hours Worked  \
0  25 - Projects Management          Permanent     Salary            45.0   
1  23 - Projects Commercial          Permanent     Salary            45.0   
2  25 - Projects Management          Permanent     Salary            45.0   
3       13 - Administration          Permanent     Salary            40.0   
4  30 - External Consultant          Permanent     Salary            40.0   

            Birth Date                           Occupation  \
0  1985-09-11 00:00:00  Project Manager                       
1  1969-01-29 0

In [31]:
missing_info(employee_data)
#fixed term expiry still not parsing
#holiday and sick leave not joining

                             missing_count  missing_ratio
Name                                     0       0.000000
Alpha Code                               0       0.000000
Start Date                               0       0.000000
Department                               0       0.000000
Employment  Status                       0       0.000000
Salary/ Wage                             0       0.000000
Hours Worked                             0       0.000000
Birth Date                               7       0.005882
Occupation                               0       0.000000
Ethnicity                                0       0.000000
Visa Type                                0       0.000000
Visa Duration                            0       0.000000
Visa Expiry                           1132       0.951261
EstimatedFixedTermExpiry               894       0.751261
FixedTermAgreement                     894       0.751261
Fixed Term Expiry                     1158       0.973109
Gender        

### Exporting tidied data to file for use in powerBI

In [32]:
#reset index on table
employee_data.reset_index(drop=True,inplace=True)

In [33]:
employee_data.to_csv('employee_data_tidy.csv')
print("Data Cleaned successfully!")

Data Cleaned successfully!


In [34]:
#change back to data analytics directory. Add a copy of data to make the active version 
os.chdir(r"C:\Users\alex.jefferies\Leighs Construction Limited\Data Analytics - Documents\General\Projects\HR\People Dashboard")
employee_data.to_csv('employee_data_tidy_active.csv')
print("Data Copied successfully!")

Data Copied successfully!


In [35]:
#render table as html
#table_to_html(employee_data)

In [36]:
#copy copy file to "History" subfolder
filename = f"employee_data_{dt.strftime(dt.today(),format="%m%Y")}.csv"
employee_data.to_csv(f"History/{filename}", index=False)