In [15]:
import pandas as pd
from typing import Union, List, Tuple, Optional
pd.options.display.float_format = '{:,.2f}'.format

# Optimizing Memory Use

In [16]:
# sal = pd.read_csv("../data/salary2.csv")

# sal.info()

# df = sal
# df.info(memory_usage='deep')
# df["Department"] = df["Department"].astype("category")
# df["jobTitle"] = df["jobTitle"].astype("category")
# df["Employee_Name"] = df["Employee_Name"].astype("string")
# df.info(memory_usage='deep')

# optimized_dtypes = {
#     "CalYear": "int16",
#     "Employee_Name": "string",
#     "Department": "category",
#     "jobTitle": "category",
#     "Annual_Rate": "float32",
#     "Regular_Rate": "float32",
#     "Overtime_Rate": "float32",
#     "Incentive_Allowance": "float32",
#     "Other": "float32",
#     "YTD_Total": "float32",
#     "ObjectId": "int32"
# }
# df = df.astype(optimized_dtypes)


In [17]:
optimized_dtypes = {
    "CalYear": "int16",
    "Employee_Name": "string",
    "Department": "category",
    "jobTitle": "category",
    "Annual_Rate": "float32",
    "Regular_Rate": "float32",
    "Overtime_Rate": "float32",
    "Incentive_Allowance": "float32",
    "Other": "float32",
    "YTD_Total": "float32",
    "ObjectId": "int32"
}

sal = pd.read_csv("../data/salary2.csv", dtype=optimized_dtypes)


In [18]:
sal.head()

Unnamed: 0,CalYear,Employee_Name,Department,jobTitle,Annual_Rate,Regular_Rate,Overtime_Rate,Incentive_Allowance,Other,YTD_Total,ObjectId
0,2021,,Belle of Louisville,Hospitality Worker,3120.0,687.0,0.0,0.0,,687.0,1
1,2021,,Parks & Recreation,Park Aide,31200.0,600.0,0.0,0.0,,600.0,2
2,2021,"Martin, David",Library,Library Page L/U,28433.6,28412.56,0.0,1450.0,,30862.81,3
3,2021,"Bratcher, Elaine",Louisville Metro Police,Clerk Typist II-Police,35256.0,35256.01,0.0,1563.12,,36819.13,4
4,2021,"Jackson, Nila",Louisville Metro Police,Traffic Guard II,21418.8,16529.14,952.39,1000.0,,18481.53,5


In [19]:
sal["Other"].value_counts()

Series([], Name: count, dtype: int64)

## Cleaning

In [20]:
sal = sal.fillna({
    "Employee_Name": "Unknown",
    "Other": 0
})

sal = sal.drop("Other", axis=1)
sal.head()

Unnamed: 0,CalYear,Employee_Name,Department,jobTitle,Annual_Rate,Regular_Rate,Overtime_Rate,Incentive_Allowance,YTD_Total,ObjectId
0,2021,Unknown,Belle of Louisville,Hospitality Worker,3120.0,687.0,0.0,0.0,687.0,1
1,2021,Unknown,Parks & Recreation,Park Aide,31200.0,600.0,0.0,0.0,600.0,2
2,2021,"Martin, David",Library,Library Page L/U,28433.6,28412.56,0.0,1450.0,30862.81,3
3,2021,"Bratcher, Elaine",Louisville Metro Police,Clerk Typist II-Police,35256.0,35256.01,0.0,1563.12,36819.13,4
4,2021,"Jackson, Nila",Louisville Metro Police,Traffic Guard II,21418.8,16529.14,952.39,1000.0,18481.53,5


In [None]:
# hr rate and converts to min wage if less than 7.25
sal["Hr_Rate"] = sal["Annual_Rate"] / 2080 
sal["Hr_Rate"] = sal["Hr_Rate"].mask(sal["Hr_Rate"] < 7.25, 7.25)

# cal Overtime rate 
sal["Ot_Rate"] = sal["Hr_Rate"] * 1.5 

# finding and filtering out part time employees
sal['Reg_Hours'] = sal["Annual_Rate"] / sal["Hr_Rate"]
sal = sal[sal['Reg_Hours'] >= 2080]

# calculating overtime hours
sal['Ot_Hours'] = sal["Overtime_Rate"] / sal["Ot_Rate"] 

sal['Total_Hours'] = sal['Reg_Hours'] + sal['Ot_Hours']

sal["Weeks_Worked"] = sal["Regular_Rate"] / (sal["Hr_Rate"] * 40) 

sal["Reg_Hrs_per_Week"] = sal["Regular_Rate"] / (sal["Hr_Rate"] * sal["Weeks_Worked"])

sal["Ot_Hrs_per_Week"] = sal["Overtime_Rate"] / (sal["Ot_Rate"] * sal["Weeks_Worked"])

sal["Avg_Hrs_per_Week"] = sal["Reg_Hrs_per_Week"] + sal["Ot_Hrs_per_Week"]

sal.head(10)

Unnamed: 0,CalYear,Employee_Name,Department,jobTitle,Annual_Rate,Regular_Rate,Overtime_Rate,Incentive_Allowance,YTD_Total,ObjectId,Hr_Rate,Ot_Rate,Reg_Hours,Ot_Hours,Total_Hours,Weeks_Worked,Reg_Hrs_per_Week,Ot_Hrs_per_Week,Avg_Hrs_per_Week
1,2021,Unknown,Parks & Recreation,Park Aide,31200.0,600.0,0.0,0.0,600.0,2,15.0,22.5,2080.0,0.0,2080.0,1.0,40.0,0.0,40.0
2,2021,"Martin, David",Library,Library Page L/U,28433.6,28412.56,0.0,1450.0,30862.81,3,13.67,20.51,2080.0,0.0,2080.0,51.96,40.0,0.0,40.0
3,2021,"Bratcher, Elaine",Louisville Metro Police,Clerk Typist II-Police,35256.0,35256.01,0.0,1563.12,36819.13,4,16.95,25.43,2080.0,0.0,2080.0,52.0,40.0,0.0,40.0
4,2021,"Jackson, Nila",Louisville Metro Police,Traffic Guard II,21418.8,16529.14,952.39,1000.0,18481.53,5,10.3,15.45,2080.0,61.66,2141.66,40.13,40.0,1.54,41.54
5,2021,"Ammon, Darrell",Louisville Metro Police,Criminal Justice Specialist,50107.2,49362.47,0.0,0.0,49362.47,6,24.09,36.14,2080.0,0.0,2080.0,51.23,40.0,0.0,40.0
6,2021,"Guagliardo, Paul",County Attorney,Senior Attorney,98425.6,73865.12,0.0,0.0,89007.52,7,47.32,70.98,2080.0,0.0,2080.0,39.02,40.0,0.0,40.0
7,2021,"Waggoner, David",Louisville Fire,Fire Prevention Inspector I,58963.84,59557.82,0.0,13290.04,72847.86,8,28.35,42.52,2080.0,0.0,2080.0,52.52,40.0,0.0,40.0
8,2021,"Clay, Timothy",Louisville Zoo,Zoo Crew Leader-Seasonal,29120.0,19052.8,160.8,0.0,19213.6,9,14.0,21.0,2080.0,7.66,2087.66,34.02,40.0,0.23,40.23
9,2021,"Lenahan, Larry",Finance,Budget Analyst I,36972.0,25963.56,189.81,0.0,26153.37,10,17.77,26.66,2080.0,7.12,2087.12,36.52,40.0,0.19,40.19
10,2021,"Taylor, Steven",Louisville Zoo,Assistant Director,86041.8,85080.97,0.0,0.0,85742.83,11,41.37,62.05,2080.0,0.0,2080.0,51.42,40.0,0.0,40.0


In [22]:
sal = sal.sort_values(by="Avg_Hrs_per_Week", ascending=False)
sal.head()

Unnamed: 0,CalYear,Employee_Name,Department,jobTitle,Annual_Rate,Regular_Rate,Overtime_Rate,Incentive_Allowance,YTD_Total,ObjectId,Hr_Rate,Ot_Rate,Reg_Hours,Ot_Hours,Total_Hours,Weeks_Worked,Reg_Hrs_per_Week,Ot_Hrs_per_Week,Avg_Hrs_per_Week
23524,2025,"Leonard, Dillon Michael",Louisville Fire,Firefighter 56hr,63441.66,12.04,51.01,0.0,63.05,23525,30.5,45.75,2080.0,1.11,2081.11,0.01,40.0,112.98,152.98
34985,2024,"Scharrer, Justin",Emergency Management Services,EMT,60424.0,5513.59,18510.8,0.0,33640.39,34986,29.05,43.57,2080.0,424.8,2504.8,4.74,40.0,89.53,129.53
14300,2023,"Murphy, Paul P.",Louisville Fire,Fire Apparatus Operator 56hr,72077.82,40383.16,97922.36,39150.6,177456.12,14301,34.65,51.98,2080.0,1883.88,3963.88,29.13,40.0,64.66,104.66
38644,2024,"McKeehan, Matthew Douglas",Emergency Management Services,EMT,45926.4,13663.62,31838.34,350.0,45851.96,38645,22.08,33.12,2080.0,961.3,3041.3,15.47,40.0,62.14,102.14
15465,2023,"Herndon, Adam B",Louisville Fire,Fire Company Commander 56hr,82278.14,42926.26,94210.22,29398.8,167021.05,15466,39.56,59.34,2080.0,1587.76,3667.76,27.13,40.0,58.53,98.53


In [23]:
sal['Reg_Hours'].value_counts()

Reg_Hours
2,080.00    39282
Name: count, dtype: int64

In [24]:
def year_selector(
    df: pd.DataFrame,
    year: Union[int, List[int], Tuple[int, int]]
) -> pd.DataFrame:
    """
    Filter a DataFrame by one or more calendar years.

    Parameters
    ----------
    df : pd.DataFrame
        The input DataFrame containing a 'CalYear' column.
    year : int | list[int] | tuple[int, int]
        The year or years to filter by.
        - If int: returns rows for that year only.
        - If list of ints: returns rows matching any year in the list.
        - If tuple of (start, end): returns rows within that inclusive range.

    Returns
    -------
    pd.DataFrame
        A filtered DataFrame containing only rows that match the given year(s).

    Examples
    --------
    >>> year_selector(sal, 2021)
    # Returns rows where CalYear == 2021

    >>> year_selector(sal, [2020, 2021])
    # Returns rows where CalYear is either 2020 or 2021

    >>> year_selector(sal, (2019, 2021))
    # Returns rows where CalYear is between 2019 and 2021 inclusive
    """
    if isinstance(year, int):
        return df[df["CalYear"] == year]
    elif isinstance(year, list):
        return df[df["CalYear"].isin(year)]
    elif isinstance(year, tuple) and len(year) == 2:
        start, end = year
        return df[(df["CalYear"] >= start) & (df["CalYear"] <= end)]
    else:
        raise TypeError("`year` must be an int, list of ints, or tuple of (start, end).")
    return df

In [25]:
sal_2025 = year_selector(sal, 2025)
sal_2025

Unnamed: 0,CalYear,Employee_Name,Department,jobTitle,Annual_Rate,Regular_Rate,Overtime_Rate,Incentive_Allowance,YTD_Total,ObjectId,Hr_Rate,Ot_Rate,Reg_Hours,Ot_Hours,Total_Hours,Weeks_Worked,Reg_Hrs_per_Week,Ot_Hrs_per_Week,Avg_Hrs_per_Week
23524,2025,"Leonard, Dillon Michael",Louisville Fire,Firefighter 56hr,63441.66,12.04,51.01,0.00,63.05,23525,30.50,45.75,2080.00,1.11,2081.11,0.01,40.00,112.98,152.98
21009,2025,"Fenwick, Paul E",Louisville Metro Police Department,Police Officer,95555.20,75315.67,134672.06,10093.88,220081.61,21010,45.94,68.91,2080.00,1954.32,4034.32,40.99,40.00,47.68,87.68
21560,2025,"Standard, Royce Leshawn",Department of Corrections,Corrections Officer,69596.80,55491.41,95985.71,0.00,151477.12,21561,33.46,50.19,2080.00,1912.45,3992.45,41.46,40.00,46.13,86.13
22397,2025,"Stimphil, Richardson",Department of Corrections,Corrections Officer,66331.20,50618.52,86647.31,0.00,137265.83,22398,31.89,47.84,2080.00,1811.38,3891.38,39.68,40.00,45.65,85.65
26842,2025,"Mann, Mikayla Faith",Emergency Management Services,EMS EMT-Paramedic I,62046.40,3740.70,6106.35,0.00,9847.05,26843,29.83,44.74,2080.00,136.47,2216.47,3.14,40.00,43.53,83.53
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27302,2025,"Cheatham, Paulette D",Louisville Zoo,Receptionist ZU,36379.20,0.00,0.00,0.00,0.00,27303,17.49,26.24,2080.00,0.00,2080.00,0.00,,,
27436,2025,"Washington, Kingston",Parks & Recreation,Parks & Rec. Aide,34632.00,0.00,0.00,0.00,0.00,27437,16.65,24.97,2080.00,0.00,2080.00,0.00,,,
27472,2025,"Fields, Christian",Parks & Recreation,Parks & Rec. Aide,34632.00,0.00,0.00,0.00,0.00,27473,16.65,24.97,2080.00,0.00,2080.00,0.00,,,
27550,2025,"Wescott, Brenda",Louisville Metro Police Department,Police Officer,63585.60,0.00,0.00,0.00,0.00,27551,30.57,45.86,2080.00,0.00,2080.00,0.00,,,


In [26]:
def department_filter(
    df: pd.DataFrame,
    department: Union[str, List[str]]
) -> pd.DataFrame:
    """
    Filter a DataFrame by one or more department names.

    Parameters
    ----------
    df : pd.DataFrame
        The input DataFrame containing a 'Department' column.
    department : str | list[str]
        Department name(s) to filter by.
        - If str: returns rows for that department only.
        - If list of str: returns rows matching any department in the list.

    Returns
    -------
    pd.DataFrame
        A filtered DataFrame containing only rows for the given department(s).

    Examples
    --------
    >>> department_filter(sal, "Louisville Metro Police Department")
    # Returns rows for that department

    >>> department_filter(sal, ["Parks & Recreation", "Library"])
    # Returns rows for both departments
    """
    # Clean column names once to avoid hidden tab or whitespace issues
    df.columns = df.columns.str.strip()

    if isinstance(department, str):
        return df[df["Department"] == department]
    elif isinstance(department, list):
        return df[df["Department"].isin(department)]
    else:
        raise TypeError("`department` must be a string or list of strings.")


In [27]:
lmpd25 = department_filter(sal_2025,"Louisville Metro Police Department")
lmpd25

Unnamed: 0,CalYear,Employee_Name,Department,jobTitle,Annual_Rate,Regular_Rate,Overtime_Rate,Incentive_Allowance,YTD_Total,ObjectId,Hr_Rate,Ot_Rate,Reg_Hours,Ot_Hours,Total_Hours,Weeks_Worked,Reg_Hrs_per_Week,Ot_Hrs_per_Week,Avg_Hrs_per_Week
21009,2025,"Fenwick, Paul E",Louisville Metro Police Department,Police Officer,95555.20,75315.67,134672.06,10093.88,220081.61,21010,45.94,68.91,2080.00,1954.32,4034.32,40.99,40.00,47.68,87.68
25482,2025,"Richardson, Joshua Thomas",Louisville Metro Police Department,Police Officer,78790.40,58987.21,76854.67,8280.70,144122.58,25483,37.88,56.82,2080.00,1352.60,3432.60,38.93,40.00,34.74,74.74
23252,2025,"Aviles, Celestino Ardres Ortiz",Louisville Metro Police Department,Police Officer,80246.40,63823.73,82673.52,8323.57,154820.81,23253,38.58,57.87,2080.00,1428.61,3508.61,41.36,40.00,34.54,74.54
21730,2025,"Williams, Derrick D",Louisville Metro Police Department,Police Officer,90584.00,71934.12,87857.76,7954.99,167746.88,21731,43.55,65.32,2080.00,1344.93,3424.93,41.29,40.00,32.57,72.57
21005,2025,"Cadwell, Corey R",Louisville Metro Police Department,Police Sergeant,111550.40,89179.76,100464.80,7711.35,197355.91,21006,53.63,80.44,2080.00,1248.86,3328.86,41.57,40.00,30.04,70.04
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25294,2025,"Heady, David Ellis",Louisville Metro Police Department,Storage Equipment Operator5day,42452.80,0.00,0.00,0.00,10658.08,25295,20.41,30.61,2080.00,0.00,2080.00,0.00,,,
25301,2025,"Kizer, Larry W",Louisville Metro Police Department,Storage Equipment Operator5day,42452.80,0.00,0.00,0.00,0.00,25302,20.41,30.61,2080.00,0.00,2080.00,0.00,,,
25514,2025,"Helm, Brent Michael",Louisville Metro Police Department,Police Recruit,47673.60,0.00,0.00,0.00,0.00,25515,22.92,34.38,2080.00,0.00,2080.00,0.00,,,
27027,2025,"Price, Nichole",Louisville Metro Police Department,Information Process Technician I,39769.60,0.00,0.00,0.00,0.00,27028,19.12,28.68,2080.00,0.00,2080.00,0.00,,,
