In [82]:
import pandas as pd
from typing import Union, List, Tuple, Optional
pd.options.display.float_format = '{:,.2f}'.format

# Optimizing Memory Use

In [83]:
# sal = pd.read_csv("../data/salary2.csv")

# sal.info()

# df = sal
# df.info(memory_usage='deep')
# df["Department"] = df["Department"].astype("category")
# df["jobTitle"] = df["jobTitle"].astype("category")
# df["Employee_Name"] = df["Employee_Name"].astype("string")
# df.info(memory_usage='deep')

# optimized_dtypes = {
#     "CalYear": "int16",
#     "Employee_Name": "string",
#     "Department": "category",
#     "jobTitle": "category",
#     "Annual_Rate": "float32",
#     "Regular_Rate": "float32",
#     "Overtime_Rate": "float32",
#     "Incentive_Allowance": "float32",
#     "Other": "float32",
#     "YTD_Total": "float32",
#     "ObjectId": "int32"
# }
# df = df.astype(optimized_dtypes)


In [84]:
optimized_dtypes = {
    "CalYear": "int16",
    "Employee_Name": "string",
    "Department": "category",
    "jobTitle": "category",
    "Annual_Rate": "float32",
    "Regular_Rate": "float32",
    "Overtime_Rate": "float32",
    "Incentive_Allowance": "float32",
    "Other": "float32",
    "YTD_Total": "float32",
    "ObjectId": "int32"
}

sal = pd.read_csv("../data/salary2.csv", dtype=optimized_dtypes)


In [85]:
sal.head()

Unnamed: 0,CalYear,Employee_Name,Department,jobTitle,Annual_Rate,Regular_Rate,Overtime_Rate,Incentive_Allowance,Other,YTD_Total,ObjectId
0,2021,,Belle of Louisville,Hospitality Worker,3120.0,687.0,0.0,0.0,,687.0,1
1,2021,,Parks & Recreation,Park Aide,31200.0,600.0,0.0,0.0,,600.0,2
2,2021,"Martin, David",Library,Library Page L/U,28433.6,28412.56,0.0,1450.0,,30862.81,3
3,2021,"Bratcher, Elaine",Louisville Metro Police,Clerk Typist II-Police,35256.0,35256.01,0.0,1563.12,,36819.13,4
4,2021,"Jackson, Nila",Louisville Metro Police,Traffic Guard II,21418.8,16529.14,952.39,1000.0,,18481.53,5


In [86]:
sal["Other"].value_counts()

Series([], Name: count, dtype: int64)

## Cleaning

In [87]:
sal = sal.fillna({
    "Employee_Name": "Unknown",
    "Other": 0
})

sal = sal.drop("Other", axis=1)
sal.head()

Unnamed: 0,CalYear,Employee_Name,Department,jobTitle,Annual_Rate,Regular_Rate,Overtime_Rate,Incentive_Allowance,YTD_Total,ObjectId
0,2021,Unknown,Belle of Louisville,Hospitality Worker,3120.0,687.0,0.0,0.0,687.0,1
1,2021,Unknown,Parks & Recreation,Park Aide,31200.0,600.0,0.0,0.0,600.0,2
2,2021,"Martin, David",Library,Library Page L/U,28433.6,28412.56,0.0,1450.0,30862.81,3
3,2021,"Bratcher, Elaine",Louisville Metro Police,Clerk Typist II-Police,35256.0,35256.01,0.0,1563.12,36819.13,4
4,2021,"Jackson, Nila",Louisville Metro Police,Traffic Guard II,21418.8,16529.14,952.39,1000.0,18481.53,5


In [88]:
# # hr rate and converts to min wage if less than 7.25
# sal["Hr_Rate"] = sal["Annual_Rate"] / 2080 
# sal["Hr_Rate"] = sal["Hr_Rate"].mask(sal["Hr_Rate"] < 7.25, 7.25)

# # cal Overtime rate 
# sal["Ot_Rate"] = sal["Hr_Rate"] * 1.5 

# # finding and filtering out part time employees
# sal['Reg_Hours'] = sal["Annual_Rate"] / sal["Hr_Rate"]
# sal = sal[sal['Reg_Hours'] >= 2080]

# # calculating regular hours
# sal['Rg_Hours'] = sal["Regular_Rate"] / sal["Hr_Rate"] 

# # calculating overtime hours
# sal['Ot_Hours'] = sal["Overtime_Rate"] / sal["Ot_Rate"] 

# sal['Total_Hours'] = sal['Reg_Hours'] + sal['Ot_Hours']

# sal["Weeks_Worked"] = sal["Regular_Rate"] / (sal["Hr_Rate"] * 40) 

# sal["Reg_Hrs_per_Week"] = sal["Regular_Rate"] / (sal["Hr_Rate"] * sal["Weeks_Worked"])

# sal["Ot_Hrs_per_Week"] = sal["Overtime_Rate"] / (sal["Ot_Rate"] * sal["Weeks_Worked"])

# sal["Avg_Hrs_per_Week"] = sal["Reg_Hrs_per_Week"] + sal["Ot_Hrs_per_Week"]

# sal.head(10)

<style>
div.math { text-align: left !important; }
</style>
# Employee Pay and Hours Calculations

This section documents the formulas used to calculate hourly pay rates, overtime, total hours, and weekly averages for full-time employees.

---

### 1️ Hourly Rate and Minimum Wage Adjustment

Hourly rate assumes 2,080 hours per year (40 hours × 52 weeks):

$$
Hr\_Rate = \frac{Annual\_Rate}{2080}
$$

If the calculated hourly rate is below \$7.25 (U.S. federal minimum wage), it’s adjusted upward:

$$
Hr\_Rate =
\begin{cases}
7.25, & Hr\_Rate < 7.25 \\
Hr\_Rate, & \text{otherwise}
\end{cases}
$$

---

### 2️ Overtime Rate

Overtime is paid at 1.5× the regular hourly rate:

$$
Ot\_Rate = 1.5 \times Hr\_Rate
$$

---

### 3️ Regular Hours and Full-Time Filter

Estimate total regular annual hours:

$$
Reg\_Hours = \frac{Annual\_Rate}{Hr\_Rate}
$$

Filter to include only full-time employees:

$$
Reg\_Hours \ge 2080
$$

---

### 4️ Regular and Overtime Hours Worked

Compute regular and overtime hours based on pay:

$$
Rg\_Hours = \frac{Regular\_Rate}{Hr\_Rate}
$$

$$
Ot\_Hours = \frac{Overtime\_Rate}{Ot\_Rate}
$$

---

### 5️ Total Hours Worked

$$
Total\_Hours = Reg\_Hours + Ot\_Hours
$$

---

### 6️ Weeks Worked

Estimate number of weeks worked:

$$
Weeks\_Worked = \frac{Regular\_Rate}{Hr\_Rate \times 40}
$$

---

### 7️ Weekly Breakdown of Regular and Overtime Hours

$$
Reg\_Hrs\_per\_Week = \frac{Regular\_Rate}{Hr\_Rate \times Weeks\_Worked}
$$

$$
Ot\_Hrs\_per\_Week = \frac{Overtime\_Rate}{Ot\_Rate \times Weeks\_Worked}
$$

---

### 8️ Average Hours per Week

$$
Avg\_Hrs\_per\_Week = Reg\_Hrs\_per\_Week + Ot\_Hrs\_per\_Week
$$




In [89]:
# 1️ Calculate Hourly Rate and apply minimum wage rule
sal["Hr_Rate"] = sal["Annual_Rate"] / 2080 

# If hourly rate is less than $7.25, replace it with $7.25 (minimum wage floor)
sal["Hr_Rate"] = sal["Hr_Rate"].mask(sal["Hr_Rate"] < 7.25, 7.25)

# 2 Calculate Overtime Rate
sal["Ot_Rate"] = sal["Hr_Rate"] * 1.5

# 3️ Filter out part-time employees
sal['Reg_Hours'] = sal["Annual_Rate"] / sal["Hr_Rate"]

# Keep only full-time employees (≥ 2080 hours)
sal = sal[sal['Reg_Hours'] >= 2080]

# 4️ Calculate regular hours worked (based on Regular_Rate)
sal['Rg_Hours'] = sal["Regular_Rate"] / sal["Hr_Rate"]

# 5️ Calculate overtime hours worked
sal['Ot_Hours'] = sal["Overtime_Rate"] / sal["Ot_Rate"]

# 6️ Total hours worked
sal['Total_Hours'] = sal['Reg_Hours'] + sal['Ot_Hours']

# 7️ Estimate number of weeks worked
sal["Weeks_Worked"] = sal["Regular_Rate"] / (sal["Hr_Rate"] * 40) 

# 8️ Regular hours per week
sal["Reg_Hrs_per_Week"] = sal["Regular_Rate"] / (sal["Hr_Rate"] * sal["Weeks_Worked"])

# 9️ Overtime hours per week
sal["Ot_Hrs_per_Week"] = sal["Overtime_Rate"] / (sal["Ot_Rate"] * sal["Weeks_Worked"])

# 10 Average hours per week
sal["Avg_Hrs_per_Week"] = sal["Reg_Hrs_per_Week"] + sal["Ot_Hrs_per_Week"]

sal.head()

Unnamed: 0,CalYear,Employee_Name,Department,jobTitle,Annual_Rate,Regular_Rate,Overtime_Rate,Incentive_Allowance,YTD_Total,ObjectId,Hr_Rate,Ot_Rate,Reg_Hours,Rg_Hours,Ot_Hours,Total_Hours,Weeks_Worked,Reg_Hrs_per_Week,Ot_Hrs_per_Week,Avg_Hrs_per_Week
1,2021,Unknown,Parks & Recreation,Park Aide,31200.0,600.0,0.0,0.0,600.0,2,15.0,22.5,2080.0,40.0,0.0,2080.0,1.0,40.0,0.0,40.0
2,2021,"Martin, David",Library,Library Page L/U,28433.6,28412.56,0.0,1450.0,30862.81,3,13.67,20.51,2080.0,2078.46,0.0,2080.0,51.96,40.0,0.0,40.0
3,2021,"Bratcher, Elaine",Louisville Metro Police,Clerk Typist II-Police,35256.0,35256.01,0.0,1563.12,36819.13,4,16.95,25.43,2080.0,2080.0,0.0,2080.0,52.0,40.0,0.0,40.0
4,2021,"Jackson, Nila",Louisville Metro Police,Traffic Guard II,21418.8,16529.14,952.39,1000.0,18481.53,5,10.3,15.45,2080.0,1605.16,61.66,2141.66,40.13,40.0,1.54,41.54
5,2021,"Ammon, Darrell",Louisville Metro Police,Criminal Justice Specialist,50107.2,49362.47,0.0,0.0,49362.47,6,24.09,36.14,2080.0,2049.09,0.0,2080.0,51.23,40.0,0.0,40.0


In [90]:
sal = sal.sort_values(by="Avg_Hrs_per_Week", ascending=False)
sal.head()

Unnamed: 0,CalYear,Employee_Name,Department,jobTitle,Annual_Rate,Regular_Rate,Overtime_Rate,Incentive_Allowance,YTD_Total,ObjectId,Hr_Rate,Ot_Rate,Reg_Hours,Rg_Hours,Ot_Hours,Total_Hours,Weeks_Worked,Reg_Hrs_per_Week,Ot_Hrs_per_Week,Avg_Hrs_per_Week
9328,2025,"Leonard, Dillon Michael",Louisville Fire,Firefighter 56hr,63441.66,12.04,51.01,0.0,63.05,9329,30.5,45.75,2080.0,0.39,1.11,2081.11,0.01,40.0,112.98,152.98
35070,2024,"Scharrer, Justin",Emergency Management Services,EMT,60424.0,5513.59,18510.8,0.0,33640.39,35071,29.05,43.57,2080.0,189.8,424.8,2504.8,4.74,40.0,89.53,129.53
27246,2023,"Murphy, Paul P.",Louisville Fire,Fire Apparatus Operator 56hr,72077.82,40383.16,97922.36,39150.6,177456.12,27247,34.65,51.98,2080.0,1165.37,1883.88,3963.88,29.13,40.0,64.66,104.66
38660,2024,"McKeehan, Matthew Douglas",Emergency Management Services,EMT,45926.4,13663.62,31838.34,350.0,45851.96,38661,22.08,33.12,2080.0,618.82,961.3,3041.3,15.47,40.0,62.14,102.14
29109,2023,"Herndon, Adam B",Louisville Fire,Fire Company Commander 56hr,82278.14,42926.26,94210.22,29398.8,167021.05,29110,39.56,59.34,2080.0,1085.18,1587.76,3667.76,27.13,40.0,58.53,98.53


In [91]:
sal['Reg_Hours'].value_counts()

Reg_Hours
2,080.00    39282
Name: count, dtype: int64

In [92]:
def year_selector(
    df: pd.DataFrame,
    year: Union[int, List[int], Tuple[int, int]]
) -> pd.DataFrame:
    """
    Filter a DataFrame by one or more calendar years.

    Parameters
    ----------
    df : pd.DataFrame
        The input DataFrame containing a 'CalYear' column.
    year : int | list[int] | tuple[int, int]
        The year or years to filter by.
        - If int: returns rows for that year only.
        - If list of ints: returns rows matching any year in the list.
        - If tuple of (start, end): returns rows within that inclusive range.

    Returns
    -------
    pd.DataFrame
        A filtered DataFrame containing only rows that match the given year(s).

    Examples
    --------
    >>> year_selector(sal, 2021)
    # Returns rows where CalYear == 2021

    >>> year_selector(sal, [2020, 2021])
    # Returns rows where CalYear is either 2020 or 2021

    >>> year_selector(sal, (2019, 2021))
    # Returns rows where CalYear is between 2019 and 2021 inclusive
    """
    if isinstance(year, int):
        return df[df["CalYear"] == year]
    elif isinstance(year, list):
        return df[df["CalYear"].isin(year)]
    elif isinstance(year, tuple) and len(year) == 2:
        start, end = year
        return df[(df["CalYear"] >= start) & (df["CalYear"] <= end)]
    else:
        raise TypeError("`year` must be an int, list of ints, or tuple of (start, end).")
    return df

In [93]:
sal_2025 = year_selector(sal, 2025)
sal_2025

Unnamed: 0,CalYear,Employee_Name,Department,jobTitle,Annual_Rate,Regular_Rate,Overtime_Rate,Incentive_Allowance,YTD_Total,ObjectId,Hr_Rate,Ot_Rate,Reg_Hours,Rg_Hours,Ot_Hours,Total_Hours,Weeks_Worked,Reg_Hrs_per_Week,Ot_Hrs_per_Week,Avg_Hrs_per_Week
9328,2025,"Leonard, Dillon Michael",Louisville Fire,Firefighter 56hr,63441.66,12.04,51.01,0.00,63.05,9329,30.50,45.75,2080.00,0.39,1.11,2081.11,0.01,40.00,112.98,152.98
7293,2025,"Fenwick, Paul E",Louisville Metro Police Department,Police Officer,95555.20,75315.67,134672.06,10093.88,220081.61,7294,45.94,68.91,2080.00,1639.44,1954.32,4034.32,40.99,40.00,47.68,87.68
7810,2025,"Standard, Royce Leshawn",Department of Corrections,Corrections Officer,69596.80,55491.41,95985.71,0.00,151477.12,7811,33.46,50.19,2080.00,1658.44,1912.45,3992.45,41.46,40.00,46.13,86.13
8618,2025,"Stimphil, Richardson",Department of Corrections,Corrections Officer,66331.20,50618.52,86647.31,0.00,137265.83,8619,31.89,47.84,2080.00,1587.28,1811.38,3891.38,39.68,40.00,45.65,85.65
13223,2025,"Mann, Mikayla Faith",Emergency Management Services,EMS EMT-Paramedic I,62046.40,3740.70,6106.35,0.00,9847.05,13224,29.83,44.74,2080.00,125.40,136.47,2216.47,3.14,40.00,43.53,83.53
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13575,2025,"Cole, Misty",Records Compliance,Open Records Supervisor,59217.60,0.00,0.00,0.00,0.00,13576,28.47,42.71,2080.00,0.00,0.00,2080.00,0.00,,,
13591,2025,"Raines, Clinton Lee",Emergency Management Services,EMT,45011.20,0.00,0.00,0.00,0.00,13592,21.64,32.46,2080.00,0.00,0.00,2080.00,0.00,,,
13681,2025,"Washington, Kingston",Parks & Recreation,Parks & Rec. Aide,34632.00,0.00,0.00,0.00,0.00,13682,16.65,24.97,2080.00,0.00,0.00,2080.00,0.00,,,
13757,2025,"Wescott, Brenda",Louisville Metro Police Department,Police Officer,63585.60,0.00,0.00,0.00,0.00,13758,30.57,45.86,2080.00,0.00,0.00,2080.00,0.00,,,


In [94]:
def department_filter(
    df: pd.DataFrame,
    department: Union[str, List[str]]
) -> pd.DataFrame:
    """
    Filter a DataFrame by one or more department names.

    Parameters
    ----------
    df : pd.DataFrame
        The input DataFrame containing a 'Department' column.
    department : str | list[str]
        Department name(s) to filter by.
        - If str: returns rows for that department only.
        - If list of str: returns rows matching any department in the list.

    Returns
    -------
    pd.DataFrame
        A filtered DataFrame containing only rows for the given department(s).

    Examples
    --------
    >>> department_filter(sal, "Louisville Metro Police Department")
    # Returns rows for that department

    >>> department_filter(sal, ["Parks & Recreation", "Library"])
    # Returns rows for both departments
    """
    # Clean column names once to avoid hidden tab or whitespace issues
    df.columns = df.columns.str.strip()

    if isinstance(department, str):
        return df[df["Department"] == department]
    elif isinstance(department, list):
        return df[df["Department"].isin(department)]
    else:
        raise TypeError("`department` must be a string or list of strings.")


In [95]:
lmpd25 = department_filter(sal_2025,"Louisville Metro Police Department")
lmpd25.head(20)

Unnamed: 0,CalYear,Employee_Name,Department,jobTitle,Annual_Rate,Regular_Rate,Overtime_Rate,Incentive_Allowance,YTD_Total,ObjectId,Hr_Rate,Ot_Rate,Reg_Hours,Rg_Hours,Ot_Hours,Total_Hours,Weeks_Worked,Reg_Hrs_per_Week,Ot_Hrs_per_Week,Avg_Hrs_per_Week
7293,2025,"Fenwick, Paul E",Louisville Metro Police Department,Police Officer,95555.2,75315.67,134672.06,10093.88,220081.61,7294,45.94,68.91,2080.0,1639.44,1954.32,4034.32,40.99,40.0,47.68,87.68
11250,2025,"Richardson, Joshua Thomas",Louisville Metro Police Department,Police Officer,78790.4,58987.21,76854.67,8280.7,144122.58,11251,37.88,56.82,2080.0,1557.21,1352.6,3432.6,38.93,40.0,34.74,74.74
9911,2025,"Aviles, Celestino Ardres Ortiz",Louisville Metro Police Department,Police Officer,80246.4,63823.73,82673.52,8323.57,154820.81,9912,38.58,57.87,2080.0,1654.32,1428.61,3508.61,41.36,40.0,34.54,74.54
7973,2025,"Williams, Derrick D",Louisville Metro Police Department,Police Officer,90584.0,71934.12,87857.76,7954.99,167746.88,7974,43.55,65.32,2080.0,1651.76,1344.93,3424.93,41.29,40.0,32.57,72.57
7283,2025,"Cadwell, Corey R",Louisville Metro Police Department,Police Sergeant,111550.4,89179.76,100464.8,7711.35,197355.91,7284,53.63,80.44,2080.0,1662.87,1248.86,3328.86,41.57,40.0,30.04,70.04
6584,2025,"Clarkson, Joseph Brian",Louisville Metro Police Department,Police Officer,95555.2,77131.71,85587.38,7621.58,170340.67,6585,45.94,68.91,2080.0,1678.97,1242.02,3322.02,41.97,40.0,29.59,69.59
10365,2025,"Valdivia, Jacob Ryan",Louisville Metro Police Department,Police Officer,80246.4,66486.56,69795.54,7524.42,143806.52,10366,38.58,57.87,2080.0,1723.34,1206.07,3286.07,43.08,40.0,27.99,67.99
12090,2025,"Stotts, Nathan",Louisville Metro Police Department,Police Officer,69680.0,55436.41,57742.67,11373.23,124552.31,12091,33.5,50.25,2080.0,1654.82,1149.11,3229.11,41.37,40.0,27.78,67.78
8396,2025,"Steller, Bradley C",Louisville Metro Police Department,Police Officer,89024.0,69676.29,72336.24,7346.47,149359.0,8397,42.8,64.2,2080.0,1627.95,1126.73,3206.73,40.7,40.0,27.68,67.68
7271,2025,"Kisling, Dean Alan",Louisville Metro Police Department,Police Officer,95555.2,58799.4,60336.2,5598.77,128447.71,7272,45.94,68.91,2080.0,1279.92,875.58,2955.58,32.0,40.0,27.36,67.36
