In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import plotly.graph_objects as go

In [2]:
data = pd.read_csv("../data2/salary.csv")

# Cleaning 

This code cleans data by: 
- removing unnecessary columns
- calculating total expected salary
- filling missing values with zero
- standardizing department names 
- computes hourly rates 
- overtime rates
- total weekly hours worked 
- converts the current years data to real time budget tracking. 

In [3]:
# drops cols we dont need 
cols_to_drop = ['jobTitle', 'Other', 'ObjectId']
data = data.drop(columns=cols_to_drop)
# calculates total expected salary 
data['Salary_Total'] = data['Annual_Rate'] + data['Incentive_Allowance']
# Filling missing data with 0 
data = data.fillna(0)
# merging the departments together 
data['Department'] = data['Department'].replace('Louisville Metro Police', 'Louisville Metro Police Department')
data['Department'] = data['Department'].replace('Department of Corrections', 'Metro Corrections')
# calculates total weekly hrs worked 
data['Hr_Rate'] = data['Regular_Rate'] / 2080
data['Ot_Rate'] = data['Hr_Rate'] * 1.5
data['Hr_Worked'] = data['Overtime_Rate'] / data['Ot_Rate'] / 52 + 40
data.replace([np.inf, -np.inf], 40, inplace=True)

'''fix this soon
add update_check.py to this file to have the dynamic var

date_div = "November 27, 2024"
'''

# Define the date
date_div = "November 27, 2024"

# Convert the string to a datetime object
date_object = datetime.strptime(date_div, "%B %d, %Y")

# Get the week number of the c_year
week = date_object.isocalendar()[1]

# Convert the string to a datetime object
date_object = datetime.strptime(date_div, "%B %d, %Y")

# Get the c_year
c_year = date_object.year

# Check if there are any rows where the CalYear is equal to the specified c_year
if (data['CalYear'] == c_year).any():
    # Convert Annual_Rate to weekly rate by dividing by 52
    data.loc[data['CalYear'] == c_year, 'Annual_Rate'] = data.loc[data['CalYear'] == c_year, 'Annual_Rate'] / 52

    # Scale the weekly rate for the specific week
    data.loc[data['CalYear'] == c_year, 'Annual_Rate'] = data.loc[data['CalYear'] == c_year, 'Annual_Rate'] * week


In [4]:
data.to_csv("../data2/salary.csv", index=False)

In [5]:
# # Check if there are any rows where the CalYear is equal to the specified year
# if (data['CalYear'] == year).any():
#     # Convert Annual_Rate to weekly rate by dividing by 52
#     data.loc[data['CalYear'] == year, 'Annual_Rate'] = data.loc[data['CalYear'] == year, 'Annual_Rate'] / 52

#     # Scale the weekly rate for the specific week
#     data.loc[data['CalYear'] == year, 'Annual_Rate'] = data.loc[data['CalYear'] == year, 'Annual_Rate'] * week


<pre>
+---------------------------------------------------------------------------------------------------------+
|+-------------------------------------------------------------------------------------------------------+|
||[1. logo] 2. Louisville Metro Government Salary tracker   [3. Department drop down][ 4. year drop down]|| 
|+-------------------------------------------------------------------------------------------------------+|
|+--------+     6. salary spend 8. actual salary spend +-----+     +-------------------------------------+|
||5. plot |     7. salary data  9. salary data         |12. %|     |  11. horizontal bar plot department || 
||        |                                            +-----+     |                                     || 
|+--------+                                                        |                                     || 
|                                                                  |                                     || 
|+---------------------------------------------------------------+ |                                     || 
|| 10. Top employees salary deviation data frame                 | |                                     || 
||                                                               | |                                     || 
||                                                               | |                                     || 
||                                                               | |                                     || 
||                                                               | |                                     || 
||                                                               | |                                     || 
||                                                               | |                                     || 
|+---------------------------------------------------------------+ +-------------------------------------+|
+---------------------------------------------------------------------------------------------------------+
</pre>


# code for 5, 7, 9, and 12 

- 5 Will be a gauge plot?


In [6]:
# gauge plot
# 2019-2024
gauge = data.groupby(['CalYear'])[['YTD_Total', 'Salary_Total']].sum().reset_index()
gauge.head(10)

Unnamed: 0,CalYear,YTD_Total,Salary_Total
0,2020,333171000.0,339664300.0
1,2021,330743600.0,351258900.0
2,2022,390135700.0,370298000.0
3,2023,385879200.0,427230500.0
4,2024,423327900.0,458171700.0
5,2025,48794120.0,379064600.0


In [11]:
def plot_info(year, gauge) -> None:
    # Filter the data for the given year
    year_filter = gauge[gauge['CalYear'] == year]
    
    # Extract actual and expected values
    actual = year_filter['YTD_Total'].iloc[0]
    expected = year_filter['Salary_Total'].iloc[0]

    # Create the gauge plot
    steps = [
        {'range': [0, expected], 'color': '#004080'}  # Dark blue for expected salary
    ]
    
    # Add yellow step if actual exceeds expected
    if actual > expected:
        steps.append({'range': [expected, actual], 'color': 'yellow'})  # Yellow for actual salary
    
    fig = go.Figure(go.Indicator(
        mode="gauge+number+delta",
        value=actual,
        delta={
            'reference': expected,
            'increasing': {'color': "red"},
            'decreasing': {'color': "green"}
        },
        gauge={
            'axis': {'range': [0, expected * 1.1]},  
            'bar': {'color': 'rgba(0,0,0,0)'},
            'steps': steps,
            'threshold': {
                'line': {'color': "red", 'width': 4},  
                'thickness': .95,
                'value': actual  # Place the red line at the actual
            }
        },
        title={'text': f"{year} Salary Spend vs. Budget", 'font': {'size': 20}},
    ))

    fig.update_layout(
        title_font={'size': 20},  # Set font size for the overall layout title
    )

    # Show the figure
    fig.show()


# Example usage:
plot_info(2020, gauge)


In [9]:
# Example usage:
plot_info(2019, gauge)

IndexError: single positional indexer is out-of-bounds

# code for 7, 9, and 12 
- 7 will be the calculation of total salary spend up till today or historical total
- 9 actual salary spend of total salary spend up till today or historical total
- 12 will be the percent difference in the salary spend 

<pre>
+---------------------------------------------------------------------------------------------------------+
|+-------------------------------------------------------------------------------------------------------+|
||[1. logo] 2. Louisville Metro Government Salary tracker   [3. Department drop down][ 4. year drop down]|| 
|+-------------------------------------------------------------------------------------------------------+|
|+--------+     6. salary spend 8. actual salary spend +-----+     +-------------------------------------+|
||5. plot |     7. salary data  9. salary data         |12. %|     |  11. horizontal bar plot department || 
||        |                                            +-----+     |                                     || 
|+--------+                                                        |                                     || 
|                                                                  |                                     || 
|+---------------------------------------------------------------+ |                                     || 
|| 10. Top employees salary deviation data frame                 | |                                     || 
||                                                               | |                                     || 
||                                                               | |                                     || 
||                                                               | |                                     || 
||                                                               | |                                     || 
||                                                               | |                                     || 
||                                                               | |                                     || 
|+---------------------------------------------------------------+ +-------------------------------------+|
+---------------------------------------------------------------------------------------------------------+
</pre>


- 7 will be the calculation of total salary spend up till today or historical total
- 9 actual salary spend of total salary spend up till today or historical total
- 12 will be the percent difference in the salary spend 

In [None]:
def calculate_total_spend(year, data) -> str:
    # Filter the data for the specified year
    year_filter = data[data['CalYear'] == year]
    
    # Calculate the sum of 'YTD_Total' for the filtered data
    total_spend = year_filter['YTD_Total'].sum().round(2)

    # Formats thousands separators
    total_spend = "{:,.2f}".format(total_spend)
    
    # Return the total spend as a float
    return str(total_spend)

In [None]:
def calculate_total_budget(year, data) -> str:
    # Filter the data for the specified year
    year_filter = data[data['CalYear'] == year]
    
    # Calculate the sum of 'Salary_Total' for the filtered data
    total_spend = year_filter['Salary_Total'].sum().round(2)

    # Formats thousands separators
    total_spend = "{:,.2f}".format(total_spend)
    
    # Return the total spend as a float
    return str(total_spend)

In [None]:
def calculate_budget_difference(year, data) -> str:
    # Filter the data for the specified year
    year_filter = data[data['CalYear'] == year]
    
    if year_filter.empty:
        return "No data for the specified year."
    
    # Extract actual and expected values
    actual_spend = year_filter['YTD_Total'].sum()
    budgeted_salary = year_filter['Salary_Total'].sum()

    # Calculate the percentage difference
    difference = ((actual_spend - budgeted_salary) / budgeted_salary) * 100
    
    # Format the difference with a '+' or '-' and thousands separators
    return f"{difference:+,.2f}%"



In [None]:
# Call the function and print the result
total_sal_spend = calculate_total_spend(2024, data)

print(f'Total Salary Spend: {total_sal_spend}')

Total Salary Spend: 373,062,743.42


In [None]:
# Call the function and print the result
total_sal_budget = calculate_total_budget(2024, data)

print(f'Total Salary Budgeted: {total_sal_budget}')

Total Salary Budgeted: 450,699,319.02


In [None]:
# Call the function and print the result
budget_difference = calculate_budget_difference(2024, data)
print(budget_difference)

-17.23%


# Code for 10 

<pre>
+---------------------------------------------------------------------------------------------------------+
|+-------------------------------------------------------------------------------------------------------+|
||[1. logo] 2. Louisville Metro Government Salary tracker   [3. Department drop down][ 4. year drop down]|| 
|+-------------------------------------------------------------------------------------------------------+|
|+--------+     6. salary spend 8. actual salary spend +-----+     +-------------------------------------+|
||5. plot |     7. salary data  9. salary data         |12. %|     |  11. horizontal bar plot department || 
||        |                                            +-----+     |                                     || 
|+--------+                                                        |                                     || 
|                                                                  |                                     || 
|+---------------------------------------------------------------+ |                                     || 
|| 10. Top employees salary deviation data frame                 | |                                     || 
||                                                               | |                                     || 
||                                                               | |                                     || 
||                                                               | |                                     || 
||                                                               | |                                     || 
||                                                               | |                                     || 
||                                                               | |                                     || 
|+---------------------------------------------------------------+ +-------------------------------------+|
+---------------------------------------------------------------------------------------------------------+
</pre>


In [None]:
# we had to avoid division by zero and only calculate Discrepancy_Percent for employees with Salary_Total >= 20k


def top_emp_dev(year, data):
    # Filter the data for the given year
    top_employee = data[data['CalYear'] == year].copy()

    # Calculate the discrepancy and discrepancy ratio
    top_employee['Deviation'] = top_employee['YTD_Total'] - top_employee['Salary_Total']

    # Avoid division by zero and handle Salary_Total < 20k
    top_employee.loc[:, 'Discrepancy_Percent'] = top_employee.apply(
        lambda row: (row['Deviation'] / row['Salary_Total']) * 100 if row['Salary_Total'] >= 20000 and row['Salary_Total'] != 0 else None, axis=1
    )

    # Round the Discrepancy_Percent to 2 decimal places
    top_employee['Discrepancy_Percent'] = top_employee['Discrepancy_Percent'].round(2)

    # Sort the DataFrame by 'Discrepancy_Percent' in descending order
    top_employee = top_employee.sort_values(by='Discrepancy_Percent', ascending=False)

    # Keep only the specified columns
    top_employee = top_employee[['CalYear', 'Employee_Name', 'Department', 
                                 'YTD_Total', 'Salary_Total', 'Deviation', 
                                 'Discrepancy_Percent']]
    
    # Reset index and drop the old index column
    top_employee.reset_index(drop=True, inplace=True)

    return top_employee.head(10)


In [None]:
# Example usage:
top_employee_result = top_emp_dev(2024, data)
top_employee_result

Unnamed: 0,CalYear,Employee_Name,Department,YTD_Total,Salary_Total,Deviation,Discrepancy_Percent
0,2024,"Standard, Royce Leshawn",Metro Corrections,180770.98,66934.4,113836.58,170.07
1,2024,"Stimphil, Richardson",Metro Corrections,153587.46,60299.2,93288.26,154.71
2,2024,"Jones, Dontorya J",Emergency Management Services,144154.28,62583.6,81570.68,130.34
3,2024,"Jenkins, Todd Barry",Louisville Metro Police Department,251840.31,114357.73,137482.58,120.22
4,2024,"Roberts, Bryan Keith",Metro Corrections,125239.59,58635.2,66604.39,113.59
5,2024,"Whidby, Jacob T",Metro Corrections,156558.72,74256.0,82302.72,110.84
6,2024,"Nicolas-Bates, Marylea Diedra",Metro Corrections,106942.63,50960.0,55982.63,109.86
7,2024,"Starcher, Candice Lynn",Louisville Metro Police Department,96518.09,46248.36,50269.73,108.7
8,2024,"Ashby, Stephanie Renee",ES & MetroSafe,137706.0,66218.31,71487.69,107.96
9,2024,"Salman, Saja Naji",Metro Corrections,101862.7,50835.2,51027.5,100.38


In [None]:
# # pre function code

# top_employee = data

# # Calculate the discrepancy and discrepancy ratio
# top_employee['Deviation'] = top_employee['YTD_Total'] - top_employee['Salary_Total']

# # Avoid division by zero by setting Discrepancy_Percent to None or 0 if Salary_Total is 0
# top_employee['Discrepancy_Percent'] = top_employee.apply(
#     lambda row: (row['Deviation'] / row['Salary_Total']) * 100 if row['Salary_Total'] != 0 else None, axis=1
# )

# # Round the Discrepancy_Percent to 2 decimal places
# top_employee['Discrepancy_Percent'] = top_employee['Discrepancy_Percent'].round(2)

# # Sort the DataFrame by 'Discrepancy_Percent' in descending order
# top_employee = top_employee.sort_values(by='Discrepancy_Percent', ascending=False)

# # Keep only the specified columns
# top_employee = top_employee[['CalYear', 'Employee_Name', 'Department', 
#                              'YTD_Total', 'Salary_Total', 'Deviation', 
#                              'Discrepancy_Percent']]

# top_employee.head(10)

In [None]:
# Discrepancy_Percent has several inf values so we worked around it as the data seems to be missing. 

# top_employee = data

# # Calculate the discrepancy and discrepancy ratio
# top_employee['Deviation'] = top_employee['YTD_Total'] - top_employee['Salary_Total']
# top_employee['Discrepancy_Percent'] = (top_employee['Deviation'] / top_employee['Salary_Total']) * 100

# # Round the Discrepancy_Percent to 2 decimal places
# top_employee['Discrepancy_Percent'] = top_employee['Discrepancy_Percent'].round(2)

# # Sort the DataFrame by 'Deviation' in descending order
# top_employee = top_employee.sort_values(by='Discrepancy_Percent', ascending=False)

# # Keep only the specified columns
# top_employee = top_employee[['CalYear', 'Employee_Name', 'Department', 
#                              'YTD_Total', 'Salary_Total', 'Deviation', 
#                              'Discrepancy_Percent']]

# top_employee.head(10)


# code for 11

<pre>
+---------------------------------------------------------------------------------------------------------+
|+-------------------------------------------------------------------------------------------------------+|
||[1. logo] 2. Louisville Metro Government Salary tracker   [3. Department drop down][ 4. year drop down]|| 
|+-------------------------------------------------------------------------------------------------------+|
|+--------+     6. salary spend 8. actual salary spend +-----+     +-------------------------------------+|
||5. plot |     7. salary data  9. salary data         |12. %|     |  11. horizontal bar plot department || 
||        |                                            +-----+     |                                     || 
|+--------+                                                        |                                     || 
|                                                                  |                                     || 
|+---------------------------------------------------------------+ |                                     || 
|| 10. Top employees salary deviation data frame                 | |                                     || 
||                                                               | |                                     || 
||                                                               | |                                     || 
||                                                               | |                                     || 
||                                                               | |                                     || 
||                                                               | |                                     || 
||                                                               | |                                     || 
|+---------------------------------------------------------------+ +-------------------------------------+|
+---------------------------------------------------------------------------------------------------------+
</pre>


In [None]:
def department_discrepancy(year, data):
    # Group by 'CalYear' and 'Department', summing 'YTD_Total' and 'Salary_Total'
    department = data.groupby(['CalYear', 'Department'])[['YTD_Total', 'Salary_Total']].sum().reset_index()

    # Filter the data by the given year
    department = department[department['CalYear'] == year]

    # Calculate the percentage difference: ((YTD_Total - Salary_Total) / Salary_Total) * 100
    department['Discrepancy_Percent'] = ((department['YTD_Total'] - department['Salary_Total']) / department['Salary_Total']) * 100

    # Round the Discrepancy_Percent to 2 decimal places
    department['Discrepancy_Percent'] = department['Discrepancy_Percent'].round(2)

    # Sort the DataFrame by 'Discrepancy_Percent' in ascending order
    department = department.sort_values(by='Discrepancy_Percent', ascending=False)

    # Format 'YTD_Total' and 'Salary_Total' with thousands separators
    department['YTD_Total'] = department['YTD_Total'].apply(lambda x: f"{x:,.2f}")
    department['Salary_Total'] = department['Salary_Total'].apply(lambda x: f"{x:,.2f}")

    # Rename columns for final output
    department = department.rename(columns={
        'YTD_Total': 'Total Salary Spend',
        'Salary_Total': 'Salary Budget',
        'Discrepancy_Percent': 'Discrepancy Percent'
    })

    # Return the result
    return department


In [None]:
# Example usage:
department_result = department_discrepancy(2024, data)
department_result

Unnamed: 0,CalYear,Department,Total Salary Spend,Salary Budget,Discrepancy Percent
224,2024,Louisville Metro Police Department,123082990.73,124125361.75,-0.84
228,2024,Metro Corrections,28968309.48,30877424.89,-6.18
213,2024,Emergency Management Services,11182592.57,11960136.14,-6.5
211,2024,ES & MetroSafe,11493629.77,12602656.88,-8.8
222,2024,Louisville Fire,42079902.81,46479751.63,-9.47
250,2024,Youth Transitional Services,1464017.29,1663070.33,-11.97
209,2024,Criminal Justice Commission,308243.95,354505.99,-13.05
238,2024,Office of Philanthropy,319561.43,367525.45,-13.05
243,2024,Parking Authority of River City - PARC,1897249.91,2247607.26,-15.59
247,2024,Records Compliance,918726.62,1127444.44,-18.51


In [None]:
# Group by 'CalYear' and 'Department', summing 'YTD_Total' and 'Salary_Total'
department = data.groupby(['CalYear', 'Department'])[['YTD_Total', 'Salary_Total']].sum().reset_index()

# Calculate the percentage difference: ((YTD_Total - Salary_Total) / Salary_Total) * 100
department['Discrepancy_Percent'] = ((department['YTD_Total'] - department['Salary_Total']) / department['Salary_Total']) * 100

# Round the Discrepancy_Percent to 2 decimal places
department['Discrepancy_Percent'] = department['Discrepancy_Percent'].round(2)

# Sort the DataFrame by 'Discrepancy_Percent' in ascending order
department = department.sort_values(by='Discrepancy_Percent', ascending=False)

# Display the result
department


Unnamed: 0,CalYear,Department,YTD_Total,Salary_Total,Discrepancy_Percent
121,2022,Emergency Management Services,12794627.89,9202487.66,39.03
134,2022,Metro Corrections,31309541.30,25990024.03,20.47
45,2020,Emergency Mgt Agency/MetroSafe,11929775.61,9909600.92,20.39
55,2020,Louisville Metro EMS,12927855.91,10843419.00,19.22
119,2022,ES & MetroSafe,13317575.13,11293792.49,17.92
...,...,...,...,...,...
100,2021,Office of Inspector General,13984.74,170801.80,-91.81
242,2024,Other Statutory Obligations,131007.15,1871474.40,-93.00
218,2024,Insurance & Risk Management,0.00,151840.00,-100.00
210,2024,Develop Louisville,0.00,115814.40,-100.00


# Whole department views

In [None]:
department = data.groupby(['CalYear', 'Department'])[['YTD_Total', 'Salary_Total']].sum().reset_index()

In [None]:
department.head()

Unnamed: 0,CalYear,Department,YTD_Total,Salary_Total
0,2019,Air Pollution Control,3161000.01,3357059.36
1,2019,Belle of Louisville,1288272.26,2255142.24
2,2019,Commonwealth Attorney,1076553.31,1254064.5
3,2019,Coroner,883494.8,1006419.2
4,2019,County Attorney,5253504.81,6583707.28


# Individual Department Views

<pre>
+---------------------------------------------------------------------------------------------------------+
|+-------------------------------------------------------------------------------------------------------+|
||[1. logo] 2. Louisville Metro Government Salary tracker   [3. Department drop down][ 4. year drop down]|| 
|+-------------------------------------------------------------------------------------------------------+|
|+--------+     6. salary spend 8. actual salary spend +-----+     +-------------------------------------+|
||5. plot |     7. salary data  9. salary data         |12. %|     |  11. horizontal bar plot department || 
||        |                                            +-----+     |                                     || 
|+--------+                                                        |                                     || 
|                                                                  |                                     || 
|+---------------------------------------------------------------+ |                                     || 
|| 10. Top employees salary deviation data frame                 | |                                     || 
||                                                               | |                                     || 
||                                                               | |                                     || 
||                                                               | |                                     || 
||                                                               | |                                     || 
||                                                               | |                                     || 
||                                                               | |                                     || 
|+---------------------------------------------------------------+ +-------------------------------------+|
+---------------------------------------------------------------------------------------------------------+
</pre>


In [None]:
lmpd = department[department['Department'] == 'Louisville Metro Police Department'].copy()
lmpd = lmpd[lmpd['CalYear'] != 2024].copy()
lmpd

Unnamed: 0,CalYear,Department,YTD_Total,Salary_Total
19,2019,Louisville Metro Police Department,102216000.0,98625180.0
56,2020,Louisville Metro Police Department,106152700.0,104060600.0
93,2021,Louisville Metro Police Department,103384100.0,102093700.0
130,2022,Louisville Metro Police Department,122408500.0,105190200.0
175,2023,Louisville Metro Police Department,122305300.0,118276600.0


In [None]:
lmpd.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5 entries, 19 to 175
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   CalYear       5 non-null      int64  
 1   Department    5 non-null      object 
 2   YTD_Total     5 non-null      float64
 3   Salary_Total  5 non-null      float64
dtypes: float64(2), int64(1), object(1)
memory usage: 200.0+ bytes


In [None]:
lmpd['Discrepancy'] = lmpd['YTD_Total'] - lmpd['Salary_Total']
lmpd['Discrepancy_Ratio'] = lmpd['Discrepancy'] / lmpd['Salary_Total']

lmpd

Unnamed: 0,CalYear,Department,YTD_Total,Salary_Total,Discrepancy,Discrepancy_Ratio
19,2019,Louisville Metro Police Department,102216000.0,98625180.0,3590832.44,0.036409
56,2020,Louisville Metro Police Department,106152700.0,104060600.0,2092070.71,0.020104
93,2021,Louisville Metro Police Department,103384100.0,102093700.0,1290365.61,0.012639
130,2022,Louisville Metro Police Department,122408500.0,105190200.0,17218319.86,0.163687
175,2023,Louisville Metro Police Department,122305300.0,118276600.0,4028742.06,0.034062


In business, a discrepancy ratio between expected salary spend (Salary_Total) and actual salary spend (YTD_Total) typically falls within 0 to 5% as an acceptable range. A discrepancy within this range usually indicates minor variations in salary spending, such as unexpected overtime, temporary staffing, or adjustments to salary levels, which are often manageable within the overall budget. A discrepancy ratio of more than 10% would generally raise concerns, signaling potential overspending on salaries, and would require a deeper investigation into the causes, such as unplanned hiring, excessive overtime, or compensation adjustments.

For the Louisville Metro Police Department, the discrepancy ratios from 2019 to 2023 show a range from 0.01 to 0.16. The ratios in 2019, 2020, 2021, and 2023 are all within a 0.01 to 0.04 range, which suggests a relatively small and manageable variance in salary spending. However, the discrepancy ratio for 2022 is significantly higher at 0.16, indicating a 16% overspend on salaries compared to the planned budget for that year. This could point to specific factors in 2022 that led to a sharp increase in salary expenses, such as a one-time hiring initiative, increased overtime costs, or other unforeseen budgetary pressures.

# LMPD indv level

In [None]:
indv = pd.read_csv('data/SalaryData.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'data/SalaryData.csv'

In [None]:
cols_to_drop = ['Regular_Rate', 'Overtime_Rate', 'Other', 'ObjectId']
indv = indv.drop(columns=cols_to_drop)
indv['Salary_Total'] = indv['Annual_Rate'] + indv['Incentive_Allowance']
final_drop = ['Annual_Rate', 'Incentive_Allowance']
indv = indv.drop(columns=final_drop)
indv = indv.fillna(0)
indv['Department'] = indv['Department'].replace('Louisville Metro Police', 'Louisville Metro Police Department')
indv = indv[indv['CalYear'] != 2024].copy()

In [None]:
indv = indv[indv['Department'] == 'Louisville Metro Police Department'].copy()
# indv.head(2)

In [None]:
indv['Discrepancy'] = indv['YTD_Total'] - indv['Salary_Total']
indv['Discrepancy_Ratio'] = indv['Discrepancy'] / indv['Salary_Total']


In [None]:
indv.head(2)

Top 10 Employees with the Highest Discrepancies

In [None]:
def top_10_discrepancy(indv, enter_year):
    # Filter the DataFrame for the given year
    indv_year = indv[indv['CalYear'] == enter_year]
    
    # Sort by 'Discrepancy' in descending order to get the top 10 employees
    top_10_employees = indv_year.sort_values(by='Discrepancy_Ratio', ascending=False).head(10)
    
    # Return the top 10 employees with their discrepancies
    return top_10_employees[['CalYear', 'Employee_Name', 'jobTitle', 'Discrepancy', 'Discrepancy_Ratio', 'YTD_Total', 'Salary_Total']]

In [None]:
# Example usage:
top_emp_dis_2019 = top_10_discrepancy(indv, 2021)
top_emp_dis_2019

In [None]:
top_emp_dis_2020 = top_10_discrepancy(indv, 2020)
top_emp_dis_2020

In [None]:
top_emp_dis_2022 = top_10_discrepancy(indv, 2022)
top_emp_dis_2022

Job Title with the Largest Individual Discrepancy by Year:

In [None]:
# Group by CalYear and jobTitle to find the row with the largest discrepancy per year
largest_discrepancy_by_year = indv.loc[indv.groupby('CalYear')['Discrepancy'].idxmax()]

# Display the job title and the corresponding discrepancy
largest_discrepancy_by_year[['CalYear', 'jobTitle', 'Discrepancy']]

top 10 job titles by avg discrepancy by year

In [None]:
def top_10_job_titles_by_avg_discrepancy(indv, enter_year):
    # Filter the DataFrame for the given year
    indv_year = indv[indv['CalYear'] == enter_year]
    
    # Group by 'jobTitle' and calculate the average 'Discrepancy' for each job title
    avg_discrepancy_by_job = indv_year.groupby('jobTitle')['Discrepancy'].mean()
    
    # Sort by the average 'Discrepancy' in descending order to get the top 10 job titles
    top_10_job_titles = avg_discrepancy_by_job.sort_values(ascending=False).head(10)
    
    # Return the relevant columns (jobTitle and average Discrepancy)
    return top_10_job_titles.reset_index()[['jobTitle', 'Discrepancy']]

In [None]:
# Example usage:
top_10_job_titles_by_avg_discrepancy_2023 = top_10_job_titles_by_avg_discrepancy(indv, 2023)
top_10_job_titles_by_avg_discrepancy_2023

Average Discrepancy per Job Title:

In [None]:
def avg_discrepancy_by_job(indv):
    # Group by 'jobTitle' and calculate the average discrepancy (YTD_Total - Salary_Total)
    avg_discrepancy_per_job = indv.groupby('jobTitle')['Discrepancy'].mean()
    
    # Sort the result by average discrepancy in descending order
    avg_discrepancy_per_job_sorted = avg_discrepancy_per_job.sort_values(ascending=False)
    
    # Return the sorted average discrepancies
    return avg_discrepancy_per_job_sorted

In [None]:
# Example usage:
avg_discrepancy = avg_discrepancy_by_job(indv)
avg_discrepancy