In [None]:
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec

In [None]:
# Load the salary data
salary  = pd.read_csv('../final/salary.csv')
salary.head()

In [None]:
# plot salary for these positions
positions = ['POLICE OFFICER', 'POLICE OFFICER (ASSIGNED AS DETECTIVE)', 'SERGEANT', 'LIEUTENANT', 'COMMANDER']

plt.figure(figsize = (7,5))
for posn in positions:
    # restrict to the particular position
    rows = salary[salary.position_description == posn]
    
    # compute the number of years the officer has spent in their current position for each record
    rows['posn_start_year'] = list(map(lambda x : x.year, pd.to_datetime(rows.present_posn_start_date.values)))
    rows['years_in_posn'] = rows['year'] - rows['posn_start_year']   

    # remove very old entries
    rows = rows[rows.years_in_posn <= 30]
    # aggregate and compute stats
    agg = rows.groupby('years_in_posn').salary.agg(['mean', 'std', 'count'])
    agg.reset_index(inplace=True)
    # only plot when there were a large enough group of officers (at least 3)
    agg = agg[agg['count'] >= 3]
    plt.errorbar(agg.index, agg['mean'], agg['std'], label=posn if '(' not in posn else 'DETECTIVE')
    plt.xlabel('Years in Position')
    plt.ylabel('Salary (USD)')
plt.legend()
plt.tight_layout()
plt.savefig('salary.pdf')
plt.show()