In [None]:
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec

In [None]:
roster = pd.read_csv('../final/roster.csv')
roster.head()

In [None]:
awards  = pd.read_csv('../final/awards.csv')
awards.head()

# officer date - start date at which officer became officer according to this dataset - ignore for now
# salary per year per position (median + bars)
# salary per year per seniority (median of binning)

# award histogram by type, gender, position, age, etc

In [None]:
awards.award_type.unique()

In [None]:
trait_map = {}
traits = ['race', 'gender']

for i in range(roster.shape[0]):
    trait_map[roster.uid.iloc[i]] = {tr : roster[tr].iloc[i] for tr in traits}

for tr in traits:
    awards[tr] = awards.uid.map(lambda x : trait_map[x][tr])
    
awards

In [None]:
num_offs = roster.groupby(['race', 'gender']).agg(['count'])
num_offs.reset_index(inplace=True)
num_offs['count'] = num_offs['uid']['count']
num_offs = num_offs[['race', 'gender', 'count']]
num_offs = num_offs.rename(columns = {'count' : 'officers'})
num_offs = num_offs[num_offs.gender != 'X']
num_offs.reset_index(inplace=True)
num_offs

In [None]:
num_awds = awards.groupby(['race', 'gender']).agg(['count'])
num_awds.reset_index(inplace=True)
num_awds['count'] = num_awds['uid']['count']
num_awds = num_awds[['race', 'gender', 'count']]
num_awds = num_awds.rename(columns = {'count' : 'awards'})
num_awds

In [None]:
cc = pd.concat([num_offs, num_awds], axis=1)
cc = cc.loc[:, ~cc.columns.duplicated()]
cc['frac'] = cc['awards']/cc['officers']
cc

In [None]:
import seaborn as sns
sns.set_style('whitegrid')
ax = sns.barplot(x = 'race', y = 'frac', hue = 'gender', data = cc)
plt.xticks(rotation=45, ha="right", rotation_mode="anchor")
plt.ylabel('Award Requests per Officer')
plt.xlabel('CPD Race Category')
ax.get_legend().set_title('CPD Gender Category')
plt.tight_layout()
plt.savefig('awards.pdf')