In [None]:
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
import seaborn as sns

In [None]:
# load the roster (for race + gender info)
roster = pd.read_csv('../final/roster.csv')
roster.head()

In [None]:
# load the complaints
complaints  = pd.read_csv('../final/complaints.csv')
complaints.head()

In [None]:
# load the complaints officers
complaints_offs = pd.read_csv('../final/complaints_officers.csv')
complaints_offs

In [None]:
year_range = min(pd.to_datetime(complaints['complaint_date'].values).year), max(pd.to_datetime(complaints['complaint_date'].values).year)
length = 1+year_range[1]-year_range[0]
events_per_year = np.bincount(pd.to_datetime(complaints['complaint_date'].values).year)[-length:]
events_per_month = np.bincount(pd.to_datetime(complaints['complaint_date'].values).month)[1:]
events_per_day = np.bincount(pd.to_datetime(complaints['complaint_date'].values).weekday)
hours = pd.to_datetime(complaints['complaint_date'].values).hour
hours = np.nan_to_num(hours, nan=24)
events_per_hour = np.bincount(hours.astype(int))

In [None]:
plt.figure(figsize = (20,5))

ax=plt.subplot(131)

plt.scatter(np.arange(len(events_per_day)), events_per_day)
plt.vlines(x = np.arange(len(events_per_day)), ymin = np.zeros(len(events_per_day)), ymax = events_per_day)
dayticks = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
plt.xticks(np.arange(len(events_per_day)), dayticks, fontsize = 18)
plt.yticks(fontsize = 18)
plt.ylabel('# Complaints', fontsize = 18)
plt.setp(ax.xaxis.get_majorticklabels(), rotation=45, ha="right", rotation_mode="anchor") 
plt.xlabel('Weekdays', fontsize = 18)

ax=plt.subplot(132)

plt.scatter(np.arange(len(events_per_month)), events_per_month)
plt.vlines(x = np.arange(len(events_per_month)), ymin = np.zeros(len(events_per_month)), ymax = events_per_month)
monthticks = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
plt.xticks(np.arange(len(events_per_month)), monthticks, fontsize = 18)
plt.yticks(fontsize = 18)
plt.setp(ax.xaxis.get_majorticklabels(), rotation=45, ha="right", rotation_mode="anchor") 
plt.xlabel('Months', fontsize = 18)

ax = plt.subplot(133)
plt.scatter(np.arange(year_range[0], year_range[1]+1, dtype = int), events_per_year)
plt.vlines(x = np.arange(year_range[0], year_range[1]+1, dtype = int), ymin = np.zeros(len(events_per_year)), ymax = events_per_year)
plt.xlabel('Years', fontsize = 18)
plt.xticks(fontsize=18)
plt.yticks(fontsize = 18)
plt.setp(ax.xaxis.get_majorticklabels(), rotation=45, ha="right", rotation_mode="anchor") 
plt.tight_layout()
plt.savefig('../doc/figs/complaints_times.pdf', dpi=1000, bbox_inches = 'tight')
plt.show()

In [None]:
# create a map of shortnames of races
races_short = {'WHITE' : 'White', 
              'BLACK' : 'Black',
              'BLACK HISPANIC' : 'Bl. Hisp.',
              'WHITE HISPANIC' : 'Wh. Hisp.',
              'ASIAN/PACIFIC ISLANDER' : 'Asian/P.I.',
              'AMER IND/ALASKAN NATIVE' : 'Indig.',
              'Other' : 'Other'}

In [None]:
# build a map of UID -> (race, gender)
trait_map = {}
traits = ['race', 'gender']

for i in range(roster.shape[0]):
    trait_map[roster.uid.iloc[i]] = {tr : roster[tr].iloc[i] for tr in traits}

# for each trait, add a column to the complaints_offs data using the map
for tr in traits:
    complaints_offs[tr] = complaints_offs.uid.map(lambda x : trait_map[x][tr])
    
# in the below plots, Asian/Pacific Islander, Indigenous, and Black Hispanic categories are too small to get reliable awards ratios.
# group them into "Other"
complaints_offs.race = complaints_offs.race.map(lambda x : 'Other' if x in ['ASIAN/PACIFIC ISLANDER', 'AMER IND/ALASKAN NATIVE', 'BLACK HISPANIC'] else x)
roster.race = roster.race.map(lambda x : 'Other' if x in ['ASIAN/PACIFIC ISLANDER', 'AMER IND/ALASKAN NATIVE', 'BLACK HISPANIC'] else x)

In [None]:
# aggregate the number of unique officers in each (race, gender) category
num_offs = roster.groupby(['race', 'gender']).agg(['count'])
num_offs.reset_index(inplace=True)
num_offs['count'] = num_offs['uid']['count']
# fix tuple colnames
num_offs.columns = num_offs.columns.map(''.join)
# restrict the dataframe to just (race, gender, count)
num_offs = num_offs[['race', 'gender', 'count']]
num_offs = num_offs.rename(columns = {'count' : 'officers'})
# remove the 'X' gender (by visual inspection, these should be missing data -- not nonbinary genders)
num_offs = num_offs[num_offs.gender != 'X']
# replace races with shortnames
num_offs.race = num_offs.race.map(races_short)
num_offs.reset_index(inplace=True)
num_offs

In [None]:
# aggregate the number of complaints in each (race, gender) category
num_compls = complaints_offs.groupby(['race', 'gender']).agg(['count'])
num_compls.reset_index(inplace=True)
num_compls['count'] = num_compls['uid']['count']
# fix tuple colnames
num_compls.columns = num_compls.columns.map(''.join)
# restrict to just these columns
num_compls = num_compls[['race', 'gender', 'count']]
num_compls = num_compls.rename(columns = {'count' : 'complaints'})
# replace races with shortnames
num_compls.race = num_compls.race.map(races_short)
num_compls.reset_index(inplace=True)
num_compls

In [None]:
# join the two data frames and remove duplicated columns
cc = pd.concat([num_offs, num_compls], axis=1)
cc = cc.loc[:, ~cc.columns.duplicated()]
# compute the number of awards per officer in each category
cc['frac'] = cc['complaints']/cc['officers']
cc

In [None]:
# generate the plot
sns.set_style('whitegrid')
ax = sns.barplot(x = 'race', y = 'frac', hue = 'gender', data = cc)
plt.xticks(rotation=45, ha="right", rotation_mode="anchor")
plt.ylabel('Complaints per Officer')
plt.xlabel('CPD Race Category')
ax.get_legend().set_title('CPD Gender Category')
plt.tight_layout()
plt.savefig('../doc/figs/complaints.pdf')