In [None]:
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec

def flat_list(list_of_lists):
    return [item for sublist in list_of_lists for item in sublist]

We first load the "history.csv" file; 

In [None]:
history = pd.read_csv('../final/history.csv')
print(history.head())
end_date = max(pd.to_datetime(history['end_date'].values)) # largest end date
print('Number of distinct uids in database:', len(set(history['uid'])))

Create history profiles: for each uid, associate all start, end dates;

In [None]:
# history_profiles = {}
# for uid in tqdm_notebook(set(history['uid'])):
#     history_profiles[uid] = {}
#     vals = history[history['uid'] == uid].values
#     for v in vals:
#         if np.isnan(v[1]):
#             pass
#         else:
#             history_profiles[uid][int(v[1])] = v[2], v[3]
# np.save('history_profiles', history_profiles)

history_profiles = np.load('../../derived_data/history_profiles.npy', allow_pickle=1).item()

In [None]:
# distribution of number of units/officer

a = [len(history_profiles[individual_officer]) for individual_officer in history_profiles]
plt.hist(a, bins = 20)
plt.xlabel('# units', fontsize = 14)
plt.ylabel('# officers', fontsize = 14)
plt.xticks(np.arange(18)[::2], np.arange(1,19)[::2], fontsize = 14)
plt.yticks(fontsize = 14)
plt.tight_layout()
plt.show()

In [None]:
# units_numbers = set(flat_list([list(history_profiles[k].keys()) for k in history_profiles]))
# time_in_unit = {unit : 0 for unit in units_numbers}
# plt.figure(figsize = (20,3))
# for uid in history_profiles:
#     for unit in history_profiles[uid]:
#         time_in_unit[unit] += 1
# max_unit, max_times = np.argmax([time_in_unit[u] for u in time_in_unit]), max([time_in_unit[u] for u in time_in_unit])
# plt.scatter(np.arange(max_unit), [time_in_unit[v] for v in np.sort(list(units_numbers))[:max_unit]])
# # plt.xticks(np.arange(len(units_numbers)), np.sort(list(units_numbers)))
# plt.show()

In [None]:
# potentially interesting questions:

# how much time does an officer spend in a unit?
# officer distribution across units as time varies? 

In [None]:
roster = pd.read_csv('../final/roster.csv')
roster.head()

In [None]:
appointment_dates = pd.to_datetime(roster['appointment_date'].values)
resignation_dates = pd.to_datetime(roster['resignation_date'].values)

plt.figure(figsize = (21,5))
plt.subplot(131)
plt.hist(appointment_dates, bins = 20)
plt.xlabel('Year', fontsize = 22)
plt.xticks(fontsize = 16)
plt.yticks(fontsize = 16)
plt.ylabel('# Appointed Officers', fontsize = 20)

plt.subplot(132)
plt.hist(resignation_dates, bins=20)
plt.xlabel('Year', fontsize = 22)
plt.xticks(fontsize = 16)
plt.yticks(fontsize = 16)
plt.ylabel('# Resigned Officers', fontsize = 20)
# plt.show()

# total number of active officers

def convert_to_year(range_time, low, high):
    delta_time = high-low
    full_date = low + range_time*delta_time
    return full_date.year

first_appointment_date, last_appointment_date = min(appointment_dates), max(appointment_dates)
delta_time = last_appointment_date - first_appointment_date
actives = (pd.to_datetime(roster['appointment_date'].values) - first_appointment_date)/delta_time
inactives = (pd.to_datetime(roster['resignation_date'].values) - first_appointment_date)/delta_time
inactives = np.nan_to_num(inactives, nan=1)
how_many_active = []
range_time = np.linspace(0,1,50)
years = [convert_to_year(r, first_appointment_date, last_appointment_date) for r in range_time]
for r in range_time:
    how_many_active.append(np.sum((actives<r)*(inactives>r)))
    
plt.subplot(133)
plt.plot(years[1:-1], how_many_active[1:-1], lw = 3)
plt.ylabel('# Active officers', fontsize = 20)
plt.xlabel('Year', fontsize = 22)
plt.xticks(fontsize = 16)
plt.yticks(fontsize = 16)
plt.tight_layout()
plt.savefig('../doc/figs/history.pdf', dpi=1000, bbox_inches = 'tight')
plt.show()

In [None]:
yearz = roster['birthyear'].sort_values().dropna()
min_year = min(yearz.values) 
max_year = max(yearz.values)
width_year_windows = 5
binned = np.bincount((yearz - min_year)//width_year_windows)
bins = np.arange(min_year, max_year, width_year_windows, dtype = int)
bins_text = [str(bins[t])+'-'+str(bins[t]+width_year_windows-1)[-2:] for t in range(len(bins))]

act_roster = roster.fillna({'resignation_date':'2014-01-01'})
act_roster.drop(act_roster[act_roster['resignation_date'] < '2014-01-01'].index, inplace = True)
act_yearz = act_roster['birthyear'].sort_values().dropna()
act_binned = np.bincount((act_yearz - min_year)//width_year_windows)

In [None]:
gender_count, race_count, birthyear_count, status_count = roster['gender'].value_counts(), roster['race'].value_counts(), roster['birthyear'].sort_values().value_counts(), roster['status'].value_counts()
gender_count, race_count

In [None]:
act_gender_count, act_race_count, act_birthyear_count, act_status_count = act_roster['gender'].value_counts(), act_roster['race'].value_counts(), act_roster['birthyear'].sort_values().value_counts(), act_roster['status'].value_counts()
act_gender_count, act_race_count

In [None]:
plt.figure(figsize = (12,3))
ax = plt.subplot(111)

plt.scatter(bins-1, binned, color = 'blue')
plt.vlines(x = bins-1, ymin = np.zeros(len(bins)), ymax = binned, color = 'blue')

plt.scatter(bins+1, act_binned, color = 'r', label = 'Active after Jan 1st 2014')
plt.vlines(x = bins+1, ymin = np.zeros(len(bins)), ymax = act_binned, color = 'r')

plt.xticks(bins, bins_text)
plt.setp(ax.xaxis.get_majorticklabels(),rotation=45, ha="right", rotation_mode="anchor") 
plt.ylabel('# Officers', fontsize = 15)
plt.xlabel('Birthyear', fontsize = 15)
plt.xticks(fontsize = 15)
plt.yticks(fontsize = 15)
plt.legend(fontsize = 12)
plt.savefig('../doc/figs/history_by.pdf', dpi=1000, bbox_inches = 'tight')
plt.show()

In [None]:
# appointment date distribution
print('Number of distinct uids in database:', len(set(profiles['uid'])))
profiles = profiles.sort_values('appointment_date', ascending=False).drop_duplicates('uid').sort_index()
# profiles.drop_duplicates(subset='uid', keep='first')
appointment_dates = pd.to_datetime(profiles['appointment_date'].values)
resignation_dates = pd.to_datetime(profiles['resignation_date'].values)

In [None]:
plt.figure(figsize = (20,4))
plt.subplot(121)
plt.hist(appointment_dates)
plt.xlabel('Years', fontsize = 18)
plt.xticks(fontsize = 18)
plt.yticks(fontsize = 18)
plt.ylabel('# Appointed Officers', fontsize = 18)

plt.subplot(122)
plt.hist(resignation_dates)
plt.xlabel('Years', fontsize = 18)
plt.xticks(fontsize = 18)
plt.yticks(fontsize = 18)
plt.ylabel('# Resigned Officers', fontsize = 18)
plt.show()

In [None]:
# total number of active officers

def convert_to_year(range_time, low, high):
    delta_time = high-low
    full_date = low + range_time*delta_time
    return full_date.year

first_appointment_date, last_appointment_date = min(appointment_dates), max(appointment_dates)
delta_time = last_appointment_date - first_appointment_date
actives = (pd.to_datetime(profiles['appointment_date'].values) - first_appointment_date)/delta_time
inactives = (pd.to_datetime(profiles['resignation_date'].values) - first_appointment_date)/delta_time
inactives = np.nan_to_num(inactives, nan=1)
how_many_active = []
how_many_active_dropped_nans = []
range_time = np.linspace(0,1,50)
years = [convert_to_year(r, first_appointment_date, last_appointment_date) for r in range_time]
for r in range_time:
    how_many_active.append(np.sum((actives<r)*(inactives>r)))
    
plt.figure()
plt.subplot(111)
plt.plot(years[1:-1], how_many_active[1:-1], lw = 3)
plt.ylabel('# Active officers', fontsize = 14)
plt.xlabel('Year', fontsize = 14)
plt.xticks(fontsize = 14)
plt.yticks(fontsize = 14)
plt.show()