In [None]:
%load_ext autoreload

In [None]:
# CONFIG (leave only one uncommented)
config = { # VehicleStyle and Vehicle Year
    "grouping_keys": ['SubjectFirstName', 'SubjectLastName', 'VehicleStyle', 'VehicleYear'],
    "descript": "_Style_Year",
    "base_path": 'replace-with-path-to-this-directory',
    "hispanic_white_drivers_only_csv_name": "az_hispanic_white_drivers_Style_Year.csv",
    "standardize_format": True
}

In [None]:
# Set base path directory
base_path = config['base_path']

%cd $base_path

!pwd

In [None]:
%autoreload 2
import pandas as pd
import math
import statistics
import numpy as np
from matplotlib import pyplot as plt
from collections import Counter
from IPython.display import display

from policing_data_expl import *

# Verify Raw Data and Clean Data Match

In [None]:
filepath =  'path-to-AZ-data.csv'
dtypes_dict = {k:str for k in config['grouping_keys']}
az_data = standardize_cols('AZ', pd.read_csv(filepath, dtype=dtypes_dict))

print(f'Rows: {len(az_data)}')

In [None]:
# 'completeness' (how many non-nan values there are) per column
for column in az_data.columns:
    print(column)
    print('  ', get_percent_complete_column(az_data, column))

In [None]:
calc_complete_cols(az_data, config['grouping_keys'])

# Construct Filtered Dataset

In [None]:
grouped_az = group_df_by(az_data, config['grouping_keys'], csv_filename='az_raw_with_driver_id' + config['descript'] + '.csv')

def az_cond(name, entries):
    """
    Only keep drivers 
    - with at least 2 entries (=at least 2 stops) but no more than 10 stps
    - non-null + custom logic for driver_first_name, driver_last_name, DOB (=valid unique identifying features)
    """
    return len(entries) >= 2 and len(entries) <= 10
        
csv_name = 'az_grouped' + config['descript'] + '.csv'
check_cond(grouped_az, az_cond, csv_name)

In [None]:
csv_name = 'az_grouped' + config['descript'] + '.csv'
azgrouped_csv = pd.read_csv(csv_name)

azgrouped_csv[config['grouping_keys']] = azgrouped_csv[config['grouping_keys']].astype(str)

az_grouped = azgrouped_csv.groupby(config['grouping_keys'])

print("#rows of individuals stopped more than once:", len(azgrouped_csv))
print("#individuals stopped more than once:", len(az_grouped))

calc_mean_med_max_stops(az_grouped)
plot_stop_freq_histogram(az_grouped)

In [None]:
calc_racial_ambig(az_grouped)

enumerate_racial_ambig(az_grouped)

In [None]:
azgrouped_csv.head(50)

# Calculate Stats for Racially Ambiguous Subset

In [None]:
person_race_dict = generate_person_race_dict(az_grouped)
# make the grouping_keys into a tuple so it can be used as a key per person in person_race_dict
tuple_lst = [tuple(keys) for keys in azgrouped_csv[config['grouping_keys']].values.tolist()]
race_str_col = [person_race_dict[(keys)] for keys in tuple_lst]

# call this new column race_str
azgrouped_with_race_str = azgrouped_csv.copy()
azgrouped_with_race_str.insert(2, "race_str", race_str_col, False)

In [None]:
stats_dict_lst = get_state_stats(azgrouped_csv, race_str_col, config['grouping_keys'])

plot_state_stats(stats_dict_lst, 'AZ - ' + config['descript'])

In [None]:
# remove drivers with more than 10 stops
az_with_drivers_less_than_10_stops = azgrouped_csv.groupby(config['grouping_keys']).filter(lambda x: len(x) <= 10).reset_index()

# make the grouping_keys into a tuple so it can be used as a key per person in person_race_dict
tuple_lst = [tuple(keys) for keys in az_with_drivers_less_than_10_stops[config['grouping_keys']].values.tolist()]
race_str_col = [person_race_dict[(keys)] for keys in tuple_lst]

# call this new column race_str
# azgrouped_with_race_str = azgrouped_csv.copy()
# azgrouped_with_race_str.insert(2, "race_str", race_str_col, False)

stats_dict_lst = get_state_stats(az_with_drivers_less_than_10_stops, race_str_col, config['grouping_keys'])

plot_state_stats(stats_dict_lst, 'AZ - ' + config['descript'])

In [None]:
print(ttest_unpaired(azgrouped_with_race_str))

In [None]:
print(ttest_paired(az_grouped))

# white-Hispanic Drivers and Regressions

In [None]:
race_str_cond = azgrouped_with_race_str['race_str'].map(lambda x:x in {"Hispanic_White"})
hispanic_white_drivers = azgrouped_with_race_str.loc[race_str_cond]

print(len(hispanic_white_drivers))
print(config['hispanic_white_drivers_only_csv_name'])
write_to_csv(hispanic_white_drivers, config['hispanic_white_drivers_only_csv_name'])

In [None]:
azgrouped_with_race_str = pd.read_csv('az_hispanic_white_drivers' + config['descript'] + '.csv')
print(azgrouped_with_race_str['officer_id'].value_counts())

In [None]:
az_hispanic_white_grouped = azgrouped_with_race_str.groupby(config['grouping_keys'])
print(f'entries: {len(azgrouped_with_race_str)}')
print(f'individuals: {len(az_hispanic_white_grouped)}')
calc_mean_med_max_stops(az_hispanic_white_grouped)

In [None]:
res1 = regress(azgrouped_with_race_str, dep_var='search_conducted', cols=[], controls=[], model_name='No controls')
res2 = regress(azgrouped_with_race_str, dep_var='search_conducted', cols=['stop_duration'], controls=['C(stop_duration)'], model_name='Control for stop duration')
res3 = regress(azgrouped_with_race_str, dep_var='search_conducted', cols=['stop_duration', 'hour_of_day'], controls=['hour_of_day'], model_name='Control for hour of day (linear)')
res4 = regress(azgrouped_with_race_str, dep_var='search_conducted', cols=['hour_of_day'], controls=['hour_of_day', 'I(hour_of_day**2)', 'I(hour_of_day**3)', 'I(hour_of_day**4)'], model_name='Control for hour of day (quartic)')
res5 = regress(azgrouped_with_race_str, dep_var='search_conducted', cols=['county_fips'], controls=['C(county_fips)'], model_name='Control for county')
res6 = regress(azgrouped_with_race_str, dep_var='search_conducted', cols=['officer_id'], controls=['C(officer_id)'], model_name='Control for officer id - drop absorbed', useFixedEffects=True, drop_absorbed=True)

make_sensitivity_dot_plot([res1, res2, res3, res4, res5, res6], coef_to_plot='Hispanic', title='Hispanic search rate sensitivity to controls')

# Years of the Data in the Analysis

In [None]:
az_complete = pd.read_csv('az_grouped' + config['descript'] + '.csv')

In [None]:
az_complete['DateOfStop'].map(lambda x: x[:4]).value_counts().sort_index()

In [None]:
az_hispanic_white = pd.read_csv(config['hispanic_white_drivers_only_csv_name'])

In [None]:
az_hispanic_white['DateOfStop'].map(lambda x: x[:4]).value_counts().sort_index()

In [None]:
az_raw = pd.read_csv('path-to-AZ-data.csv')

In [None]:
az_raw['DateOfStop'].map(lambda x: x[:4]).value_counts().sort_index()

# Comparing a Couple Subsets of the Population

In [None]:
plot_search_rates_comparison('Arizona', 'search_conducted', az_data, azgrouped_csv, azgrouped_with_race_str)
plot_search_rates_comparison('Arizona', 'is_arrested', az_data, azgrouped_csv, azgrouped_with_race_str)

## All Drivers, white or Hispanic

In [None]:
plot_top_5_col_values(az_data, azgrouped_csv, 'violation')
plot_top_5_col_values(az_data, azgrouped_csv, 'county_name')

## Multiply Stopped Drivers, white or Hispanic

In [None]:
plot_top_5_col_values(azgrouped_csv, 'violation')
plot_top_5_col_values(azgrouped_csv, 'county_name')

## Multiply Stopped Drivers with white/Hispanic Racial Ambiguity

In [None]:
plot_top_5_col_values(azgrouped_with_race_str, 'violation')
plot_top_5_col_values(azgrouped_with_race_str, 'county_name')