In [None]:
%load_ext autoreload

In [None]:
# CONFIG (leave only one uncommented)
# dr_race_col = 'driver_race' # don't use - this is not the officer's perception
dr_race_col = 'driver_race' # standardize_cols replaces driver_race with capitalized driver_race_raw, so it is ok to use this

capitalize_dr_race_col = False # Texas driver_race cols aren't capitalized, so capitalize them

# Don't use tx_processed: tx_processed is after processing the raw data, which doesn't have the driver_race or driver_race_raw columns yet
# Use tx_processed_driver_race_cols or the TX-clean.csv from the raw_openpolicing_data folder
config = { # 2016-2017 data only
    "grouping_keys": ['HA_N_FIRST_DRVR', 'HA_N_LAST_DRVR', 'HA_A_ADDRESS_DRVR', 'HA_A_CITY_DRVR', 'HA_A_STATE_DRVR', 'HA_A_ZIP_DRVR'],
    "descript": dr_race_col,
    "base_path": 'replace-with-path-to-this-directory',
    "starting_file_name": 'path-to-TX-data.csv',
    "grouped_csv_name": 'tx_processed_grouped_driver_race_raw.csv',
    "hispanic_white_drivers_only_csv_name": f"tx_processed_hispanic_white_drivers_{dr_race_col}.csv",
    "only_after_2016": True
}

In [None]:
# Set base path directory
base_path = config['base_path']

%cd $base_path

!pwd

In [None]:
%autoreload 2
import pandas as pd
import math
import statistics
import numpy as np
from matplotlib import pyplot as plt
from collections import Counter
from IPython.display import display

from policing_data_expl import *

# Verify Raw Data and Clean Data Match

In [None]:
filepath = config['starting_file_name']
dtypes_dict = {k:str for k in config['grouping_keys']}
tx_data = standardize_cols('TX', pd.read_csv(filepath, dtype=dtypes_dict))

if config['only_after_2016']:
    # only take rows that happened in the year 2016 and 2017
    before2016_mask = tx_data['date'].apply(lambda x: str(x)[:4] != '2016' and str(x)[:4] != '2017')
    only_2016_2017 = tx_data['date'].apply(lambda x: str(x)[:4] == '2016' or str(x)[:4] == '2017')
    b2016 = tx_data[before2016_mask]
    a2016 = tx_data[only_2016_2017]
    tx_data = a2016

print(f'Rows: {len(tx_data)}')

In [None]:
tx_data.columns.to_list()

In [None]:
# # checking race discordance between what's written and the corrected versions
# tx_data_copy = tx_data.copy()
# tx_data_copy['year_month'] = tx_data_copy['date'].apply(lambda x: str(x)[:7])
# for group, item in tx_data_copy.groupby('year_month'):
#     print(group, (item['driver_race'] != item['driver_race_raw']).sum() / len(item))

In [None]:
# 'completeness' (how many non-nan values there are) per column
for column in tx_data.columns:
    print(column)
    print('  ', get_percent_complete_column(tx_data, column))

In [None]:
tx_data.columns.to_list()

In [None]:
calc_complete_cols(tx_data, config['grouping_keys'], driver_race_col=dr_race_col)

# Construct Filtered Dataset

In [None]:
grouped_tx = group_df_by(tx_data, config['grouping_keys'], driver_race_col=dr_race_col, csv_filename='tx_raw_with_driver_id_' + config['descript'] + '.csv')

def tx_cond(name, entries):
    """
    Only keep drivers 
    - with at least 2 entries (=at least 2 stops) and no more than 10 stops
    - assume non-null HA_N_FIRST_DRVR, HA_N_LAST_DRVR, HA_A_ADDRESS_DRVR, HA_A_CITY_DRVR, HA_A_STATE_DRVR, HA_A_ZIP_DRVR (=valid unique identifying features)
    """
    f, l, a, c, s, z = name
    return len(entries) >= 2 and len(entries) <= 10
        
csv_name = config['grouped_csv_name']
check_cond(grouped_tx, tx_cond, csv_name)

In [None]:
csv_name = config['grouped_csv_name']
txgrouped_csv = pd.read_csv(csv_name)

if capitalize_dr_race_col:
    # with the processed csv, the driver_race_raw are lowercased, so capitalize them
    txgrouped_csv['driver_race_raw'] = txgrouped_csv['driver_race_raw'].apply(lambda x: str(x).capitalize())

tx_grouped = txgrouped_csv.groupby(config['grouping_keys'])

print("#rows of individuals stopped more than once:", len(txgrouped_csv))
print("#individuals stopped more than once:", len(tx_grouped))

calc_mean_med_max_stops(tx_grouped)

In [None]:
# Because Texas is only looking at 2016 and 2017 data, confirm that here
txgrouped_csv['date'].apply(lambda x: str(x)[:4]).value_counts()

In [None]:
calc_racial_ambig(tx_grouped, driver_race_col=dr_race_col)

enumerate_racial_ambig(tx_grouped, driver_race_col=dr_race_col)

In [None]:
txgrouped_csv.head(50)

# Calculate Stats for Racially Ambiguous Subset

In [None]:
person_race_dict = generate_person_race_dict(tx_grouped, driver_race_col=dr_race_col)
# make the grouping_keys into a tuple so it can be used as a key per person in person_race_dict
tuple_lst = [tuple(keys) for keys in txgrouped_csv[config['grouping_keys']].values.tolist()]
race_str_col = [person_race_dict[(keys)] for keys in tuple_lst]

# call this new column race_str
txgrouped_with_race_str = txgrouped_csv.copy()
txgrouped_with_race_str.insert(2, "race_str", race_str_col, False)

In [None]:
stats_dict_lst = get_state_stats(txgrouped_csv, race_str_col, config['grouping_keys'], driver_race_col=dr_race_col)

plot_state_stats(stats_dict_lst, 'TX - ' + config['descript'])

In [None]:
# remove drivers with more than 10 stops
tx_with_drivers_less_than_10_stops = txgrouped_csv.groupby(config['grouping_keys']).filter(lambda x: len(x) <= 10).reset_index()

# make the grouping_keys into a tuple so it can be used as a key per person in person_race_dict
tuple_lst = [tuple(keys) for keys in tx_with_drivers_less_than_10_stops[config['grouping_keys']].values.tolist()]
race_str_col = [person_race_dict[(keys)] for keys in tuple_lst]

# call this new column race_str
# azgrouped_with_race_str = azgrouped_csv.copy()
# azgrouped_with_race_str.insert(2, "race_str", race_str_col, False)

stats_dict_lst = get_state_stats(tx_with_drivers_less_than_10_stops, race_str_col, config['grouping_keys'], dr_race_col)

plot_state_stats(stats_dict_lst, 'TX - ' + config['descript'])

In [None]:
print(ttest_unpaired(txgrouped_with_race_str, driver_race_col=dr_race_col))

In [None]:
print(ttest_paired(tx_grouped, driver_race_col=dr_race_col))

# white-Hispanic Drivers and Regressions

In [None]:
race_str_cond = txgrouped_with_race_str['race_str'].map(lambda x:x in {"Hispanic_White"})
hispanic_white_drivers = txgrouped_with_race_str.loc[txgrouped_with_race_str['search_conducted'].notnull() & race_str_cond]

print(len(hispanic_white_drivers))
print(config['hispanic_white_drivers_only_csv_name'])
write_to_csv(hispanic_white_drivers, config['hispanic_white_drivers_only_csv_name'])

In [None]:
txgrouped_with_race_str = pd.read_csv(config['hispanic_white_drivers_only_csv_name'])
txgrouped_with_race_str['officer_id'].value_counts()

In [None]:
tx_hispanic_white_grouped = txgrouped_with_race_str.groupby(config['grouping_keys'])
print(f'entries: {len(txgrouped_with_race_str)}')
print(f'individuals: {len(tx_hispanic_white_grouped)}')
calc_mean_med_max_stops(tx_hispanic_white_grouped)

In [None]:
# Note: TX only has search data from 2016 and no arrest data
txgrouped_with_race_str['date'].apply(lambda x: str(x)[:4]).value_counts()

txgrouped_with_race_str.loc[txgrouped_with_race_str['search_conducted'].notnull()]['date'].apply(lambda x: str(x)[:4]).value_counts()

In [None]:
res1 = regress(txgrouped_with_race_str, dep_var='search_conducted', cols=[], controls=[], model_name='No controls', useFixedEffects=True, stop_date_col='date', driver_race_col=dr_race_col, stop_time_col='time')
res2 = regress(txgrouped_with_race_str, dep_var='search_conducted', cols=['hour_of_day'], controls=['hour_of_day'], model_name='Control for hour of day (linear)', useFixedEffects=True, stop_date_col='date', driver_race_col=dr_race_col, stop_time_col='time')
res3 = regress(txgrouped_with_race_str, dep_var='search_conducted', cols=['hour_of_day'], controls=['hour_of_day', 'I(hour_of_day**2)', 'I(hour_of_day**3)', 'I(hour_of_day**4)'], model_name='Control for hour of day (quartic)', useFixedEffects=True, stop_date_col='date', driver_race_col=dr_race_col, stop_time_col='time')
res4 = regress(txgrouped_with_race_str, dep_var='search_conducted', cols=['county_fips'], controls=['C(county_fips)'], model_name='Control for county - drop absorbed', useFixedEffects=True, stop_date_col='date', driver_race_col=dr_race_col, stop_time_col='time', drop_absorbed=True)
res5 = regress(txgrouped_with_race_str, dep_var='search_conducted', cols=['officer_id'], controls=['C(officer_id)'], model_name='Control for officer id - drop absorbed', useFixedEffects=True, stop_date_col='date', driver_race_col=dr_race_col, stop_time_col='time', drop_absorbed=True)

make_sensitivity_dot_plot([res1, res2, res3, res4, res5], coef_to_plot = 'Hispanic', title='Sensitivity to controls in Texas')

In [None]:
# regressions that don't yield much
# res1 = regress(txgrouped_with_race_str, dep_var='contraband_found', cols=[], controls=[], model_name='No controls (contraband found rate)', useFixedEffects=True, stop_date_col='date', driver_race_col='driver_race_raw')
# res2 = regress(non_null_outcome, cols=[], controls=[], dep_var='citation_given', model_name='no controls (citation rate)', useFixedEffects=True, stop_date_col='date', driver_race_col='driver_race_raw')

# Years of Data in the Analysis

In [None]:
tx_complete = pd.read_csv(config['grouped_csv_name'])

In [None]:
tx_complete['date'].apply(lambda x: str(x)[:4]).value_counts().sort_index()

In [None]:
tx_hispanic_white = pd.read_csv(config['hispanic_white_drivers_only_csv_name'])

In [None]:
tx_hispanic_white['date'].apply(lambda x: str(x)[:4]).value_counts().sort_index()

# Comparing a Couple Subsets of the Population

In [None]:
plot_search_rates_comparison('TX', 'search_conducted', tx_data, txgrouped_csv, txgrouped_with_race_str, driver_race_col=dr_race_col)

## All Drivers, white or Hispanic

In [None]:
plot_top_5_col_values(tx_data, 'violation', driver_race_col=dr_race_col)
plot_top_5_col_values(tx_data, 'county_name', driver_race_col=dr_race_col)

## Multiply Stopped Drivers, white or Hispanic

In [None]:
plot_top_5_col_values(txgrouped_csv, 'violation', driver_race_col=dr_race_col)
plot_top_5_col_values(txgrouped_csv, 'county_name', driver_race_col=dr_race_col)

## Multiply Stopped Drivers with white/Hispanic Racial Ambiguity

In [None]:
plot_top_5_col_values(txgrouped_with_race_str, 'violation', driver_race_col=dr_race_col)
plot_top_5_col_values(txgrouped_with_race_str, 'county_name', driver_race_col=dr_race_col)