In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from copy import copy

from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly
import plotly.express as px
import plotly.graph_objs as go
from plotly.subplots import make_subplots
init_notebook_mode(connected=True)

from sklearn.linear_model import LinearRegression

In [None]:
folder = 'data/'
state = 'fl_statewide_2020_04_01.zip'
state2 = 'ca_long_beach_2020_04_01.csv'

# Preprocessing whole dataset (do not execute if working on full data)

In [None]:
df_full = pd.read_csv(folder + state)
print(df_full.columns)

Reducing the amount of data for experimentation

In [None]:
cut_file_path = folder + state.split('.')[0] + "_cut.csv"
total_size = df_full.shape[0]
df = df_full.sample(n=int(total_size/100))
print("Full dataset of size {} was reduced to subset of {} elements.".format(total_size, df.shape[0]))
df.to_csv(cut_file_path, index = False)

# Load the data (currently loading all data)

In [None]:
df =  pd.read_csv(folder + state)
print("Dataset is composed of {} stops. Columns are: \n".format(df.shape[0]))
for col in df.columns:
    if df.dtypes[col] != np.float64:
        val = df[col].unique()
        if len(val) > 20:
            print('{} \t\t: too much different values'.format(col))
        else:
            print('{} \t\t: values are: {}'.format(col if len(col)>15 else col + "\t\t", val))
df = df.rename(columns={'officer_years_of_service': 'officer_yos'})

# Preprocess to compare search rates

Calculate search rates for each officer race and each subject race across each county

# General functions

In [None]:
minority_race = ['black', 'hispanic']

def remove_general_unused_columns(data):
    columns_unused = ['officer_id_hash', 'vehicle_registration_state', 'type']
    columns_reasons = ['reason_for_stop', 'reason_for_search', 'notes', 'violation', 'search_basis']
    columns_raw = ['raw_EnforcementAction', 'raw_SearchType', 'raw_Ethnicity', 'raw_row_number_new', 'raw_Race', 'raw_row_number_old', 'raw_row_number']
    columns_geography = ['location', 'department_name', 'unit']
    return data.drop(columns=columns_unused + columns_reasons + columns_raw + columns_geography)

def print_search_rate(data):
    number_stops = data['search_conducted'].shape[0]
    number_search_conducted = data[data['search_conducted'] == True].shape[0]
    print('Data contains {} stops and {} of them ({}%) resulted in searches.'.format(number_stops, number_search_conducted, 100 * float(number_search_conducted)/number_stops))

def preprocess_for_grouping(data):
    data['search_rate'] = data['search_conducted']
    data = data.astype({'search_rate': float})
    data['count'] = 1 # to count occurences
    return data

def separate_data(data, categories):
    df = {}
    for exp in categories:
        df[exp] = data.loc[exp]
    return df


In [None]:
from scipy.stats import linregress

def plot_race(data, min_race, categories, threshold, what_to_plot, ax_limits=None, filename='test.html'):

    fig, ax_arr = plt.subplots(1, 3) # 2 graphs
    figsize = (14,5)
    fig.set_size_inches(14,5) # fig size
    fig.suptitle(min_race.title() + " people search rates among officers".format(min_race))

    fig = make_subplots(rows=1, cols=3)
    for i, off_feat in enumerate(categories):
        filename = off_feat + '_' + filename

        data_ = data[off_feat]
        df_white_sub, df_minority = data_.loc['white'], data_.loc[min_race] #separate data
        df_white_sub.reset_index(inplace=True), df_minority.reset_index(inplace=True) #reset index
        df_merged = pd.merge(df_white_sub, df_minority, on='county_name', suffixes=['_white', '_minority']) # merge both

        # remove where there are too little datapoints
        cond_minority = df_merged['count_minority'] >= threshold

        x = df_merged[what_to_plot + '_white'][cond_minority]*100
        y = df_merged[what_to_plot + '_minority'][cond_minority]*100
        s = df_merged['count_minority'][cond_minority]/100

        ax_arr[i].scatter(x, y, s=s, c="None", edgecolors='black', linewidth=0.4)
        ax_arr[i].set_xlabel("White " + what_to_plot + " (%)")
        ax_arr[i].set_ylabel(min_race.title() + " "+ what_to_plot +" (%)")

        #plot regression
        a, b, r, p_value, std_err = linregress(x.repeat(s), y.repeat(s))
        sns.regplot(x=x.repeat(s), y=y.repeat(s), ax=ax_arr[i], label='{:.1f}*x + {:.1f}, r={:.2f}'.format(a,b,r), scatter=False, truncate=False).legend(loc="best")

        ax_arr[i].set_title(off_feat.title() +' officers')

        if what_to_plot == 'search_rate':
            # draw dotted line
            max_ = df_merged[what_to_plot + '_minority'][cond_minority].max()
            line = np.arange(0, max_ * 100, max_)
            ax_arr[i].plot(line, line, c='black', linestyle=(0,(5,5)), linewidth=1)



        if ax_limits is None:
            max_x = 1
            max_y = 1
            reg_line = np.array(((0,b), (1, a + b)))
        else:
            max_x = ax_limits['x']
            max_y = ax_limits['y']
            reg_line = np.array(((0, b), (max_x, a*max_x + b)))

        
        color = px.colors.qualitative.Plotly[i]
        fig.add_trace(go.Scatter(
            x=x, 
            y=y, 
            mode='markers',
            marker=dict(size=np.sqrt(s), opacity=0.5, color=color),
            text='size:' + s.astype(str),
            name=off_feat.title() +' officers',
            ), 
            row=1, col=i+1
            )

        fig.add_shape(type="line",
            x0=line[0], y0=line[0], x1=line[-1], y1=line[-1],
            line=dict(
                color="Grey",
                width=1,
                dash="dot",
            ), row=1, col=i+1,
        )

        fig.add_trace(go.Scatter(
            x=reg_line[:,0], y=reg_line[:,1],
            mode='lines',
            name='{:.1f}*x + {:.1f}, r={:.2f}'.format(a,b,r),
            line=dict(
                color=color,
                width=2,
                dash="solid")
            ),
            row=1, col=i+1
        )

        fig.update_xaxes(title_text="White " + what_to_plot + " (%)", range=[0, max_x], row=1, col=i+1)
        fig.update_yaxes(title_text=min_race.title() + " "+ what_to_plot +" (%)", range=[0, max_y], row=1, col=i+1)

    fig.update_layout(autosize=True, width=figsize[0]*100*0.7, height=figsize[1]*100*0.8, title_text=min_race.title() + " people search rates among officers".format(min_race))
    fig.write_html(filename)
    fig.show()


def plot_search_rates(data, categories, threshold=500, what_to_plot='search_rate', ax_limits=None, filename='test.html'):
    for i, race in enumerate(minority_race):
        filename=race + '_' + filename
        plot_race(data, race, categories, threshold, what_to_plot, ax_limits[i] if ax_limits else None, filename=filename)


def plot_one_search_rate(data, categories, threshold=500, what_to_plot='search_rate', ax_limits=None):
    for i, race in enumerate(minority_race):
        plot_race(data, race, categories, threshold, what_to_plot, ax_limits[i] if ax_limits else None)
        break


# # separate data
# df_race_sep = separate_data(df_race_mixed, officer_race)
# df_race_sep[officer_race[0]].head(2)

# # plot it 
# plot_search_rates(df_race_sep, officer_race, 100, what_to_plot='stop_rate')

# Race of officer

In [None]:
officer_race = ['white', 'black', 'hispanic']

In [None]:
df_race = copy(df)

# remove unused columns
df_race = remove_general_unused_columns(df_race)

# make sure that required data are present (drop if nan values in those columns)
important_features = ['subject_race', 'officer_race', 'search_conducted']
df_race.dropna(inplace=True, subset=important_features)
print('Cleaned subset is composed of {} datapoints'.format(df_race.shape[0]))

# overview of all the data / global search rate
print_search_rate(df_race)

# preprocess for grouping
df_race = preprocess_for_grouping(df_race)

# add specific column for this type of analyzis
df_race_all = df_race.groupby(['officer_race','subject_race','county_name']).agg({'search_rate': 'mean', 'count':'count'})

In [None]:
# separate data
df_race_sep = separate_data(df_race_all, officer_race)
df_race_sep[officer_race[0]].head(2)

# plot it 
ax_limits = [
    {'x':2, 'y':4},
    {'x':2, 'y':4},
]
plot_search_rates(df_race_sep, officer_race, 3000, filename='search_rate.html', ax_limits=ax_limits )

# Experience of officer

In [None]:
officer_experience_level = ['young', 'experienced', 'old']
thresholds_experience = [2, 9]

Preprocess the data to have what is needed

In [None]:
df_yos = copy(df)

# remove unused columns
df_yos = remove_general_unused_columns(df_yos)

# make sure that required data are present (drop if nan values in those columns)
important_features = ['subject_race', 'officer_yos', 'search_conducted']
df_yos.dropna(inplace=True, subset=important_features)
print('Cleaned subset is composed of {} datapoints'.format(df_yos.shape[0]))

# overview of all the data / global search rate
print_search_rate(df_yos)

# preprocess for grouping
df_yos = preprocess_for_grouping(df_yos)

# add specific column for this type of analyzis
df_yos['age_category'] = df_yos['officer_yos'].apply(lambda x : officer_experience_level[0] if x < thresholds_experience[0] else (officer_experience_level[1] if x < thresholds_experience[1] else officer_experience_level[2]))
df_yos_all = df_yos.groupby(['age_category','subject_race','county_name']).agg({'search_rate': 'mean', 'count':'count'})

In [None]:
# separate data
df_yos_sep = separate_data(df_yos_all, officer_experience_level)
df_yos_sep[officer_experience_level[0]].head(2)

# plot it 
plot_search_rates(df_yos_sep, officer_experience_level, 5000)

# Age of the officer

In [None]:
officer_age_level = ['young', 'experienced', 'old']
thresholds_age = [30, 50]

In [None]:
df_age = copy(df)

# remove unused columns
df_age = remove_general_unused_columns(df_age)

# make sure that required data are present (drop if nan values in those columns)
important_features = ['subject_race', 'officer_age', 'search_conducted']
df_age.dropna(inplace=True, subset=important_features)
print('Cleaned subset is composed of {} datapoints'.format(df_age.shape[0]))

# overview of all the data / global search rate
print_search_rate(df_age)

# preprocess for grouping
df_age = preprocess_for_grouping(df_age)

# add specific column for this type of analyzis
df_age['age_category'] = df_age['officer_age'].apply(lambda x : officer_age_level[0] if x < thresholds_age[0] else (officer_age_level[1] if x < thresholds_age[1] else officer_age_level[2]))
df_age_all = df_age.groupby(['age_category','subject_race','county_name']).agg({'search_rate': 'mean', 'count':'count'})

In [None]:
# separate data
df_age_sep = separate_data(df_age_all, officer_age_level)
df_age_sep[officer_age_level[0]].head(2)

# plot it 
plot_search_rates(df_age_sep, officer_age_level, 1000, ax_limits=[{'x': 1.5, 'y':3},{'x': 2, 'y':3}])

# Race of the officer (with stop_rate)

In [None]:
officer_race = ['white', 'black', 'hispanic']

In [None]:
# make sure that required data are present (drop if nan values in those columns)
important_features = ['subject_race', 'officer_race', 'search_conducted']
df_race.dropna(inplace=True, subset=important_features)
print('Cleaned subset is composed of {} datapoints'.format(df_race.shape[0]))

# overview of all the data / global search rate
print_search_rate(df_race)


# add specific column for this type of analyzis
df_race_all_s = df_race.groupby(['officer_race','county_name', 'subject_race']).agg({'count':'count'})
df_race_all = df_race.groupby(['officer_race','county_name']).agg({'count':'count'})

df_race_mixed = copy(df_race_all_s)
df_race_mixed['tot'] = df_race_all['count']
df_race_mixed['stop_rate'] = df_race_mixed['count'] / df_race_mixed['tot']

df_race_mixed = df_race_mixed.reorder_levels(['officer_race','subject_race','county_name'])
# df_race_mixed.head()

In [None]:
# separate data
df_race_sep = separate_data(df_race_mixed, officer_race)
df_race_sep[officer_race[0]].head(2)

# plot it 
plot_search_rates(df_race_sep, officer_race, 100, what_to_plot='stop_rate')

# Logistic regression of characteristics of officers

In [None]:
officer_race = ['white', 'black', 'hispanic']

In [None]:
df_race = copy(df)

# make sure that required data are present (drop if nan values in those columns)
important_features = ['subject_race', 'officer_race', 'search_conducted']
df_race.dropna(inplace=True, subset=important_features)
print('Cleaned subset is composed of {} datapoints'.format(df_race.shape[0]))

# overview of all the data / global search rate
print_search_rate(df_race)
df_race = preprocess_for_grouping(df_race)

# add specific column for this type of analyzis
df_race_all_s = df_race.groupby(['officer_race','county_name', 'subject_race']).agg({'count':'count'})
df_race_all = df_race.groupby(['officer_race','county_name']).agg({'count':'count'})

df_race_mixed = copy(df_race_all_s)
df_race_mixed['tot'] = df_race_all['count']
df_race_mixed['stop_rate'] = df_race_mixed['count'] / df_race_mixed['tot']


df_race_mixed = df_race_mixed.reorder_levels(['officer_race','subject_race','county_name'])

df_race_mixed.head()

In [None]:

df = separate_data( df_race_mixed, ['white'])['white']
df.reset_index(level=[0,1], inplace=True)
df.head()

In [None]:
print(df[df.subject_race == 'black']['count'].sum())
print(df['count'].sum())

In [None]:
from scipy.stats import linregress

def plot_race_stop(data, min_race, categories, threshold):

    what_to_plot = 'stop_rate'

    fig, ax_arr = plt.subplots(1, 3) # 2 graphs
    fig.set_size_inches(17,5) # fig size
    fig.suptitle(min_race.title() + " people "+ what_to_plot+" among officers".format(min_race))

    for i, off_feat in enumerate(categories):

        data_ = data[off_feat]
        df_white_sub, df_minority = data_.loc['white'], data_.loc[min_race] #separate data
        df_white_sub.reset_index(inplace=True), df_minority.reset_index(inplace=True) #reset index
        df_merged = pd.merge(df_white_sub, df_minority, on='county_name', suffixes=['_white', '_minority']) # merge both

        # remove where there are too little datapoints
        cond_minority = df_merged['count_minority'] >= threshold

        y = df_merged[what_to_plot + '_minority'][cond_minority]*100
        total =  float(df_merged['count_minority'].sum())
        print(total)
        s = df_merged['count_minority'][cond_minority] #.apply(lambda x: x / total)

        sns.histplot(y.repeat(s), ax=ax_arr[i], bins=8, kde=True)


        ax_arr[i].set_xlabel(min_race.title() + what_to_plot + " (%)")
        ax_arr[i].set_ylabel("Number of " + min_race +" stops")


        ax_arr[i].set_title(off_feat.title() +' officers')



def plot_stop_rates(data, categories, threshold= 0):
    for race in minority_race + ['white']:
        plot_race_stop(data, race, categories, threshold)


# separate data
df_race_sep = separate_data(df_race_mixed, officer_race)
df_race_sep[officer_race[0]].head(2)


In [None]:

# plot it 
plot_stop_rates(df_race_sep, officer_race, 0)

# Veil of darkness

In [None]:
CHOSEN_CHARACTERISTIC = 'officer_race'

df_veil = copy(df)

# remove unused columns
columns_time = ['date','time']
df_veil = remove_general_unused_columns(df_veil)

# make sure that required data are present (drop if nan values in those columns)
important_features = ['subject_race', CHOSEN_CHARACTERISTIC , 'search_conducted']
df_veil.dropna(inplace=True, subset=important_features)
print('Cleaned subset is composed of {} datapoints'.format(df_veil.shape[0]))

df_veil['date'] = pd.to_datetime(df_veil['date'])
df_veil.head()

In [None]:
import datetime

starts = []
ends = []
years = [2010, 2011, 2012, 2013, 2014, 2015]

for y in years:
    starts.append(datetime.datetime.strptime(f"01-10-{y}", "%d-%m-%Y"))
    ends.append(datetime.datetime.strptime(f"30-10-{y}", "%d-%m-%Y"))


df_veil['month'] = df_veil['date'].apply(lambda x : np.sum([s <= x <= e for s,e in zip(starts, ends)]) > 0 )
df_veil = df_veil[df_veil['month']]
df_veil.head()

In [None]:
df_veiled = copy(df_veil)
hours = [ f'{x}:' for x in range(17, 21)]

df_veiled['time_bool'] = df_veiled['time'].apply(lambda x: x[0:3] in hours) 
df_veiled_both = df_veiled[ df_veiled['time_bool']]
print(df_veiled_both.shape)

df_veiled_both['time_period'] = df_veiled_both['time'].apply(lambda x : x[0:4])
df_veiled_both.head()

In [None]:
# df_veiled_both['count'] = 1

# df_veil_of_darkness = df_veiled_both.groupby(['officer_race','time_period','subject_race']).agg({'count':'count'})
# df_veil_of_darkness_all = df_veiled_both.groupby(['officer_race','time_period']).agg({'count':'count'})

# df_veil_of_darkness['tot'] = df_veil_of_darkness_all['count']
# df_veil_of_darkness['black_stop_rate'] = df_veil_of_darkness['count'] / df_veil_of_darkness['tot']


# # df_race_mixed = df_race_mixed.reorder_levels(['officer_race','subject_race','county_name'])

# df_veil_of_darkness = df_veil_of_darkness.reset_index()

# df_veil_of_darkness.head()


# # df_veil_white = separate_data(df_veil_of_darkness, officer_race)
# # df_veil_white['white'].head(10)

df_veiled_both['count'] = 1

df_veil_of_darkness = df_veiled_both.groupby(['time_period','subject_race']).agg({'count':'count'})
df_veil_of_darkness_all = df_veiled_both.groupby('time_period').agg({'count':'count'})

df_veil_of_darkness.reset_index(inplace=True)
df_veil_of_darkness_all.reset_index(inplace=True)




# df_veil_of_darkness['tot'] = df_veil_of_darkness_all['count']
# df_veil_of_darkness['black_stop_rate'] = df_veil_of_darkness['count'] / df_veil_of_darkness['tot']


# df_race_mixed = df_race_mixed.reorder_levels(['officer_race','subject_race','county_name'])

# df_veil_of_darkness = df_veil_of_darkness.reset_index()

df_veil_of_darkness = df_veil_of_darkness.merge(df_veil_of_darkness_all, on=['time_period'])
df_veil_of_darkness['black_stop_rate'] = df_veil_of_darkness['count_x'] / df_veil_of_darkness['count_y']

df_veil_of_darkness = df_veil_of_darkness[df_veil_of_darkness['subject_race'] == 'black']

df_veil_of_darkness.head()

# df_veil_white = separate_data(df_veil_of_darkness, officer_race)
# df_veil_white['white'].head(10)


In [None]:
df_veil_study = df_veil_of_darkness.reset_index()
# df_veil_study = df_veil_white['white'].reset_index()
df_veil_study['before_sunset'] = df_veil_study['time_period'].apply(lambda x: x < '20:1')
df_veil_study.head()

In [None]:
periods = []
for i in range(17, 21):
    for j in range(0, 60, 10):
        periods.append(f"{i}:{int(j/10)}")
print(periods)

In [None]:
from matplotlib.pyplot import figure
figure(figsize=(10,5))

data_black = df_veil_study[ df_veil_study['subject_race'] == 'black']

sns.lineplot(x='time_period', y='black_stop_rate', data=data_black)
plt.axvline('19:1', c='black', linestyle='dashed')

data_black['before_sunset'] = data_black['time_period'].apply(lambda x : x < '19:1')

data_black['time_period_v'] = data_black['time_period'].apply(lambda x : periods.index(x))

data_black_before = data_black[data_black['before_sunset']]
data_black_after = data_black[data_black['before_sunset'] == False]

data_black_before.head()

sns.regplot(x='time_period_v', y='black_stop_rate', data=data_black_before)
sns.regplot(x='time_period_v', y='black_stop_rate', data=data_black_after)
# sns.regplot(x=data_black_after['time_period'], y=data_black_after['black_stop_rate'])

In [None]:
import datetime
from matplotlib.pyplot import figure

# def plot_veil_of_darkness(data, period= ['01-12', '20-12'], sunset_time='17:3', hour_range=[15,20]):



period= ['05-06', '28-07']
sunset_time='20:1'
hour_range=[18,22]


df_veil = copy(data)
# remove unused columns
columns_time = ['date','time']
df_veil = remove_general_unused_columns(df_veil, columns_time)

# make sure that required data are present (drop if nan values in those columns)
important_features = ['subject_race', CHOSEN_CHARACTERISTIC , 'search_conducted']
df_veil.dropna(inplace=True, subset=important_features)
print('Cleaned subset is composed of {} datapoints'.format(df_yos.shape[0]))

df_veil['date'] = pd.to_datetime(df_veil['date'])


# select range of the period
starts = []
ends = []
years = [2010, 2011, 2012, 2013, 2014, 2015]
for y in years:
    starts.append(datetime.datetime.strptime(f"{period[0]}-{y}", "%d-%m-%Y"))
    ends.append(datetime.datetime.strptime(f"{period[1]}-{y}", "%d-%m-%Y"))

df_veil['month'] = df_veil['date'].apply(lambda x : np.sum([s <= x <= e for s,e in zip(starts, ends)]) > 0 )
df_veil = df_veil[df_veil['month']]

# select hour range
hours = [ f'{x}:' for x in range(hour_range[0], hour_range[1])]
df_veiled['time_bool'] = df_veiled['time'].apply(lambda x: x[0:3] in hours) 
df_veiled_both = df_veiled[ df_veiled['time_bool']]
df_veiled_both['time_period'] = df_veiled_both['time'].apply(lambda x : x[0:4])

# group 
df_veiled_both['count'] = 1
df_veil_of_darkness = df_veiled_both.groupby(['time_period','subject_race']).agg({'count':'count'})
df_veil_of_darkness_all = df_veiled_both.groupby('time_period').agg({'count':'count'})
df_veil_of_darkness.reset_index(inplace=True)
df_veil_of_darkness_all.reset_index(inplace=True)
df_veil_of_darkness = df_veil_of_darkness.merge(df_veil_of_darkness_all, on=['time_period'])
df_veil_of_darkness['black_stop_rate'] = df_veil_of_darkness['count_x'] / df_veil_of_darkness['count_y']
df_veil_of_darkness = df_veil_of_darkness[df_veil_of_darkness['subject_race'] == 'black']
df_veil_study = df_veil_of_darkness.reset_index()



# select only black people arrested
data_black = df_veil_study[df_veil_study['subject_race'] == 'black']


return data_black



# df_black = plot_veil_of_darkness(df, period = period, sunset_time = sunset_time, hour_range = hour_range)

In [None]:
# !pip uninstall astral
# !pip install astral==1.10.1
# !pip install astral
# !pip install tqdm

In [None]:
import datetime
from matplotlib.pyplot import figure
from tqdm.notebook import tqdm, tqdm_notebook
from datetime import datetime
tqdm.pandas()

In [None]:

# s = sun(cityobserver, date=datetime.date())

# def plot_veil_of_darkness(data, period= ['01-12', '20-12'], sunset_time='17:3', hour_range=[15,20]):
CHOSEN_CHARACTERISTIC = 'officer_race'


df_veil = copy(df)
# remove unused columns
columns_time = ['date','time']
columns_to_drop = ['subject_sex', 'subject_age', 'arrest_made', 'citation_issued', 'warning_issued', 'outcome', 'frisk_performed', 'search_conducted']
df_veil = remove_general_unused_columns(df_veil)
df_veil.drop(columns=columns_to_drop, inplace=True)

# make sure that required data are present (drop if nan values in those columns)
important_features = ['subject_race', CHOSEN_CHARACTERISTIC ]
df_veil.dropna(inplace=True, subset=important_features)
print('Cleaned subset is composed of {} datapoints'.format(df_veil.shape[0]))

df_veil['date'] = (df_veil['date'] + ' ' + df_veil['time']).progress_apply(lambda x : datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
df_veil.drop(columns=['time'], inplace=True)
df_veil.head()

In [None]:
import astral.sun as astralsun
from astral import LocationInfo
from astral.sun import sun
from datetime import datetime

city = LocationInfo("Orlando")

df_veil['dusk_time'] = df_veil['date'].progress_apply(lambda x : sun(city.observer, date=x)['dusk'].replace(tzinfo=None))
df_veil['time_relative'] = df_veil['date'] - df_veil['dusk_time'] #.progress_apply(lambda x : x - )
df_veil.head()

In [None]:
from datetime import timedelta

WIDTH = 20 # number of period of 15 minutes
EXTEND= 2 # extend to take more data before the dusk time than after

categories = [timedelta(minutes=15*i) for i in range(-WIDTH-EXTEND, WIDTH+1)]
def get_category(timed):
    for i, t in enumerate(categories):
        if timed < t:
            return i-WIDTH-EXTEND-1 if i else -1
    return -1

# Add a column with their time category (by small periods of 15 minutes. cat 0 is between 0 and 15 BEFORE dusk, 1 is 15 min just AFTER dusk, -1 is between 30 and 15 min BEFORE ). All cat = 0 are out of range arrests, clear them
df_veil['time_cat'] = df_veil['time_relative'].progress_apply(lambda x : get_category(x))
df_veil_cleaned = df_veil[df_veil['time_cat'] != -1]
df_veil_cleaned.head()

In [None]:
# df_for_vod = copy(df_veil_cleaned)

def get_time_category(time):
    if time.minute < 7.5:
        return f"{time.hour}:00"
    elif 7.5 < time.minute < 22.5:
        return f"{time.hour}:15"
    elif 22.5 < time.minute < 37.5:
        return f"{time.hour}:30"
    elif 37.5 < time.minute < 52.5:
        return f"{time.hour}:45"
    else:
        return f"{time.hour + 1}:00"

# # Add a column with their dusk time moment
# df_for_vod['dusk_exact_time'] = df_for_vod['dusk_time'].progress_apply(lambda x : x.time())
# df_for_vod['dusk_category'] = df_for_vod['dusk_exact_time'].progress_apply(lambda x : get_time_category(x))
# df_for_vod.head()

In [None]:
df_veil_cleaned.to_csv('data_with_dusk.csv')
df_veil_cleaned.head()

In [None]:
from datetime import datetime

df_for_vod = pd.read_csv('data_with_dusk.csv')
df_for_vod['date_obj'] = df_for_vod['date'].progress_apply(lambda x : datetime.strptime(x.split('.')[0], '%Y-%m-%d %H:%M:%S'))

In [None]:
df_for_vod['date_category'] = df_for_vod['date_obj'].progress_apply(lambda x : get_time_category(x))

print(df_for_vod.shape)
df_for_vod.head()

In [None]:
df_new_version = copy(df_for_vod)

In [None]:
def group_data(df_to_group):
    df_to_group['count'] = 1

    # group the categories
    df_vod = df_to_group.groupby(['time_cat','subject_race']).agg({'count':'count'})
    df_grouped_by_timecat = df_to_group.groupby('time_cat').agg({'count':'count'})

    # merge both
    df_vod.reset_index(inplace=True)
    df_grouped_by_timecat.reset_index(inplace=True)
    df_vod = df_vod.merge(df_grouped_by_timecat, on=['time_cat'])

    # compute the stop_rate
    df_vod['race_stop_rate'] = df_vod['count_x'] / df_vod['count_y']
    return df_vod

In [None]:
def plot_vod_1sub_1off(data_o, time_range=[-8,7], plot=True, dusk_time=None, off_cat='all', filename='test.html', factor_size=None):
    data = copy(data_o)
    data = data[data['time_cat'].isin(range(time_range[0],time_range[1]))]
    data = data[~data['time_cat'].isin(range(-1,0))]

    data['time_cat'] = data['time_cat'] * 10
    data['race_stop_rate'] = data['race_stop_rate'] * 100

    # two different regressions
    data_before = data[data['time_cat'] < -1]
    data_after = data[data['time_cat'] > -1]

    # calculate means
    if data_before['count_y'].sum() == 0:
        before_average = np.nan
    else:
        before_average = np.average(data_before['race_stop_rate'], weights=data_before['count_y'])
    if data_after['count_y'].sum() == 0:
        after_average = np.nan
    else:
        after_average = np.average(data_after['race_stop_rate'], weights=data_after['count_y'])

    # plot
    if plot:

        # plot points
        figure(figsize=(10,5))
        ax = sns.scatterplot(x='time_cat', y='race_stop_rate', size='count_y', sizes=(20,200) ,data=data, color='black')
        plt.axvline(0, c='black', linestyle='dashed')

        # plot averages
        plt.hlines(before_average, -80, -20)
        plt.hlines(after_average, 0, 60)

        # set axes
        ax.set_xlabel('Time since dusk (min)')
        ax.set_ylabel('Stopped drivers who are black (%)')

        if dusk_time :
            title = f"Stops at {dusk_time} by {off_cat} officers"
        else:
            title = f"All stops by {off_cat} officers"
        ax.set_title(title)


        # plotly version
        fig = make_subplots(rows=1, cols=1)
        min_x, max_x = -90, 70
        min_y, max_y = data['race_stop_rate'].min(), data['race_stop_rate'].max()
        offset = (max_y - min_y)*0.1
        min_y, max_y = min_y-offset, max_y+offset
        figsize = (10,5)
        
        
        color = px.colors.qualitative.Plotly[0]
        m = data['count_y'].min()
        M = data['count_y'].max()
        sizes = data['count_y'] / data['count_y'].max() * 500
        sizes /= 20
        if factor_size is not None:
            sizes *=  factor_size

        fig.add_trace(go.Scatter(
            x=data['time_cat'],
            y=data['race_stop_rate'], 
            mode='markers',
            marker=dict(size=sizes, opacity=0.9, color=color),
            text='count: ' + data['count_y'].astype(str),
            name='Number of stops',
            ), 
            row=1, col=1
            )

        fig.add_shape(type="line",
            x0=0, y0= min_y, x1=0, y1=max_y,
            line=dict(
                color="Grey",
                width=1,
                dash="dot",
            ), row=1, col=1,
        )

        lin = 15
        xs = np.linspace(min_x, 0, lin)
        ys = np.linspace(before_average, before_average, lin)
        fig.add_trace(go.Scatter(
            x=xs, y=ys,
            mode='lines',
            name='Weighted average',
            text='Weighted mean: ' + str(before_average),
            opacity=0.6,
            line=dict(
                color='Black',
                width=3,
                dash="solid")
            ),
            row=1, col=1
        )

        xs = np.linspace(0, max_x, lin)
        ys = np.linspace(after_average, after_average, lin)
        fig.add_trace(go.Scatter(
            x=xs, y=ys,
            mode='lines',
            name='Weighted average',
            text='Weighted mean: ' + str(after_average),
            opacity=0.6,
            line=dict(
                color='Black',
                width=3,
                dash="solid")
            ),
            row=1, col=1
        )

        fig.update_xaxes(title_text='Time since dusk (min)', range=[min_x, max_x], row=1, col=1)
        fig.update_yaxes(title_text='Stopped drivers who are black (%)', range=[min_y, max_y], row=1, col=1)
        print(min_y, max_y)

        # fig.update_layout(autosize=True,width= widfigsize[0]*100*0.7, height=figsize[1]*100*0.8, title_text=title)
        fig.update_layout(autosize=True, title_text=title)
        fig.write_html(filename)
        fig.show()
    return before_average, after_average

plot_1sub_Aoff(df_to_exploit, plot=True,  off_cat_name=struct['name'], off_cat_values=struct['values'], sub_race= 'black', dusk_time='19:15', county_name=None)

In [None]:
def plot_1sub_Aoff(df__, off_cat_name, off_cat_values, sub_race, \
                            county_name=None, dusk_time=None, plot=True, filename='test.html'):

    df_vod = copy(df__)

    if type(off_cat_values) is dict:
        officer_cat_values_list = [off_cat_values[x] for x in off_cat_values]
        df_vod[off_cat_name+'_cat'] = df_vod[off_cat_name].apply(lambda x : get_off_cat(x, off_cat_values))
        off_cat_name = off_cat_name+'_cat'
        off_cat_values = officer_cat_values_list


    # select only the data from one county
    if county_name:
        df_vod = df_vod[df_vod['county_name'] == county_name]

    # select data from only one dusk time range
    if dusk_time:
        df_vod = df_vod[df_vod['date_category'] == dusk_time]

    if df_vod.shape[0] == 0:
        return None

    # for each cat of officer, plot the graph for the given subject race
    list_hours = []

    if off_cat_name != 'all':

        for cat in off_cat_values:
            df_ = group_data(df_vod[df_vod[off_cat_name] == cat])

            # select only the data of one subject race
            df_ = df_[df_['subject_race'] == sub_race]

            if df_.shape[0] == 0:
                break

            tup = plot_vod_1sub_1off(df_, plot=plot, dusk_time = dusk_time, off_cat=cat, filename= cat + '_' + filename)
            list_hours.append({'cat_off': cat ,'dusk': dusk_time, 'before':tup[0], 'after': tup[1]})
        
    else:
        df_ = group_data(df_vod)
        # select only the data of one subject race
        df_ = df_[df_['subject_race'] == sub_race]

        if df_.shape[0] == 0:
            return None

        tup = plot_vod_1sub_1off(df_, plot=plot, dusk_time= dusk_time, filename=filename)
        list_hours.append({'cat_off': 'all' ,'dusk': dusk_time, 'before':tup[0], 'after': tup[1]})
    
    return list_hours

In [None]:
# df_to_exploit = copy(df_for_vod)
dusk_times = [f"{h}:{m}" for h in range(17,23) for m in ['00','15','30','45']]

def get_off_cat(v, dic):
    for i in dic:
        if v < i:
            return dic[i]
    return -1

# print(df_to_exploit.shape)
# df_to_exploit.head()

In [None]:
def precalculate_each_cat(df__, cat_name, dic_cat, subject_race):
    df_to_exploit = copy(df__)

    if type(dic_cat) is dict:
        officer_cat_values = [dic_cat[x] for x in dic_cat]
        important_column = cat_name+'_cat'
        df_to_exploit[important_column] = df_to_exploit[cat_name].apply(lambda x : get_off_cat(x, dic_cat))
    else:
        important_column = cat_name
        officer_cat_values = dic_cat

    stop_rates_each_period = []
    for dt in tqdm(dusk_times):

        v = plot_1sub_Aoff(df_to_exploit, plot=False,  off_cat_name=important_column, \
                 off_cat_values=officer_cat_values, sub_race= subject_race, dusk_time=dt)
        if v != None:
            for e in v:
                stop_rates_each_period.append(e)

    return pd.DataFrame(stop_rates_each_period), officer_cat_values



In [None]:
subject_race = 'black'
# cat_name = 'officer_yos'
# dic_cat = { 2: 'recruit', 6:'new_officer', 10:'experienced_officer', 100: 'oldcop'}
cat_name = 'officer_race'
dic_cat = ['white', 'black', 'hispanic']
# cat_name = 'officer_age'
# dic_cat = { 25: 'teenager', 42:'young_adult', 48:'adult', 55:'old_adult', 100: 'pre-retirement'}#,  100: 'retired'}


values, cats = precalculate_each_cat(df_to_exploit, cat_name=cat_name, dic_cat=dic_cat, subject_race=subject_race)
values.head()
# print_drop_Aoff(values, dic_cat)

In [None]:
values.head()

In [None]:
def value_of_hour(x):
    s = x.split(':')
    return int(s[0]) + float(s[1])/60


def print_drop_Aoff(df__, off_cat, both_lines=False, filename='test.html'):
    
    colors = ['blue','black','red', 'green', 'yellow', 'pink']
    df__['dusk_i'] = df__['dusk'].apply(lambda x : value_of_hour(x))
    df__.dropna(inplace=True)


    print(df__.head(1))

    fig = make_subplots(rows=1, cols=1)

    def set_range_y_scale(serie):
        min_y, max_y = serie.min(), serie.max()
        offset = (max_y - min_y)*0.1
        min_y, max_y = min_y-offset, max_y+offset
        return min_y, max_y

    figsize = (10,5)

    min_y, max_y = 0.0 ,0.0
    if off_cat:
        for i, cat in enumerate(off_cat):
            df_ = df__[df__['cat_off'] == cat]
            if both_lines:
                min_, max_ = set_range_y_scale(pd.concat([df_['before'],df_['after']]))
                min_y = min_y if min_y<min_ else min_
                max_y = max_y if max_y>max_ else max_
                
                x = df_['dusk_i']
                y = df_['before']
                label = cat + ' officers before dusk'
                plt.plot(x, y, label=label, linestyle='dashed')

                fig.add_trace(go.Scatter(
                    x=x, y=y,
                    mode='lines',
                    name=label,
                    opacity=0.8,
                    line=dict(
                        width=2,
                        dash="dash")
                    ),
                    row=1, col=1
                )

                
                x = df_['dusk_i']
                y = df_['after']
                label = cat + ' officers after dusk'
                plt.plot(x, y, label=label)

                fig.add_trace(go.Scatter(
                    x=x, y=y,
                    mode='lines',
                    name=label,
                    opacity=0.8,
                    line=dict(
                        width=2,
                        dash="solid")
                    ),
                    row=1, col=1
                )

            else:
                x = df_['dusk_i']
                y = df_['before'] - df_['after']
                label = cat + ' officers'
                plt.plot(x, y, label=label)
                
                min_, max_ = set_range_y_scale(y)
                min_y = min_y if min_y<min_ else min_
                max_y = max_y if max_y>max_ else max_
                # plt.hlines(df_['before'].mean() - df_['after'].mean(), 17.25, 21.50,  linestyle='dashed', color=colors[i])

                fig.add_trace(go.Scatter(
                    x=x, y=y,
                    mode='lines',
                    name=label,
                    opacity=0.8,
                    line=dict(
                        width=2,
                        dash="solid")
                    ),
                    row=1, col=1
                )

    else:
        df_ = df__.group(columns=['dusk']).aggr({'before':'mean', 'after':'mean'})
        

        x = df_['dusk_i']
        y = df_['before'] - df_['after']
        label = 'all officers'
        min_, max_ = set_range_y_scale(y)
        min_y = min_y if min_y<min_ else min_
        max_y = max_y if max_y>max_ else max_
        plt.plot(x, y, label=label)
        # plt.hlines(df_['before'].mean() - df_['after'].mean(), 17.25, 21.50,  linestyle='dashed', color=colors[i])

        fig.add_trace(go.Scatter(
            x=x, y=y,
            mode='lines',
            name=label,
            opacity=0.8,
            line=dict(
                width=2,
                dash="solid")
            ),
            row=1, col=1
        )

    fig.add_shape(type="line",
        x0=22, y0= 0, x1=17, y1=0,
        line=dict(
            color="Grey",
            width=2,
            dash="dot",
        ), row=1, col=1,
    )

    min_x, max_x = df_['dusk_i'].min(), df_['dusk_i'].max()
    fig.update_xaxes(title_text='Time period (h)', range=[min_x-0.2, max_x+0.2], row=1, col=1)
    fig.update_yaxes(title_text='Stop rate drop of black people at each time period (%)', range=[min_y, max_y], row=1, col=1)
    print(min_y, max_y)

    # fig.update_layout(autosize=True,width= widfigsize[0]*100*0.7, height=figsize[1]*100*0.8, title_text=title)
    fig.update_layout(autosize=True, title_text='Stop rate drop of black people at each time period')
    fig.write_html(filename)
    fig.show()

    plt.hlines(0, 17, 22, linestyle='dashed', color='black')
    plt.ylabel('Stop rate drop of black people at each time period (%)')
    plt.xlabel('Time period (h)')
    plt.title('Stop rate drop of black people at each time period')
    plt.legend()

## TO BE PRESENTED

First plot : n average on the year, for all officers: see that the vod works

In [None]:
df_to_exploit = copy(df_new_version)

plot_1sub_Aoff(df_new_version, plot=True,  off_cat_name='all', \
                 off_cat_values=['white', 'black', 'hispanic'], sub_race= 'black', dusk_time='18:00', county_name=None, filename='basic_1800.html')

Now lets try to get ride of the hour schedule bias

In [None]:
plot_1sub_Aoff(df_to_exploit, plot=True,  off_cat_name='all', \
                 off_cat_values=[], sub_race= 'black', dusk_time='18:00', county_name=None)
plot_1sub_Aoff(df_to_exploit, plot=True,  off_cat_name='all', \
                 off_cat_values=[], sub_race= 'black', dusk_time='19:15', county_name=None)
plot_1sub_Aoff(df_to_exploit, plot=True,  off_cat_name='all', \
                 off_cat_values=[], sub_race= 'black', dusk_time='19:45', county_name=None)
plot_1sub_Aoff(df_to_exploit, plot=True,  off_cat_name='all', \
                 off_cat_values=[], sub_race= 'black', dusk_time='20:30', county_name=None)

Not always consistent: lets show the difference per dusk time

In [None]:
int(s[1])

In [None]:
values, cats = precalculate_each_cat(df_to_exploit, cat_name='all', dic_cat=['all'], subject_race=subject_race)

In [None]:
print_drop_Aoff(values, cats, filename='stop_rate_drop_at_eat.html')

Let's try to find out if there are differences between officer races. First define the several categories

In [None]:
officer_race_struct =  {
    'name': 'officer_race',
    'values': ['white', 'black', 'hispanic']
}
officer_yos_struct =  {
    'name': 'officer_yos',
    'values': {2: '0 - 2', 6:' 2 - 6', 10:' 6 - 10', 100: ' > 10'}
}
officer_age_struct =  {
    'name': 'officer_age',
    'values': { 25: '< 25', 35:'25 - 35', 45:'35 - 45', 55:'45 - 55', 100: '> 55'}
}
officer_sex_struct =  {
    'name': 'officer_sex',
    'values': ['male', 'female']
}
struct = officer_yos_struct 
subject_race='black'

In [None]:
# plot_1sub_Aoff(df_to_exploit, plot=True,  off_cat_name=struct['name'], off_cat_values=struct['values'], sub_race= 'black', dusk_time='19:15', county_name=None, filename='1915_white.html')

In [None]:
values, cats = precalculate_each_cat(df_to_exploit, cat_name=struct['name'], dic_cat=struct['values'], subject_race=subject_race)

In [None]:
print_drop_Aoff(values, cats, False, filename='html/officer_yos_struct.html')

In [None]:
figure(figsize=(15,5))
print_drop_Aoff(values, cats, both_lines = True)

In [None]:

# preprocess for grouping
data['search_rate'] = data['search_conducted']
data = data.astype({'search_rate': float})
data['count'] = 1 # to count occurences

# add specific column for this type of analyzis
df_yos['age_category'] = df_yos['officer_yos'].apply(lambda x : officer_experience_level[0] if x < thresholds_experience[0] else (officer_experience_level[1] if x < thresholds_experience[1] else officer_experience_level[2]))
df_yos_all = df_yos.groupby(['age_category','subject_race','county_name']).agg({'search_rate': 'mean', 'count':'count'})

# Logistic regression of characteristics of officers

In [None]:
df_log_raw = copy(df)

# remove unused columns
df_log_raw = remove_general_unused_columns(df_log_raw)
df_log_raw.drop(columns=['arrest_made', 'citation_issued','warning_issued', 'outcome', 'frisk_performed'], inplace=True)
df_log_raw.dropna(inplace=True, subset=['search_conducted'])

# make sure that required data are present (drop if nan values in those columns)
important_features = ['subject_age', 'officer_yos', 'officer_age']
for feat in important_features:
    m = df_log_raw[feat].mean()
    df_log_raw[feat] = df_log_raw[feat].fillna((m))
print('Cleaned subset is composed of {} datapoints'.format(df_log_raw.shape[0]))

# overview of all the data / global search rate
print_search_rate(df_log_raw)

df_log_raw.head()

In [None]:
# preprocess for regression
df_log = copy(df_log_raw)

print('Processing sub races')
sub_races = list(df_log['subject_race'].unique())
for race in sub_races:
    df_log['subject_' + (race if isinstance(race, str) else 'nan')] = (df_log['subject_race'] == race).apply(lambda x : int(x))

print('Processing off races')
off_races = list(df_log['officer_race'].unique())
for race in off_races:
    df_log['officer_' + (race if isinstance(race, str) else 'nan')] = (df_log['officer_race'] == race).apply(lambda x : int(x))

print('Processing sub genders')
sub_genders = list(df_log['subject_sex'].unique())
for gender in sub_genders:
    df_log['subject_' + (gender if isinstance(gender, str) else 'nan')] = (df_log['subject_sex'] == gender).apply(lambda x : int(x))

print('Processing off genders')
off_genders = list(df_log['officer_sex'].unique())
for gender in off_genders:
    df_log['officer_' + (gender if isinstance(gender, str) else 'nan')] = (df_log['officer_sex'] == gender).apply(lambda x : int(x))

print('Processing off yos')
cat_exp = [0, 2, 6, 10, 15, 100]
for infi in range(len(cat_exp) -1):
    df_log['officer_exp_' + str(cat_exp[infi]) ] = ( df_log['officer_yos'].between(cat_exp[infi],cat_exp[infi+1], inclusive=[True, False]) ) .apply(lambda x : int(x))

print('Processing off age')
cat_age = [15, 25, 35, 45, 55, 70, 100]
for infi in range(len(cat_age) -1):
    df_log['officer_age_' + str(cat_age[infi]) ] = ( df_log['officer_age'].between(cat_age[infi],cat_age[infi+1], inclusive=[True, False]) ) .apply(lambda x : int(x))


print('Processing search')
df_log['search_conducted'] = df_log['search_conducted'].apply(lambda x : int(x))

print('Removing old columns')
df_log.drop(columns=['subject_race', 'officer_race', 'subject_sex', 'officer_sex', 'officer_yos', 'officer_age'], inplace=True)

df_log.head()

In [None]:
counties = df_log['county_name'].unique()
maxi = 0
maxi_c = 0
for c in counties:
    v = df_log[df_log['county_name'] == c].shape[0]
    if v > maxi:
        maxi = v
        maxi_c = c

print(maxi_c, maxi )

df_log = df_log[df_log['county_name'] == maxi_c].drop(columns='county_name')

In [None]:
df_log.head()

In [None]:
y = df_log['search_conducted'].to_numpy()
x = df_log.drop(columns='search_conducted').to_numpy()
print(x[0], y[0])

In [None]:
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(random_state=0)
clf.fit(x, y)
params = clf.coef_[0]

In [None]:
print("Parameters of logistic regression are:")
for i, col in enumerate(df_log.drop(columns='search_conducted').columns):
    print(" - {} : {:.3f} ".format(col,params[i]))

In [None]:
from xgboost import XGBClassifier, plot_importance, DMatrix

model = XGBClassifier()
model.fit(x, y)

In [None]:
plot_importance(model).set_yticklabels(df_log.drop(columns='search_conducted').columns[:23])

We see that experienced officers do not search often people. This may be due because they are at a higher level, so they do not do the search themselves. 

# Old stuff

In [None]:
def plot_officer_race(data, race):
    data_ = data[race]
    df_white_sub, df_black_sub, df_hispanic = data_.loc['white'], data_.loc['black'], data_.loc['hispanic'] #separate data
    df_white_sub.reset_index(inplace=True), df_black_sub.reset_index(inplace=True), df_hispanic.reset_index(inplace=True) #reset index
    df_merged = pd.merge(df_white_sub, df_black_sub, on='county_name', suffixes=['', '_black'])
    df_merged = pd.merge(df_merged, df_hispanic, on='county_name', suffixes=['_white', '_hispanic'])
    fig, ax_arr = plt.subplots(1, 2) # 2 graphs
    fig.set_size_inches(9,5) # fig size

    fig.suptitle("Minorities search rates of {} officers".format(race))

    ax_arr[0].scatter(df_merged['search_rate_white']*100, df_merged['search_rate_black']*100)
    ax_arr[0].set_xlabel("White search rate (%)")
    ax_arr[0].set_ylabel("Black search rate (%)")

    ax_arr[1].scatter(df_merged['search_rate_white']*100, df_merged['search_rate_hispanic']*100)
    ax_arr[1].set_xlabel("White search rate (%)")
    ax_arr[1].set_ylabel("Black search rate (%)")
    

for race in officer_race:
    plot_officer_race(df_officers, race)

In [None]:
def group_data(df_to_group):
    df_to_group['count'] = 1

    # group the categories
    df_vod = df_to_group.groupby(['time_cat','subject_race']).agg({'count':'count'})
    df_grouped_by_timecat = df_to_group.groupby('time_cat').agg({'count':'count'})

    # merge both
    df_vod.reset_index(inplace=True)
    df_grouped_by_timecat.reset_index(inplace=True)
    df_vod = df_vod.merge(df_grouped_by_timecat, on=['time_cat'])

    # compute the stop_rate
    df_vod['race_stop_rate'] = df_vod['count_x'] / df_vod['count_y']
    return df_vod

In [None]:
def plot_vod_1sub_1off(data_o, time_range=[-8,7], plot=True, dusk_time=None, off_cat='all'):
    data = copy(data_o)
    data = data[data['time_cat'].isin(range(time_range[0],time_range[1]))]
    data = data[~data['time_cat'].isin(range(-1,0))]

    data['time_cat'] = data['time_cat'] * 10
    data['race_stop_rate'] = data['race_stop_rate'] * 100

    # two different regressions
    data_before = data[data['time_cat'] < -1]
    data_after = data[data['time_cat'] > -1]

    # calculate means
    before_average = data_before['race_stop_rate'].mean()
    after_average = data_after['race_stop_rate'].mean()

    # plot
    if plot:

        # plot points
        figure(figsize=(10,5))
        ax = sns.scatterplot(x='time_cat', y='race_stop_rate', size='count_y', sizes=(20,200) ,data=data, color='black')
        plt.axvline(0, c='black', linestyle='dashed')

        # plot averages
        plt.hlines(before_average, -80, -20)
        plt.hlines(after_average, 0, 60)

        # set axes
        ax.set_xlabel('Time since dusk (min)')
        ax.set_ylabel('Percentage of stopped drivers who are black')

        if dusk_time :
            ax.set_title(f"Dusk time at {dusk_time} : stops by {off_cat} officers")
        else:
            ax.set_title(f"All dusk times : stops by {off_cat} officers")

    return before_average, after_average