In [1]:
import numpy as np 
import pandas as pd
import glob
import os
from tqdm import tqdm
import re

In [2]:
def calculate_stops_by_race(df):
    stops_by_race = df.groupby('subject_race').size().reset_index(name='stops_count')
    stops_by_race = stops_by_race.set_index('subject_race')
    return stops_by_race

In [3]:
def extract_filename(filenames):
    """
    Extracts the filename without extension from a list of filenames with paths.
    
    Args:
        filenames (list): A list of filenames with paths.
        
    Returns:
        list: A list of filenames without extensions.
    """
    filenames_without_ext = []
    for filename in filenames:
        # Split the filename into the path and the filename with extension
        path, filename_with_ext = os.path.split(filename)
        
        # Split the filename with extension into the filename and the extension
        filename, ext = os.path.splitext(filename_with_ext)
        
        filenames_without_ext.append(filename)
    
    return filenames_without_ext

In [4]:
def reformat_filename(filename):
    # Extract state name until first underscore
    state_match = re.match(r'^([^_]+)', filename)
    state = state_match.group(1)
    
    # Extract city name until underscore followed by number
    city_match = re.match(r'^[a-zA-Z]+_(.+?)_\d{4}_\d{2}_\d{2}', filename)
    city = city_match.group(1)
    
    state_names = {
        "al": "Alabama", "ak": "Alaska", "az": "Arizona", "ar": "Arkansas",
        "ca": "California", "co": "Colorado", "ct": "Connecticut", "de": "Delaware",
        "fl": "Florida", "ga": "Georgia", "hi": "Hawaii", "id": "Idaho",
        "il": "Illinois", "in": "Indiana", "ia": "Iowa", "ks": "Kansas",
        "ky": "Kentucky", "la": "Louisiana", "me": "Maine", "md": "Maryland",
        "ma": "Massachusetts", "mi": "Michigan", "mn": "Minnesota", "ms": "Mississippi",
        "mo": "Missouri", "mt": "Montana", "ne": "Nebraska", "nv": "Nevada",
        "nh": "New Hampshire", "nj": "New Jersey", "nm": "New Mexico", "ny": "New York",
        "nc": "North Carolina", "nd": "North Dakota", "oh": "Ohio", "ok": "Oklahoma",
        "or": "Oregon", "pa": "Pennsylvania", "ri": "Rhode Island", "sc": "South Carolina",
        "sd": "South Dakota", "tn": "Tennessee", "tx": "Texas", "ut": "Utah",
        "vt": "Vermont", "va": "Virginia", "wa": "Washington", "wv": "West Virginia",
        "wi": "Wisconsin", "wy": "Wyoming"
    }
    
    state_name = state_names.get(state.lower(), state)
    city_name = city.replace("_", " ").capitalize()
    
    return f"{city_name}, {state_name}"

In [5]:
path = r'./data_h1/'
all_files = glob.glob(path + "/*.csv")
filenames = extract_filename(all_files)

print(filenames)

for i in range(len(filenames)):
    filenames[i] = reformat_filename(filenames[i])

stops_by_race_list = []
all_races = set()

for filename in tqdm(all_files, desc="Processing csv", unit="csv"):
    print(f"Loading {filename}")
    df = pd.read_csv(filename, index_col=None, header=0)
    print(f"Loaded {filename}")
    stops_by_race_list.append(calculate_stops_by_race(df))
    all_races.update(df['subject_race'].unique())

['nc_greensboro_2020_04_01', 'ct_hartford_2020_04_01', 'ar_little_rock_2020_04_01', 'ca_bakersfield_2020_04_01', 'nc_fayetteville_2020_04_01', 'nc_raleigh_2020_04_01', 'ky_louisville_2023_01_26', 'tx_plano_2020_04_01']


Processing csv:   0%|          | 0/8 [00:00<?, ?csv/s]

Loading ./data_h1/nc_greensboro_2020_04_01.csv


Processing csv:  12%|█▎        | 1/8 [00:01<00:12,  1.76s/csv]

Loaded ./data_h1/nc_greensboro_2020_04_01.csv
Loading ./data_h1/ct_hartford_2020_04_01.csv
Loaded ./data_h1/ct_hartford_2020_04_01.csv
Loading ./data_h1/ar_little_rock_2020_04_01.csv
Loaded ./data_h1/ar_little_rock_2020_04_01.csv
Loading ./data_h1/ca_bakersfield_2020_04_01.csv


Processing csv:  50%|█████     | 4/8 [00:02<00:01,  2.25csv/s]

Loaded ./data_h1/ca_bakersfield_2020_04_01.csv
Loading ./data_h1/nc_fayetteville_2020_04_01.csv


Processing csv:  62%|██████▎   | 5/8 [00:03<00:02,  1.48csv/s]

Loaded ./data_h1/nc_fayetteville_2020_04_01.csv
Loading ./data_h1/nc_raleigh_2020_04_01.csv


Processing csv:  75%|███████▌  | 6/8 [00:05<00:02,  1.06s/csv]

Loaded ./data_h1/nc_raleigh_2020_04_01.csv
Loading ./data_h1/ky_louisville_2023_01_26.csv


Processing csv:  88%|████████▊ | 7/8 [00:05<00:00,  1.13csv/s]

Loaded ./data_h1/ky_louisville_2023_01_26.csv
Loading ./data_h1/tx_plano_2020_04_01.csv


  df = pd.read_csv(filename, index_col=None, header=0)
Processing csv: 100%|██████████| 8/8 [00:06<00:00,  1.19csv/s]

Loaded ./data_h1/tx_plano_2020_04_01.csv





In [6]:
print(all_races)

{'black', 'white', 'other', nan, 'unknown', 'asian/pacific islander', 'hispanic'}


In [7]:
results = {race: [] for race in all_races}

for i in range(len(stops_by_race_list)):
    for race in all_races:
        if race in stops_by_race_list[i].index:
            results[race].append(stops_by_race_list[i].loc[race, 'stops_count'])
        else:
            results[race].append(0)

results_df = pd.DataFrame(results)
results_df.index = [filename for filename in filenames]

results_df.head(n=len(filenames))

Unnamed: 0,black,white,other,NaN,unknown,asian/pacific islander,hispanic
"Greensboro, North Carolina",299893,253574,2344,0,5886,10401,27929
"Hartford, Connecticut",7104,6057,29,0,0,176,5073
"Little rock, Arkansas",7044,6028,9,0,376,177,0
"Bakersfield, California",19534,97505,291,0,10666,3931,56921
"Fayetteville, North Carolina",274013,170523,5000,0,3448,5711,28303
"Raleigh, North Carolina",383628,361319,555,0,8089,15349,87460
"Louisville, Kentucky",45947,91470,781,0,2,1621,6656
"Plano, Texas",35078,150321,2964,0,646,25743,34291


In [8]:
results_df = results_df.loc[:, results_df.columns.notnull()]
results_df

Unnamed: 0,black,white,other,unknown,asian/pacific islander,hispanic
"Greensboro, North Carolina",299893,253574,2344,5886,10401,27929
"Hartford, Connecticut",7104,6057,29,0,176,5073
"Little rock, Arkansas",7044,6028,9,376,177,0
"Bakersfield, California",19534,97505,291,10666,3931,56921
"Fayetteville, North Carolina",274013,170523,5000,3448,5711,28303
"Raleigh, North Carolina",383628,361319,555,8089,15349,87460
"Louisville, Kentucky",45947,91470,781,2,1621,6656
"Plano, Texas",35078,150321,2964,646,25743,34291


In [9]:
results_df.rename_axis('regions', inplace=True)
results_df.to_csv('h1_stop.csv')

In [10]:
import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency
import plotly.express as px

chi2_stat, p_val, _, _ = chi2_contingency(results_df)

# Print test results
print("Chi-square statistic:", chi2_stat)
print("P-value:", p_val)

# Determine if the result is statistically significant (e.g., using a significance level of 0.05)
alpha = 0.05
if p_val < alpha:
    print("There is evidence of racial disparity in police stops.")
else:
    print("There is no evidence of racial disparity in police stops.")

# Plot bar graphs using Plotly
fig = px.bar(results_df, x=results_df.index, y=results_df.columns, barmode='group', title='Police Stops by Race and Region')
fig.show()


Chi-square statistic: 409219.36407983885
P-value: 0.0
There is evidence of racial disparity in police stops.
