In [1]:
import numpy as np 
import pandas as pd
import glob
import os
from tqdm import tqdm
import regex as re

In [2]:
def calculate_searches_by_gender(df):
    searched_df = df[df['search_conducted'] == True]
    searches_by_race = searched_df.groupby('subject_sex').size().reset_index(name='searches_count')
    searches_by_race = searches_by_race.set_index('subject_sex')
    return searches_by_race

def calculate_arrest_count(df):
    searched_df = df[df['arrest_made'] == True]
    searches_by_race = searched_df.groupby('subject_sex').size().reset_index(name='arrests_count')
    searches_by_race = searches_by_race.set_index('subject_sex')
    return searches_by_race

def calculate_citations(df):
    searched_df = df[df['citation_issued'] == True]
    searches_by_race = searched_df.groupby('subject_sex').size().reset_index(name='citations_count')
    searches_by_race = searches_by_race.set_index('subject_sex')
    return searches_by_race

def calculate_warnings(df):
    searched_df = df[df['warning_issued'] == True]
    searches_by_race = searched_df.groupby('subject_sex').size().reset_index(name='warnings_count')
    searches_by_race = searches_by_race.set_index('subject_sex')
    return searches_by_race

In [3]:
def calculate_stops_by_race(df):
    stops_by_race = df.groupby('subject_sex').size().reset_index(name='stops_count')
    stops_by_race = stops_by_race.set_index('subject_sex')
    return stops_by_race

In [4]:
def extract_filename(filenames):
    """
    Extracts the filename without extension from a list of filenames with paths.
    
    Args:
        filenames (list): A list of filenames with paths.
        
    Returns:
        list: A list of filenames without extensions.
    """
    filenames_without_ext = []
    for filename in filenames:
        path, filename_with_ext = os.path.split(filename)
        filename, ext = os.path.splitext(filename_with_ext)
        filenames_without_ext.append(filename)
    
    return filenames_without_ext

In [5]:
def reformat_filename(filename):
    # Extract state name until first underscore
    state_match = re.match(r'^([^_]+)', filename)
    state = state_match.group(1)
    
    # Extract city name until underscore followed by number
    city_match = re.match(r'^[a-zA-Z]+_(.+?)_\d{4}_\d{2}_\d{2}', filename)
    city = city_match.group(1)
    
    # Dictionary mapping state abbreviations to full state names
    state_names = {
        "al": "Alabama", "ak": "Alaska", "az": "Arizona", "ar": "Arkansas",
        "ca": "California", "co": "Colorado", "ct": "Connecticut", "de": "Delaware",
        "fl": "Florida", "ga": "Georgia", "hi": "Hawaii", "id": "Idaho",
        "il": "Illinois", "in": "Indiana", "ia": "Iowa", "ks": "Kansas",
        "ky": "Kentucky", "la": "Louisiana", "me": "Maine", "md": "Maryland",
        "ma": "Massachusetts", "mi": "Michigan", "mn": "Minnesota", "ms": "Mississippi",
        "mo": "Missouri", "mt": "Montana", "ne": "Nebraska", "nv": "Nevada",
        "nh": "New Hampshire", "nj": "New Jersey", "nm": "New Mexico", "ny": "New York",
        "nc": "North Carolina", "nd": "North Dakota", "oh": "Ohio", "ok": "Oklahoma",
        "or": "Oregon", "pa": "Pennsylvania", "ri": "Rhode Island", "sc": "South Carolina",
        "sd": "South Dakota", "tn": "Tennessee", "tx": "Texas", "ut": "Utah",
        "vt": "Vermont", "va": "Virginia", "wa": "Washington", "wv": "West Virginia",
        "wi": "Wisconsin", "wy": "Wyoming"
    }
    
    state_name = state_names.get(state.lower(), state)
    city_name = city.replace("_", " ").capitalize()
    
    return f"{city_name}, {state_name}"

In [6]:
path = r'./Statewise/'
all_files = glob.glob(path + "/*.csv")
filenames = extract_filename(all_files)

print(filenames)

stops_by_race_list = []
search_conducted_by_race_list = []
arrests_made = []
citation_issued = []
warnings_issued = []
all_races = set()

for filename in tqdm(all_files, desc="Processing csv", unit="csv"):
    print(f"Loading {filename}")
    df = pd.read_csv(filename, index_col=None, header=0)
    print(f"Loaded {filename}")
    stops_by_race_list.append(calculate_stops_by_race(df))
    all_races.update(df['subject_sex'].unique())
    search_conducted_by_race_list.append(calculate_searches_by_gender(df))
    arrests_made.append(calculate_arrest_count(df))
    citation_issued.append(calculate_citations(df))
    warnings_issued.append(calculate_warnings(df))

['CT', 'RI', 'CA', 'AZ', 'WI', 'NC', 'LA', 'MT', 'MD', 'VT', 'TN', 'MA']


Processing csv:   0%|          | 0/12 [00:00<?, ?csv/s]

Loading ./Statewise/CT.csv


  df = pd.read_csv(filename, index_col=None, header=0)


Loaded ./Statewise/CT.csv


Processing csv:   8%|▊         | 1/12 [00:03<00:41,  3.75s/csv]

Loading ./Statewise/RI.csv


  df = pd.read_csv(filename, index_col=None, header=0)


Loaded ./Statewise/RI.csv


Processing csv:  17%|█▋        | 2/12 [00:05<00:24,  2.40s/csv]

Loading ./Statewise/CA.csv


  df = pd.read_csv(filename, index_col=None, header=0)


Loaded ./Statewise/CA.csv


Processing csv:  25%|██▌       | 3/12 [00:07<00:21,  2.36s/csv]

Loading ./Statewise/AZ.csv


  df = pd.read_csv(filename, index_col=None, header=0)


Loaded ./Statewise/AZ.csv


Processing csv:  33%|███▎      | 4/12 [00:18<00:46,  5.82s/csv]

Loading ./Statewise/WI.csv


  df = pd.read_csv(filename, index_col=None, header=0)


Loaded ./Statewise/WI.csv


Processing csv:  42%|████▏     | 5/12 [00:24<00:39,  5.70s/csv]

Loading ./Statewise/NC.csv


Processing csv:  50%|█████     | 6/12 [00:25<00:26,  4.38s/csv]

Loaded ./Statewise/NC.csv
Loading ./Statewise/LA.csv


  df = pd.read_csv(filename, index_col=None, header=0)
Processing csv:  58%|█████▊    | 7/12 [00:27<00:17,  3.51s/csv]

Loaded ./Statewise/LA.csv
Loading ./Statewise/MT.csv


  df = pd.read_csv(filename, index_col=None, header=0)


Loaded ./Statewise/MT.csv


Processing csv:  67%|██████▋   | 8/12 [00:31<00:14,  3.55s/csv]

Loading ./Statewise/MD.csv


  df = pd.read_csv(filename, index_col=None, header=0)


Loaded ./Statewise/MD.csv


Processing csv:  75%|███████▌  | 9/12 [00:42<00:17,  5.80s/csv]

Loading ./Statewise/VT.csv


Processing csv:  83%|████████▎ | 10/12 [00:43<00:08,  4.39s/csv]

Loaded ./Statewise/VT.csv
Loading ./Statewise/TN.csv


  df = pd.read_csv(filename, index_col=None, header=0)


Loaded ./Statewise/TN.csv


Processing csv:  92%|█████████▏| 11/12 [00:58<00:07,  7.71s/csv]

Loading ./Statewise/MA.csv


  df = pd.read_csv(filename, index_col=None, header=0)


Loaded ./Statewise/MA.csv


Processing csv: 100%|██████████| 12/12 [01:06<00:00,  5.52s/csv]


In [9]:
all_races = set({'male', 'female'})

In [10]:
print(len(filenames))
print(len(arrests_made))
print(len(citation_issued))
print(len(warnings_issued))

12
12
12
12


In [11]:
results_race = {race: [] for race in all_races}
results_searches = {race: [] for race in all_races}
arrest_searches = {race: [] for race in all_races}
citations = {race: [] for race in all_races}
warnings = {race: [] for race in all_races}

for i in range(len(stops_by_race_list)):
    for race in all_races:
        if race in stops_by_race_list[i].index:
            results_race[race].append(stops_by_race_list[i].loc[race, 'stops_count'])
        else:
            results_race[race].append(0)
        
        if race in search_conducted_by_race_list[i].index:
            results_searches[race].append(search_conducted_by_race_list[i].loc[race, 'searches_count'])
        else:
            results_searches[race].append(0)
            
        if race in arrests_made[i].index:
            print(arrests_made[i])
            arrest_searches[race].append(arrests_made[i].loc[race, 'arrests_count'])
        else:
            arrest_searches[race].append(0)
            
        if race in citation_issued[i].index:
            citations[race].append(citation_issued[i].loc[race, 'citations_count'])
        else:
            citations[race].append(0)
            
        if race in warnings_issued[i].index:
            warnings[race].append(warnings_issued[i].loc[race, 'warnings_count'])
        else:
            warnings[race].append(0)



results_df_race = pd.DataFrame(results_race)
results_df_searches = pd.DataFrame(results_searches)
results_df_arrests = pd.DataFrame(arrest_searches)
results_df_citations = pd.DataFrame(citations)
results_df_warnings = pd.DataFrame(warnings)

results_df_race.index = [filename for filename in filenames]
results_df_searches.index = [filename for filename in filenames]
results_df_arrests.index = [filename for filename in filenames]
results_df_citations.index = [filename for filename in filenames]
results_df_warnings.index = [filename for filename in filenames]

             arrests_count
subject_sex               
female                5941
male                 19089
             arrests_count
subject_sex               
female                5941
male                 19089
             arrests_count
subject_sex               
female                3343
male                 13260
             arrests_count
subject_sex               
female                3343
male                 13260
             arrests_count
subject_sex               
female                2516
male                  9409
             arrests_count
subject_sex               
female                2516
male                  9409
             arrests_count
subject_sex               
female              158579
male                345115
             arrests_count
subject_sex               
female              158579
male                345115
             arrests_count
subject_sex               
female                4342
male                 14596
             arrests_count
s

In [12]:
results_df_race = results_df_race.loc[:, results_df_race.columns.notnull()]
results_df_searches = results_df_searches.loc[:, results_df_searches.columns.notnull()]
results_df_arrests = results_df_arrests.loc[:, results_df_arrests.columns.notnull()]
results_df_citations = results_df_citations.loc[:, results_df_citations.columns.notnull()]
results_df_warnings = results_df_warnings.loc[:, results_df_warnings.columns.notnull()]

In [13]:
results_df_race.rename_axis('regions', inplace=True)
results_df_searches.rename_axis('regions', inplace=True)
results_df_arrests.rename_axis('regions', inplace=True)
results_df_citations.rename_axis('regions', inplace=True)
results_df_warnings.rename_axis('regions', inplace=True)

In [110]:
# export all to csv
results_df_race.to_csv('results_race_h3.csv')
results_df_searches.to_csv('results_searches_h3.csv')
results_df_arrests.to_csv('results_arrests_h3.csv')
results_df_citations.to_csv('results_citations_h3.csv')
results_df_warnings.to_csv('results_warnings_h3.csv')

In [111]:
results_df_searches

Unnamed: 0_level_0,other,black,white,asian/pacific islander,unknown,hispanic
regions,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
CT,105,11807,23868,383,0,9833
RI,15,4310,9968,280,0,3189
CA,3746,23622,11707,2861,0,11445
AZ,17025,21402,85432,2311,1162,83475
WI,412,2720,9693,257,0,1080
NC,96,20654,8901,258,199,1828
LA,29,57940,15296,205,188,1668
MT,635,151,2554,52,3,226
MD,1498,44441,48900,1291,91,8704
VT,11,273,2741,27,1,99


In [14]:
results_df_searches = results_df_searches.copy()
for col in results_df_race.columns[0:]:
    results_df_searches[col] = results_df_searches[col] / results_df_race[col].replace(0, 1)  # Avoid division by zero

results_df_arrests = results_df_arrests.copy()
for col in results_df_race.columns[0:]:
    results_df_arrests[col] = results_df_arrests[col] / results_df_race[col].replace(0, 1)  # Avoid division by zero
    
results_df_citations = results_df_citations.copy()
for col in results_df_race.columns[0:]:
    results_df_citations[col] = results_df_citations[col] / results_df_race[col].replace(0, 1)  # Avoid division by zero
    
results_df_warnings = results_df_warnings.copy()
for col in results_df_race.columns[0:]:
    results_df_warnings[col] = results_df_warnings[col] / results_df_race[col].replace(0, 1)  # Avoid division by zero

In [113]:
df_new = results_df_searches.stack().reset_index()
df_new.columns = ['index', 'Subject_Sex', 'Search_Rate']
df_new = df_new[['Search_Rate', 'Subject_Sex']]
df_new.rename_axis('regions', inplace=True)
df_new.to_csv('sex_results_searches_h3_a.csv')

df_new = results_df_warnings.stack().reset_index()
df_new.columns = ['index', 'Subject_Sex', 'Warning_Rate']
df_new = df_new[['Warning_Rate', 'Subject_Sex']]
df_new.rename_axis('regions', inplace=True)
df_new.to_csv('sex_results_warning_h3_a.csv')

df_new = results_df_arrests.stack().reset_index()
df_new.columns = ['index', 'Subject_Sex', 'Arrest_Rate']
df_new = df_new[['Arrest_Rate', 'Subject_Sex']]
df_new.rename_axis('regions', inplace=True)
df_new.to_csv('sex_results_arrest_h3_a.csv')

df_new = results_df_citations.stack().reset_index()
df_new.columns = ['index', 'Subject_Sex', 'Citation_Rate']
df_new = df_new[['Citation_Rate', 'Subject_Sex']]
df_new.rename_axis('regions', inplace=True)
df_new.to_csv('sex_results_citations_h3_a.csv')