In [1]:
import os
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from scipy.stats import chi2_contingency

In [2]:
# import os
# import pandas as pd
# from data_builder import DataBuilder
# from api_key import api_key

# def process_data():
#     citation_file = 'data/citation.csv'
#     employee_file = 'data/employee.csv'
#     census_file = 'data/census.csv'

#     citation_df, employee_df, census_df = None, None, None

#     if os.path.exists(citation_file) and os.path.exists(employee_file) and os.path.exists(census_file):
#         print("Files already exist. Loading from CSV.")
        
#         citation_df = pd.read_csv(citation_file)
#         employee_df = pd.read_csv(employee_file)
#         census_df = pd.read_csv(census_file)
#     else:
#         print("Files do not exist. Fetching data from API.")
        
#         builder = DataBuilder(api_key)
#         builder.run_all()

#         citation_df = pd.read_csv(citation_file)
#         employee_df = pd.read_csv(employee_file)
#         census_df = pd.read_csv(census_file)

#     return citation_df, employee_df, census_df

# if __name__ == '__main__':
#     citation_df, employee_df, census_df = process_data()




In [3]:
citation_file = 'data/citation.csv'
employee_file = 'data/employee.csv'
census_file = 'data/census.csv'

citation_df = pd.read_csv(citation_file)
employee_df = pd.read_csv(employee_file)
census_df = pd.read_csv(census_file)


In [4]:
citation_df = citation_df.copy()
citation_cols_to_drop = [
    'CITATION_CONTROL_NUMBER', 'ACTIVITY_RESULTS', 
    'ACTIVITY_DATE', 'ACTIVITY_TIME', 'ACTIVITY_LOCATION', 
    'ACTIVITY_DIVISION', 'ACTIVITY_BEAT', 'NUMBER_OF_PASSENGERS', 
    'WAS_VEHCILE_SEARCHED', 'REASON_FOR_SEARCH', 'ObjectId'
]

citation_df = citation_df.drop(citation_cols_to_drop, axis=1, errors='ignore')
citation_df.head()

Unnamed: 0,TYPE_OF_STOP,OFFICER_GENDER,OFFICER_RACE,OFFICER_AGE_RANGE,DRIVER_GENDER,DRIVER_RACE,DRIVER_AGE_RANGE
0,COMPLAINT/CRIMINAL VIOLATION,M,WHITE,31 - 40,M,WHITE,31 - 40
1,COMPLAINT/CRIMINAL VIOLATION,M,WHITE,41 - 50,M,BLACK,20 - 25
2,COMPLAINT/CRIMINAL VIOLATION,F,BLACK,31 - 40,M,WHITE,20 - 25
3,COMPLAINT/CRIMINAL VIOLATION,M,WHITE,31 - 40,F,WHITE,OVER 60
4,COMPLAINT/CRIMINAL VIOLATION,M,WHITE,41 - 50,M,BLACK,20 - 25


In [5]:
employee_df = employee_df.copy()
employee_cols_to_drop = [
 'AOC_CODE', 'RANK_TITLE', 'OFFICER_DIVISION',
 'OFFICER_ASSIGNMENT', 'OFFICER_YEARS_SWORN',
 ]

employee_df = employee_df.drop(employee_cols_to_drop, axis=1, errors='ignore')

race_mapping = {
    'W': 'WHITE',
    'B': 'BLACK',
    'H': 'HISPANIC',
    'A': 'ASIAN',
    'U': 'UNKNOWN'
}

employee_df['OFFICER_RACE'] = employee_df['OFFICER_RACE'].str.strip().map(race_mapping)
employee_df.head()

Unnamed: 0,OFFICER_SEX,OFFICER_RACE,OFFICER_AGE_RANGE,OFFICER_AGE,ObjectId
0,F,WHITE,40 - 49,48,1
1,M,WHITE,40 - 49,46,2
2,F,WHITE,40 - 49,45,3
3,F,WHITE,40 - 49,47,4
4,M,WHITE,50 - 59,52,5


In [6]:
census_df = census_df.copy()
census_df

Unnamed: 0,Name,Total_population,Total_male_population,Total_female_population,Total_population_for_race,White_alone,Black,Native_American,Asian,Hawaiian_Pacific_Islander,Other_race_alone,Two_or_more,State code,County code
0,"Jefferson County, Kentucky",768419,371306,397113,768419,539721,168526,1070,22880,499,9085,26638,21,111


In [7]:
race_percentage_cols = [
    'Total_population', 'Total_male_population', 'Total_female_population',
    'Total_population_for_race', 'White_alone', 'Black', 'Native_American',
    'Asian', 'Hawaiian_Pacific_Islander', 'Other_race_alone', 'Two_or_more'
]

louisville_census = pd.DataFrame()


for col in race_percentage_cols:
    louisville_census[col + " (%)"] = census_df[col].apply(lambda x: (x / census_df["Total_population"]) * 100)

louisville_census

Unnamed: 0,Total_population (%),Total_male_population (%),Total_female_population (%),Total_population_for_race (%),White_alone (%),Black (%),Native_American (%),Asian (%),Hawaiian_Pacific_Islander (%),Other_race_alone (%),Two_or_more (%)
0,100.0,48.320773,51.679227,100.0,70.237852,21.931524,0.139247,2.977542,0.064939,1.182298,3.466598


In [8]:
# louisville_census['Total_male_population (%)', 'Total_female_population (%)'] 
# employee_df['OFFICER_SEX']
# citation_df['DRIVER_GENDER']

In [9]:
# Sample DataFrames (replace these with your actual DataFrames)
male_percentage = louisville_census['Total_male_population (%)'][0]
female_percentage = louisville_census['Total_female_population (%)'][0]

# For employee_df
employee_gender_counts = employee_df['OFFICER_SEX'].value_counts()
employee_male = employee_gender_counts.get('M', 0)
employee_female = employee_gender_counts.get('F', 0)

# For citation_df
citation_gender_counts = citation_df['DRIVER_GENDER'].value_counts()
citation_male = citation_gender_counts.get('M', 0)
citation_female = citation_gender_counts.get('F', 0)

# Prepare the subplot
fig = make_subplots(rows=1, cols=3, 
                    subplot_titles=('Louisville Population', 'Police Force', 'Drivers Race From Citations'),
                    specs=[[{'type':'pie'}, {'type':'pie'}, {'type':'pie'}]])

# Data preparation
datasets = ['Louisville Population', 'Police Force', 'Drivers Race From Citations']
data_values = [
    [male_percentage, female_percentage],
    [employee_male, employee_female],
    [citation_male, citation_female]
]

# Custom colors for male and female
colors = ['#1f77b4', '#ff7f0e']  # Blue for Male, Orange for Female

# Add each pie chart to a subplot using a loop
for i, values in enumerate(data_values):
    fig.add_trace(go.Pie(labels=['Male', 'Female'], values=values, 
                         marker=dict(colors=colors)),  # Set custom colors here
                  row=1, col=i + 1)  # `i + 1` since rows and columns are 1-indexed


# Update layout
fig.update_layout(title_text='Gender Comparison', template='plotly_white')

# Show the figure
fig.show()


In [10]:
# Step 1: Create a contingency table
contingency_table = pd.crosstab(citation_df['OFFICER_RACE'], citation_df['DRIVER_RACE'])

# Step 2: Perform chi-squared test
chi2, p, dof, expected = chi2_contingency(contingency_table)

# Step 3: Print the results
print("Chi-squared Test:")
print(f"Chi2 Statistic: {chi2}, P-value: {p}")


Chi-squared Test:
Chi2 Statistic: 122.9167565750705, P-value: 8.200484026464087e-17


### Overview of the Analysis
- In this analysis, we explored the relationship between the race of law enforcement officers and the race of the drivers they stop. Our goal was to see if there’s any indication of bias in traffic stops based on the racial identity of the officers. To do this, we used a chi-squared test for independence, which helps us understand whether there’s a meaningful connection between these two groups.

### Results of the Chi-Squared Test
- **Chi-Squared Statistic:** We calculated a chi-squared statistic of 122.92. This high number shows that there’s a significant difference between the actual number of stops for different racial groups and what we would expect to see if there were no connection between the officer's race and the driver's race. In other words, this suggests that the patterns we observe in the data are unlikely to be just a coincidence.

- **P-Value:** The p-value we found was about 8.20e-17, which is extremely low. This tells us that the result is statistically significant since it’s much lower than the usual thresholds (like 0.05 or 0.01). A low p-value means we have strong evidence against the idea that there’s no connection between the officer's race and the driver's race.

### Interpretation of Findings
- The results show a strong connection between the race of the officer and the race of the driver being stopped. This means that a driver's chances of being stopped may change depending on the officer's race, suggesting there might be some bias in how traffic stops are carried out.

### Implications
- These findings are important for understanding how race plays a role in law enforcement. They suggest that different racial groups might be treated differently by officers during traffic stops. It's crucial to address these biases to ensure fairness and equality in policing.

### Conclusion
- The strong evidence from the chi-squared statistic and p-value emphasizes the importance of further examining law enforcement practices. Police leaders and community advocacy groups should take these findings into account when reviewing policies and training programs designed to reduce racial bias in policing.

In [11]:
contingency_table = pd.crosstab(citation_df['OFFICER_RACE'], citation_df['DRIVER_RACE'])
colors = {
    'WHITE': '#f0d8c4',     # Light skin tone
    'BLACK': '#5d3a2a',     # Dark skin tone
    'HISPANIC': '#c9b89b',  # Medium skin tone
    'ASIAN': '#e5c9b1',     # Light tan skin tone
}

# Create the Plotly bar plot
fig = go.Figure()

# Add a bar for each driver race category
for driver_race in contingency_table.columns:
    fig.add_trace(go.Bar(
        x=contingency_table.index,              # Officer Race
        y=contingency_table[driver_race],       # Count for each driver race
        name=driver_race,                        # Name of the driver race
        marker_color=colors.get(driver_race, '#808080'),  # Color matching the skin tone or gray if not found
        hoverinfo='y+name'                       # Show count and driver race on hover
    ))

# Update layout
fig.update_layout(
    title='Officers Race vs. Drivers Race',
    xaxis_title='Officer Race',
    yaxis_title='Count',
    barmode='stack',
    legend_title='Driver Race',
    template='plotly_white'
)

# Show the figure
fig.show()


In [12]:
driver_race_counts = citation_df.groupby('DRIVER_RACE').size()

# Races of interest
driver_races = ['WHITE', 'BLACK', 'HISPANIC', 'ASIAN', 'UNKNOWN']

# Ensure that all races are represented in the count
driver_race_counts = driver_race_counts.reindex(driver_races, fill_value=0)

# Radar chart for overall driver citations
overall_radar = go.Figure()

overall_radar.add_trace(go.Scatterpolar(
    r=driver_race_counts.values,
    theta=driver_race_counts.index,
    fill='toself',
    name='All Drivers'
))

overall_radar.update_layout(
    polar=dict(
        radialaxis=dict(visible=True, range=[0, max(driver_race_counts.values)+1])
    ),
    title="Overall Driver Citations by Race"
)

overall_radar.show()

# Group by OFFICER_RACE and DRIVER_RACE for specific radar charts
officer_driver_counts = citation_df.groupby(['OFFICER_RACE', 'DRIVER_RACE']).size().unstack(fill_value=0)

# Create radar charts for each officer race
for officer_race in officer_driver_counts.index:
    fig = go.Figure()

    # Citation counts for this officer's race
    counts = officer_driver_counts.loc[officer_race]

    fig.add_trace(go.Scatterpolar(
        r=counts.reindex(driver_races, fill_value=0).values,  # Make sure all races are accounted for
        theta=driver_races,
        fill='toself',
        name=f'Officer Race: {officer_race}'
    ))

    fig.update_layout(
        polar=dict(
            radialaxis=dict(visible=True, range=[0, max(counts.values) + 1])
        ),
        title=f'Citations by Driver Race for Officer Race: {officer_race}'
    )

    fig.show()


In [13]:
total_population = census_df['Total_population'].values[0]
races = ['White_alone', 'Black', 'Asian', 'Other_race_alone']
race_populations = census_df[races].iloc[0]

# Normalize population values to percentages
normalized_populations = (race_populations / total_population) * 100

# Create radar chart
fig = go.Figure()

fig.add_trace(go.Scatterpolar(
    r=normalized_populations.values,  # Percentage values
    theta=races,  # Race categories
    fill='toself',
    name=census_df['Name'].iloc[0]
))

# Update layout
fig.update_layout(
    polar=dict(
        radialaxis=dict(visible=True, range=[0, max(normalized_populations) + 10])  # Adjust range for better visibility
    ),
    title=f"Racial Population Distribution for {census_df['Name'].iloc[0]}"
)

# Show the plot
fig.show()

In [14]:
# Sample citation data (without using counts)
citation_data = {
    'DRIVER_RACE': ['WHITE', 'BLACK', 'ASIAN']
}
citation_df = pd.DataFrame(citation_data)

# Census race categories
census_races = ['White_alone', 'Black', 'Asian']
census_populations = census_df[census_races].iloc[0]

# Calculate normalized population as a percentage of the total population
total_population = census_df['Total_population'].values[0]
normalized_population = (census_populations / total_population) * 100

# Create a dummy distribution for citations (equal percentages for each race)
# Normalize each race in citation_df to have equal representation
unique_citations = citation_df['DRIVER_RACE'].nunique()
normalized_citations = pd.Series([100 / unique_citations] * unique_citations, index=citation_df['DRIVER_RACE'].unique())

# Map citation races to the corresponding census race labels to ensure consistency
race_mapping = {
    'WHITE': 'White_alone',
    'BLACK': 'Black',
    'ASIAN': 'Asian'
}

# Make sure to reorder the citations to match the census race order
mapped_citations = normalized_citations.rename(index=race_mapping)

# Plot radar chart
fig = go.Figure()

# Plot census population (percentages)
fig.add_trace(go.Scatterpolar(
    r=normalized_population.values,  # Population percentages
    theta=census_races,  # Census race categories
    fill='toself',
    name='Census Population',
    line=dict(color='blue')
))

# Plot citation data (with equal distribution for each race)
fig.add_trace(go.Scatterpolar(
    r=mapped_citations.values,  # Citation percentages
    theta=census_races,  # Match the order of census race categories
    fill='toself',
    name='Citations (Equal Distribution)',
    line=dict(color='red')
))

# Update layout for the radar chart
fig.update_layout(
    polar=dict(
        radialaxis=dict(visible=True, range=[0, 100]),  # Both axes in percentage
    ),
    title="Comparison of Census Population and Citations (Equal Distribution) by Race",
    showlegend=True
)

# Show the plot
fig.show()