In [None]:
# Imports, setup
import pandas as pd
import altair as alt
import vega_datasets
alt.data_transformers.disable_max_rows()

# Data, default theme
hatecrimes = pd.read_csv("../data/hate_crime.csv")
population = pd.read_csv("../data/population.csv")
alt.themes.enable('fivethirtyeight')

# Some basic data cleaning on population
population.dropna(inplace=True)
population.rename(columns={"Label (Grouping)": "state", "Total": "population"}, inplace=True)

In [None]:
# GRAPH 1 -- LINE CHART (ALL TIME)
# No critiques on this one
alt.Chart(hatecrimes, title='Hate Crimes Over Time').mark_line().encode(
        alt.X('data_year:O').title("Year"),
        alt.Y('count():N').title("Hate Crime Incidents"),
        color=alt.value("#ad2e03")
    ).display()

In [None]:
# GRAPH 2 -- LINE CHART (BY BIAS)
# Main critques: sort legend so it appears in the order on the chart, and expand y axis to better see variation

# Finding top 10 biases, and filtering for new dataset
top10biases = hatecrimes.groupby(['bias_desc']).size().sort_values(ascending=False).head(10)
top10biases = list(top10biases.index)
hatecrimestop10 = hatecrimes.loc[hatecrimes['bias_desc'].isin(top10biases)]

# Manual legend sorting
sorted_legend_list = ['Anti-Black or African American', 'Anti-Jewish', 'Anti-Gay (Male)', 'Anti-White', 'Anti-Hispanic or Latino', 'Anti-Lesbian, Gay, Bisexual, or Transgender (Mixed Group)', 
                      'Anti-Asian', 'Anti-Other Race/Ethnicity/Ancestry', 'Anti-Multiple Races, Group', 'Anti-Lesbian (Female)']

# Line chart
alt.Chart(hatecrimestop10, title="Hate Crimes by Bias Over Time").mark_line().encode(
        alt.X('data_year:O').title("Year"),
        alt.Y('count():N').title("Incidents"),
        alt.Color('bias_desc:N', legend=alt.Legend(labelLimit=300), sort=sorted_legend_list).title("Crime Bias").scale(scheme='category10')
    ).properties(height=500).display()

In [None]:
# GRAPH 3 -- HEATMAP (SEASONALITY)
# Main critiques: "gray out" ones with no data, "highlight" seasons with lots of hate crimes
# NOTE: Decided these suggestions distract from the data

# Mapping data excluding 2001, as surge after 9/11 doesn't contribute to seasonality
alt.Chart(hatecrimes.loc[hatecrimes['data_year'] != 2001], title=alt.Title("Seasonality of Hate Crimes", subtitle="Excluding data from the year 2001")).mark_rect().encode(
    alt.X('date(incident_date):O').title("Date"),
    alt.Y('month(incident_date):O').title("Month"),
    alt.Color('count()').title("Incidents").scale(scheme='orangered')
).display()

In [None]:
# Importing, manipulating state/pop data for map
# Using Altair's default state/ID dataset
from vega_datasets import data
states = alt.topo_feature(data.us_10m.url, feature='states')
state_ids = data.population_engineers_hurricanes(usecols=['state', 'id'])

# Counting hatecrimes by state IDs
hatecrimes_w_ids = hatecrimes.merge(state_ids, left_on='state_name', right_on='state')
state_counts = hatecrimes_w_ids.groupby(['id'], as_index=False).size()
state_counts = state_counts.merge(state_ids)

# Re-adding state name and population, adding percapita column
state_counts_w_pop = state_counts.merge(population, on="state")
state_counts_w_pop["percap"] = state_counts_w_pop["size"]/state_counts_w_pop["population"]

In [None]:
# GRAPH 4 -- US MAP
# Main critiques: normalize for population using per capita
states = alt.topo_feature(data.us_10m.url, 'states')
alt.Chart(states, title=alt.Title("Hate Crimes Across the U.S.", subtitle="Using 2020 population")).mark_geoshape().encode(
    alt.Color('percap:Q').scale(scheme='browns').title("Hate Crimes Per Capita")
).transform_lookup(
    lookup='id',
    from_=alt.LookupData(state_counts_w_pop, 'id', list(state_counts_w_pop.columns))
).project(
    type='albersUsa'
).properties(
    width=500,
    height=300
)

In [None]:
# GRAPH 5 -- BAR CHART (OFFENDER RACE)
# Main critique: clarify "multiple races", deal with not specified/unknown, possibly change to percentages

# Removing 'not specified', changing "Multiple" to "Multiple Races"
hatecrimes_race= hatecrimes.loc[hatecrimes['offender_race'] != "Not Specified"]
hatecrimes_race["offender_race"] = hatecrimes_race["offender_race"].replace("Multiple", "Multiple Races")

# Chart
alt.Chart(hatecrimes_race, title=alt.Title("Hate Crimes by Offender Race", subtitle="Among hate crimes where race was specified")).transform_joinaggregate(
    total='count(*)'
).transform_calculate(pct = '1/datum.total').mark_bar().encode(
        alt.Y('offender_race:O', axis=alt.Axis(labelLimit=200)).title("Offender Race").sort('-x'),
        alt.X('sum(pct):Q').title("Hate Crime Incidents").scale(domain=(0,0.425)).axis(format='%'),
        color=alt.value("#ad2e03")
    ).display()

# Help converting to percentages here https://stackoverflow.com/questions/56358977/how-to-show-a-histogram-of-percentages-instead-of-counts-using-altair

In [None]:
# Data manipulation for "recent years" line chart
# Focusing on 2019-2023, finding most popular biases, filtering
years = [2019, 2020, 2021, 2022, 2023]
hatecrimes1924= hatecrimes.loc[hatecrimes['data_year'].isin(years)]
topbiases_recent = hatecrimes1924.groupby(['bias_desc']).size().sort_values(ascending=False).head(10)
topbiases_recent = list(topbiases_recent.index)
hatecrimestop10_rec = hatecrimes1924.loc[hatecrimes1924['bias_desc'].isin(topbiases_recent)]

# Manual legend sorting
sorted_legend_rec = ['Anti-Black or African American', 'Anti-Jewish', 'Anti-Gay (Male)', 'Anti-White', 'Anti-Hispanic or Latino', 'Anti-Lesbian, Gay, Bisexual, or Transgender (Mixed Group)', 
                      'Anti-Asian', 'Anti-Other Race/Ethnicity/Ancestry', 'Anti-Multiple Races, Group', 'Anti-Transgender']

In [None]:
# GRAPH 6 -- LINE CHART FOR 2019-2023 BY BIAS
# Main critique: legend sorting, vertical lines/dots on important events
# Credit to Dorka for helping me apply the lines/dots :) <3

# Main line chart
main_c = alt.Chart(hatecrimestop10_rec, title="Top Hate Crimes From 2019-2023").mark_line().encode(
        alt.X('yearmonth(incident_date):T').title('Month'),
        alt.Y('count()').title('Hate Crime Incidents'), 
        alt.Color('bias_desc:N', legend=alt.Legend(labelLimit=300), sort=sorted_legend_rec).title("Bias")
    ).properties(height=500, width=1500)

# Groups of text
text1 = alt.Chart(pd.DataFrame({'incident_date': ['2020-05-25'], 'label': ["Murder of George Floyd"]})).mark_text(
    dx=-75, dy=75
    ).encode(
        alt.X('yearmonth(incident_date):T'),
        text='label:N')

text2 = alt.Chart(pd.DataFrame({'incident_date': ['2021-03-16'], 'label': ["2021 Atlanta Spa Shootings"]})).mark_text(
    dx=-75, dy=150
    ).encode(
        alt.X('yearmonth(incident_date):T'),
        text='label:N')

text3 = alt.Chart(pd.DataFrame({'incident_date': ['2023-10-07'], 'label': ["Hamas Attacks Israel"]})).mark_text(
    dx=-60
    ).encode(
        alt.X('yearmonth(incident_date):T'),
        text='label:N')
text = text1 + text2 + text3

# Groups of lines
line1 = alt.Chart(pd.DataFrame({'incident_date': ['2020-05-25']})).mark_rule(color='#808080').encode(
    x='yearmonth(incident_date):T'
)
line2 = alt.Chart(pd.DataFrame({'incident_date': ['2021-03-16']})).mark_rule(color='#808080').encode(
    x='yearmonth(incident_date):T'
)
line3 = alt.Chart(pd.DataFrame({'incident_date': ['2023-10-07']})).mark_rule(color='#808080').encode(
    x='yearmonth(incident_date):T'
)
lines = line1 + line2 + line3

# Groups of dots
dot_1 = alt.Chart(pd.DataFrame({'incident_date': ['2020-05-25'], 'y': 250})).mark_point(color='#000000').encode(
        x='yearmonth(incident_date):T',
        y='y'
)
dot_2 = alt.Chart(pd.DataFrame({'incident_date': ['2021-03-16'], 'y': 125})).mark_point(color='#000000').encode(
        x='yearmonth(incident_date):T',
        y='y'
)
dot_3 = alt.Chart(pd.DataFrame({'incident_date': ['2023-10-07'], 'y': 325})).mark_point(color='#000000').encode(
        x='yearmonth(incident_date):T',
        y='y'
)
dots = dot_1 + dot_2 + dot_3

# Layering into a single chart
main_c + text + lines + dots

In [None]:
# GRAPH 7 - SCATTERPLOT?
# Main critique: consider 'zooming in', deal with outliers, make sure axis is same on both sides
# Filtering to only include crimes with "people" victims (not just property crime)
hatecrimes_people = hatecrimes.loc[hatecrimes['total_individual_victims'] > 0]

# Chart, using scale to clip off outliers
alt.Chart(hatecrimes_people, title=alt.Title('Number of Offenders vs. Number of Victims', subtitle='Among Crimes with <50 Victims and Offenders')).mark_circle(clip=True).encode(
    alt.X('total_offender_count').title("Total Offenders").scale(domain=(0,51)),
    alt.Y('total_individual_victims').title("Total Victims").scale(domain=(0,51)),
    color=alt.value("#ad2e03")
).display()

In [None]:
# GRAPH 8 -- BAR CHART (LOCATIONS)
# Main critique: 'unknowns' here not relevant to story, add percentages like other bar graph
# Removing 'unknown' location, finding top 10, filtering
hatecrimes_known = hatecrimes.loc[hatecrimes['location_name'] != "Other/Unknown"]
top_locs = hatecrimes_known.groupby(['location_name']).size().sort_values(ascending=False).head(10)
top_locs = list(top_locs.index)
hatecrimestop10_loc = hatecrimes.loc[hatecrimes['location_name'].isin(top_locs)]

# Chart, using same percent method as previously cited
alt.Chart(hatecrimestop10_loc, title="Top Locations of Hate Crimes").transform_joinaggregate(
    total='count(*)'
).transform_calculate(pct = '1/datum.total').mark_bar().encode(
        alt.Y('location_name:O', axis=alt.Axis(labelLimit=200)).title("Location of Incident").sort('-x'),
        alt.X('sum(pct):Q').title("Hate Crime Incidents").scale(domain=(0,0.425)).axis(format='%'),
        color=alt.value("#ad2e03")
    ).display()


In [None]:
# GRAPH 9 -- BAR CHART (OFFENSES)
# While this was not critiqued by peers, it is similar to my other bar charts
# I chose not to use percentages on this one to focus on the magnitude of 
# different types of crimes

# Handling with 'joint offenses' by focusing on primary (first charge before ;)
hatecrimes['offense_name'] = hatecrimes['offense_name'].str.split(';', n=1, expand=True)[0]
topcrimes = hatecrimes.groupby(['offense_name']).size().sort_values(ascending=False).head(10)
topcrimes = list(topcrimes.index)

alt.Chart(hatecrimes.loc[hatecrimes['offense_name'].isin(topcrimes)], title="Top 10 Charges in Hate Crimes").mark_bar().encode(
        alt.Y('offense_name:O', axis=alt.Axis(labelLimit=200)).title("Offense").sort('-x'),
        alt.X('count():N').title("Hate Crime Incidents"),
        color=alt.value("#ad2e03")
    ).display()