# Police Shootings in America Analysis
---
This notebook is meant to analyze The Washington Post's fatal police shootings dataset. The dataset is hosted on Github [here](https://github.com/washingtonpost/data-police-shootings) and covers fatal police shootings in America starting from 2015 and is continuously updated. Go [here](https://www.washingtonpost.com/national/how-the-washington-post-is-examining-police-shootings-in-the-united-states/2016/07/07/d9c52238-43ad-11e6-8856-f26de2537a9d_story.html?utm_term=.24bb56ea1c26) to read more about the Washington Post's methodology. This notebook is using the dataset available as of July 12, 2018.

This notebook focuses mostly on visualizations, after the data has been prepared. The notebook `DataPreparation.ipynb` should be run first, unless you are just viewing. It takes a first look at the data and geocodes the cities/states into latitude/longitude coordinates, and creates some new features that are used for some visualizations below.

In [1]:
import pandas as pd
import numpy as np
import plotly
import plotly.graph_objs as go
from IPython.display import display
import utils
import ipywidgets as widgets

# Init plotly for offline plotting
plotly.offline.init_notebook_mode(connected=True)

print('Pandas version:', pd.__version__)
print('Numpy version:', np.__version__)
print('Plotly version:', plotly.__version__)  # 3.5.0

Pandas version: 0.20.3
Numpy version: 1.14.2
Plotly version: 3.5.0


In [2]:
# Load data from pickle file
data = utils.load_data()

## Geographic Map of Shootings
---
Plot a map of the US and overlay where shootings occur. Areas with more shootings correspond to a darker dot. There are 1819 unique cities in the dataset with 51 unique states (includes District of Columbia DC as state). There are buttons to switch between viewing shootings by state and by year. Click on the state names to the right of plot to toggle traces or use the slider at the bottom of the plot to change year.

In [3]:
# Plot geographic scatter map

# Create traces for each state
cities = []
for state in data['state'].value_counts().index:
    data_sub = data.loc[data['state'] == state]
    text = data_sub['city'] + ', ' + data_sub['state'] + '<br>Num Shootings: ' + \
    data_sub['num_shootings'].astype(str) + '<br>Most common weapon: ' + data_sub['common_armed'].astype(str)
    city = go.Scattergeo(locationmode = 'USA-states',
                         lon = data_sub['lon'],
                         lat = data_sub['lat'],
                         text = text,
                         mode = 'markers',
                         marker = go.scattergeo.Marker(
                             size = 8,
                             color = "rgb(255, 102, 102)",
                             opacity = 0.4,
                             line = go.scattergeo.marker.Line(
                                 width=0.5,
                                 color='rgb(255, 102, 102)'
                             )
                         ),
                         name = state
                        )
    cities.append(city)

# Create traces for each state by year
cities_year = []
for year in sorted(data.date.dt.year.value_counts().index):
    data_sub = data.loc[data['date'].dt.year == year]
    text = data_sub['city'] + ', ' + data_sub['state'] + '<br>Num Shootings: ' + \
    data_sub['num_shootings'].astype(str) + '<br>Most common weapon: ' + data_sub['common_armed'].astype(str)
    city = go.Scattergeo(locationmode = 'USA-states',
                         visible = False,
                         lon = data_sub['lon'],
                         lat = data_sub['lat'],
                         text = text,
                         mode = 'markers',
                         marker = go.scattergeo.Marker(
                             size = 8,
                             color = "rgb(0, 153, 51)",  # 255, 102, 102
                             opacity = 0.4,
                             line = go.scattergeo.marker.Line(
                                 width=0.5,
                                 color='rgb(0, 153, 51)'
                             )
                         ),
                         name = str(year)
                        )
    cities_year.append(city)

""" Old way for creating buttons and slider
# Create steps for slider
steps = [
    dict(method = 'restyle',
         args = ['visible', ([False] * len(cities)) + [True, False, False, False, False]],
         label = "Year 2015"
        ),
    dict(method = 'restyle',
         args = ['visible', ([False] * len(cities)) + [False, True, False, False, False]],
         label = "Year 2016"
    ),
    dict(method = 'restyle',
         args = ['visible', ([False] * len(cities)) + [False, False, True, False, False]],
         label = "Year 2017"
        ),
    dict(method = 'restyle',
         args = ['visible', ([False] * len(cities)) + [False, False, False, True, False]],
         label = "Year 2018"
        ),
    dict(method = 'restyle',
         args = ['visible', ([False] * len(cities)) + [False, False, False, False, True]],
         label = "Year 2019"
        )
]

# Create slider
slider = [dict(active = 0,
               pad = dict(t = 1),
               steps = steps
              )
         ]
        
# Create buttons that toggle cities and cities_years traces
updatemenus = [go.layout.Updatemenu(type = "buttons",
                                    active = 0,
                                    buttons = [go.layout.updatemenu.Button(label = 'States',
                                                                           method = 'update',
                                                                           args = [dict(visible = ([True] * len(cities)) + [False, False, False, False, False],
                                                                                        showlegend = True),
                                                                                   dict(sliders = [],
                                                                                        title = 'US police shootings (2015-Present)<br>(Click legend to toggle traces)')
                                                                                 ]
                                                                         ),
                                              go.layout.updatemenu.Button(label = 'Years',
                                                                          method = 'update',
                                                                          args = [dict(visible = ([False] * len(cities)) + [True, False, False, False, False],
                                                                                       showlegend = False),
                                                                                  dict(sliders = slider,
                                                                                       title = 'US police shootings (2015-Present)<br>(Drag slider to change year)')
                                                                                 ]
                                                                         )
                                             ]
                                  )
              ]
"""

# Create layout
layout = go.Layout(title = 'US police shootings (2015-Present)<br>(Click legend to toggle traces)',
                   font = dict(family = "Overpass"),
                   geo = go.layout.Geo(scope = 'usa',
                                       projection = go.layout.geo.Projection(type='albers usa'),
                                       showland=True,
                                       showlakes = True,
                                       landcolor = 'rgb(217, 217, 217)',
                                       subunitwidth=1,
                                       countrywidth=1,
                                       subunitcolor="rgb(255, 255, 255)",
                                       countrycolor="rgb(255, 255, 255)"
                                      ),
#                    updatemenus = updatemenus
                  )

trace_data = cities + cities_year

fig = go.FigureWidget(data=trace_data, layout=layout)
# fig = go.Figure(data=trace_data, layout=layout)
# plotly.offline.iplot(fig, validate=False, show_link=False, filename='scatter-map-states.html')

# Alternate way to create slider with widgets
slider = widgets.SelectionSlider(options=['2015', '2016', '2017', '2018', '2019'],
                                 index=0,
                                 description='Year:',
                                 disabled=True,
                                 orientation='horizontal',
                                 continuous_update=False,
                                 readout=True,
                                 layout=dict(visibility='hidden'))

def set_year(*_):
    year_visibility = ['legendonly'] * 5
    year_visibility[slider.index] = True
    fig.plotly_restyle({'visible': ([False] * len(cities)) + year_visibility})

slider.observe(set_year)

# Create mode toggle
def set_mode(*_):
    if mode_toggle.value == 'States':
        slider.disabled = True
        slider.layout.visibility = 'hidden'
        fig.plotly_update({'visible': ([True] * len(cities)) + [False, False, False, False, False], 'showlegend': True},
                          {'title': 'US police shootings (2015-Present)<br>(Click legend to toggle traces)'})
    else:
        slider.disabled = False
        slider.layout.visibility = 'visible'
        fig.plotly_update({'showlegend': False},
                          {'title': 'US police shootings (2015-Present)<br>(Drag slider to change year)'})
        set_year()

mode_toggle = widgets.ToggleButtons(
    options=['States', 'Years'],
    disabled=False)


mode_toggle.observe(set_mode)

# Create widget layout
widgets.VBox([fig, mode_toggle, slider])

VBox(children=(FigureWidget({
    'data': [{'lat': array([37.7792808, 37.9577016, 33.954737 , ..., 32.7425516,…

## Shootings by State
---
How many fatal shootings are there per state?

This bar plot shows the total number of shootings that occur by state. The state with the highest number of shootings is on the left side of the graph. California has the highest number of shootings by far, 610 shootings compared to 340 shootings in Texas.

In [4]:
# Plot # of shootings per state
x = []
y = []
for state in data['state'].value_counts().index:
    x.append(state)
    y.append(len(data.loc[data['state'] == state]))

trace_data = [go.Bar(x=x,
                     y=y,
                     textposition = 'auto',
                     marker=dict(
                         color='rgb(172,83,231)',
                         line=dict(
                             color='rgb(124,26,229)',
                             width=1.5),
                     ),
                     opacity=0.6
                    )]

layout = go.Layout(font = dict(family = "Overpass"),
                   title = "Number of shootings per state",
                   xaxis = dict(tickangle=-45,
                              title = "State"),
                   yaxis = dict(title = "Number of shootings")
)

fig = go.FigureWidget(data=trace_data, layout=layout)
plotly.offline.iplot(fig, validate=False, show_link=False, image='png', filename='num-shootings-per-state')

## Signs of Mental Illness & Common Weapon
---
Is someone who shows signs of mental illness more likely to have a weapon? What kind of weapon?

The dataset includes a feature that tells whether or not a person showed signs of mental illness. The next plot looks at the distribution of people showing signs of mental illness and the most common weapon ('armed' column) for people who did show signs of mental illness. 

Gun is the most common, which is also the most common weapon for the entire dataset. Same for the second most common item in plot which is knife, but the third one is different from the whole dataset: toy weapon. Toy weapon can include airsoft guns, BB guns, and replica guns and are difficult to discern from the [real thing](https://www.washingtonpost.com/investigations/in-two-years-police-killed-86-people-brandishing-guns-that-look-real--but-arent/2016/12/18/ec005c3a-b025-11e6-be1c-8cec35b1ad25_story.html?utm_term=.0cf19c6648fc).

In [5]:
ppl_with_mi = data.loc[data['signs_of_mental_illness'] == True]
ppl_with_no_mi = data.loc[data['signs_of_mental_illness'] == False]
x = ppl_with_mi['armed'].value_counts().index
y = []
for weapon in x:
    y.append(len(ppl_with_mi.loc[ppl_with_mi['armed'] == weapon]))

# Create trace for 'armed' distribution
trace_data_mi = go.Bar(x = x,
                       y = y,
                       marker = dict(
                           color = 'rgb(158,202,225)',
                           line = dict(
                               color = 'rgb(8,48,107)',
                               width = 1.5),
                       ),
                       opacity = 0.6,
                       name = 'Weapon distribution for people showing signs of mental illness'
                      )

# Create trace for mental illness distribution
trace_total_mi = go.Bar(x = ['Signs of mental illness', 'No signs of mental illness'],
                        y = [len(ppl_with_mi), len(ppl_with_no_mi)],
                        text = [len(ppl_with_mi), len(ppl_with_no_mi)],
                        textposition = 'auto',
                        marker = dict(
                            color = 'rgb(33, 181, 179)',
                            line = dict(
                                color = 'rgb(8,48,107)',
                                width = 1.5)
                        ),
                        opacity = 0.7,
                        xaxis='x2',
                        yaxis='y2',
                        name = 'Distribution of people showing signs of mental illness'
                       )

# Create layout
layout = go.Layout(font = dict(family = "Overpass"),
                   title = "Common weapon for people showing signs of mental illness",
                   legend=dict(x=.6, y=0.2),
                   xaxis=dict(tickangle=-45),
                   yaxis = dict(title = "Number of shootings"),
                   margin = dict(b = 90),
                   xaxis2=dict(
                       domain=[0.6, 0.95],
                       anchor='y2'
                   ),
                   yaxis2=dict(
                       domain=[0.6, 0.95],
                       anchor='x2',
                   )
                  )

fig = go.FigureWidget(data=[trace_data_mi, trace_total_mi], layout=layout)
plotly.offline.iplot(fig , validate=False, show_link=False, image='png', filename='mental-illness-armed')

## Feature Distributions
---

Often times it's useful to look at how many times a certain value appears in column, especially if the column has categorical values.

The next few bar plots show various distributions for some of the columns in the dataset.
* **Gender distribution**: Look at how many M/F and missing values are in the entire dataset. "Missing Data" means the gender information is missing from the dataset. In the dataset this is represented as NaN (not a number).
* **Race distribution**: Look at how many fatal shootings for each race occur in the dataset.
* **Manner of death distribution**: See how many different manner of deaths there are for entire dataset.
* **Flee distribution**: Data is recorded whether the person shot was trying to flee from the police.

In [6]:
# Gender distribution
gender_counts = data['gender'].value_counts(dropna=False)
x = gender_counts.index.fillna('Missing Data')
y = gender_counts.values

# Create trace
trace_data = go.Bar(x=x,
                    y=y,
                    text=y,
                    textposition = 'auto',
                    marker=dict(
                        color='rgb(158,202,225)',
                        line=dict(
                            color='rgb(8,48,107)',
                            width=1.5),
                    ),
                    opacity=0.6
                   )

# Create layour
layout = go.Layout(font = dict(family = "Overpass"),
                   title = "Gender distribution",
                   xaxis = dict(title = "Gender"),
                   yaxis = dict(title = "Number of shootings")
                  )
fig = go.FigureWidget(data=[trace_data], layout=layout)
plotly.offline.iplot(fig , validate=False, show_link=False, image='png', filename='gender-distribution')

In [7]:
# Race distribution
# Replace abbreviations with full names. easier to read on plot
data['race'].replace(['W', 'B', 'A', 'N', 'H', 'O'], ['White', 'Black', 'Asian', 'Native American', 'Hispanic', 'Other'], inplace=True)
race_counts = data['race'].value_counts(dropna=False)
x = race_counts.index.fillna('Missing Data')
y = race_counts.values

# Create trace    
trace_data = go.Bar(x=x,
                    y=y,
                    text=y,
                    textposition = 'auto',
                    marker=dict(
                        color='rgb(158,202,225)',
                        line=dict(
                            color='rgb(8,48,107)',
                            width=1.5),
                    ),
                    opacity=0.6
                   )

# Create layout
layout = go.Layout(font = dict(family = "Overpass"),
                   title = "Race distribution",
                   xaxis = dict(title = "Race"),
                   yaxis = dict(title = "Number of shootings")
                  )
fig = go.FigureWidget(data=[trace_data], layout=layout)
plotly.offline.iplot(fig , validate=False, show_link=False, image='png', filename='race-distribution')

In [8]:
# Manner of death distribution
manner_counts = data['manner_of_death'].value_counts(dropna=False)
x = manner_counts.index.fillna('Missing Data')
y = manner_counts.values
    
# Create trace
trace_data = go.Bar(x=x,
                    y=y,
                    text=y,
                    textposition = 'auto',
                    marker=dict(
                        color='rgb(241,140,39)',
                        line=dict(
                            color='rgb(241,125,8)',
                            width=1.5),
                    ),
                    opacity=0.6
                   )
# Create layout
layout = go.Layout(font = dict(family = "Overpass"),
                   title = "Manner of death distribution",
                   xaxis = dict(title = "Manner of death"),
                   yaxis = dict(title = "Number of shootings")
                  )
fig = go.FigureWidget(data=[trace_data], layout=layout)
plotly.offline.iplot(fig , validate=False, show_link=False, image='png', filename='manner-death-distribution')

In [9]:
# Flee distribution
flee_counts = data['flee'].value_counts(dropna=False)
x = flee_counts.index.fillna('Missing Data')
y = flee_counts.values
    
# Create trace
trace_data = go.Bar(x=x,
                    y=y,
                    text=y,
                    textposition = 'auto',
                    marker=dict(
                        color='rgb(8,240,215)',
                        line=dict(
                            color='rgb(9,220,198)',
                            width=1.5),
                    ),
                    opacity=0.6
                   )
# Create layout
layout = go.Layout(font = dict(family = "Overpass"),
                   title = "Flee distribution",
                   xaxis = dict(title = "Flee"),
                   yaxis = dict(title = "Number of shootings")
                  )
fig = go.FigureWidget(data=[trace_data], layout=layout)
plotly.offline.iplot(fig , validate=False, show_link=False, image='png', filename='flee-distribution')

## Age vs. Race
---

Look at each race in the dataset and plot the age of each person. Here we can easily see the oldest and youngest person killed and the density of the ages too (darker dots means more people in that age range).

In [10]:
# Create temporary copy of data for this plot only
temp = data.copy()
temp = temp.dropna(axis=0)
trace1 = go.Scatter(x = temp['age'].astype(int),
                    y = temp['race'],
                    mode = 'markers',
                    marker = dict(color = "blue",
                                  opacity=0.3,
                                  size = 12),
                    name = "Age vs. Race",
                   )

layout = go.Layout(font = dict(family = "Overpass"),
                   title = "Age vs. Race",
                   hovermode = 'closest',
                   margin = dict(l = 100),
                   xaxis = dict(title = 'Age')
                  )

fig = go.FigureWidget(data=[trace1], layout=layout)
plotly.offline.iplot(fig, show_link=False, image='png', filename='age-vs-race')

## Cities with Bodycams
---

Where in America are bodycams being used?

This barplot looks at cities where a bodycam was being used. Then the number of shootings is counted and the top 5 cities are plotted with their city in y axis and state inside bar.

Las Vegas has 17 shootings where police officer(s) had a body camera while Los Angeles came in second with 12.

In [11]:
# Find cities/states with the most amount of shootings involving body cameras
top_cities = data.loc[data['body_camera'] == True].groupby(["city", "state"]).size().sort_values(ascending=False)[:5]
x = top_cities.index.get_level_values(level='city')
y = top_cities.values

trace_data = [go.Bar(x=x,
                     y=y,
                     text = top_cities.index.get_level_values(level='state'),
                     textposition = 'auto',
                     marker=dict(
                         color='rgb(147,216,68)',
                         line=dict(
                             color='rgb(112,180,33)',
                             width=1.5),
                     ),
                     opacity=0.6
                    )]

layout = go.Layout(font = dict(family = "Overpass"),
                   title = "Cities with most bodycam shootings",
                   xaxis = dict(tickangle=-45,
                              title = "Cities"),
                   yaxis = dict(title = "Number of shootings")
)

fig = go.FigureWidget(data=trace_data, layout=layout)
plotly.offline.iplot(fig, validate=False, show_link=False, image='png', filename='cities-bodycam')

## Race over Time
---
How many fatal shootings occur over time by race?

Most of the data points contain information about the persons's race. The plot below shows the number of shootings that happen in a given month for each race, not including missing data. In general Native American, Asian, and Other have a low number of shootings. There is also a drop in number of shootings for all races in December of 2017.

In [12]:
# Race over time
trace_data_total = []

num_years = len(data['date'].dt.year.value_counts()) - 1
# Grab most recent month and year
end_year = data['date'].dt.year.max()
# Final month that in dataset (ex. June, 2018)
end_month = data.loc[data['date'].dt.year == data['date'].dt.year.max()]['date'].dt.month.max()

# Construct Series containing all possible months/years to append to another Series
month_years = [list(range(1, 13)) * num_years + list(range(1, end_month+1)), [2015] * 12 + [2016] * 12 + [2017] * 12 + [2018] * 12 + [2019] * end_month]
month_years_tuples = list(zip(*month_years))
index = pd.MultiIndex.from_tuples(month_years_tuples, names=['date', 'date'])
empty_ser = pd.Series(np.zeros((len(month_years[1]))), index=index)


for race in data['race'].value_counts().index:
    race_counts = data.loc[data['race'] == race].groupby([data['date'].dt.month, data['date'].dt.year]).size()
    race_counts.sort_index(level=1, inplace=True)
    # Merge empty series and race_counts to fill in missing month/years with 0
    race_counts_new = pd.concat([race_counts, empty_ser], axis=1)
    race_counts_new.sort_index(level=1, inplace=True)
    race_counts_new.drop(1, axis=1, inplace=True)
    race_counts_new.fillna(0, inplace=True)
    # Create string for dates and grab num shooting values
    x = race_counts_new.index.get_level_values(level=0).astype(str) + '/' + race_counts_new.index.get_level_values(level=1).astype(str)
    y = race_counts_new.values.flatten()
    trace_data = go.Scatter(x = x,
                            y = y,
                            name = race,
                            mode = "lines+markers",
                            line = dict(
                                color = ('rgb('+str(np.random.randint(1,256))+','+str(np.random.randint(1,256))+','+str(np.random.randint(1,256))+')'),
                                width = 4)
                           )
    trace_data_total.append(trace_data)

layout = go.Layout(font = dict(family = "Overpass"),
                   title = "Race over time",
                   showlegend = True,
                   xaxis = dict(title = "Time",
                                tickangle = -45),
                   yaxis = dict(title = "Number of shootings")
                  )

fig = go.FigureWidget(data=trace_data_total, layout=layout)
plotly.offline.iplot(fig , validate=False, show_link=False, image='png', filename='race-time')