In [1]:
import datetime as dt
import pandas as pd
import numpy as np

The data was scraped from the Gun Violence Archive by github user jamesqo. It contains data for all recorded gun violence incidents in the US between January 2013 and March 2018, with the notable exception of the 2017 Vegas mass shooting.
Gun Violence Archive: https://www.gunviolencearchive.org/
Github Repository: https://github.com/jamesqo/gun-violence-data

In [2]:
gun_violence_filepath = "gun_violence_data.csv"
df = pd.read_csv(gun_violence_filepath)
df.head()

Unnamed: 0,incident_id,date,state,city_or_county,address,n_killed,n_injured,incident_url,source_url,incident_url_fields_missing,...,participant_age,participant_age_group,participant_gender,participant_name,participant_relationship,participant_status,participant_type,sources,state_house_district,state_senate_district
0,461105,2013-01-01,Pennsylvania,Mckeesport,1506 Versailles Avenue and Coursin Street,0,4,http://www.gunviolencearchive.org/incident/461105,http://www.post-gazette.com/local/south/2013/0...,False,...,0::20,0::Adult 18+||1::Adult 18+||2::Adult 18+||3::A...,0::Male||1::Male||3::Male||4::Female,0::Julian Sims,,0::Arrested||1::Injured||2::Injured||3::Injure...,0::Victim||1::Victim||2::Victim||3::Victim||4:...,http://pittsburgh.cbslocal.com/2013/01/01/4-pe...,,
1,460726,2013-01-01,California,Hawthorne,13500 block of Cerise Avenue,1,3,http://www.gunviolencearchive.org/incident/460726,http://www.dailybulletin.com/article/zz/201301...,False,...,0::20,0::Adult 18+||1::Adult 18+||2::Adult 18+||3::A...,0::Male,0::Bernard Gillis,,0::Killed||1::Injured||2::Injured||3::Injured,0::Victim||1::Victim||2::Victim||3::Victim||4:...,http://losangeles.cbslocal.com/2013/01/01/man-...,62.0,35.0
2,478855,2013-01-01,Ohio,Lorain,1776 East 28th Street,1,3,http://www.gunviolencearchive.org/incident/478855,http://chronicle.northcoastnow.com/2013/02/14/...,False,...,0::25||1::31||2::33||3::34||4::33,0::Adult 18+||1::Adult 18+||2::Adult 18+||3::A...,0::Male||1::Male||2::Male||3::Male||4::Male,0::Damien Bell||1::Desmen Noble||2::Herman Sea...,,"0::Injured, Unharmed, Arrested||1::Unharmed, A...",0::Subject-Suspect||1::Subject-Suspect||2::Vic...,http://www.morningjournal.com/general-news/201...,56.0,13.0
3,478925,2013-01-05,Colorado,Aurora,16000 block of East Ithaca Place,4,0,http://www.gunviolencearchive.org/incident/478925,http://www.dailydemocrat.com/20130106/aurora-s...,False,...,0::29||1::33||2::56||3::33,0::Adult 18+||1::Adult 18+||2::Adult 18+||3::A...,0::Female||1::Male||2::Male||3::Male,0::Stacie Philbrook||1::Christopher Ratliffe||...,,0::Killed||1::Killed||2::Killed||3::Killed,0::Victim||1::Victim||2::Victim||3::Subject-Su...,http://denver.cbslocal.com/2013/01/06/officer-...,40.0,28.0
4,478959,2013-01-07,North Carolina,Greensboro,307 Mourning Dove Terrace,2,2,http://www.gunviolencearchive.org/incident/478959,http://www.journalnow.com/news/local/article_d...,False,...,0::18||1::46||2::14||3::47,0::Adult 18+||1::Adult 18+||2::Teen 12-17||3::...,0::Female||1::Male||2::Male||3::Female,0::Danielle Imani Jameison||1::Maurice Eugene ...,3::Family,0::Injured||1::Injured||2::Killed||3::Killed,0::Victim||1::Victim||2::Victim||3::Subject-Su...,http://myfox8.com/2013/01/08/update-mother-sho...,62.0,27.0


## Getting Rid of Useless Columns

In [3]:
print(df.columns)

Index(['incident_id', 'date', 'state', 'city_or_county', 'address', 'n_killed',
       'n_injured', 'incident_url', 'source_url',
       'incident_url_fields_missing', 'congressional_district', 'gun_stolen',
       'gun_type', 'incident_characteristics', 'latitude',
       'location_description', 'longitude', 'n_guns_involved', 'notes',
       'participant_age', 'participant_age_group', 'participant_gender',
       'participant_name', 'participant_relationship', 'participant_status',
       'participant_type', 'sources', 'state_house_district',
       'state_senate_district'],
      dtype='object')


In [4]:
# remove unnecessary columns to free up space/make df cleaner
columns = ["location_description", "notes", "incident_id", "incident_url", "source_url", "incident_url_fields_missing", "participant_name", "sources"]
df.drop(columns=columns, inplace=True)
df.head()

Unnamed: 0,date,state,city_or_county,address,n_killed,n_injured,congressional_district,gun_stolen,gun_type,incident_characteristics,...,longitude,n_guns_involved,participant_age,participant_age_group,participant_gender,participant_relationship,participant_status,participant_type,state_house_district,state_senate_district
0,2013-01-01,Pennsylvania,Mckeesport,1506 Versailles Avenue and Coursin Street,0,4,14.0,,,Shot - Wounded/Injured||Mass Shooting (4+ vict...,...,-79.8559,,0::20,0::Adult 18+||1::Adult 18+||2::Adult 18+||3::A...,0::Male||1::Male||3::Male||4::Female,,0::Arrested||1::Injured||2::Injured||3::Injure...,0::Victim||1::Victim||2::Victim||3::Victim||4:...,,
1,2013-01-01,California,Hawthorne,13500 block of Cerise Avenue,1,3,43.0,,,"Shot - Wounded/Injured||Shot - Dead (murder, a...",...,-118.333,,0::20,0::Adult 18+||1::Adult 18+||2::Adult 18+||3::A...,0::Male,,0::Killed||1::Injured||2::Injured||3::Injured,0::Victim||1::Victim||2::Victim||3::Victim||4:...,62.0,35.0
2,2013-01-01,Ohio,Lorain,1776 East 28th Street,1,3,9.0,0::Unknown||1::Unknown,0::Unknown||1::Unknown,"Shot - Wounded/Injured||Shot - Dead (murder, a...",...,-82.1377,2.0,0::25||1::31||2::33||3::34||4::33,0::Adult 18+||1::Adult 18+||2::Adult 18+||3::A...,0::Male||1::Male||2::Male||3::Male||4::Male,,"0::Injured, Unharmed, Arrested||1::Unharmed, A...",0::Subject-Suspect||1::Subject-Suspect||2::Vic...,56.0,13.0
3,2013-01-05,Colorado,Aurora,16000 block of East Ithaca Place,4,0,6.0,,,"Shot - Dead (murder, accidental, suicide)||Off...",...,-104.802,,0::29||1::33||2::56||3::33,0::Adult 18+||1::Adult 18+||2::Adult 18+||3::A...,0::Female||1::Male||2::Male||3::Male,,0::Killed||1::Killed||2::Killed||3::Killed,0::Victim||1::Victim||2::Victim||3::Subject-Su...,40.0,28.0
4,2013-01-07,North Carolina,Greensboro,307 Mourning Dove Terrace,2,2,6.0,0::Unknown||1::Unknown,0::Handgun||1::Handgun,"Shot - Wounded/Injured||Shot - Dead (murder, a...",...,-79.9569,2.0,0::18||1::46||2::14||3::47,0::Adult 18+||1::Adult 18+||2::Teen 12-17||3::...,0::Female||1::Male||2::Male||3::Female,3::Family,0::Injured||1::Injured||2::Killed||3::Killed,0::Victim||1::Victim||2::Victim||3::Subject-Su...,62.0,27.0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 239677 entries, 0 to 239676
Data columns (total 21 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   date                      239677 non-null  object 
 1   state                     239677 non-null  object 
 2   city_or_county            239677 non-null  object 
 3   address                   223180 non-null  object 
 4   n_killed                  239677 non-null  int64  
 5   n_injured                 239677 non-null  int64  
 6   congressional_district    227733 non-null  float64
 7   gun_stolen                140179 non-null  object 
 8   gun_type                  140226 non-null  object 
 9   incident_characteristics  239351 non-null  object 
 10  latitude                  231754 non-null  float64
 11  longitude                 231754 non-null  float64
 12  n_guns_involved           140226 non-null  float64
 13  participant_age           147379 non-null  o

## Extracting Useful Data as Booleans

The original dataframe has most of its data stored as strings that make it really hard to filter the data using pandas' built in methods. Therefore I had to do the costly process of going through every row in the "incident characteristics" column and extract useful information to store in a new column as booleans.

In [6]:
df.incident_characteristics[1]

'Shot - Wounded/Injured||Shot - Dead (murder, accidental, suicide)||Mass Shooting (4+ victims injured or killed excluding the subject/suspect/perpetrator, one location)||Gang involvement'

In [7]:
"Gang involvement" in df.incident_characteristics[1]

True

In [8]:
# parsing data for keywords

suicide = [False] * len(df)
mass_shooting = [False] * len(df)
gang = [False] * len(df)
wounded = [False] * len(df)
dead = [False] * len(df)

for i in range(len(df.incident_characteristics)):
    if pd.isna(df.incident_characteristics.iloc[i]) == False:
        if "mass shooting" in df.incident_characteristics[i].lower():
            mass_shooting[i] = True
        if "suicide" in df.incident_characteristics[i].lower():
            suicide[i] = True
        if "gang involvement" in df.incident_characteristics[i].lower():
            gang[i] = True
        if "dead" in df.incident_characteristics[i].lower():
            dead[i] = True
        if "wounded" in df.incident_characteristics[i].lower():
            wounded[i] = True
            
df["suicide"] = suicide
df["mass shooting"] = mass_shooting
df["gang"] = gang
df["wounded"] = wounded
df["dead"] = dead
df.head()

Unnamed: 0,date,state,city_or_county,address,n_killed,n_injured,congressional_district,gun_stolen,gun_type,incident_characteristics,...,participant_relationship,participant_status,participant_type,state_house_district,state_senate_district,suicide,mass shooting,gang,wounded,dead
0,2013-01-01,Pennsylvania,Mckeesport,1506 Versailles Avenue and Coursin Street,0,4,14.0,,,Shot - Wounded/Injured||Mass Shooting (4+ vict...,...,,0::Arrested||1::Injured||2::Injured||3::Injure...,0::Victim||1::Victim||2::Victim||3::Victim||4:...,,,False,True,False,True,False
1,2013-01-01,California,Hawthorne,13500 block of Cerise Avenue,1,3,43.0,,,"Shot - Wounded/Injured||Shot - Dead (murder, a...",...,,0::Killed||1::Injured||2::Injured||3::Injured,0::Victim||1::Victim||2::Victim||3::Victim||4:...,62.0,35.0,True,True,True,True,True
2,2013-01-01,Ohio,Lorain,1776 East 28th Street,1,3,9.0,0::Unknown||1::Unknown,0::Unknown||1::Unknown,"Shot - Wounded/Injured||Shot - Dead (murder, a...",...,,"0::Injured, Unharmed, Arrested||1::Unharmed, A...",0::Subject-Suspect||1::Subject-Suspect||2::Vic...,56.0,13.0,True,False,False,True,True
3,2013-01-05,Colorado,Aurora,16000 block of East Ithaca Place,4,0,6.0,,,"Shot - Dead (murder, accidental, suicide)||Off...",...,,0::Killed||1::Killed||2::Killed||3::Killed,0::Victim||1::Victim||2::Victim||3::Subject-Su...,40.0,28.0,True,False,False,False,True
4,2013-01-07,North Carolina,Greensboro,307 Mourning Dove Terrace,2,2,6.0,0::Unknown||1::Unknown,0::Handgun||1::Handgun,"Shot - Wounded/Injured||Shot - Dead (murder, a...",...,3::Family,0::Injured||1::Injured||2::Killed||3::Killed,0::Victim||1::Victim||2::Victim||3::Subject-Su...,62.0,27.0,True,False,False,True,True


In [9]:
df[df["mass shooting"]==True].head()

Unnamed: 0,date,state,city_or_county,address,n_killed,n_injured,congressional_district,gun_stolen,gun_type,incident_characteristics,...,participant_relationship,participant_status,participant_type,state_house_district,state_senate_district,suicide,mass shooting,gang,wounded,dead
0,2013-01-01,Pennsylvania,Mckeesport,1506 Versailles Avenue and Coursin Street,0,4,14.0,,,Shot - Wounded/Injured||Mass Shooting (4+ vict...,...,,0::Arrested||1::Injured||2::Injured||3::Injure...,0::Victim||1::Victim||2::Victim||3::Victim||4:...,,,False,True,False,True,False
1,2013-01-01,California,Hawthorne,13500 block of Cerise Avenue,1,3,43.0,,,"Shot - Wounded/Injured||Shot - Dead (murder, a...",...,,0::Killed||1::Injured||2::Injured||3::Injured,0::Victim||1::Victim||2::Victim||3::Victim||4:...,62.0,35.0,True,True,True,True,True
5,2013-01-07,Oklahoma,Tulsa,6000 block of South Owasso,4,0,1.0,,,"Shot - Dead (murder, accidental, suicide)||Hom...",...,,0::Killed||1::Killed||2::Killed||3::Killed||4:...,0::Victim||1::Victim||2::Victim||3::Victim||4:...,72.0,11.0,True,True,False,False,True
6,2013-01-19,New Mexico,Albuquerque,2806 Long Lane,5,0,1.0,0::Unknown||1::Unknown,0::22 LR||1::223 Rem [AR-15],"Shot - Dead (murder, accidental, suicide)||Mas...",...,5::Family,0::Killed||1::Killed||2::Killed||3::Killed||4:...,0::Victim||1::Victim||2::Victim||3::Victim||4:...,10.0,14.0,True,True,False,False,True
7,2013-01-21,Louisiana,New Orleans,LaSalle Street and Martin Luther King Jr. Boul...,0,5,2.0,,,Shot - Wounded/Injured||Drive-by (car to stree...,...,,0::Injured||1::Injured||2::Injured||3::Injured...,0::Victim||1::Victim||2::Victim||3::Victim||4:...,93.0,5.0,False,True,False,True,False


In [10]:
df.to_pickle("gunviolence.pickle")

## Changing Index to Datetime

In [11]:
# # change index to datetime index
# newDate = pd.to_datetime(df.date)
# df.drop(columns=["date"], inplace=True)
# df.set_index(newDate, inplace=True)
# df["2015"].head()

In [12]:
# df = df[df.latitude != np.nan]
# df = df[df.longitude != np.nan]

In [13]:
# df.to_pickle("gunviolence.pickle")