In [13]:
import datetime as dt
import pandas as pd
import numpy as np

The data was scraped from the Gun Violence Archive by github user jamesqo. It contains data for all recorded gun violence incidents in the US between January 2013 and March 2018, with the notable exception of the 2017 Vegas mass shooting. 2013 wasn't a reliable scrape

Gun Violence Archive: https://www.gunviolencearchive.org/

Github Repository: https://github.com/jamesqo/gun-violence-data

In [14]:
gun_violence_filepath = "gunViolenceData.csv"
df = pd.read_csv(gun_violence_filepath)
df['year'] = pd.DatetimeIndex(df['date']).year
df = df[df.state != 'District of Columbia']
df = df[df.year > 2013]
df.reset_index(inplace=True)
df.head()

Unnamed: 0,index,incident_id,date,state,city_or_county,address,n_killed,n_injured,incident_url,source_url,...,participant_age_group,participant_gender,participant_name,participant_relationship,participant_status,participant_type,sources,state_house_district,state_senate_district,year
0,278,95289,2014-01-01,Michigan,Muskegon,300 block of Monroe Avenue,0,0,http://www.gunviolencearchive.org/incident/95289,http://www.mlive.com/news/muskegon/index.ssf/2...,...,0::Adult 18+,0::Female,,,0::Unharmed,0::Victim,http://www.mlive.com/news/muskegon/index.ssf/2...,92.0,34.0,2014
1,279,92401,2014-01-01,New Jersey,Newark,Central Avenue,0,0,http://www.gunviolencearchive.org/incident/92401,http://www.nj.com/essex/index.ssf/2014/01/newa...,...,,,,,,,http://www.nj.com/essex/index.ssf/2014/01/newa...,29.0,29.0,2014
2,280,92383,2014-01-01,New York,Queens,113th Avenue,1,0,http://www.gunviolencearchive.org/incident/92383,http://www.timesledger.com/stories/2014/2/firs...,...,0::Adult 18+||1::Adult 18+,0::Male||1::Male,0::Julio Mora||1::Sheldon Smith,,0::Killed||1::Unharmed,0::Victim||1::Subject-Suspect,http://www.timesledger.com/stories/2014/2/firs...,33.0,14.0,2014
3,281,92142,2014-01-01,New York,Brooklyn,St. Johns Place,0,1,http://www.gunviolencearchive.org/incident/92142,http://www.nydailynews.com/new-york/nyc-crime/...,...,0::Adult 18+||1::Adult 18+,0::Male||1::Male,,,0::Injured,0::Victim||1::Subject-Suspect,http://www.nydailynews.com/new-york/nyc-crime/...,43.0,20.0,2014
4,282,95261,2014-01-01,Missouri,Springfield,Beverly Hills and Temple,0,1,http://www.gunviolencearchive.org/incident/95261,http://www.ozarksfirst.com/story/deputies-6-ye...,...,0::Child 0-11||1::Teen 12-17,0::Female,,,0::Injured||1::Unharmed,0::Victim||1::Subject-Suspect,http://www.ozarksfirst.com/story/deputies-6-ye...,131.0,30.0,2014


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 236210 entries, 0 to 236209
Data columns (total 31 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   index                        236210 non-null  int64  
 1   incident_id                  236210 non-null  int64  
 2   date                         236210 non-null  object 
 3   state                        236210 non-null  object 
 4   city_or_county               236210 non-null  object 
 5   address                      219870 non-null  object 
 6   n_killed                     236210 non-null  int64  
 7   n_injured                    236210 non-null  int64  
 8   incident_url                 236210 non-null  object 
 9   source_url                   235753 non-null  object 
 10  incident_url_fields_missing  236210 non-null  bool   
 11  congressional_district       224812 non-null  float64
 12  gun_stolen                   138348 non-null  object 
 13 

## Extracting Useful Data as Booleans

The original dataframe has most of its data stored as strings that make it really hard to filter the data using pandas' built in methods. Therefore I had to do the costly process of going through every row in the "incident characteristics" column and extract useful information to store in a new column as booleans.

In [16]:
df.incident_characteristics[1]

'Officer Involved Incident'

In [17]:
"Gang involvement" in df.incident_characteristics[1]

False

In [27]:
# parsing data for keywords

suicide = [False] * len(df)
mass_shooting = [False] * len(df)
gang = [False] * len(df)
wounded = [False] * len(df)
dead = [False] * len(df)
non_suicide = [False] * len(df)

for i in range(len(df.incident_characteristics)):
    if pd.isna(df.incident_characteristics.iloc[i]) == False:
        if "mass shooting" in df.incident_characteristics[i].lower():
            mass_shooting[i] = True
        if "Suicide" in df.incident_characteristics[i]: # some keywords work weird so we have to be picky about case
            suicide[i] = True
        if "gang involvement" in df.incident_characteristics[i].lower():
            gang[i] = True
        if "Dead" in df.incident_characteristics[i]:
            dead[i] = True
        if "Wounded" in df.incident_characteristics[i]:
            wounded[i] = True
        if "Suicide^" not in df.incident_characteristics[i] and "Suicide - Attempt" not in df.incident_characteristics[i]:
            non_suicide[i] = True
            
            
df["suicide"] = suicide
df["mass shooting"] = mass_shooting
df["gang"] = gang
df["wounded"] = wounded
df["dead"] = dead
df["non-suicide"] = non_suicide
df.head()

Unnamed: 0,index,incident_id,date,state,city_or_county,address,n_killed,n_injured,incident_url,source_url,...,sources,state_house_district,state_senate_district,year,suicide,mass shooting,gang,wounded,dead,non-suicide
0,278,95289,2014-01-01,Michigan,Muskegon,300 block of Monroe Avenue,0,0,http://www.gunviolencearchive.org/incident/95289,http://www.mlive.com/news/muskegon/index.ssf/2...,...,http://www.mlive.com/news/muskegon/index.ssf/2...,92.0,34.0,2014,False,False,False,False,False,True
1,279,92401,2014-01-01,New Jersey,Newark,Central Avenue,0,0,http://www.gunviolencearchive.org/incident/92401,http://www.nj.com/essex/index.ssf/2014/01/newa...,...,http://www.nj.com/essex/index.ssf/2014/01/newa...,29.0,29.0,2014,False,False,False,False,False,True
2,280,92383,2014-01-01,New York,Queens,113th Avenue,1,0,http://www.gunviolencearchive.org/incident/92383,http://www.timesledger.com/stories/2014/2/firs...,...,http://www.timesledger.com/stories/2014/2/firs...,33.0,14.0,2014,False,False,False,False,True,True
3,281,92142,2014-01-01,New York,Brooklyn,St. Johns Place,0,1,http://www.gunviolencearchive.org/incident/92142,http://www.nydailynews.com/new-york/nyc-crime/...,...,http://www.nydailynews.com/new-york/nyc-crime/...,43.0,20.0,2014,False,False,False,True,False,True
4,282,95261,2014-01-01,Missouri,Springfield,Beverly Hills and Temple,0,1,http://www.gunviolencearchive.org/incident/95261,http://www.ozarksfirst.com/story/deputies-6-ye...,...,http://www.ozarksfirst.com/story/deputies-6-ye...,131.0,30.0,2014,False,False,False,True,False,True


In [29]:
df[(df.suicide == True) & (df["non-suicide"] == True)].incident_characteristics

264       Shot - Dead (murder, accidental, suicide)||Mur...
765                                          Murder/Suicide
854       Shot - Wounded/Injured||Shot - Dead (murder, a...
1524                                         Murder/Suicide
1792                                         Murder/Suicide
                                ...                        
223749    Attempted Murder/Suicide (one variable unsucce...
224801    Shot - Dead (murder, accidental, suicide)||Mur...
225026    Shot - Wounded/Injured||Shot - Dead (murder, a...
225542    Shot - Dead (murder, accidental, suicide)||Att...
232612    Shot - Dead (murder, accidental, suicide)||Att...
Name: incident_characteristics, Length: 171, dtype: object

In [20]:
df[df["mass shooting"]==True].head()

Unnamed: 0,index,incident_id,date,state,city_or_county,address,n_killed,n_injured,incident_url,source_url,...,sources,state_house_district,state_senate_district,year,suicide,mass shooting,gang,wounded,dead,non-suicide
72,351,92194,2014-01-01,Virginia,Norfolk,Rockingham Street and Berkley Avenue Extended,2,2,http://www.gunviolencearchive.org/incident/92194,http://www.dailypress.com/news/crime/dp-norfol...,...,http://www.dailypress.com/news/crime/dp-norfol...,80.0,6.0,2014,False,True,False,True,True,True
325,614,92704,2014-01-03,New York,Queens,Farmers Boulevard and 133rd Avenue,1,3,http://www.gunviolencearchive.org/incident/92704,http://abclocal.go.com/wabc/story,...,http://www.nydailynews.com/new-york/nyc-crime/...,32.0,10.0,2014,False,True,False,True,True,True
1486,1803,95500,2014-01-12,Louisiana,Tallulah,3600 block of Highway 80 W,0,6,http://www.gunviolencearchive.org/incident/95500,http://www.myarklamiss.com/crime/update-3-new-...,...,http://www.ksla.com/story/24428661/6-hurt-in-m...,19.0,34.0,2014,False,True,False,True,False,True
1513,1831,95579,2014-01-12,Illinois,Elgin,300 block of North Street,0,5,http://www.gunviolencearchive.org/incident/95579,http://www.chicagotribune.com/news/local/break...,...,http://www.chicagotribune.com/news/local/break...,43.0,22.0,2014,False,True,False,True,False,True
1573,1891,95550,2014-01-12,Alabama,Huntsville,University Drive,0,5,http://www.gunviolencearchive.org/incident/95550,http://blog.al.com/breaking/2014/01/huntsville...,...,http://www.waff.com/story/24426895/shooting-se...,53.0,2.0,2014,False,True,False,True,False,True


In [30]:
df.to_pickle("gunviolence.pickle")

## Changing Index to Datetime

In [22]:
# # change index to datetime index
# newDate = pd.to_datetime(df.date)
# df.drop(columns=["date"], inplace=True)
# df.set_index(newDate, inplace=True)
# df["2015"].head()

In [23]:
# df = df[df.latitude != np.nan]
# df = df[df.longitude != np.nan]

In [24]:
# df.to_pickle("gunviolence.pickle")