In [1]:
import requests
from bs4 import BeautifulSoup

import numpy as np
import pandas as pd

# Part 1- WEB SCRAPING

In [2]:
Date = []
State = []
City = []
Location = []
Killed = []
Injured = []
Incident = []

In [3]:
for i in range(0,14):
    url = 'http://www.gunviolencearchive.org/reports/mass-shooting?page=' + str(i)
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')
    even = soup.find_all('tr', class_ = 'even')
    odd = soup.find_all('tr', class_ = 'odd')
    events = even + odd
    events = [el.find_all('td') for el in events]
    Date += [event[0].text for event in events]
    State += [event[1].text for event in events]
    City += [event[2].text for event in events]
    Location += [event[3].text for event in events]
    Killed += [event[4].text for event in events]
    Injured += [event[5].text for event in events]
    Incident += [str(event[6].find_all('a')[0]).split('"')[1] for event in events]

In [4]:
MS = pd.DataFrame({
        "Date": Date, 
        "State": State, 
        "Location": Location, 
        "Killed":Killed,
        "Injured":Injured,
        "Incident":Incident
    })

In [5]:
MS

Unnamed: 0,Date,Incident,Injured,Killed,Location,State
0,"December 12, 2017",/incident/1006446,4,0,1301 Rivercrest Ln,Texas
1,"December 4, 2017",/incident/1001566,2,2,7200 block of Everton St,Pennsylvania
2,"December 2, 2017",/incident/999969,4,0,5200 Woodward Avenue,Michigan
3,"November 24, 2017",/incident/994323,6,1,119 Martin Street,Mississippi
4,"November 18, 2017",/incident/991017,4,0,3600 block of Emerson Avenue North,Minnesota
5,"November 16, 2017",/incident/989838,7,0,1400 block of 15th Street,Georgia
6,"November 14, 2017",/incident/988824,2,2,900 Block of View Dr,California
7,"November 12, 2017",/incident/987134,5,0,S Calvin Ave,California
8,"November 11, 2017",/incident/985582,3,1,2532 Martin Luther King Jr Blvd,Texas
9,"November 11, 2017",/incident/985832,5,1,2835 Bragg Blvd,North Carolina


In [188]:
np.unique(MS.State)

array(['Alabama', 'Arizona', 'Arkansas', 'California', 'Colorado',
       'Connecticut', 'Delaware', 'District of Columbia', 'Florida',
       'Georgia', 'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky',
       'Louisiana', 'Maine', 'Maryland', 'Michigan', 'Minnesota',
       'Mississippi', 'Missouri', 'Montana', 'Nevada', 'New Jersey',
       'New Mexico', 'New York', 'North Carolina', 'Ohio', 'Oklahoma',
       'Pennsylvania', 'South Carolina', 'Tennessee', 'Texas', 'Utah',
       'Virginia', 'Washington', 'Wisconsin'], dtype=object)

In [6]:
Months = {
    'January': '01',
    'February': '02',
    'March':'03', 
    'April': '04',
    'May': '05',
    'June': '06',
    'July': '07',
    'August': '08',
    'September': '09',
    'October': '10',
    'November': '11',
    'December': '12'  
}

In [7]:
def fill_date(row):
    date = row['Date']
    date = date.strip().split(' ')
    month = Months[date[0]]
    day = date[1].replace(',','')
    year = date[2]
    row['Date'] = year+'-'+month+'-'+day
    return row

In [8]:
MS = MS.apply(lambda row: fill_date(row), axis = 1)

In [9]:
MS['Date'] = pd.to_datetime(MS.Date)

In [10]:
MS = MS.sort_values('Date')

In [11]:
MS.head()

Unnamed: 0,Date,Incident,Injured,Killed,Location,State
329,2017-01-01,/incident/739766,7,0,8000 block of NW 14th Ave,Florida
326,2017-01-01,/incident/739451,3,1,9220 Skillman Street,Texas
328,2017-01-01,/incident/743282,5,0,114 North Greyer Street,Mississippi
325,2017-01-03,/incident/742001,4,0,500 block of Hawthorne Drive,Texas
327,2017-01-04,/incident/742542,1,3,14520 Village Drive,California


In [12]:
MS.to_csv('MS_USA_2017.csv')

# Part 2- Analysis

### Compute Injured and Killed by State and Population

In [13]:
MS = pd.read_csv('MS_USA_2017.csv')

In [18]:
POP = pd.read_csv('data/pop_all_usa.csv')

In [20]:
POP = POP[['2016', 'Name']]

In [22]:
POP = POP.set_index('Name')

In [14]:
MS.head()

Unnamed: 0.1,Unnamed: 0,Date,Incident,Injured,Killed,Location,State
0,329,2017-01-01,/incident/739766,7,0,8000 block of NW 14th Ave,Florida
1,326,2017-01-01,/incident/739451,3,1,9220 Skillman Street,Texas
2,328,2017-01-01,/incident/743282,5,0,114 North Greyer Street,Mississippi
3,325,2017-01-03,/incident/742001,4,0,500 block of Hawthorne Drive,Texas
4,327,2017-01-04,/incident/742542,1,3,14520 Village Drive,California


In [79]:
Killed_Injured = MS[['Injured','Killed', 'State']].groupby(['State']).sum()

In [80]:
Killed_Injured.head()

Unnamed: 0_level_0,Injured,Killed
State,Unnamed: 1_level_1,Unnamed: 2_level_1
Alabama,30,5
Arizona,15,5
Arkansas,35,2
California,143,37
Colorado,1,3


In [81]:
def killed_Injured_Pop(row):
    pop = POP['2016'][row.name]/1000000
    row['Inj_hab'] = row['Injured']/pop
    row['Kil_hab'] = row['Killed']/pop
    return row

In [82]:
Killed_Injured_Pop = Killed_Injured.apply(lambda row: killed_Injured_Pop(row), axis = 1)

In [83]:
Killed_Injured_Pop.to_csv('Killed_Injured_by_StatePOP.csv')

### Compute Injured and Killed by State and Firegun by Population

In [189]:
FG = pd.read_excel('data/usa.xlsx')

In [190]:
FG = FG.set_index('State')
FG = FG[['FG/100 h']]

In [192]:
Killed_Injured = MS[['Injured','Killed', 'State']].groupby(['State']).sum()

In [196]:
Killed_Injured.columns

Index(['Injured', 'Killed'], dtype='object')

In [197]:
FG['FG/100 h']['Virginia']

10

In [200]:
def killed_Injured_FG(row):
    firegun = FG['FG/100 h'][row.name]
    row['Inj_fg'] = row['Injured']/firegun
    row['Kil_fg'] = row['Killed']/firegun
    return row

In [201]:
Killed_Injured_FG = Killed_Injured.apply(lambda row: killed_Injured_FG(row), axis = 1)

In [203]:
Killed_Injured_FG.head()

Unnamed: 0_level_0,Injured,Killed,Inj_fg,Kil_fg
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Alabama,30.0,5.0,1.764706,0.294118
Arizona,15.0,5.0,1.875,0.625
Arkansas,35.0,2.0,2.058824,0.117647
California,143.0,37.0,28.6,7.4
Colorado,1.0,3.0,0.066667,0.2


In [204]:
Killed_Injured_FG.to_csv('Killed_Injured_FG.csv')

### Compute Injured and Killed by State and Law