In [1]:
import requests
from lxml import etree
import pickle
import pandas

In [2]:
def get_gunviolence_page(url):
    """
    create pandas.dataframe from one page of gunviolence e.g.
    http://www.gunviolencearchive.org/reports/mass-shootings/2014
    
    :param str url: gunviolence output page
    http://www.gunviolencearchive.org/reports/mass-shootings/2014
    
    :rtype: pandas.core.frame.DataFrame
    :return: info from the violence page in a dataframe
    
    """
    call = requests.get(url)
    doc = etree.HTML(call.text)
    
    headers = ['incident_uri',
               'date', 'state', 'city_or_county',
               'address', 'num_killed', 'num_injured',
               'incident_url', 'source_url', 'incident_sources',
               'participants']
    list_of_reports = []

    for tr_el in doc.xpath('//tr[@class="even" or @class="odd"]'):
        td_els = tr_el.getchildren()

        date = td_els[0].text
        state = td_els[1].text
        city_or_county = td_els[2].text
        address = td_els[3].text
        num_killed = int(td_els[4].text)
        num_injured = int(td_els[5].text)

        operations_el = td_els[6]
        a_els = operations_el.findall('ul/li/a')

        # get incident url
        incident_base = 'http://www.gunviolencearchive.org'
        incident_ending = a_els[0].get('href')
        incident_url = incident_base + incident_ending
        incident_uri = incident_url.split('/')[-1]

        # get source url
        source_url = ''
        if len(a_els) == 2:
            source_url = a_els[1].get('href')

        # get incident sources
        incident_call = requests.get(incident_url)
        incident_doc = etree.HTML(incident_call.text)

        incident_sources = set()
        for li_el in incident_doc.xpath('//li'):
            if li_el.text is not None:
                if 'URL:' in li_el.text:
                    for a_el in li_el.xpath('a'):
                        incident_sources.add(a_el.get('href'))
                        
        # get participants information
        div_els = incident_doc.xpath('//div[h2[text()="Participants"]]')
        div_el = div_els[0]
        participants = []

        for ul_el in div_el.iterfind('div/ul'):
            participant = dict()
            for li_el in ul_el.iterfind('li'):
                attr, value = li_el.text.split(':')
                participant[attr] = value
            participants.append(participant)
            
            
        incident_report = [incident_uri,
                           date, state, city_or_county,
                           address, num_killed, num_injured,
                           incident_url, source_url, incident_sources, participants]
        list_of_reports.append(incident_report)
    
    df = pandas.DataFrame(list_of_reports, columns=headers)
    return df

In [3]:
def paginate(base_url, debug=False):
    """
    paginate over gunviolence urls
    
    :param str base_url: paginate over gunviolence urls
    
    :rtype: pandas.core.frame.DataFrame
    :return: all results from one category
    
    """
    frames = []
    previous_df = get_gunviolence_page(base_url)
    frames.append(previous_df)

    keep_going = True
    counter = 1
    while keep_going:
        url = base_url + '?page=' + str(counter)
        print(url)
        
        df = get_gunviolence_page(url)
        counter += 1

        if df.equals(previous_df):
            keep_going = False
        else:
            frames.append(df)
            previous_df = df

    df = pandas.concat(frames)
    return df

In [4]:
urls_and_paths = [#('frames/mass_shootings_2013', 'http://www.gunviolencearchive.org/reports/mass-shootings/2013'),
                  #('frames/mass_shootings_2014', 'http://www.gunviolencearchive.org/reports/mass-shootings/2014'),
                  ('frames/mass_shootings_2015', 'http://www.gunviolencearchive.org/reports/mass-shootings/2015')]
for output_path, base_url in urls_and_paths:
    df = paginate(base_url)
    print(output_path)
    with open(output_path, 'wb') as outfile:
        pickle.dump(df, outfile)

http://www.gunviolencearchive.org/reports/mass-shootings/2015?page=1
http://www.gunviolencearchive.org/reports/mass-shootings/2015?page=2
http://www.gunviolencearchive.org/reports/mass-shootings/2015?page=3
http://www.gunviolencearchive.org/reports/mass-shootings/2015?page=4
http://www.gunviolencearchive.org/reports/mass-shootings/2015?page=5
http://www.gunviolencearchive.org/reports/mass-shootings/2015?page=6
http://www.gunviolencearchive.org/reports/mass-shootings/2015?page=7
http://www.gunviolencearchive.org/reports/mass-shootings/2015?page=8
http://www.gunviolencearchive.org/reports/mass-shootings/2015?page=9
http://www.gunviolencearchive.org/reports/mass-shootings/2015?page=10
http://www.gunviolencearchive.org/reports/mass-shootings/2015?page=11
http://www.gunviolencearchive.org/reports/mass-shootings/2015?page=12
http://www.gunviolencearchive.org/reports/mass-shootings/2015?page=13
http://www.gunviolencearchive.org/reports/mass-shootings/2015?page=14
frames/mass_shootings_2015
