In [1]:
import requests
from lxml import etree
from datetime import datetime
import pandas

In [2]:
def extract_paragraphs_city_state(url):
    """
    extract text from firerescue article
    
    :param str url: firerescue incident report
    
    :rtype: tuple
    :return: (paragraphs, city, state)
    """
    call = requests.get(url)
    incident_doc = etree.HTML(call.text)
    text_el = incident_doc.xpath('//span[@class="text"]')[0]
    paragraphs = [p_el.text for p_el in text_el.iterfind('p')]
    
    loc_el = incident_doc.xpath('//p/b')[0]
    city, state = loc_el.tail[4: -1].rsplit(', ', 1)
    if state == 'N/A':
        state = ''
    
    return paragraphs, city, state

In [3]:
url = 'https://www.firerescue1.com/incident-reports/'
base_url = 'https://www.firerescue1.com'

In [4]:
call = requests.get(url)

In [5]:
doc = etree.HTML(call.text)

In [6]:
list_of_lists = []
headers = ['incident_uri', 'date', 
           'state', 'city_or_county', 'address',
           'incident_url', 'source_url',
           'incident_sources', 'participants',
           'incident', 'fire_department', 'incident_reporting_date']

counter = 0
for a_el in doc.xpath('//a[starts-with(@href, "/incident-reports")]'):
    href = a_el.get('href')
    
    if href.endswith('submit/'):
        continue
    
    counter += 1
    if counter % 50 == 0:
        print(counter, datetime.now())
        
    fire_department = a_el.find('b').text
    incident_url = base_url + a_el.get('href')
    incident_sources = set()
    incident_uri = 'FR' + incident_url.replace(url, '').split('-')[0]
    
    main_span_el = a_el.find('span[@class="redDateText"]')
    child_span_el = main_span_el.find('span')
    data_timestamp = child_span_el.get('data-timestamp')
    incident_date = datetime.fromtimestamp(int(data_timestamp))
    date = incident_date.strftime('%B %d, %Y')
    
    # city, state_abbr = main_span_el.tail[2:-1].rsplit(', ', 1)
    
    incident, city, state = extract_paragraphs_city_state(incident_url)
    address = ''
    participants = dict()
    
    one_row = [incident_uri, date,
               state, city, address, 
               incident_url, incident_url,
               incident_sources, participants,
               incident, fire_department, incident_date]
    list_of_lists.append(one_row)

50 2017-03-10 14:34:34.989011
100 2017-03-10 14:34:53.816253
150 2017-03-10 14:35:15.179743
200 2017-03-10 14:35:33.838639
250 2017-03-10 14:35:52.002638
300 2017-03-10 14:36:13.152189
350 2017-03-10 14:36:31.581349
400 2017-03-10 14:36:52.265259


In [7]:
df = pandas.DataFrame(list_of_lists, columns=headers)

In [8]:
df.head()

Unnamed: 0,incident_uri,date,state,city_or_county,address,incident_url,source_url,incident_sources,participants,incident,fire_department,incident_reporting_date
0,FR455329,"February 20, 2009",Oregon,Tualatin Valley,,https://www.firerescue1.com/incident-reports/4...,https://www.firerescue1.com/incident-reports/4...,{},{},"[At 10:32pm last night (2/17), firefighters we...",Tualatin Valley Fire & Rescue,2009-02-20 01:00:00
1,FR455326,"February 20, 2009",Oregon,Lebanon,,https://www.firerescue1.com/incident-reports/4...,https://www.firerescue1.com/incident-reports/4...,{},{},"[House Fire at 10:16pm Thursday night, 2-19-20...",Lebanon Fire District,2009-02-20 01:00:00
2,FR453809,"February 13, 2009",Oregon,Portland,,https://www.firerescue1.com/incident-reports/4...,https://www.firerescue1.com/incident-reports/4...,{},{},[Portland Fire & Rescue responded to a fire in...,Portland Fire Rescue,2009-02-13 01:00:00
3,FR453057,"February 10, 2009",Michigan,Parma,,https://www.firerescue1.com/incident-reports/4...,https://www.firerescue1.com/incident-reports/4...,{},{},[],Parma-Sandstone,2009-02-10 01:00:00
4,FR448687,"January 22, 2009",California,Los Angeles,,https://www.firerescue1.com/incident-reports/4...,https://www.firerescue1.com/incident-reports/4...,{},{},"[On Sunday, January 18, 2009 at 8:56 PM, 9 Com...",LAFD,2009-01-22 01:00:00


In [9]:
df.to_pickle('firerescue.pickle')