In [1]:
import requests
from lxml import etree
from datetime import datetime
import pandas

In [2]:
def extract_paragraphs(url):
    """
    extract text from firerescue article
    
    :param str url: firerescue incident report
    """
    call = requests.get(url)
    incident_doc = etree.HTML(call.text)
    text_el = incident_doc.xpath('//span[@class="text"]')[0]
    paragraphs = [p_el.text for p_el in text_el.iterfind('p')]
    
    return paragraphs

In [3]:
url = 'https://www.firerescue1.com/incident-reports/'
base_url = 'https://www.firerescue1.com'

In [4]:
call = requests.get(url)

In [5]:
doc = etree.HTML(call.text)

In [6]:
list_of_lists = []
headers = ['fire_department', 'incident_reporting_date', 
           'incident_url',
           'city', 'state_abbr', 'incident']

counter = 0
for a_el in doc.xpath('//a[starts-with(@href, "/incident-reports")]'):
    href = a_el.get('href')
    
    if href.endswith('submit/'):
        continue
    
    counter += 1
    if counter % 10 == 0:
        print(counter, datetime.now())
    fire_department = a_el.find('b').text
    incident_url = base_url + a_el.get('href')
    
    main_span_el = a_el.find('span[@class="redDateText"]')
    child_span_el = main_span_el.find('span')
    data_timestamp = child_span_el.get('data-timestamp')
    incident_date = datetime.fromtimestamp(int(data_timestamp))
    
    city, state_abbr = main_span_el.tail[2:-1].rsplit(', ', 1)
    incident = extract_paragraphs(incident_url)
    
    one_row = [fire_department, incident_date, 
               incident_url,
               city, state_abbr, incident]
    list_of_lists.append(one_row)

10 2017-02-06 22:53:44.204870
20 2017-02-06 22:53:51.510130
30 2017-02-06 22:53:55.493511
40 2017-02-06 22:54:01.092721
50 2017-02-06 22:54:04.680433
60 2017-02-06 22:54:10.082773
70 2017-02-06 22:54:14.191894
80 2017-02-06 22:54:18.876121
90 2017-02-06 22:54:23.300306
100 2017-02-06 22:54:28.353605
110 2017-02-06 22:54:32.503188
120 2017-02-06 22:54:37.012311
130 2017-02-06 22:54:42.314970
140 2017-02-06 22:54:46.960218
150 2017-02-06 22:54:51.607312
160 2017-02-06 22:54:55.314271
170 2017-02-06 22:55:01.016619
180 2017-02-06 22:55:05.145421
190 2017-02-06 22:55:09.985518
200 2017-02-06 22:55:14.085348
210 2017-02-06 22:55:19.330434
220 2017-02-06 22:55:22.813670
230 2017-02-06 22:55:29.888763
240 2017-02-06 22:55:34.539631
250 2017-02-06 22:55:39.354913
260 2017-02-06 22:55:43.013592
270 2017-02-06 22:55:49.529607
280 2017-02-06 22:55:53.714611
290 2017-02-06 22:55:58.384711
300 2017-02-06 22:56:02.668978
310 2017-02-06 22:56:08.511681
320 2017-02-06 22:56:12.959507
330 2017-02-06 22

In [7]:
df = pandas.DataFrame(list_of_lists, columns=headers)

In [8]:
df.head()

Unnamed: 0,fire_department,incident_reporting_date,incident_url,city,state_abbr,incident
0,Tualatin Valley Fire & Rescue,2009-02-20 01:00:00,https://www.firerescue1.com/incident-reports/4...,Tualatin Valley,OR,"[At 10:32pm last night (2/17), firefighters we..."
1,Lebanon Fire District,2009-02-20 01:00:00,https://www.firerescue1.com/incident-reports/4...,Lebanon,OR,"[House Fire at 10:16pm Thursday night, 2-19-20..."
2,Portland Fire Rescue,2009-02-13 01:00:00,https://www.firerescue1.com/incident-reports/4...,Portland,OR,[Portland Fire & Rescue responded to a fire in...
3,Parma-Sandstone,2009-02-10 01:00:00,https://www.firerescue1.com/incident-reports/4...,Parma,MI,[]
4,LAFD,2009-01-22 01:00:00,https://www.firerescue1.com/incident-reports/4...,Los Angeles,CA,"[On Sunday, January 18, 2009 at 8:56 PM, 9 Com..."


In [9]:
df.to_pickle('firerescue.pickle')