In [1]:
# Extract Data from iwaspoisoned.com website using web scraping.
# Then populate the information in a MongoDB
# (to facilitate teaming, export the MongoDB to a JSON file)

In [2]:
# Dependencies
import pandas as pd
from sqlalchemy import create_engine
from pprint import pprint

from splinter import Browser
from bs4 import BeautifulSoup
import requests
import pymongo

In [3]:
# Initialize PyMongo to work with MongoDBs
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)

In [4]:
# Define database and collection
db = client.etl_db

In [None]:
# Setup the splinter Browser
executable_path = {'executable_path': 'chromedriver.exe'}
browser = Browser('chrome', **executable_path, headless=False)

In [5]:
# URL of page to be scraped
url_iwp = 'https://iwaspoisoned.com'

# Retrieve page with the requests module
response = requests.get(url_iwp)
# Create BeautifulSoup object; parse with 'lxml'
soup = BeautifulSoup(response.text, 'lxml')

In [22]:
# Examine the results, then determine element that contains sought info
# results are returned as an iterable list
results = soup.find_all('div', class_='row div-report-box')

# Loop through returned results
for r in results:
    # Get the primary incident report info from the main box
    main_box = r.find('div', class_='report-first-box')
    
    # Date the incident occurred
    incident_date = main_box.find('p', class_ = 'report-date').text.strip()
    
    # Title of the incident
    incident_title = main_box.find('a')['title']
    
    # URL of the per-incident details
    incident_url = main_box.find('a')['href']
    
    # Get the Symptoms
    report_tags = main_box.find_all('p', class_ = 'report-tag')
    
    # Parse each report tag into its proper field
    incident_symptoms = ""
    incident_report_type = ""
    incident_misc = ""
    
    for rt in report_tags:
        # Get the text in this tag
        rt_info = rt.text.strip()

        # Symptoms
        if "Symptoms:" in rt_info:
            incident_symptoms = [ s.replace(',','') for s in rt_info[len("Symptoms: "):].split() ]
            
        # Report Type
        elif "Report Type:" in rt_info:
            incident_report_type = rt_info[len("Report Type: "):]
        
        # Ok... no idea what this report tag contains
        else:
            incident_misc = rt_info
    
    pprint(main_box)
    print(f">>> Incident Date: {incident_date}")
    print(f">>> Incident Title: {incident_title}")
    print(f">>> Incident URL: {incident_url}")
    print(f">>> Incident Report Type: {incident_report_type}")
    print(f">>> Incident Symptoms: {incident_symptoms}")
    print(f">>> Incident Misc Info: {incident_misc}")
    print("-"*40)
    
    # Get the full description of the incident
    # Assume this couple be populated in multiple paragraphs
    desc_box = r.find('div', class_='report-second-box')
    desc_list = desc_box.find_all('p')
    desc_info = ""
    for d in desc_list:
        desc_info += d.text.strip()
    
    #pprint(descbox)
    print(f">>> Description: {desc_info}")
    print("-"*40)
    break;
#    try:
        # Dictionary to be inserted as a MongoDB document
        #post = {
        #    'title': title,
        #    'price': price,
        #    'url': link
        #}

        #db.iwp.insert_one(post)

#    except Exception as e:
#        print(e)

<div class="col-md-6 report-first-box">
<p class="report-date">Feb 17 2019 8:20pm</p>
<a href="https://iwaspoisoned.com/incident/chick-fil-a-north-fairfield-road-beavercreek-oh-usa-168576#emailscroll" title="Chick-fil-A, North Fairfield Road, Beavercreek, OH, USA - Got Food Poisoning? Report it now">
<h3 class="report-box-h">Chick-fil-A, North Fairfield Road, Beavercreek, OH, USA</h3></a>
<p class="report-tag">Symptoms:
                                                        Diarrhea,
                                                        Fever,
                                                        Nausea,
                                                        Vomiting
                            </p>
<p class="report-tag">Report Type: Food Poisoning</p>
</div>
>>> Incident Date: Feb 17 2019 8:20pm
>>> Incident Title: Chick-fil-A, North Fairfield Road, Beavercreek, OH, USA - Got Food Poisoning? Report it now
>>> Incident URL: https://iwaspoisoned.com/incident/chick-fil-a-north-fair

In [23]:
# URL of page to be scraped
url_incident = 'https://iwaspoisoned.com/incident/chick-fil-a-north-fairfield-road-beavercreek-oh-usa-168576#emailscroll'

# Retrieve page with the requests module
response = requests.get(url_incident)
# Create BeautifulSoup object; parse with 'lxml'
soup = BeautifulSoup(response.text, 'lxml')

In [24]:
# Examine the results, then determine element that contains sought info
# results are returned as an iterable list
results = soup.find_all('div', class_='single-incident')

In [73]:
for r in results:
    # Incident detail page - title
    # incident_detail_title = r.find('h1', class_='h1 post-title').text.strip()
    
    # Address
    addr_info = r.find('span', class_='pl-1 py-0 text-muted').text.strip()
    incident_address = ' '.join(addr_info.split())
    
    # Ok, we now have an address of the form:
    # 2360 North Fairfield Road, Beavercreek, 45431 Ohio, United States
    # But, would be nice to be able to break this up into
    # individual components to facilitate address matching,
    # Especially with the non-standard location of the zipcode
    
    if "United States" in incident_address:
        # Create a list of address items
        ai_list = incident_address.split(',')
        
        # Some items are mandatory and are at the end of the list of length = N
        # N-1: Country e.g. "United States"
        # N-2: Zipcode and State e.g. "45431 Ohio"
        # N-3: City
        # Other entries 0 to N-4: Street/Apt/etc.
        
        ai_size = len( ai_list )
        # Country
        incident_address_country = ai_list[ai_size-1].strip()
        
        # Split the next entry to get state and zipcode
        zs_info = ai_list[ai_size-2].strip()
        zs_delim = zs_info.find(' ')
        # print(f"zs_delim: {zs_delim}, zs_info: {zs_info}")
        incident_address_zipcode = zs_info[:zs_delim].strip()
        incident_address_state = zs_info[zs_delim:].strip()
        
        # City
        incident_address_city = ai_list[ai_size-3].strip()
        
        # Process up to 3 "street" type entries
        incident_address_street = ""
        incident_address_street2 = ""
        incident_address_street3 = ""
        
        # print(f"ai_size: {ai_size}")
        # First street address item
        if ai_size >= 4:
            incident_address_street = ai_list[0].strip()
        
        # Second street address item
        if ai_size >= 5:
            incident_address_street2 = ai_list[1].strip()

        # Third street address item
        if ai_size >= 6:
            incident_address_street3 = ai_list[i].strip()
            
        # Reform the address - with standard formating
        incident_address_standard = incident_address_street
        if len(incident_address_street2) > 0:
            incident_address_standard += ", " + incident_address_street2
        if len(incident_address_street3) > 0:
            incident_address_standard += ", " + incident_address_street3
        if len(incident_address_city) > 0:
            incident_address_standard += ", " + incident_address_city
        if len(incident_address_state) > 0:
            incident_address_standard += ", " + incident_address_state
        if len(incident_address_zipcode) > 0:
            incident_address_standard += " " + incident_address_zipcode
        if len(incident_address_country) > 0:
            incident_address_standard += ", " + incident_address_country
        
    print(f">>> Incident Detail - Address: {incident_address}")
    print(f">>> Incident Detail - Address - Street1: {incident_address_street}")
    print(f">>> Incident Detail - Address - Street2: {incident_address_street2}")
    print(f">>> Incident Detail - Address - Street3: {incident_address_street3}")
    print(f">>> Incident Detail - Address - City: {incident_address_city}")
    print(f">>> Incident Detail - Address - State: {incident_address_state}")
    print(f">>> Incident Detail - Address - Zipcode: {incident_address_zipcode}")
    print(f">>> Incident Detail - Address - Country: {incident_address_country}")
    print(f">>> Incident Detail - Address: {incident_address_standard}")
    print("-"*40)



>>> Incident Detail - Address: 2360 North Fairfield Road, Beavercreek, 45431 Ohio, United States
>>> Incident Detail - Address - Street1: 2360 North Fairfield Road
>>> Incident Detail - Address - Street2: 
>>> Incident Detail - Address - Street3: 
>>> Incident Detail - Address - City: Beavercreek
>>> Incident Detail - Address - State: Ohio
>>> Incident Detail - Address - Zipcode: 45431
>>> Incident Detail - Address - Country: United States
>>> Incident Detail - Address: 2360 North Fairfield Road, Beavercreek, Ohio 45431, United States
----------------------------------------


In [None]:
# Loop through returned results
for r in results:
    # Get the primary incident report info from the main box
    main_box = r.find('div', class_='report-first-box')
    
    # Date the incident occurred
    incident_date = main_box.find('p', class_ = 'report-date').text.strip()
    
    # Title of the incident
    incident_title = main_box.find('a')['title']
    
    # URL of the per-incident details
    incident_url = main_box.find('a')['href']
    
    # Get the Symptoms
    report_tags = main_box.find_all('p', class_ = 'report-tag')
    
    # Parse each report tag into its proper field
    incident_symptoms = ""
    incident_report_type = ""
    incident_misc = ""
    
    for rt in report_tags:
        # Get the text in this tag
        rt_info = rt.text.strip()

        # Symptoms
        if "Symptoms:" in rt_info:
            incident_symptoms = [ s.replace(',','') for s in rt_info[len("Symptoms: "):].split() ]
            
        # Report Type
        elif "Report Type:" in rt_info:
            incident_report_type = rt_info[len("Report Type: "):]
        
        # Ok... no idea what this report tag contains
        else:
            incident_misc = rt_info
    
    pprint(main_box)
    print(f">>> Incident Date: {incident_date}")
    print(f">>> Incident Title: {incident_title}")
    print(f">>> Incident URL: {incident_url}")
    print(f">>> Incident Report Type: {incident_report_type}")
    print(f">>> Incident Symptoms: {incident_symptoms}")
    print(f">>> Incident Misc Info: {incident_misc}")
    print("-"*40)
    
    # Get the full description of the incident
    # Assume this couple be populated in multiple paragraphs
    desc_box = r.find('div', class_='report-second-box')
    desc_list = desc_box.find_all('p')
    desc_info = ""
    for d in desc_list:
        desc_info += d.text.strip()
    
    #pprint(descbox)
    print(f">>> Description: {desc_info}")
    print("-"*40)
    break;
#    try:
        # Dictionary to be inserted as a MongoDB document
        #post = {
        #    'title': title,
        #    'price': price,
        #    'url': link
        #}

        #db.iwp.insert_one(post)

#    except Exception as e:
#        print(e)

In [7]:
# Display items in MongoDB collection
#listings = db.items.find()

#for listing in listings:
#    print(listing)