<a href="https://colab.research.google.com/github/gaibelg/Death-Notices-Ireland/blob/main/Death_Notices.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import requests
from bs4 import BeautifulSoup
import datetime
from multiprocessing import Pool
import pandas as pd

def get_notice(dn_code):

  data = {'dn_code': dn_code,
          'link': f'https://rip.ie/showdn.php?dn={dn_code}',
          'title': None,
          'publish_date': None,
          'death_date': None,
          'scraping_time': str(datetime.datetime.now())}

  try:
    response = requests.get(f'https://rip.ie/showdn.php?dn={dn_code}')
    
    # If the request succeeded
    if response.status_code == 200:

      soup = BeautifulSoup(response.content.decode())
      title = soup.title

      # if you have reached an actual notice
      if title is not None:
        title = title.text
        if title.startswith('Death Notice of '):
  
          publish_date = soup.find('div', attrs = {'class': 'dates dpubl'})
          death_date = soup.find('div', attrs = {'class': 'dates ddeath textRight'})
          
          if publish_date is not None: data['publish_date'] = publish_date.text
          if death_date is not None: data['death_date'] = death_date.text
          data['title'] = title
  
  except Exception as ex:
    print('Code: ', dn_code)
    print(ex)

  finally:

    return data

def repeated_requests(dn_code, N = 2):

  """In case of a bad request, a repeated request may come out valid. 
     This function makes `N` repeated requests in case of empty data."""

  for _ in range(N):
    data = get_notice(dn_code)
    if data['title'] is not None: 
      break
    else:
      print(f'Attempting to retrieve dn_code {dn_code} again.')

  return data

# The scraping is of this list of dn_codes. you can change the values here if you want different ranges
dn_codes = list(range(444855,0,-1)) 

# The value in the `Pool` function is the amount of simultanous requests.

with Pool(10) as p:

  results = p.map(repeated_requests, dn_codes)

df = pd.DataFrame(results).set_index('dn_code')
df.to_csv('death_notices.csv')

print('\n**********\n')
print(df.info())

In [None]:
from google.colab import files
files.download('death_notices.csv')