# web scraping RIP.ie with beautifulsoup

In [1]:
import time
import pandas as pd
from bs4 import BeautifulSoup
import requests
import datefinder

def get_dn_page(n):
    """Get death notice text from page matching the id number"""
    
    url = 'https://rip.ie/showdn.php?dn=%s' %n
    req = requests.get(url)
    soup = BeautifulSoup(req.content, 'html.parser')
    if soup.title == None:
        title = ''
    else:        
        title  = soup.title.text.strip()
    name=''    
    for s in ['Death Notice of ','The death has occurred of ']:
        if title.startswith(s):
            name = title.split(s)[1]   
    elem = soup.find_all("div", id="dn_photo_and_text")
    
    if len(elem) == 0:
        return name, '', '', '', ''
    rows = elem[0].find_all('p')
    if len(rows) == 0:
        rows = elem[0].find_all('td')
    text = ';'.join([r.text.strip() for r in rows]).replace('\n','')
    #address
    addrelem = soup.find("span", class_='small_addr') 
    if addrelem != None:
        address = addrelem.text.strip()
    else:
        address = ''
    #county  
    ctyelem = soup.find("li", class_='fd_county') 
    if ctyelem != None:
        county = ctyelem.text.strip()
    else:
        county = ''
    #date
    dateelem = soup.find("div", class_='ddeath')
    if dateelem == None:
        dateelem = soup.find("div", class_='dpubl')
    s = dateelem.text.strip()
    try:
        date = list(datefinder.find_dates(s))[0]
    except:
        date = ''
    print (n, date, name, address, county)
    return name, date, county, address, text

In [3]:
get_dn_page(390045)

390045 2019-06-05 00:00:00 Margaret  Kelly Avenue Grove, Ballymodan Place, Bandon,  Cork Cork


('Margaret  Kelly',
 datetime.datetime(2019, 6, 5, 0, 0),
 'Cork',
 'Avenue Grove, Ballymodan Place, Bandon,  Cork',
 "Kelly (Avenue Grove, Ballymodan Place, Bandon and late of Currivreeda West) on June 5th 2019. Margaret, beloved daughter of the late Timothy and Ellen. Sadly missed by her loving sisters Ann and Eileen, brothers Teddy and John, Margaret's Partner William, brothers-in-law, sisters-in-law, nieces, nephews, relatives and good friend Malcolm.;;Rosary on Friday evening at 7pm in St. Patrick's Church, Bandon. Requiem Mass on Saturday at 12 noon, funeral afterwards to the adjoining cemetery.;;May Margaret Rest in Peace")

In [2]:
#df = pd.read_csv('rip_dn_scrape.csv')
df = pd.read_pickle('rip_dn_scrape.pkl')
len(df)

483921

## iterate over a range of ids and get info

In [None]:
#read current table in so we skip those already done
df = pd.read_pickle('rip_dn_scrape.pkl')
print (len(df))
ids = list(df.id)

results={}
for n in range(482000,484050):
    if n in ids:
        continue
    name,date,cty,addr,txt = get_dn_page(n)
    if name == '':
        continue
    results[n] = [name,date,cty,addr,txt]
    time.sleep(0.05)

In [4]:
res = pd.DataFrame.from_dict(results,orient='index',columns=['name','date','county','address','notice']).reset_index()
res = res.rename(columns={'index':'id'})

In [5]:
res

Unnamed: 0,id,name,date,county,address,notice
0,483950,David O'Shea,2022-01-23,Kerry,"Springfield Lodge, Rookery Close, Killarney, ...","Formerly of Turrenafersh, Blackwater, Kenmare...."
1,483951,Eamon Bracken,2022-01-23,Westmeath,"Ballybroder, Kilbeggan, Westmeath / Durrow, O...","Eamon Bracken, Ballybroder, Kilbeggan, Co. Wes..."
2,483952,Breda O'Leary (née Ronayne),2022-01-22,Galway,"Kilcolgan, Galway / Cork / Limerick / Dublin","Formerly of Cork, Limerick, Castleknock and Ca..."
3,483953,Eugene (Janey) Brady,2022-01-23,,"Camagh, Abbeylara, Longford",Died Sunday 23th January 2022 peacefully at hi...
4,483954,Margaret (Mairéad) Brophy (née Bracken),2022-01-22,Offaly,"Lackaroe, Cadamstown / Castletown, The Island,...","Margaret passed away peacefully, at home surro..."
...,...,...,...,...,...,...
95,484045,Glenn HOGAN,2022-01-22,Dublin,"Thomond Road, Ballyfermot, Dublin","Hogan, Glenn (Thomond Road, Ballyfermot) 22nd ..."
96,484046,Ken Rennison,2022-01-23,Limerick,"Rivers, Lisnagry, Limerick","Ken Rennison (Rivers, Lisnagry, Co. Limerick, ..."
97,484047,Daniel Tierney,2022-01-24,Dublin,"Donnycarney, Dublin","Tierney, Daniel (Dan) (late of Donnycarney and..."
98,484048,Michael Cowley,2022-01-23,Dublin,"Annadale Drive, Marino, Dublin","Cowley, Michael (late of Annadale Drive, Marin..."


In [8]:
x = pd.concat([df,res]).reset_index(drop=True)
x=x[~x.id.duplicated(keep='first')]
print (len(df),len(res),len(x))
x.to_pickle('rip_dn_scrape.pkl')

483921 100 484021


In [9]:
x.to_csv('rip_dn_scrape.csv')

## clean data

In [6]:
print (len(x))
x=x.replace('',None).dropna(axis=0,subset=['date'])
x['date'] = pd.to_datetime(x['date']).apply(lambda x: x.strftime('%d/%m/%Y'))
x=x.drop_duplicates(['name','notice'])
x=x.drop_duplicates(['name','address'])
x=x.drop_duplicates(['name','date','county'])
nc = ['Fermanagh','Armagh','Tyrone','Down','Antrim','Derry']
x = x[~x.county.isin(nc)]
x = x[~x.address.str.contains('|'.join(nc))]
x=x.sort_values('id')
print (len(x))
#x.to_csv('rip_dn_scrape_processed.csv')
x.to_pickle('rip_dn_scrape_processed.pkl')

483839
449109
