# web scraping RIP.ie with beautifulsoup

In [8]:
import time
import pandas as pd
from bs4 import BeautifulSoup
import requests
import datefinder

def get_dn_page(n):
    """Get death notice text from page matching the id number"""
    
    url = 'https://rip.ie/showdn.php?dn=%s' %n
    req = requests.get(url)
    soup = BeautifulSoup(req.content, 'html.parser')
    if soup.title == None:
        title = ''
    else:        
        title  = soup.title.text.strip()
    name=''    
    for s in ['Death Notice of ','The death has occurred of ']:
        if title.startswith(s):
            name = title.split(s)[1]   
    elem = soup.find_all("div", id="dn_photo_and_text")
    
    if len(elem) == 0:
        return name, '', '', '', ''
    rows = elem[0].find_all('p')
    if len(rows) == 0:
        rows = elem[0].find_all('td')
    text = ';'.join([r.text.strip() for r in rows]).replace('\n','')
    #address
    addrelem = soup.find("span", class_='small_addr') 
    if addrelem != None:
        address = addrelem.text.strip()
    else:
        address = ''
    #county  
    ctyelem = soup.find("li", class_='fd_county') 
    if ctyelem != None:
        county = ctyelem.text.strip()
    else:
        county = ''
    #date
    dateelem = soup.find("div", class_='ddeath')
    if dateelem == None:
        dateelem = soup.find("div", class_='dpubl')
    s = dateelem.text.strip()
    try:
        date = list(datefinder.find_dates(s))[0]
    except:
        date = ''
    print (n, date, name, address, county)
    return name, date, county, address, text

In [9]:
get_dn_page(390045)

390045 2019-06-05 00:00:00 Margaret  Kelly Avenue Grove, Ballymodan Place, Bandon,  Cork Cork


('Margaret  Kelly',
 datetime.datetime(2019, 6, 5, 0, 0),
 'Cork',
 'Avenue Grove, Ballymodan Place, Bandon,  Cork',
 "Kelly (Avenue Grove, Ballymodan Place, Bandon and late of Currivreeda West) on June 5th 2019. Margaret, beloved daughter of the late Timothy and Ellen. Sadly missed by her loving sisters Ann and Eileen, brothers Teddy and John, Margaret's Partner William, brothers-in-law, sisters-in-law, nieces, nephews, relatives and good friend Malcolm.;;Rosary on Friday evening at 7pm in St. Patrick's Church, Bandon. Requiem Mass on Saturday at 12 noon, funeral afterwards to the adjoining cemetery.;;May Margaret Rest in Peace")

In [10]:
#df = pd.read_csv('rip_dn_scrape.csv')
df = pd.read_pickle('rip_dn_scrape.pkl')
len(df)

484021

## iterate over a range of ids and get info

In [None]:
#read current table in so we skip those already done
df = pd.read_pickle('rip_dn_scrape.pkl')
print (len(df))
ids = list(df.id)

results={}
for n in range(482000,484892):
    if n in ids:
        continue
    name,date,cty,addr,txt = get_dn_page(n)
    if name == '':
        continue
    results[n] = [name,date,cty,addr,txt]
    time.sleep(0.05)

In [12]:
res = pd.DataFrame.from_dict(results,orient='index',columns=['name','date','county','address','notice']).reset_index()
res = res.rename(columns={'index':'id'})

In [13]:
res

Unnamed: 0,id,name,date,county,address,notice
0,484050,Veronica GENOCKEY (née Farrelly),2022-01-23,Dublin,"Malahide, Dublin","GENOCKEY (née Farrelly), Veronica (Malahide) -..."
1,484051,Joe (Joseph) Poole,2022-01-23,Dublin,"Ballymun, Dublin / Cabra, Dublin","POOLE, Joe (Joseph) (Ballymun, Dublin formerly..."
2,484052,Michael Maurice Moore,2022-01-23,Cork,"Glanmire, Cork","MOORE (Glanmire, Cork) : On January 23rd, 2022..."
3,484053,Baby Theo Ryan,2022-01-22,Tipperary,"Rootagh, Newport, Tipperary","Ryan, Baby Theo, Rootagh, Newport, Co. Tippera..."
4,484054,Vera Nicol (née O'Keeffe),2022-01-23,Dublin,"Ballyfermot, Dublin / Whitehall, Dublin","Nicol, Marie (Vera) (nee O’Keeffe), January 23..."
...,...,...,...,...,...,...
825,484887,Francis (Frank) Hughes,2022-01-29,Dublin,"Finglas, Dublin","Late of Chubb, Blackrock. Peacefully, at the M..."
826,484888,Joan Savage (née O'Hara),2022-01-31,Dublin,"Swords, Dublin","SAVAGE (née O'Hara), Joan (Swords) - January 3..."
827,484889,Ellen (Chrissie) Hayes (née O'Sullivan),2022-01-30,Cork,"Dromada Beg, Ladysbridge, Cork",Ellen (Chrissie) beloved wife of the late Rich...
828,484890,Emily ROWE (née Seery),2022-01-31,Offaly,"Ballykillen, Edenderry, Offaly",Suddenly. Emily will be sadly missed and fore...


In [14]:
new = pd.concat([df,res]).reset_index(drop=True)
new=new[~new.id.duplicated(keep='first')]
print (len(df),len(res),len(new))
new.to_pickle('rip_dn_scrape.pkl')

484021 830 484851


In [9]:
#x.to_csv('rip_dn_scrape.csv')

## clean data

In [15]:
x=new
print (len(x))
x=x.replace('',None).dropna(axis=0,subset=['date'])
x['date'] = pd.to_datetime(x['date']).apply(lambda x: x.strftime('%d/%m/%Y'))
x=x.drop_duplicates(['name','notice'])
x=x.drop_duplicates(['name','address'])
x=x.drop_duplicates(['name','date','county'])
x = x[~x.address.isnull()]
nc = ['Fermanagh','Armagh','Tyrone','Down','Antrim','Derry']
x = x[~x.county.isin(nc)]
x = x[~x.address.str.contains('|'.join(nc))]
x=x.sort_values('id')
print (len(x))
#x.to_csv('rip_dn_scrape_processed.csv')
x.to_pickle('rip_dn_scrape_processed.pkl')

484851
450268
