# web scraping RIP.ie with beautifulsoup

In [1]:
import time
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import requests
import datefinder

In [2]:
def get_dn_page_old(n):
    """Get death notice text from page matching the id number"""
    
    url = 'https://rip.ie/showdn.php?dn=%s' %n

    user_agent = 'Chrome/107.0.5304.110'
    headers = {'User-Agent': user_agent}
    req = requests.get(url,headers=headers)   
    soup = BeautifulSoup(req.content, 'html.parser')
    if soup.title == None:
        title = ''
    else:        
        title  = soup.title.text.strip()
    name=''    
    for s in ['Death Notice of ','The death has occurred of ']:
        if title.startswith(s):
            name = title.split(s)[1]   
    elem = soup.find_all("div", id="dn_photo_and_text")
    
    if len(elem) == 0:
        return name, '', '', '', ''
    rows = elem[0].find_all('p')
    if len(rows) == 0:
        rows = elem[0].find_all('td')
    text = ';'.join([r.text.strip() for r in rows]).replace('\n','')
    #address
    addrelem = soup.find("span", class_='small_addr') 
    if addrelem != None:
        address = addrelem.text.strip()
    else:
        address = ''
    #county  
    ctyelem = soup.find("li", class_='fd_county') 
    if ctyelem != None:
        county = ctyelem.text.strip()
    else:
        county = ''
    #date
    dateelem = soup.find("div", class_='ddeath')
    if dateelem == None:
        dateelem = soup.find("div", class_='dpubl')
    s = dateelem.text.strip()
    try:
        date = list(datefinder.find_dates(s))[0]
    except:
        date = ''
    print (n, date, name, address, county)
    return name, date, county, address, text

In [2]:
def get_dn_page_new(n):
    """Get death notice text from page matching the id number"""
    
    url = 'https://rip.ie/death-notice/%s' %n
    #print (url)
    user_agent = 'Chrome/107.0.5304.110'
    headers = {'User-Agent': user_agent}
    req = requests.get(url,headers=headers)   
    soup = BeautifulSoup(req.content, 'html.parser')
   
    elem = soup.select('[class*=DeathNotice_person-name]')
    #print (elem)
    if len(elem) == 0:
        return '', '', '', '', ''
    else:
        name = elem[0].get_text(strip=True)
    
    #date elements
    ddateelem = soup.select('[class*=DeathNotice_dates-death-date]')
    pubdateelem = soup.select('[class*=DeathNotice_dates-published-date]')
    if len(ddateelem) > 0:
        date = ddateelem[0].get_text(strip=True)
    elif len(pubdateelem) > 0:
        print ('using pub date')
        date = pubdateelem[0].get_text(strip=True)
    else:
        print (n, name, 'no date')
        return '', '', '', '', ''        
    
    date = list(datefinder.find_dates(date))[0]

    address = soup.select('[class*=DeathNotice_tags-item]')[0].get_text(strip=True)
    #print (address) 
    county = address.split()[-1]

    desc = soup.select('[class*=DeathNotice_description]')
    text = []
    for elem in desc:
        text.append(elem.get_text(strip=True))
    text = ' '.join(text)
    #print (text)
   
    print (n, date, name, address, county)
    return name, date, county, address, text


In [None]:
get_dn_page_new(50000)

In [None]:
df = pd.read_pickle('rip_dn_scrape.pkl')
len(df)

In [48]:
#read current table in so we skip those already done
df = pd.read_pickle('rip_dn_scrape_new.pkl')
#df = pd.read_parquet('rip_dn_scrape.parquet')
print (len(df))
ids = list(df.id)

123913


## iterate over a range of ids to get info

In [3]:
def get_ids(start, end):
    """fetch a range of ids"""
    results={}
    for n in range(start, end):
        if n in ids:
            continue
        name,date,cty,addr,txt = get_dn_page_new(n)   
        if name == '':
            continue
        results[n] = [name,date,cty,addr,txt]
        time.sleep(0.04)
    res = pd.DataFrame.from_dict(results,orient='index',columns=['name','date','county','address','notice']).reset_index()
    res = res.rename(columns={'index':'id'})
    return res
    
#res = get_ids(72000,73000)

In [4]:
def get_ids_parallel(start, end, n_cores=4):
    """Get ids in blocks in parallel"""
    
    #from multiprocessing import Pool
    from multiprocessing.pool import ThreadPool as Pool
    pool = Pool(n_cores)
    x = np.linspace(start,end,n_cores,dtype=int)
    blocks=[]
    for i in range(len(x)):
        if i < len(x)-1:
            blocks.append((x[i],x[i+1]-1))    
    print (blocks)
    funclist = []
    for start,end in blocks:
        f = pool.apply_async(get_ids, [start, end])
        funclist.append(f)
    result=[]
    for f in funclist:
        df = f.get(timeout=None)
        result.append(df)
    pool.close()
    pool.join()
    result = pd.concat(result).sort_values('id')    
    print ('finished')
    return result

In [None]:
#read current table in so we skip those already done
df = pd.read_pickle('rip_dn_scrape.pkl')
#df = pd.read_parquet('rip_dn_scrape.parquet')
print (len(df))
ids = list(df.id)
#res = get_ids_parallel(530000,537000, n_cores=16)
res = get_ids(537000,529500)

In [None]:
res

In [53]:
new = pd.concat([df,res]).reset_index(drop=True)
new=new[~new.id.duplicated(keep='first')]
#new=new.replace('',None).dropna(axis=0,subset=['date'])
#new['date'] = pd.to_datetime(new['date']).apply(lambda x: x.strftime('%d/%m/%Y'))
print (len(df),len(res),len(new))
#new.to_parquet('rip_dn_scrape.parquet')
new.to_pickle('rip_dn_scrape.pkl')

537095 89 537184


## clean data

In [54]:
x=new
print (len(x))
x=x.replace('',None).dropna(axis=0,subset=['date'])
x['date'] = pd.to_datetime(x['date'],format='mixed', errors='coerce')#.apply(lambda x: x.strftime('%d/%m/%Y'))
x=x.drop_duplicates(['name','notice'])
x=x.drop_duplicates(['name','address'])
x=x.drop_duplicates(['name','date','county'])
x = x[~x.address.isnull()]
nc = ['Fermanagh','Armagh','Tyrone','Down','Antrim','Derry']
x = x[~x.county.isin(nc)]
x = x[~x.address.str.contains('|'.join(nc))]
x=x.sort_values('id')
print (len(x))
#x.to_pickle('rip_dn_scrape_processed.pkl')
x.to_parquet('rip_dn_scrape_processed.parquet')

537184
519427


  if _pandas_api.is_sparse(col):
