In [1]:
import re
import ast
import sys
import json
import random
import requests
import numpy as np
import pandas as pd
from tqdm import tqdm
from bs4 import BeautifulSoup
from dateutil.parser import parse as parseDate
from dateutil.parser import ParserError

PATH_TO_UTILS = "../"  # change based on your directory structure
sys.path.append(PATH_TO_UTILS)

from utils import check_mf_formatting

In [2]:
wd_entries = pd.read_csv("wikidata_entries.tsv", delimiter="\t", low_memory=False)

In [3]:
wd_entries.tail()

Unnamed: 0,?findAGraveID,?person,?name,?birthdays,?birthplaces,?deathdays,?deathplaces,?burials,?plots,?fathers,?mothers,?siblings
140550,9999,<http://www.wikidata.org/entity/Q555910>,Fred Quimby@en,1886-07-31T00:00:00Z,Minneapolis,1965-09-16T00:00:00Z,Santa Monica,Forest Lawn Memorial Park,,,,
140551,99992961,<http://www.wikidata.org/entity/Q3108870>,Pascual Pérez@en,1957-05-17T00:00:00Z,San Cristóbal,2012-11-01T00:00:00Z,San Gregorio de Nigua,,,,,
140552,99996384,<http://www.wikidata.org/entity/Q65801651>,Joseph Kain@en,1854-01-01T00:00:00Z,,1907-01-01T00:00:00Z,,,,,,
140553,99999223,<http://www.wikidata.org/entity/Q63616730>,John Rogers@en,1692-01-19T00:00:00Z,,1773-10-16T00:00:00Z,,,,,,
140554,99999919,<http://www.wikidata.org/entity/Q1687534>,Jeremiah D. M. Ford@en,1873-07-02T00:00:00Z,Cambridge,1958-11-13T00:00:00Z,Cambridge,Saint Pauls Cemetery,,,,


In [4]:
# Adapted from https://www.zenrows.com/blog/user-agent-web-scraping
# More here https://useragentstring.com/pages/useragentstring.php
user_agents = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36"
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36"
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36"
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36"
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36"
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.1 Safari/605.1.15"
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 13_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.1 Safari/605.1.15"
]

In [5]:
base_url = "https://www.findagrave.com/memorial/"
try:
    acc = pd.read_csv('findagrave_entries.csv').to_dict(orient="records")
except:
    acc = []
i = len(acc) - 1

def construct_row(id, newId=None):
    headers = {'User-Agent': random.choice(user_agents)}
    with requests.get(base_url + str(newId if newId else id), headers=headers) as res:
        soup = BeautifulSoup(res.text, 'html.parser')
        if res.status_code == 404:
            a = soup.find("a", string=" See Merged Memorial")
            if a is None:
                # Memorial has been removed
                return {
                    "?findAGraveID": id,
                    "!newId": ""
                }
            return construct_row(id, a.get("href").split('/')[2])
        s = soup.find_all(attrs={'aria-labelledby': 'siblingLabel'})
        try:
            birthdays = parseDate(soup.find(id="birthDateLabel").text).isoformat() + 'Z'
        except ParserError:
            print("could not parse", soup.find(id="birthDateLabel").text)
            birthdays = np.NaN
        try:
            deathdays = parseDate(re.sub(r"\([^)]+\)", "", soup.find(id="deathDateLabel").text)).isoformat() + 'Z'
        except ParserError:
            print("could not parse", soup.find(id="deathDateLabel").text)
            deathdays = np.NaN
        return {
            "?findAGraveID": id,
            "!newId": newId,
            "?name": soup.find(id="bio-name").find(string=True).strip(),
            "?birthdays": birthdays,
            "?birthplaces": soup.find(id="birthLocationLabel").text.strip() if soup.find(id="birthLocationLabel") else np.NaN,
            "?deathdays": deathdays,
            "?deathplaces": soup.find(id="deathLocationLabel").text.strip() if soup.find(id="deathLocationLabel") else np.NaN,
            "?burials": soup.find(id="cemeteryNameLabel").text.strip() if soup.find(id="cemeteryNameLabel") else 
                (re.sub("[ \n]+", " ", soup.find(id="cemeteryCountryName").parent.text.strip()) if soup.find(id="cemeteryCountryName") else np.NaN),
            "?plots": soup.find(id="plotValueLabel").text.strip() if soup.find(id="plotValueLabel") is not None else np.NaN,
            "?siblings": ';'.join(list(map(lambda elem: re.sub(" +", " ", elem.find("h3", recursive=True).text.strip()), soup.find_all(attrs={'aria-labelledby': 'siblingLabel'}))))
        }

for id in tqdm(wd_entries["?findAGraveID"][len(acc):]):
    i += 1
    try:
        acc.append(construct_row(id))
    except Exception as e:
        print(base_url + str(id), "idx", i)
        raise e
    finally:
        pd.DataFrame(acc).to_csv("findagrave_entries.csv", index=False)


  0%|          | 208/140555 [02:04<24:57:41,  1.56it/s]

could not parse c.1286


  0%|          | 411/140555 [04:37<35:01:03,  1.11it/s]

could not parse unknown


  0%|          | 413/140555 [04:38<27:21:01,  1.42it/s]

could not parse unknown


  0%|          | 427/140555 [04:50<26:29:11,  1.47it/s]

https://www.findagrave.com/memorial/10061758 idx 427





AttributeError: 'NoneType' object has no attribute 'text'

In [6]:
acc[0]

{'?findAGraveID': 100102475,
 '!newId': '88066179',
 '?name': 'George Lee Thurston III',
 '?birthdays': '1925-10-01T00:00:00Z',
 '?birthplaces': 'Culpeper, Culpeper County, Virginia, USA',
 '?deathdays': '2001-03-20T00:00:00Z',
 '?deathplaces': 'Tallahassee, Leon County, Florida, USA',
 '?burials': 'Oakland Cemetery',
 '?plots': 'Block / Lot or Sec # / Space J / LOT 28, SEC 2 / 1, Lot Descr',
 '?siblings': ''}

In [None]:
acc.to_csv("mismatches.csv", index=False)