# Find-a-grave

In [3]:
import re
import ast
import sys
import json
import time
import random
import requests
import numpy as np
import pandas as pd
from tqdm import tqdm
from bs4 import BeautifulSoup
from collections import defaultdict
from dateutil.parser import parse as parseDate
from dateutil.parser import ParserError

PATH_TO_UTILS = "../"  # change based on your directory structure
sys.path.append(PATH_TO_UTILS)

from utils import check_mf_formatting

In [23]:
types = defaultdict(lambda: "str", {"?findAGraveID": int})
wd_entries = pd.read_csv("wikidata_entries.tsv", delimiter="\t", dtype=types, parse_dates=["?birthdays", "?deathdays"], index_col="?findAGraveID")

## Fetching from find a grave

In [9]:
def make_request(url, max_retries=3, initial_backoff=2, multiplier=2, max_backoff=16, **request_params):
    retries = 0
    backoff = initial_backoff

    while retries < max_retries:
        try:
            response = requests.get(url, **request_params)
            if response.status_code != 404:
                response.raise_for_status()
            return response
        except requests.RequestException as e:
            print(f"Request failed: {e}")
            retries += 1
            if retries < max_retries:
                print(f"Retrying in {backoff} seconds (retry {retries}/{max_retries})")
                time.sleep(backoff)
                backoff = min(backoff * multiplier, max_backoff)
    
    raise Exception(f"Max retries reached, could not complete request for {url}")

# test
# make_request("http://foobar.com/")

In [5]:
# Adapted from https://www.zenrows.com/blog/user-agent-web-scraping
# More here https://useragentstring.com/pages/useragentstring.php
user_agents = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36"
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36"
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36"
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36"
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36"
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.1 Safari/605.1.15"
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 13_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.1 Safari/605.1.15"
]

In [None]:
base_url = "https://www.findagrave.com/memorial/"
try:
    acc = pd.read_csv('findagrave_entries.csv').to_dict(orient="records")
except:
    acc = []
i = len(acc) - 1

def construct_row(id, newId=None):
    headers = {'User-Agent': random.choice(user_agents)}
    res = make_request(base_url + str(newId if newId else id), headers=headers)
    soup = BeautifulSoup(res.text, 'html.parser')
    if res.status_code == 404:
        a = soup.find("a", string=" See Merged Memorial")
        if a is None:
            # Memorial has been removed
            return {
                "?findAGraveID": id,
                "!newId": ""
            }
        return construct_row(id, a.get("href").split('/')[2])
    s = soup.find_all(attrs={'aria-labelledby': 'siblingLabel'})
    try:
        birthdays = parseDate(soup.find(id="birthDateLabel").text).isoformat() + 'Z'
    except ParserError:
        print("could not parse", soup.find(id="birthDateLabel").text)
        birthdays = np.NaN
    except AttributeError as e:
        print(res.text)
        raise e
    try:
        deathdays = parseDate(re.sub(r"\([^)]+\)", "", soup.find(id="deathDateLabel").text)).isoformat() + 'Z'
    except ParserError:
        print("could not parse", soup.find(id="deathDateLabel").text)
        deathdays = np.NaN
    return {
        "?findAGraveID": id,
        "!newId": newId,
        "?name": soup.find(id="bio-name").find(string=True).strip(),
        "?birthdays": birthdays,
        "?birthplaces": soup.find(id="birthLocationLabel").text.strip() if soup.find(id="birthLocationLabel") else np.NaN,
        "?deathdays": deathdays,
        "?deathplaces": soup.find(id="deathLocationLabel").text.strip() if soup.find(id="deathLocationLabel") else np.NaN,
        "?burials": soup.find(id="cemeteryNameLabel").text.strip() if soup.find(id="cemeteryNameLabel") else 
            (re.sub("[ \n]+", " ", soup.find(id="cemeteryCountryName").parent.text.strip()) if soup.find(id="cemeteryCountryName") else np.NaN),
        "?plots": soup.find(id="plotValueLabel").text.strip() if soup.find(id="plotValueLabel") is not None else np.NaN,
        "?siblings": ';'.join(list(map(lambda elem: re.sub(" +", " ", elem.find("h3", recursive=True).text.strip()), soup.find_all(attrs={'aria-labelledby': 'siblingLabel'}))))
    }

for id in tqdm(wd_entries.index[len(acc):]):
    i += 1
    try:
        acc.append(construct_row(id))
    except Exception as e:
        print(base_url + str(id), "idx", i)
        raise e
    finally:
        pd.DataFrame(acc).to_csv("findagrave_entries.csv", index=False)


In [11]:
acc[0]

{'?findAGraveID': 1,
 '!newId': nan,
 '?name': 'Cleveland Abbe',
 '?birthdays': '1838-12-03T00:00:00Z',
 '?birthplaces': 'New York, New York County, New York, USA',
 '?deathdays': '1916-10-28T00:00:00Z',
 '?deathplaces': 'Chevy Chase, Montgomery County, Maryland, USA',
 '?burials': 'Rock Creek Cemetery',
 '?plots': 'Section M, Lot 292, Range 5',
 '?siblings': 'Walter Abbe'}

## Comparing

In [24]:
# Not fully implemented
wd_entries.drop(columns=["?fathers", "?mothers", "?siblings"], inplace=True)

# Normalizу dfs
wd_entries.insert(loc=0, column="!newId", value=np.NaN)
columns_to_drop = [col for col in wd_entries.columns if '_guid' in col]
comparable_wd_entries = wd_entries.drop(columns=[*columns_to_drop, "?person"])

In [25]:
wd_entries.tail()

Unnamed: 0_level_0,!newId,?person,?name,?birthdays,?birthdays_guids,?birthplaces,?birthplaces_guids,?deathdays,?deathdays_guids,?deathplaces,?deathplaces_guids,?burials,?burials_guids,?plots,?plots_guids,?fathers_guids,?mothers_guids,?siblings_guids
?findAGraveID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
9999,,<http://www.wikidata.org/entity/Q555910>,Fred Quimby@en,1886-07-31T00:00:00Z,http://www.wikidata.org/entity/statement/q5559...,Fred Quimby,http://www.wikidata.org/entity/statement/q5559...,1965-09-16T00:00:00Z,http://www.wikidata.org/entity/statement/q5559...,Fred Quimby,http://www.wikidata.org/entity/statement/Q5559...,Fred Quimby,http://www.wikidata.org/entity/statement/q5559...,,,,,
99992961,,<http://www.wikidata.org/entity/Q3108870>,Pascual Pérez@en,1957-05-17T00:00:00Z,http://www.wikidata.org/entity/statement/Q3108...,Pascual Pérez,http://www.wikidata.org/entity/statement/Q3108...,2012-11-01T00:00:00Z,http://www.wikidata.org/entity/statement/Q3108...,Pascual Pérez,http://www.wikidata.org/entity/statement/Q3108...,,,,,,,
99996384,,<http://www.wikidata.org/entity/Q65801651>,Joseph Kain@en,1854-01-01T00:00:00Z,http://www.wikidata.org/entity/statement/Q6580...,,,1907-01-01T00:00:00Z,http://www.wikidata.org/entity/statement/Q6580...,,,,,,,,,
99999223,,<http://www.wikidata.org/entity/Q63616730>,John Rogers@en,1692-01-19T00:00:00Z,http://www.wikidata.org/entity/statement/Q6361...,,,1773-10-16T00:00:00Z,http://www.wikidata.org/entity/statement/Q6361...,,,,,,,,,
99999919,,<http://www.wikidata.org/entity/Q1687534>,Jeremiah D. M. Ford@en,1873-07-02T00:00:00Z,http://www.wikidata.org/entity/statement/Q1687...,Jeremiah D. M. Ford,http://www.wikidata.org/entity/statement/q1687...,1958-11-13T00:00:00Z,http://www.wikidata.org/entity/statement/Q1687...,Jeremiah D. M. Ford,http://www.wikidata.org/entity/statement/q1687...,Jeremiah D. M. Ford,http://www.wikidata.org/entity/statement/Q1687...,,,,,


In [26]:
comparable_wd_entries.tail()

Unnamed: 0_level_0,!newId,?name,?birthdays,?birthplaces,?deathdays,?deathplaces,?burials,?plots
?findAGraveID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
9999,,Fred Quimby@en,1886-07-31T00:00:00Z,Fred Quimby,1965-09-16T00:00:00Z,Fred Quimby,Fred Quimby,
99992961,,Pascual Pérez@en,1957-05-17T00:00:00Z,Pascual Pérez,2012-11-01T00:00:00Z,Pascual Pérez,,
99996384,,Joseph Kain@en,1854-01-01T00:00:00Z,,1907-01-01T00:00:00Z,,,
99999223,,John Rogers@en,1692-01-19T00:00:00Z,,1773-10-16T00:00:00Z,,,
99999919,,Jeremiah D. M. Ford@en,1873-07-02T00:00:00Z,Jeremiah D. M. Ford,1958-11-13T00:00:00Z,Jeremiah D. M. Ford,Jeremiah D. M. Ford,


In [27]:
grave_entries = pd.read_csv("findagrave_entries.csv", index_col="?findAGraveID")

In [28]:
# Thanks to https://stackoverflow.com/a/61954604
def drop_uncommon_rows(df1: pd.DataFrame, df2: pd.DataFrame):
    return df1[df1.index.isin(df2.index)]
comparable_wd_entries = drop_uncommon_rows(comparable_wd_entries, grave_entries)
grave_entries = drop_uncommon_rows(grave_entries, comparable_wd_entries)

In [29]:
# Not sure if correctly distinguished between siblings or parents, so remove
grave_entries.drop(columns=["?siblings"], inplace=True)
grave_entries.tail()

Unnamed: 0_level_0,!newId,?name,?birthdays,?birthplaces,?deathdays,?deathplaces,?burials,?plots
?findAGraveID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
9999,,Fred Quimby,1886-07-31T00:00:00Z,"Minneapolis, Hennepin County, Minnesota, USA",1965-09-16T00:00:00Z,"Santa Monica, Los Angeles County, California, USA",Forest Lawn Memorial Park,"Court of Freedom section, Map #G41, Lot 4265, ..."
99992961,,Pascual Gross Perez,1957-05-17T00:00:00Z,"San Cristóbal, Municipio de San Cristóbal, San...",2012-11-01T00:00:00Z,"San Gregorio de Nigua, Municipio de San Gregor...",Cementerio Municipal,
99996384,,Joseph Kain,1854-12-04T00:00:00Z,Scotland,1907-05-03T00:00:00Z,"Chicago, Cook County, Illinois, USA",Oakwood Cemetery,
99999223,248358353.0,Rev John Rogers Jr.,1692-06-27T00:00:00Z,"Ipswich, Essex County, Massachusetts, USA",1773-10-16T00:00:00Z,"Eliot, York County, Maine, USA",Leighton Family Cemetery,
99999919,,Jeremiah Denis Mathias Ford,1873-07-02T00:00:00Z,"Cambridge, Middlesex County, Massachusetts, USA",1958-11-13T00:00:00Z,"Arlington, Middlesex County, Massachusetts, USA",Saint Paul Cemetery,


In [30]:
assert(set(comparable_wd_entries.columns) == set(grave_entries.columns))
assert(len(comparable_wd_entries) == len(grave_entries))

In [41]:
# Docs: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.compare.html
diff = comparable_wd_entries.compare(grave_entries, align_axis=1, result_names=("wikidata", "findagrave"))
diff.drop(columns=["?name"], inplace=True)
diff.tail()

Unnamed: 0_level_0,!newId,!newId,?birthdays,?birthdays,?birthplaces,?birthplaces,?deathdays,?deathdays,?deathplaces,?deathplaces,?burials,?burials,?plots,?plots
Unnamed: 0_level_1,wikidata,findagrave,wikidata,findagrave,wikidata,findagrave,wikidata,findagrave,wikidata,findagrave,wikidata,findagrave,wikidata,findagrave
?findAGraveID,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2
9999,,,,,Fred Quimby,"Minneapolis, Hennepin County, Minnesota, USA",,,Fred Quimby,"Santa Monica, Los Angeles County, California, USA",Fred Quimby,Forest Lawn Memorial Park,,"Court of Freedom section, Map #G41, Lot 4265, ..."
99992961,,,,,Pascual Pérez,"San Cristóbal, Municipio de San Cristóbal, San...",,,Pascual Pérez,"San Gregorio de Nigua, Municipio de San Gregor...",,Cementerio Municipal,,
99996384,,,1854-01-01T00:00:00Z,1854-12-04T00:00:00Z,,Scotland,1907-01-01T00:00:00Z,1907-05-03T00:00:00Z,,"Chicago, Cook County, Illinois, USA",,Oakwood Cemetery,,
99999223,,248358353.0,1692-01-19T00:00:00Z,1692-06-27T00:00:00Z,,"Ipswich, Essex County, Massachusetts, USA",,,,"Eliot, York County, Maine, USA",,Leighton Family Cemetery,,
99999919,,,,,Jeremiah D. M. Ford,"Cambridge, Middlesex County, Massachusetts, USA",,,Jeremiah D. M. Ford,"Arlington, Middlesex County, Massachusetts, USA",Jeremiah D. M. Ford,Saint Paul Cemetery,,


In [40]:
diff.columns

MultiIndex([(      '!newId',   'wikidata'),
            (      '!newId', 'findagrave'),
            (  '?birthdays',   'wikidata'),
            (  '?birthdays', 'findagrave'),
            ('?birthplaces',   'wikidata'),
            ('?birthplaces', 'findagrave'),
            (  '?deathdays',   'wikidata'),
            (  '?deathdays', 'findagrave'),
            ('?deathplaces',   'wikidata'),
            ('?deathplaces', 'findagrave'),
            (    '?burials',   'wikidata'),
            (    '?burials', 'findagrave'),
            (      '?plots',   'wikidata'),
            (      '?plots', 'findagrave')],
           )