# Find-a-grave

In [2]:
import re
import ast
import sys
import json
import time
import random
import requests
import numpy as np
import pandas as pd
from tqdm import tqdm
from bs4 import BeautifulSoup
from collections import defaultdict
from dateutil.parser import ParserError
from dateutil.parser import parse as parseDate

PATH_TO_UTILS = "../"  # change based on your directory structure
sys.path.append(PATH_TO_UTILS)

from utils import check_mf_formatting

In [23]:
%load_ext jupyter_black

In [24]:
types = defaultdict(lambda: "str", {"?findAGraveID": int})
wd_entries = pd.read_csv(
    "wikidata_entries.tsv", delimiter="\t", dtype=types, index_col="?findAGraveID"
)

In [None]:
pd.set_option("display.max_rows", None)
dupes = wd_entries.index.duplicated(False)
wd_entries[dupes].sort_index()

In [26]:
pd.set_option("display.max_rows", 10)

## Fetching from find a grave

In [None]:
def make_request(url, max_retries=3, initial_backoff=2, multiplier=2, max_backoff=16, **request_params):
    retries = 0
    backoff = initial_backoff

    while retries < max_retries:
        try:
            response = requests.get(url, **request_params)
            if response.status_code != 404:
                response.raise_for_status()
            return response
        except requests.RequestException as e:
            print(f"Request failed: {e}")
            retries += 1
            if retries < max_retries:
                print(f"Retrying in {backoff} seconds (retry {retries}/{max_retries})")
                time.sleep(backoff)
                backoff = min(backoff * multiplier, max_backoff)
    
    raise Exception(f"Max retries reached, could not complete request for {url}")

# test
# make_request("http://foobar.com/")

In [None]:
# Adapted from https://www.zenrows.com/blog/user-agent-web-scraping
# More here https://useragentstring.com/pages/useragentstring.php
user_agents = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36"
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36"
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36"
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36"
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36"
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.1 Safari/605.1.15"
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 13_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.1 Safari/605.1.15"
]

In [46]:
base_url = "https://www.findagrave.com/memorial/"
try:
    acc = pd.read_csv("findagrave_entries.csv").to_dict(orient="records")
except:
    acc = []
i = len(acc) - 1


def construct_row(id, newId=None):
    headers = {"User-Agent": random.choice(user_agents)}
    res = make_request(base_url + str(newId if newId else id), headers=headers)
    soup = BeautifulSoup(res.text, "html.parser")
    if res.status_code == 404:
        a = soup.find("a", string=" See Merged Memorial")
        if a is None:
            # Memorial has been removed
            return {"?findAGraveID": id, "!newId": ""}
        return construct_row(id, a.get("href").split("/")[2])
    s = soup.find_all(attrs={"aria-labelledby": "siblingLabel"})
    try:
        birthdays = parseDate(soup.find(id="birthDateLabel").text).isoformat() + "Z"
    except ParserError:
        print("could not parse", soup.find(id="birthDateLabel").text)
        birthdays = np.NaN
    except AttributeError as e:
        print(res.text)
        raise e
    try:
        deathdays = (
            parseDate(
                re.sub(r"\([^)]+\)", "", soup.find(id="deathDateLabel").text)
            ).isoformat()
            + "Z"
        )
    except ParserError:
        print("could not parse", soup.find(id="deathDateLabel").text)
        deathdays = np.NaN
    return {
        "?findAGraveID": id,
        "!newId": newId,
        "?name": soup.find(id="bio-name").find(string=True).strip(),
        "?birthdays": birthdays,
        "?birthplaces": (
            soup.find(id="birthLocationLabel").text.strip()
            if soup.find(id="birthLocationLabel")
            else np.NaN
        ),
        "?deathdays": deathdays,
        "?deathplaces": (
            soup.find(id="deathLocationLabel").text.strip()
            if soup.find(id="deathLocationLabel")
            else np.NaN
        ),
        "?burials": (
            soup.find(id="cemeteryNameLabel").text.strip()
            if soup.find(id="cemeteryNameLabel")
            else (
                re.sub(
                    "[ \n]+",
                    " ",
                    soup.find(id="cemeteryCountryName").parent.text.strip(),
                )
                if soup.find(id="cemeteryCountryName")
                else np.NaN
            )
        ),
        "?plots": (
            soup.find(id="plotValueLabel").text.strip()
            if soup.find(id="plotValueLabel") is not None
            else np.NaN
        ),
        "?siblings": ";".join(
            list(
                map(
                    lambda elem: re.sub(
                        " +", " ", elem.find("h3", recursive=True).text.strip()
                    ),
                    soup.find_all(attrs={"aria-labelledby": "siblingLabel"}),
                )
            )
        ),
    }


for id in tqdm(wd_entries.index[len(acc) :]):
    i += 1
    try:
        acc.append(construct_row(id))
    except Exception as e:
        print(base_url + str(id), "idx", i)
        raise e
    finally:
        pd.DataFrame(acc).to_csv("findagrave_entries.csv", index=False)

0it [00:00, ?it/s]


In [47]:
acc[0]

{'?findAGraveID': 1,
 '!newId': nan,
 '?name': 'Cleveland Abbe',
 '?birthdays': '1838-12-03T00:00:00Z',
 '?birthplaces': 'New York, New York County, New York, USA',
 '?deathdays': '1916-10-28T00:00:00Z',
 '?deathplaces': 'Chevy Chase, Montgomery County, Maryland, USA',
 '?burials': 'Rock Creek Cemetery',
 '?plots': 'Section M, Lot 292, Range 5',
 '?siblings': 'Walter Abbe'}

## Comparing

In [27]:
# Remove duplicates
wd_entries = wd_entries[~dupes]

In [28]:
# This is a bad idea b/c mismatch df expects nans
# wd_entries = wd_entries.fillna('')

In [29]:
# Not fully implemented
wd_entries.drop(columns=["?fathers", "?mothers", "?siblings", "?name"], inplace=True)

# Normalizу dfs
wd_entries.insert(loc=0, column="!newId", value=np.NaN)
columns_to_drop = [col for col in wd_entries.columns if "_guid" in col]
comparable_wd_entries = wd_entries.drop(columns=[*columns_to_drop, "?person"])

In [30]:
wd_entries.head()

Unnamed: 0_level_0,!newId,?person,?birthdays,?birthdays_guids,?birthplaces,?birthplaces_guids,?deathdays,?deathdays_guids,?deathplaces,?deathplaces_guids,?burials,?burials_guids,?plots,?plots_guids,?fathers_guids,?mothers_guids,?siblings_guids
?findAGraveID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
5422,,<http://www.wikidata.org/entity/Q6227279>,1860-10-15T00:00:00Z,http://www.wikidata.org/entity/statement/q6227...,Chicago,http://www.wikidata.org/entity/statement/Q6227...,1938-11-11T00:00:00Z,http://www.wikidata.org/entity/statement/q6227...,,,Calvary Cemetery,http://www.wikidata.org/entity/statement/Q6227...,,,,,
15751646,,<http://www.wikidata.org/entity/Q96088451>,1897-01-01T00:00:00Z;1898-01-01T00:00:00Z,http://www.wikidata.org/entity/statement/Q9608...,,,1918-08-26T00:00:00Z;1918-08-20T00:00:00Z,http://www.wikidata.org/entity/statement/Q9608...,,,,,,,http://www.wikidata.org/entity/statement/Q9608...,http://www.wikidata.org/entity/statement/Q9608...,http://www.wikidata.org/entity/statement/Q9608...
16562169,,<http://www.wikidata.org/entity/Q4647456>,1911-03-27T00:00:00Z,http://www.wikidata.org/entity/statement/q4647...,Sulphur Rock,http://www.wikidata.org/entity/statement/Q4647...,1970-06-11T00:00:00Z,http://www.wikidata.org/entity/statement/q4647...,San Francisco,http://www.wikidata.org/entity/statement/Q4647...,,,,,,,
81351176,,<http://www.wikidata.org/entity/Q4647455>,1842-01-18T00:00:00Z,http://www.wikidata.org/entity/statement/Q4647...,Garden Prairie,http://www.wikidata.org/entity/statement/Q4647...,1911-11-16T00:00:00Z,http://www.wikidata.org/entity/statement/Q4647...,Minneapolis,http://www.wikidata.org/entity/statement/Q4647...,Lakewood Cemetery,http://www.wikidata.org/entity/statement/Q4647...,,,,,
114764126,,<http://www.wikidata.org/entity/Q116344568>,1905-01-01T00:00:00Z;1904-01-01T00:00:00Z,http://www.wikidata.org/entity/statement/Q1163...,Lancaster County,http://www.wikidata.org/entity/statement/Q1163...,1985-01-01T00:00:00Z,http://www.wikidata.org/entity/statement/Q1163...,Pennsylvania,http://www.wikidata.org/entity/statement/Q1163...,,,,,,,


In [31]:
comparable_wd_entries.tail()

Unnamed: 0_level_0,!newId,?birthdays,?birthplaces,?deathdays,?deathplaces,?burials,?plots
?findAGraveID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
194988853,,1900-04-15T00:00:00Z,Częstochowa,1954-06-18T00:00:00Z,Warsaw,Powązki Military Cemetery,
239983804,,1870-01-25T00:00:00Z,Lviv,1950-07-23T00:00:00Z,Zakopane,New Cemetery in Zakopane,
161613819,,1910-03-12T00:00:00Z,Stryi,1951-02-08T00:00:00Z,Warsaw,Powązki Military Cemetery,
253062274,,1824-05-01T00:00:00Z,Międzyrzecz,1887-08-14T00:00:00Z,Lviv,Lychakiv Cemetery,
11298961,,1845-10-28T00:00:00Z,Hrodna,1888-04-16T00:00:00Z,Kraków,Rakowicki Cemetery,


In [32]:
grave_entries = pd.read_csv("findagrave_entries.csv", index_col="?findAGraveID")

In [33]:
# Thanks to https://stackoverflow.com/a/61954604
def drop_uncommon_rows(df1: pd.DataFrame, df2: pd.DataFrame):
    return df1[df1.index.isin(df2.index)]


comparable_wd_entries = drop_uncommon_rows(comparable_wd_entries, grave_entries)
grave_entries = drop_uncommon_rows(grave_entries, comparable_wd_entries)

In [34]:
# Not sure if correctly distinguished between siblings or parents, so remove
grave_entries.drop(columns=["?siblings", "?name"], inplace=True)
grave_entries.tail()

Unnamed: 0_level_0,!newId,?birthdays,?birthplaces,?deathdays,?deathplaces,?burials,?plots
?findAGraveID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
9999,,1886-07-31T00:00:00Z,"Minneapolis, Hennepin County, Minnesota, USA",1965-09-16T00:00:00Z,"Santa Monica, Los Angeles County, California, USA",Forest Lawn Memorial Park,"Court of Freedom section, Map #G41, Lot 4265, ..."
99992961,,1957-05-17T00:00:00Z,"San Cristóbal, Municipio de San Cristóbal, San...",2012-11-01T00:00:00Z,"San Gregorio de Nigua, Municipio de San Gregor...",Cementerio Municipal,
99996384,,1854-12-04T00:00:00Z,Scotland,1907-05-03T00:00:00Z,"Chicago, Cook County, Illinois, USA",Oakwood Cemetery,
99999223,248358353.0,1692-06-27T00:00:00Z,"Ipswich, Essex County, Massachusetts, USA",1773-10-16T00:00:00Z,"Eliot, York County, Maine, USA",Leighton Family Cemetery,
99999919,,1873-07-02T00:00:00Z,"Cambridge, Middlesex County, Massachusetts, USA",1958-11-13T00:00:00Z,"Arlington, Middlesex County, Massachusetts, USA",Saint Paul Cemetery,


## Construct mismatch-finder-compatable dataframe
It'll probably be easier to compare iteratively instad of using `.compare` unfortunately

In [35]:
nameToPid = {
    "?birthdays": "P569",
    "?findAGraveID": "P535",
    "?birthplaces": "P19",
    "?deathdays": "P570",
    "?deathplaces": "P20",
    "?burials": "P119",
    "?plots": "P965",
}

In [36]:
def compare(wiki, ext):
    acc = []

    for index, extRow in tqdm(ext.iterrows(), total=len(ext)):
        for col in extRow.index:
            # print(index, col, wiki.loc[index].to_json(indent))
            # TODO: properly generate id change mismatches
            if col[0] == "!":
                continue

            extVal = extRow.loc[col]
            wikiVals = wiki[col].loc[index]
            if not isinstance(wikiVals, str) and np.isnan(wikiVals):
                wikiVals = [np.nan]
            else:
                wikiVals = wikiVals.split(";")

            wikiGuids = wiki[col + "_guids"].loc[index] if col[0] == "?" else np.nan
            if not isinstance(wikiGuids, str) and np.isnan(wikiGuids):
                wikiGuids = [np.nan]
            else:
                wikiGuids = wikiGuids.split(";")

            qid = re.split("[/>]", wiki["?person"].loc[index])[-2]
            # print(wikiVals, wikiGuids)
            # KNOWN ISSUE: this assertion will fail. Eg: Q505270 has 2 birthdates, one specific and one less specific.
            # SQARQL query grabs both guids, but only first date. Not fixing b/c at present, mismatch finder can't automatically reconsile changes.
            # This also means wikival may not match up to wikiGuid
            # assert(len(wikiVals) == len(wikiGuids))
            for wikiVal, wikiGuid in zip(wikiVals, wikiGuids):
                try:
                    if wikiVal == extVal or (np.isnan(wikiVal) and np.isnan(extVal)):
                        continue
                except TypeError:
                    # wikiVal != extVal and (wikiVak or extVal is non-numeric)
                    # no-op b/c they are different
                    pass

                # Test if any substrings exist. If yes, probably the same
                if isinstance(wikiVal, str):
                    continuelol = False
                    for word in wikiVal.split():
                        if isinstance(extVal, str) and word in extVal:
                            continuelol = True
                            break
                    if continuelol:
                        continue

                # print(index, col, wikiVal, extVal, wikiGuid, "", sep="\n")
                acc.append(
                    {
                        "item_id": qid,
                        "statement_guid": wikiGuid,
                        "property_id": nameToPid[col],
                        "wikidata_value": wikiVal,
                        "meta_wikidata_value": np.NAN,
                        "external_value": extVal,
                        "external_url": "https://www.findagrave.com/memorial/"
                        + str(index),
                        "type": "statement",
                    }
                )

    return pd.DataFrame(acc)


diff = compare(wd_entries, grave_entries)

100%|██████████| 139029/139029 [00:36<00:00, 3832.25it/s]


In [37]:
diff.head()

Unnamed: 0,item_id,statement_guid,property_id,wikidata_value,meta_wikidata_value,external_value,external_url,type
0,Q505270,,P965,,,"Section M, Lot 292, Range 5",https://www.findagrave.com/memorial/1,statement
1,Q180989,http://www.wikidata.org/entity/statement/Q1809...,P570,1935-05-22T00:00:00Z,,1935-05-21T00:00:00Z,https://www.findagrave.com/memorial/10,statement
2,Q180989,,P965,,,"Front right section, Row 9, Lot 8",https://www.findagrave.com/memorial/10,statement
3,Q349690,http://www.wikidata.org/entity/statement/q3496...,P20,Cedars-Sinai Medical Center,,"Los Angeles, Los Angeles County, California, USA",https://www.findagrave.com/memorial/100,statement
4,Q349690,,P965,,,"Garden of the Exodus (formerly Pineland, Secti...",https://www.findagrave.com/memorial/100,statement


In [38]:
# Remove rows where wikidata has a value, but findagrave does not
diff = diff.dropna(subset=["external_value"])

In [39]:
check_mf_formatting(diff)

All checks have passed! The data is ready to be uploaded to Mismatch Finder.


In [40]:
print("Congrats! You've found", len(diff), "mismatches!")

Congrats! You've found 262419 mismatches!


In [41]:
diff.to_csv("findagrave_mismatches.csv", index=False)