In [1]:
import os
import re
import unicodedata
import numpy as np
import pandas as pd
from datetime import datetime, timedelta

# —————————————————————————————————————————————————————————————————————————
def compute_halfway_day(month: int, year: int) -> int:
    """Default midpoint day for months (Feb leap years, 30 vs 31 days)."""
    if month == 2:
        return 15 if ((year % 400 == 0) or ((year % 4 == 0) and (year % 100 != 0))) else 14
    return 16 if month in {1,3,5,7,8,10,12} else 15

def parse_dates(val: str):
    """
    Parse a single date string into (year, month, day, is_synthetic).
    - “before”/“after”/“circa” ⇒ is_synthetic=1; shift by ∓1 day for before/after.
    - Otherwise, your existing slash/or, month-name, and halfway-day logic.
    """
    if pd.isnull(val) or not str(val).strip():
        return (np.nan, np.nan, np.nan, 0)

    s0 = str(val).strip().lower()
    qualifier = next((q for q in ("before","after","circa") if q in s0), None)
    synthetic = int(qualifier is not None)
    # strip qualifier so it doesn’t confuse parsing
    s = re.sub(r'\b(before|after|circa)\b', '', s0).strip()

    month_names = {
        'january':1,'february':2,'march':3,'april':4,
        'may':5,'june':6,'july':7,'august':8,
        'september':9,'october':10,'november':11,'december':12
    }

    year = month = day = None

    # (A) Slash/or logic
    if '/' in s or ' or ' in s:
        m = re.fullmatch(r'\s*(\d{3,4})\s*/\s*(\d{1,4})\s*', s)
        if m:
            a,b = m.groups()
            year = int(b) if len(b)>=len(a) else int(a[:len(a)-len(b)] + b)
            month, day = 1, 1
            synthetic = 1
        else:
            parts = s.split('/') if '/' in s else s.split(' or ')
            y1 = re.search(r'\d{3,4}', parts[0])
            y2 = re.search(r'\d+',       parts[1]) if len(parts)>1 else None
            if not y1:
                return (np.nan, np.nan, np.nan, 1)
            sec = y2.group(0) if y2 else y1.group(0)
            year = (int(sec) if len(sec)>=len(y1.group(0))
                    else int(y1.group(0)[:len(y1.group(0))-len(sec)] + sec))
            # month?
            found = next((n for n in month_names if n in s), None)
            if not found:
                month, day = 7, 1
                synthetic = 1
            else:
                month = month_names[found]
                if re.match(r'\d', s):
                    nums = re.findall(r'\d+', s)
                    if not nums:
                        return (np.nan, np.nan, np.nan, 1)
                    cand = int(nums[0])
                    if cand<=31:
                        day = cand
                    else:
                        day = compute_halfway_day(month, year)
                        synthetic = 1
                else:
                    day = compute_halfway_day(month, year)
                    synthetic = 1

    # (B) No slash/or
    else:
        nums = re.findall(r'\d+', s)
        if not nums:
            return (np.nan, np.nan, np.nan, 1)
        found = next((n for n in month_names if n in s), None)

        if found:
            month = month_names[found]
            if re.match(r'\d', s) and len(nums)>=2:
                day, year = int(nums[0]), int(nums[-1])
            else:
                year = int(nums[-1])
                day = compute_halfway_day(month, year)
                synthetic = 1
        else:
            if len(nums)==1:
                year, month, day = int(nums[0]), 7, 1
                synthetic = 1
            elif re.match(r'\d', s) and len(nums)>=2:
                day, year = int(nums[0]), int(nums[-1])
                month = 7
            else:
                year, month, day = int(nums[-1]), 7, 1
                synthetic = 1

    # apply before/after shift
    try:
        dt = datetime(year, month, day)
        if qualifier == "before":
            dt -= timedelta(days=1)
        elif qualifier == "after":
            dt += timedelta(days=1)
        year, month, day = dt.year, dt.month, dt.day
    except Exception:
        pass

    return (year, month, day, synthetic)

def create_datetime(row, prefix: str):
    """
    Build a Timestamp or datetime for a row using columns:
      {prefix}_year, {prefix}_month, {prefix}_day.
    Returns pd.NaT if any component is null or invalid.
    """
    y, m, d = row[f'{prefix}_year'], row[f'{prefix}_month'], row[f'{prefix}_day']
    if pd.isnull(y) or pd.isnull(m) or pd.isnull(d):
        return pd.NaT
    y,m,d = int(y), int(m), int(d)
    try:
        # native datetime for y<1678, else pandas.Timestamp
        return datetime(y,m,d) if y<1678 else pd.Timestamp(year=y,month=m,day=d)
    except ValueError:
        return pd.NaT

In [2]:

# ====================== File Paths & Data Loading ======================
rawpath = '../data/thepeerage/raw'
bad_stuff = '../data/thepeerage/bad_stuff'

source = pd.read_csv(os.path.join(rawpath, 'sources.tsv'), sep='\t')
peers = pd.read_csv(os.path.join(rawpath, 'british_peers_and_orders.tsv'), sep='\t')
raw = pd.read_csv(os.path.join(rawpath, 'entire_thepeerage.tsv'), sep='\t')

print(f"There are {len(raw[raw['ID'].isnull()])} rows of missing ids in the raw dataset")
raw[raw['ID'].isnull()].to_csv(os.path.join(bad_stuff, 'missing_person_ID.csv'))
print(f"These are saved to {os.path.join(bad_stuff, 'missing_person_ID.csv')}.")
raw = raw[raw['ID'].notnull()]
print("Dropping them")
print(f"There are {len(raw[raw['Page'].isnull()])} rows of missing pages in the raw dataset")
raw[raw['Page'].isnull()].to_csv(os.path.join(bad_stuff, 'missing_person_Page.csv'))
print(f"These are saved to {os.path.join(bad_stuff, 'missing_person_Page.csv')}.")
print(f"There are {len(peers[peers['id'].isnull()])} rows of missing ids in the peers dataset")
print(f"There are {len(peers[peers['type'].isnull()])} rows of missing types in the peers dataset")
print(f"There are {len(source[source['Page'].isnull()])} rows of missing pages in the source dataset")
print(f"There are {len(source[source['SourceID'].isnull()])} rows of missing sourceID in the source dataset")
print(f"There are {len(source[source['Source'].isnull()])} rows of missing Source in the source dataset")
print(f"There are {len(peers[~peers['id'].isin(raw['ID'])])} peer IDs not in the raw...")
peers[~peers['id'].isin(raw['ID'])].to_csv(os.path.join(bad_stuff, 'peers_not_in_raw_ID.csv'))
print(f"These are saved to {os.path.join(bad_stuff, 'peers_not_in_raw_ID.csv')}.")
peers = peers[peers['id'].isin(raw['ID'])]
print("Dropping them")
peers = peers.set_index('id')
df = raw
df = df.set_index('ID')
print(f"We begin with {len(df)} rows of the raw df")

There are 97 rows of missing ids in the raw dataset
These are saved to ../data/thepeerage/bad_stuff/missing_person_ID.csv.
Dropping them
There are 0 rows of missing pages in the raw dataset
These are saved to ../data/thepeerage/bad_stuff/missing_person_Page.csv.
There are 0 rows of missing ids in the peers dataset
There are 0 rows of missing types in the peers dataset
There are 0 rows of missing pages in the source dataset
There are 0 rows of missing sourceID in the source dataset
There are 0 rows of missing Source in the source dataset
There are 4 peer IDs not in the raw...
These are saved to ../data/thepeerage/bad_stuff/peers_not_in_raw_ID.csv.
Dropping them
We begin with 757192 rows of the raw df


In [3]:

print("Now, let's merge on our peerage data.")
peers = peers.rename({'type': 'type_of_peer'}, axis=1)
df['is_peer'] = 0
df['type_of_peer'] = ''
for index, row in peers.iterrows():
    key     = 'type_of_peer'
    current = df.loc[index, key]
    new_val = row['type_of_peer']
    if current:
        df.loc[index, key] = f"{current};{new_val}"
    else:
        df.loc[index, key] = new_val
    # …and likewise for is_* flags…

df['is_child_of_peer'] = 0
df['is_grandchild_of_peer'] = 0
df['Extracted Parental Peerage'] = '' # for harony with our hollingsworth processing

Now, let's merge on our peerage data.


In [4]:
import numpy as np
import pandas as pd

# ──────────────────────────────────────────────────────────────
# Helper: append a peer‐type into a semicolon‐delimited cell,
# avoiding duplicates and empty tokens.
# Returns a sorted, semicolon‐delimited string of unique types.
# ──────────────────────────────────────────────────────────────
def append_peerage(cell: str, new_type: str) -> str:
    tokens = [t for t in cell.split(';') if t]
    types  = set(tokens)
    types.add(new_type)
    return ';'.join(sorted(types))

# ──────────────────────────────────────────────────────────────
# 0. Ensure all needed columns exist, and reset peerage columns
# ──────────────────────────────────────────────────────────────
for flag in ['is_child_of_peer', 'is_grandchild_of_peer']:
    if flag not in df.columns:
        df[flag] = 0
    else:
        df[flag] = df[flag].astype(int).clip(0,1)

for col in ['Extracted Parental Peerage', 'Extracted Grandparental Peerage']:
    df[col] = ''  # reset to empty string

# ──────────────────────────────────────────────────────────────
# 1. Ensure 'child' column is string‐typed (no NaNs)
# ──────────────────────────────────────────────────────────────
df['child'] = df['child'].fillna('').astype(str)

# ──────────────────────────────────────────────────────────────
# 2. Traverse peers → children → grandchildren
# ──────────────────────────────────────────────────────────────
child_not_found      = set()
grandchild_not_found = set()

for peer_id in peers.index:
    peer_type = df.at[peer_id, 'type_of_peer']
    child_list = df.at[peer_id, 'child']
    if not child_list:
        continue  # no children → skip

    # ── Handle direct children ─────────────────────────────────
    for child_str in child_list.split(';'):
        if not child_str:
            continue
        cid = int(child_str)
        if cid not in df.index:
            child_not_found.add(cid)
            continue

        # Mark as child and append parental peerage
        df.at[cid, 'is_child_of_peer'] = 1
        existing_pp = df.at[cid, 'Extracted Parental Peerage']
        df.at[cid, 'Extracted Parental Peerage'] = append_peerage(existing_pp, peer_type)

        # ── Handle grandchildren of this peer ──────────────────
        grand_list = df.at[cid, 'child']
        if not grand_list:
            continue

        for grand_str in grand_list.split(';'):
            if not grand_str:
                continue
            gid = int(grand_str)
            if gid not in df.index:
                grandchild_not_found.add(gid)
                continue

            # Mark as grandchild and append grandparental peerage
            df.at[gid, 'is_grandchild_of_peer'] = 1
            existing_gpp = df.at[gid, 'Extracted Grandparental Peerage']
            df.at[gid, 'Extracted Grandparental Peerage'] = append_peerage(existing_gpp, peer_type)

# At the end:
# - df['Extracted Parental Peerage']  contains unique parent‐peer types
# - df['Extracted Grandparental Peerage'] contains unique grandparent‐peer types
# - child_not_found and grandchild_not_found list any missing IDs


In [5]:
df['Extracted Parental Peerage'].value_counts()

Extracted Parental Peerage
                                                                                                            697578
baronet                                                                                                      27060
baron                                                                                                         7621
baron;baron;baronet;baronet                                                                                   2731
baron;earl;earl;viscount;baron;viscount                                                                       2116
                                                                                                             ...  
baron_by_writ;duke;duke;earl;baron_by_writ;earl                                                                  1
baron;baronet;earl;jacobite;marquess;marquess;earl;viscount;baron;baronet;jacobite;jacobite;viscount             1
baron;earl;marquess;marquess;earl;baron;baron;baron;v

In [6]:
import re
import unicodedata
import pandas as pd

# Expanded list of honourifics/ranks...
honorifics = [
    # Civilian titles
    'Mr', 'Mrs', 'Ms', 'Miss', 'Mx',
    'Dr', 'Prof', 'Professor',
    'Sir', 'Dame', 'Lord', 'Lady',
    'Rev', 'Reverend', 'Father', 'Fr', 'Pastor',
    'Rabbi', 'Imam',
    'Hon', 'Honorable', 'Right Honourable', 'Rt Hon',
    # Army ranks
    'Private', 'Pvt',
    'Corporal', 'Cpl',
    'Sergeant', 'Sgt',
    'Lieutenant', 'Lt', 
    'Captain', 'Capt',
    'Major', 'Maj',
    'Lieutenant Colonel', 'Lt Col', 'Lt-Col',
    'Colonel', 'Col',
    'Brigadier', 'Brigadier General', 'Brigadier-General',
    'Major General', 'Maj Gen', 'Maj-Gen',
    'Lieutenant General', 'Lt Gen', 'Lt-Gen',
    'General', 'Gen',
    'Field Marshal',
    # Naval ranks
    'Seaman', 'Able Seaman', 'AB',
    'Petty Officer', 'Chief Petty Officer',
    'Midshipman',
    'Lieutenant Commander',
    'Commander',
    'Captain RN',
    'Commodore',
    'Rear Admiral',
    'Vice Admiral',
    'Admiral',
    # Air-Force ranks
    'Pilot Officer',
    'Flying Officer',
    'Flight Lieutenant',
    'Squadron Leader',
    'Wing Commander',
    'Group Captain',
    'Air Commodore',
    'Air Vice Marshal', 'Air Vice-Marshal',
    'Air Marshal',
    'Air Chief Marshal',
    'Marshal of the RAF',
    # Religious / other
    'Saint', 'St'
]

def escape_with_optional_dot(title: str) -> str:
    parts = title.split()
    return r'\s+'.join(re.escape(p) + r'\.?' for p in parts)

# Compile honourific‐stripping pattern
escaped_fragments = (escape_with_optional_dot(h) for h in honorifics)
hon_pattern = re.compile(
    r'^(?:' + r'|'.join(escaped_fragments) + r')\b\s*',
    flags=re.IGNORECASE
)

def strip_honorifics(name: str) -> str:
    """
    1. Drop any 'formerly ...' suffix.
    2. Remove leading honourific/rank.
    """
    if pd.isna(name):
        return name
    # 1) Keep only text before 'formerly'
    name = re.split(r'\bformerly\b', name, flags=re.IGNORECASE)[0].strip()
    # 2) Strip leading honorific
    return hon_pattern.sub('', name).strip()

def clean_string(s: str) -> str:
    """
    1. Remove parenthesized content.
    2. Normalize to NFKD and drop diacritics.
    3. Replace dashes with space.
    4. Keep only ASCII letters and spaces.
    5. Collapse multiple spaces and trim.
    """
    if pd.isna(s):
        return s
    s = re.sub(r'\(.*?\)', '', s)
    s = unicodedata.normalize('NFKD', s)
    s = re.sub(r'[-–—]+', ' ', s)
    s = re.sub(r'[^A-Za-z ]+', '', s)
    return re.sub(r'\s{2,}', ' ', s).strip()

# Apply to DataFrame
df['fullname_clean'] = df['fullname'].apply(strip_honorifics).str.upper()
df['First Forename'] = df['fullname_clean'].str.split().str[0]
df['Last Surname']   = df['fullname_clean'].str.split().str[-1]

for col in ['fullname_clean', 'First Forename', 'Last Surname']:
    df[col] = df[col].apply(clean_string)

# View the cleaned columns
df[['fullname_clean', 'First Forename', 'Last Surname']]


Unnamed: 0_level_0,fullname_clean,First Forename,Last Surname
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1.0,CHARLES PHILIP ARTHUR GEORGE MOUNTBATTEN WINDSOR,CHARLES,MOUNTBATTEN WINDSOR
2.0,OTTON GRAF VON BALLENSTEDT,OTTON,BALLENSTEDT
3.0,JOSEPH GRAF VON HOHENZOLLERN SIGMARINGEN,JOSEPH,HOHENZOLLERN SIGMARINGEN
4.0,KARL FRIEDRICH GRAF VON HOHENZOLLERN SIGMARINGEN,KARL,HOHENZOLLERN SIGMARINGEN
5.0,EITEL FRIEDRICH IV GRAF VON HOHENZOLLERN HECHI...,EITEL,HOHENZOLLERN HECHINGEN
...,...,...,...
759726.0,ANASTATIA BEEVOR,ANASTATIA,BEEVOR
759727.0,JOHN BEEVOR,JOHN,BEEVOR
759728.0,LAMBERT BLACKWELL FOSTER,LAMBERT,FOSTER
759729.0,MARY GREENE BROWNE,MARY,BROWNE


In [7]:
            
print(f"We have {len(child_not_found)} children not found")
print(f"We have {len(grandchild_not_found)} grandchildren not found")

# Save these two sets to CSV files.
child_not_found_df = pd.DataFrame({'child_not_found': list(child_not_found)})
child_not_found_file = os.path.join(bad_stuff, 'child_not_found.csv')
child_not_found_df.to_csv(child_not_found_file, index=False)
print(f"Child not found list saved to: {child_not_found_file}")

grandchild_not_found_df = pd.DataFrame({'grandchild_not_found': list(grandchild_not_found)})
grandchild_not_found_file = os.path.join(bad_stuff, 'grandchild_not_found.csv')
grandchild_not_found_df.to_csv(grandchild_not_found_file, index=False)
print(f"Grandchild not found list saved to: {grandchild_not_found_file}")

We have 0 children not found
We have 0 grandchildren not found
Child not found list saved to: ../data/thepeerage/bad_stuff/child_not_found.csv
Grandchild not found list saved to: ../data/thepeerage/bad_stuff/grandchild_not_found.csv


In [8]:
# 1) PARSE both born & died into (year,month,day,is_synthetic)
df[['born_year','born_month','born_day','is_synthetic_birthdate']] = (
    df['born']
      .apply(lambda x: pd.Series(parse_dates(x)))
)

df[['died_year','died_month','died_day','is_synthetic_deathdate']] = (
    df['died']
      .apply(lambda x: pd.Series(parse_dates(x)))
)

# 2) BUILD actual datetime columns, using the same prefixes:
df['born_datetime'] = df.apply(
    lambda row: create_datetime(row, prefix='born'),
    axis=1
)
df['died_datetime'] = df.apply(
    lambda row: create_datetime(row, prefix='died'),
    axis=1
)

# 3) (Optional) A human-readable string form:
df['born_datetime_str'] = df['born_datetime'].apply(
    lambda x: x.strftime('%d-%m-%Y') if pd.notnull(x) else np.nan
)

df['died_datetime_str'] = df['died_datetime'].apply(
    lambda x: x.strftime('%d-%m-%Y') if pd.notnull(x) else np.nan
)


In [9]:
# Lets now make it as equivilent as possible to hollingsworth:
df = df.rename({'fullname_clean': 'Full_Name_l'}, axis=1)
df = df.rename({'First Forename': 'First Forename_l'}, axis=1)
df = df.rename({'Last Surname': 'Last Surname_l'}, axis=1)
df = df.rename({'gender': 'Gender_l'}, axis=1)
df = df.rename({'born_day': 'born_day_l'}, axis=1)
df = df.rename({'born_month': 'born_month_l'}, axis=1)
df = df.rename({'born_year': 'born_year_l'}, axis=1)
df = df.rename({'is_synthetic_birthdate': 'born_accuracy_l'}, axis=1)
df = df.rename({'died_day': 'died_day_l'}, axis=1)
df = df.rename({'died_month': 'died_month_l'}, axis=1)
df = df.rename({'died_year': 'died_year_l'}, axis=1)
df = df.rename({'is_synthetic_deathdate': 'died_accuracy_l'}, axis=1)
df = df.rename({'born_datetime': 'born_datetime_l'}, axis=1)
df = df.rename({'died_datetime': 'died_datetime_l'}, axis=1)
df = df.rename({'Extracted Parental Peerage': 'Extracted Parental Peerage_l'}, axis=1)

In [10]:
df = df[['Full_Name_l',
         'First Forename_l',
         'Last Surname_l',
         'Gender_l',
         'born_day_l',
         'born_month_l',
         'born_year_l',
         'born_accuracy_l',
         'died_day_l',
         'died_month_l',
         'died_year_l',
         'died_accuracy_l',
         'born_datetime_l',
         'died_datetime_l',
         'Extracted Parental Peerage_l']]

In [11]:
print(f"We end with {len(df)} rows of the df")
print(f"Saving out to ../data/thepeerage/wrangled/wrangled_peerage.csv")
df.to_csv('../data/thepeerage/wrangled/wrangled_peerage.csv')

We end with 757192 rows of the df
Saving out to ../data/thepeerage/wrangled/wrangled_peerage.csv
