In [2]:
import pandas as pd
from indralib.indra_time import IndraTime
import re

In [3]:
# https://en.wikipedia.org/wiki/List_of_timelines

In [4]:
def remove_footnotes(text, numeric_only=True, single_letter_alpha=True):
    if numeric_only is False:
        text = re.sub(r"\[.*?\]", "", text)
    else:
        text = re.sub(r"\[\d+\]", "", text)
        if single_letter_alpha is True:
            text = re.sub(r"\[\w\]", "", text)
    return text

In [33]:
def extract_date_remarks(date):
    remarks = ""
    approxies = ["ca.", "c.", "circa", "fl.", "born", "died in", "Died", "died", "buried", "early", "or later", "later", "late", "mid", "or before", "before", 
                 "after", "around", "approximately", "~", ">", "<", "≈", "≥", "≤", "(?)", "?"]
    for ap in approxies:
        if ap in date:
            date = date.replace(ap, "").strip()
            if date != "":
                remarks = f"{ap}"
    if ' or ' in date:
        idx = date.find(' or ')
        remarks = date[idx+1:] 
        date = date[:idx]
    if date.endswith('s'):
        remarks = date
        date = date[:-1]
    if "±" in date:
        dates = date.split("±")
        date = dates[0].strip()
        append = dates[1].strip().split(" ")
        if len(append) > 1:
            date = date + " " + append[1]
            remarks = "±" + append[0]
        else:
            remark = "±" + dates[1].strip()
    date = date.replace("  ", " ")
    return date, remarks


In [34]:
def pre_clean(date):
    if isinstance(date, str):
        date = remove_footnotes(date)
        date = date.replace("–", " - ").replace("–", "-").replace("—", "-").replace(" ", " ").replace("\u2009", " ") \
                   .replace(" to ", " - ").replace(",", "").replace("\xa0", " ").replace("  ", " ") \
                   .replace("AD", "").replace("BCE", "BC").replace("  ", " ").replace(" mya", " ma bp").strip()
    return date

In [35]:
def decenturize(date):
    bc = False
    cent = False
    remarks = ""
    org_date = date
    postf = ['st', 'nd', 'rd', 'th']
    if ' century BC' in date:
        bc = True
        fpf = False
        for pf in postf:
            if pf in date:
                date = date.replace(pf, '')
                fpf = True
                break
        if fpf is True:
            date = date.replace(' century BC', '')
            cent = True
    elif ' century' in date:
        if 'c. ' in date:
            date = date.replace("c. ", "")
        fpf = False
        for pf in postf:
            if pf in date:
                date = date.replace(pf, '')
                fpf = True
                break
        if fpf is True:
            date = date.replace(' century', '')
            cent = True
    if cent is False:
        return org_date, remarks
    century = date.strip()
    try:
        int_cent = int(century)
    except:
        return org_date, remarks
    if bc is True:
        if int_cent == 1:
            date = f"100 BC - 1 BC"
        else:
            date = f"{int_cent}00 BC - {int_cent-1}00 BC"
    else:
        if int_cent == 1:
            date = "1 - 100"
        else:
            date = f"{int_cent-1}00 - {int_cent}00"
    return date, org_date

In [36]:
def check_date_parts(parts):
    for p in parts:
        for c in p:
            if c <'0' or c>'9':
                return False
    return True

In [37]:
def date_clean(date, default_scale = None):
    remarks = ""
    if isinstance(date, str):
        date = pre_clean(date)
        date_sub = date.split("-")
        if len(date_sub)==3 or (len(date_sub)==2 and " - " not in date and check_date_parts(date_sub)):
            return date, remarks
        if default_scale is not None:
            date = date.split("-")
            if len(date) == 1:
                date = date[0].strip()+f" {default_scale}"
            else:
                date = f"{date[0].strip()} {default_scale} - {date[1].strip()} {default_scale}"
        else:            
            dates = date.split(" - ")
            new_dates = []
            for di in dates:
                dj, rem = extract_date_remarks(di)
                if len(rem)>0:
                    if len(remarks) == 0:
                        remarks = rem
                    else:
                        remarks += f", {rem}"
                dj, rem = decenturize(dj)
                if len(rem)>0:
                    if len(remarks) == 0:
                        remarks = rem
                    else:
                        remarks += f", {rem}"
                if "/" in dj:
                    dj_parts = dj.split("/")
                    dj = dj_parts[0].strip()
                    alt_dj_stub = dj_parts[1].strip()
                    alt_dj = dj[:len(dj)-len(alt_dj_stub)] + alt_dj_stub
                    rem = f"Alt.: {alt_dj}"       
                    if len(remarks) == 0:
                        remarks = rem
                    else:
                        remarks += f", {rem}"
                new_dates.append(dj)
            date = " - ".join(new_dates)
            dates = date.split(" - ")
            if len(dates) == 2:
                sub_dates0 = dates[0].split(" ")
                sub_dates1 = dates[1].split(" ")
                if len(sub_dates0)==1 and len(sub_dates1)==2:
                    date = f"{sub_dates0[0]} {sub_dates1[1]} - {dates[1]}"
    else:
        if default_scale is not None:
            date = f"{date} {default_scale}"
        else:
            date = str(int(date))
    jd_dates = IndraTime.string_time_to_julian(date)
    
    if len(jd_dates) > 1 and jd_dates[1] is not None:
        date_start = IndraTime.julian_to_string_time(jd_dates[0])
        date_end = IndraTime.julian_to_string_time(jd_dates[1])
        date = f"{date_start} - {date_end}"
    else:
        date = IndraTime.julian_to_string_time(jd_dates[0])

    return date, remarks

In [38]:
def date_merge(year, rest):
    day = None
    month = None
    year = pre_clean(year)
    rest = rest.strip()
    year = year.strip()
    if rest is not None and len(rest) > 0:
        rparts = rest.split(" ")
        if len(rparts) == 1:
            month = rparts[0].strip().lower()
        else:
            try:
                month = rparts[1].strip().lower()
                day = int(rparts[0].strip())
            except ValueError:
                month = None
                day = None
            if month is None and day is None:
                try:
                    month = rparts[0].strip().lower()
                    day = int(rparts[1].strip())
                except ValueError:
                    month = None
                    day = None
    else:
        month = None
        day = None
    if month is not None:
        val_months = ["january", "february", "march", "april", "may", "june", "july", "august", "september", "october", "november", "december"]
        try:
            month_id = val_months.index(month) + 1
        except ValueError:
            month_id = None
            return f"{year}"
        year_parts = str(year).replace("\xa0"," ").split(" ", 1)
        if len(year_parts) > 1:
            year = year_parts[0]
            appendix = year_parts[1]
        else:
            appendix = ""
        if day is not None:
            date = f"{year}-{month_id:02d}-{day:02d}"
            if appendix != "":
                date = date + f" {appendix}"
        else:
            date = f"{year}-{month_id:02d}"
            if appendix != "":
                date = date + f" {appendix}"
        return date
    else:
        return f"{year}"

In [39]:
url= "https://de.wikipedia.org/wiki/Liste_homininer_Fossilien"
tables = pd.read_html(url)

In [50]:
tables[1]

Unnamed: 0,Abbildung,Name,Alter,Art,Jahr der Entdeckung,Fundort,Erstbeschreibung
0,,KNM-ER 1813,"1,9 mya",Homo habilis,1973.0,Kenia,Kamoya Kimeu
1,,KNM-ER 1470,"1,9 mya",Homo rudolfensis (Holotypus),1972.0,Kenia,Bernard Ngeneo
2,,SK 48,"1,8 mya",Paranthropus robustus,1948.0,Südafrika,Robert Broom
3,,OH 5 „Zinj“ oder „Nussknackermensch“ (+ Unter...,"1,8 mya",Paranthropus boisei (Holotypus),1959.0,Tansania,Mary Leakey
4,OH 8 (Hand) bei talkorigins.org,OH 8,"1,8 mya",Homo habilis,1960.0,Tansania,Jonathan Leakey
5,,OH 24 („Twiggy“),"1,8 mya",Homo habilis,1968.0,Tansania,Peter Nzube
6,OH 62 bei msu.edu,OH 62 („Dik-dik hominid“),"1,8 mya",Homo habilis,1986.0,Tansania,Tim White
7,OH 65 bei talkorigins.org,OH 65,"1,8 mya",Homo habilis,1995.0,Tansania,Fidelis Masao
8,,D 2700,"1,8 mya",Homo erectus ergaster georgicus,2001.0,Georgien,
9,,OH 7 „Jonnys Child“,"1,75 mya",Homo habilis (Holotypus),1960.0,Tansania,Jonathan Leakey


In [49]:
hist = []
for table_index in range(5):
    print(f"Table {table_index}")
    for i, row in enumerate(tables[table_index].iterrows()):
        remarks = ""
        date_raw = row[1].iloc[2]
        if pd.isna(date_raw):
            continue
        date_raw = date_raw.replace(',', '.')
        date_inter = date_raw
        dps = date_raw.split(' ')
    
        if len(dps)>1 and dps[1] == 'mya':
            if '–' in dps[0]:
                dpsi = dps[0].split('–')
                date_inter = f"{dpsi[0]} mya - {dpsi[1]} " + ' '.join(dps[1:])
                
            
        # print(f"Date_raw: {date_raw}")
        if pd.isna(date_inter):
            continue
        date, remarks = date_clean(date_inter)
        print(f"cv d:{date}, r:{remarks}")
        name = row[1].iloc[1]
        if pd.isna(name):
            continue
        jd = IndraTime.string_time_to_julian(date)
        if len(jd) == 1:
            jd = (jd[0], jd[0])
        hist.append((jd, date, remarks, name))

entries = sorted(hist, key=lambda x: x[0])
for entry in entries:
    _, date, remarks, name = entry
    print(f"| {date} | {remarks} | {name} |")

Table 0
cv d:7000.0 kya BP - 6000.0 kya BP, r:
cv d:6000.0 kya BP, r:
cv d:5500.0 kya BP - 5800.0 kya BP, r:
cv d:4400.0 kya BP, r:
cv d:4200.0 kya BP - 3800.0 kya BP, r:
cv d:3600.0 kya BP, r:
cv d:3800.0 kya BP - 3600.0 kya BP, r:
cv d:3580.0 kya BP, r:
cv d:3500.0 kya BP, r:
cv d:3300.0 kya BP, r:(?)
cv d:3300.0 kya BP, r:
cv d:3200.0 kya BP - 3000.0 kya BP, r:
cv d:3400.0 kya BP, r:
cv d:3500.0 kya BP - 3000.0 kya BP, r:
cv d:3200.0 kya BP, r:
cv d:3200.0 kya BP, r:
cv d:3000.0 kya BP, r:
cv d:2800.0 kya BP - 2750.0 kya BP, r:
cv d:2800.0 kya BP - 2600.0 kya BP, r:
cv d:2500.0 kya BP, r:
cv d:2500.0 kya BP, r:
cv d:2500.0 kya BP, r:
cv d:2500.0 kya BP, r:
cv d:2400.0 kya BP, r:
cv d:2400.0 kya BP, r:
cv d:2300.0 kya BP, r:
cv d:2160.0 kya BP - 2050.0 kya BP, r:
cv d:2040.0 kya BP - 1950.0 kya BP, r:
cv d:2000.0 kya BP, r:
cv d:2000.0 kya BP, r:
Table 1
cv d:1900.0 kya BP, r:
cv d:1900.0 kya BP, r:
cv d:1800.0 kya BP, r:
cv d:1800.0 kya BP, r:
cv d:1800.0 kya BP, r:
cv d:1800.0 kya 

ValueError: Invalid date format: 1.4

In [43]:
5,5–5,8 mya

SyntaxError: invalid character '–' (U+2013) (1958696806.py, line 1)