In [1]:
import pandas as pd
from indralib.indra_time import IndraTime
import re
import polars
import pyarrow as pa

In [2]:
# https://en.wikipedia.org/wiki/List_of_timelines

In [3]:
def remove_footnotes(text, numeric_only=True, single_letter_alpha=True):
    if numeric_only is False:
        text = re.sub(r"\[.*?\]", "", text)
    else:
        text = re.sub(r"\[\d+\]", "", text)
        if single_letter_alpha is True:
            text = re.sub(r"\[\w\]", "", text)
    return text

In [4]:
def extract_date_remarks(date):
    remarks = ""
    approxies = ["ca.", "c.", "circa", "fl.", "born", "died in", "Died", "died", "buried", "early", "or later", "later", "late", "mid", "or before", "before", 
                 "after", "around", "approximately", "~", ">", "<", "≈", "≥", "≤", "(?)", "?"]
    for ap in approxies:
        if ap in date:
            date = date.replace(ap, "").strip()
            if date != "":
                remarks = f"{ap}"
    if ' or ' in date:
        idx = date.find(' or ')
        remarks = date[idx+1:] 
        date = date[:idx]
    if date.endswith('s'):
        remarks = date
        date = date[:-1]
    if "±" in date:
        dates = date.split("±")
        date = dates[0].strip()
        append = dates[1].strip().split(" ")
        if len(append) > 1:
            date = date + " " + append[1]
            remarks = "±" + append[0]
        else:
            remark = "±" + dates[1].strip()
    date = date.replace("  ", " ")
    return date, remarks


In [5]:
def pre_clean(date):
    if isinstance(date, str):
        date = remove_footnotes(date)
        date = date.replace("–", " - ").replace("–", "-").replace("—", "-").replace("–", " - ").replace(" ", " ").replace("\u2009", " ") \
                   .replace(" to ", " - ").replace(",", "").replace("\xa0", " ").replace("  ", " ") \
                   .replace("AD", "").replace("BCE", "BC").replace("  ", " ").replace(" mya", " ma bp").strip()
    return date

In [6]:
def decenturize(date):
    bc = False
    cent = False
    remarks = ""
    org_date = date
    postf = ['st', 'nd', 'rd', 'th']
    if ' century BC' in date:
        bc = True
        fpf = False
        for pf in postf:
            if pf in date:
                date = date.replace(pf, '')
                fpf = True
                break
        if fpf is True:
            date = date.replace(' century BC', '')
            cent = True
    elif ' century' in date:
        if 'c. ' in date:
            date = date.replace("c. ", "")
        fpf = False
        for pf in postf:
            if pf in date:
                date = date.replace(pf, '')
                fpf = True
                break
        if fpf is True:
            date = date.replace(' century', '')
            cent = True
    if cent is False:
        return org_date, remarks
    century = date.strip()
    try:
        int_cent = int(century)
    except:
        return org_date, remarks
    if bc is True:
        if int_cent == 1:
            date = f"100 BC - 1 BC"
        else:
            date = f"{int_cent}00 BC - {int_cent-1}00 BC"
    else:
        if int_cent == 1:
            date = "1 - 100"
        else:
            date = f"{int_cent-1}00 - {int_cent}00"
    return date, org_date

In [7]:
def check_date_parts(parts):
    for p in parts:
        for c in p:
            if c <'0' or c>'9':
                return False
    return True

In [8]:
def date_clean(date, default_scale = None):
    remarks = ""
    if isinstance(date, str):
        date = pre_clean(date)
        date_sub = date.split("-")
        if len(date_sub)==3 or (len(date_sub)==2 and " - " not in date and check_date_parts(date_sub)):
            return date, remarks
        if default_scale is not None:
            date = date.split("-")
            if len(date) == 1:
                date = date[0].strip()+f" {default_scale}"
            else:
                date = f"{date[0].strip()} {default_scale} - {date[1].strip()} {default_scale}"
        else:            
            dates = date.split(" - ")
            new_dates = []
            for di in dates:
                dj, rem = extract_date_remarks(di)
                if len(rem)>0:
                    if len(remarks) == 0:
                        remarks = rem
                    else:
                        remarks += f", {rem}"
                dj, rem = decenturize(dj)
                if len(rem)>0:
                    if len(remarks) == 0:
                        remarks = rem
                    else:
                        remarks += f", {rem}"
                if "/" in dj:
                    dj_parts = dj.split("/")
                    dj = dj_parts[0].strip()
                    alt_dj_stub = dj_parts[1].strip()
                    alt_dj = dj[:len(dj)-len(alt_dj_stub)] + alt_dj_stub
                    rem = f"Alt.: {alt_dj}"       
                    if len(remarks) == 0:
                        remarks = rem
                    else:
                        remarks += f", {rem}"
                new_dates.append(dj)
            date = " - ".join(new_dates)
            dates = date.split(" - ")
            if len(dates) == 2:
                sub_dates0 = dates[0].split(" ")
                sub_dates1 = dates[1].split(" ")
                if len(sub_dates0)==1 and len(sub_dates1)==2:
                    date = f"{sub_dates0[0]} {sub_dates1[1]} - {dates[1]}"
    else:
        if default_scale is not None:
            date = f"{date} {default_scale}"
        else:
            date = str(int(date))
    jd_dates = IndraTime.string_time_to_julian(date)
    
    if len(jd_dates) > 1 and jd_dates[1] is not None:
        date_start = IndraTime.julian_to_string_time(jd_dates[0])
        date_end = IndraTime.julian_to_string_time(jd_dates[1])
        date = f"{date_start} - {date_end}"
    else:
        date = IndraTime.julian_to_string_time(jd_dates[0])

    return date, remarks

In [9]:
def date_merge(year, rest):
    day = None
    month = None
    year = pre_clean(year)
    rest = rest.strip()
    year = year.strip()
    if rest is not None and len(rest) > 0:
        rparts = rest.split(" ")
        if len(rparts) == 1:
            month = rparts[0].strip().lower()
        else:
            try:
                month = rparts[1].strip().lower()
                day = int(rparts[0].strip())
            except ValueError:
                month = None
                day = None
            if month is None and day is None:
                try:
                    month = rparts[0].strip().lower()
                    day = int(rparts[1].strip())
                except ValueError:
                    month = None
                    day = None
    else:
        month = None
        day = None
    if month is not None:
        val_months = ["january", "february", "march", "april", "may", "june", "july", "august", "september", "october", "november", "december"]
        try:
            month_id = val_months.index(month) + 1
        except ValueError:
            month_id = None
            return f"{year}"
        year_parts = str(year).replace("\xa0"," ").split(" ", 1)
        if len(year_parts) > 1:
            year = year_parts[0]
            appendix = year_parts[1]
        else:
            appendix = ""
        if day is not None:
            date = f"{year}-{month_id:02d}-{day:02d}"
            if appendix != "":
                date = date + f" {appendix}"
        else:
            date = f"{year}-{month_id:02d}"
            if appendix != "":
                date = date + f" {appendix}"
        return date
    else:
        return f"{year}"

In [10]:
url= "https://de.wikipedia.org/wiki/Liste_homininer_Fossilien"
tables = pd.read_html(url)

In [11]:
tables[4]

Unnamed: 0,Abbildung,Name,Alter,Art,Jahr der Entdeckung,Fundort,Erstbeschreibung
0,,Le Moustier,45 ka,Homo neanderthalensis,1909,Frankreich,Otto Hauser
1,,Denisova-Mensch,48–30 ka,ungeklärt; (vermutlich Schwestergruppe der Ne...,2010,"Russland, Tibet",
2,,Neandertal 1,42 ka,Homo neanderthalensis (Holotypus),1856,"Neandertal, Deutschland",Johann Carl Fuhlrott
3,,Spy I,42 ka,Homo neanderthalensis,1886,"Grotte de Spy, Jemeppe-sur-Sambre, Belgien","Marcel de Puydt, Max Lohest"
4,,Spy II,42 ka,Homo neanderthalensis,1886,"Grotte de Spy, Jemeppe-sur-Sambre, Belgien","Marcel de Puydt, Max Lohest"
5,,Mungo Man,40 ka,Homo sapiens,1974,Australien,Jim Bowler
6,,Mungo Lady,40 ka,Homo sapiens,1969,Australien,Jim Bowler
7,Qafzeh VI,Qafzeh 6,,Homo sapiens,,Qafzeh-Höhle,
8,Abb. des Unterkiefer-Fragments,Tianyuan 1,40 ka,Homo sapiens,2003,VR China,Hong Shang et al.
9,,Hofmeyr-Schädel,36 ka,Homo sapiens,1952,Südafrika,


In [28]:
hist = []
last_date = "<unknown>"
last_name = "<unknown>"
for table_index in range(5):
    # print(f"Table {table_index}")
    for i, row in enumerate(tables[table_index].iterrows()):
        remarks = ""
        pre_rem = ""
        post_rem = ""
        na_rem = ""
        date_raw = row[1].iloc[2]
        name = row[1].iloc[1]
        if pd.isna(date_raw) or date_raw == "" or date_raw == "??":
            na_rem = f"<NA>, date from previous table entry {last_name} used! "
            date_raw = last_date
        else:
            last_date = date_raw
            last_name = name
        date_raw = remove_footnotes(date_raw)
        ind = date_raw.find("ka,")
        if ind != -1:
            post_rem = date_raw[ind+3:]
            date_raw = date_raw[:ind] + "ka"
        if date_raw.startswith("ca. "):
            pre_rem = date_raw[:4]
            date_raw = date_raw[4:]
        dri = date_raw.split('(')
        if len(dri) > 1:
            date_raw = dri[0]
            post_rem = '('+'('.join(dri[1:])
        ind = date_raw.find('?')
        if ind != -1:
            post_rem = date_raw[ind:]
            date_raw = date_raw[:ind]
            
        date_raw = date_raw.replace(',', '.').replace('-', '–')
        date_inter = date_raw
        dps = date_raw.split(' ')
        if len(dps)>1 and dps[1] == 'mya':
            if '–' in dps[0]:
                dpsi = dps[0].split('–')
                date_inter = f"{dpsi[0]} mya - {dpsi[1]} " + ' '.join(dps[1:])
                  
        # print(f"Date_raw: {date_raw}")
        
        date, remarks = date_clean(date_inter)
        remarks = pre_rem + remarks + post_rem + na_rem
        # print(f"cv d:{date}, r:{remarks}")
        spec = row[1].iloc[3]
        if pd.isna(spec):
            spec = ""
        spec = remove_footnotes(spec)
        disc_date = row[1].iloc[4]
        if pd.isna(disc_date):
            disc_date = ""
        disc_date = remove_footnotes(str(disc_date))
        locat = row[1].iloc[5]
        if pd.isna(locat):
            locat = ""
        locat = remove_footnotes(locat)
        disc = row[1].iloc[6]
        if pd.isna(disc):
            disc = ""
        disc = remove_footnotes(disc)
        jd = IndraTime.string_time_to_julian(date)
        if len(jd) == 1:
            jd = (jd[0], jd[0])
        hist.append((jd, date, remarks, name, spec, disc_date, locat, disc))

entries = sorted(hist, key=lambda x: x[0])
print(f"| Date | Remarks | Name | Species | Discovery date | Location | First description |")
print( "| ---- | ------- | ---- | ------- | -------------- | -------- | ---------- |")
for entry in entries:
    _, date, remarks, name, spec, disc_date, locat, disc = entry
    date = date.replace("1 BC", "0 BC")
    print(f"| {date} | {remarks} | {name} | {spec} | {disc_date} | {locat} | {disc} |")

| Date | Remarks | Name | Species | Discovery date | Location | First description |
| ---- | ------- | ---- | ------- | -------------- | -------- | ---------- |
| 7000.0 kya BP - 6000.0 kya BP |  | TM 266-01-060-1  „Toumai“ | Sahelanthropus tchadensis  (Holotypus) | 2001 | Tschad | Michel Brunet |
| 6000.0 kya BP |  | BAR 1000a'00  BAR 1000b'00 | Orrorin tugenensis  (Holotypus) | 2000 | Kenia | Martin Pickford, Kiptalam Cheboi, Dominique Gommery, Pierre Mein, Brigitte Senut |
| 5500.0 kya BP - 5800.0 kya BP |  | ALA-VP 1/20 | Ardipithecus kadabba  (Holotypus) | 2001 | Äthiopien | Yohannes Haile-Selassie |
| 4400.0 kya BP |  | ARA-VP-6/500  „Ardi“ | Ardipithecus ramidus | 1994 | Äthiopien | Tim White et al. |
| 4200.0 kya BP - 3800.0 kya BP |  | KNM-KP 29281 | Australopithecus anamensis (Holotypus) | 1995 | Kanapoi (Kenia) | Meave Leakey et al. |
| 3800.0 kya BP - 3600.0 kya BP |  | LH 4 | Australopithecus afarensis (Holotypus) | 1974 | Laetoli (Tansania) | Mary Leakey |
| 3600.0 kya BP

In [31]:
url= "https://en.wikipedia.org/wiki/Flyby_anomaly"
tables = pd.read_html(url)
ptab = polars.from_pandas(tables[0])
print(ptab)


shape: (11, 13)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
│ CraftData ┆ Galileo I ┆ Galileo   ┆ NEAR      ┆ … ┆ Juno      ┆ Hayabusa2 ┆ OSIRIS-RE ┆ BepiColo │
│ ---       ┆ ---       ┆ II        ┆ ---       ┆   ┆ ---       ┆ ---       ┆ x[8]      ┆ mbo[9]   │
│ str       ┆ str       ┆ ---       ┆ str       ┆   ┆ str       ┆ str       ┆ ---       ┆ ---      │
│           ┆           ┆ str       ┆           ┆   ┆           ┆           ┆ str       ┆ str      │
╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡
│ Date      ┆ 1990-12-0 ┆ 1992-12-0 ┆ 1998-01-2 ┆ … ┆ 2013-10-0 ┆ 2015-12-0 ┆ 2017-09-2 ┆ 2020-04- │
│           ┆ 8         ┆ 8         ┆ 3         ┆   ┆ 9         ┆ 3         ┆ 2         ┆ 10       │
│ Speed at  ┆ 8.949     ┆ 8.877     ┆ 6.851     ┆ … ┆ null      ┆ 4.7       ┆ null      ┆ null     │
│ infinity, ┆           ┆           ┆           ┆   ┆           ┆          

In [34]:

# Transpose the table, get column names from previous index
ptabt = ptab.transpose(include_header=True)
print(ptabt)

shape: (13, 12)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
│ column    ┆ column_0  ┆ column_1  ┆ column_2  ┆ … ┆ column_7  ┆ column_8  ┆ column_9  ┆ column_1 │
│ ---       ┆ ---       ┆ ---       ┆ ---       ┆   ┆ ---       ┆ ---       ┆ ---       ┆ 0        │
│ str       ┆ str       ┆ str       ┆ str       ┆   ┆ str       ┆ str       ┆ str       ┆ ---      │
│           ┆           ┆           ┆           ┆   ┆           ┆           ┆           ┆ str      │
╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡
│ CraftData ┆ Date      ┆ Speed at  ┆ Speed at  ┆ … ┆ Deflectio ┆ Speed     ┆ Speed     ┆ Gained   │
│           ┆           ┆ infinity, ┆ perigee,  ┆   ┆ n angle,  ┆ increment ┆ increment ┆ energy,  │
│           ┆           ┆ km/s      ┆ km/s      ┆   ┆ degrees   ┆ at        ┆ at        ┆ J/kg     │
│           ┆           ┆           ┆           ┆   ┆           ┆ infinity,

In [42]:
for i, row in enumerate(ptabt.iter_rows(named=True)):
    print("|", end="")
    for key in row:
        print(f" {row[key]} |", end="")
    print()
    if i == 0:
        print("|", end="")
        for key in row:
            print(f" {'-'*len(row[key])} |", end="")
        print()

| CraftData | Date | Speed at infinity, km/s | Speed at perigee, km/s | Impact parameter, km | Minimal altitude, km | Spacecraft mass, kg | Trajectory inclination to equator, degrees | Deflection angle, degrees | Speed increment at infinity, mm/s | Speed increment at perigee, mm/s | Gained energy, J/kg |
| --------- | ---- | ----------------------- | ---------------------- | -------------------- | -------------------- | ------------------- | ------------------------------------------ | ------------------------- | --------------------------------- | -------------------------------- | ------------------- |
| Galileo I | 1990-12-08 | 8.949 | 13.738 | 11261 | 956 | 2497.1 | 142.9 | 47.46 | 3.92±0.08 | 2.560±0.050 | 35.1±0.7 |
| Galileo II | 1992-12-08 | 8.877 | 8.877 | None | 303 | 2223.0 | 138.9 | 51.1 | −4.60±1.00 | −9.200±0.600 | None |
| NEAR | 1998-01-23 | 6.851 | 12.739 | 12850 | 532 | 730.40 | 108.0 | 66.92 | 13.46±0.13 | 7.210±0.0700 | 92.2±0.9 |
| Cassini | 1999-08-18 | 16.01 | 19

In [102]:
tables[0]

CraftData,index,Date,"Speed at infinity, km/s","Speed at perigee, km/s","Impact parameter, km","Minimal altitude, km","Spacecraft mass, kg","Trajectory inclination to equator, degrees","Deflection angle, degrees","Speed increment at infinity, mm/s","Speed increment at perigee, mm/s","Gained energy, J/kg"
0,Galileo I,1990-12-08,8.949,13.738,11261.0,956,2497.1,142.9,47.46,3.92±0.08,2.560±0.050,35.1±0.7
1,Galileo II,1992-12-08,8.877,8.877,,303,2223.0,138.9,51.1,−4.60±1.00,−9.200±0.600,
2,NEAR,1998-01-23,6.851,12.739,12850.0,532,730.40,108.0,66.92,13.46±0.13,7.210±0.0700,92.2±0.9
3,Cassini,1999-08-18,16.01,19.03,8973.0,1172,4612.1,25.4,19.66,−2±1,−1.700±0.9000,
4,Rosetta-I,2005-03-04,3.863,10.517,22680.49,1954,2895.2,144.9,99.396,1.82±0.05,0.670±0.0200,7.03±0.19
5,MESSENGER,2005-08-02,4.056,10.389,22319.0,2336,1085.6,133.1,94.7,0.02±0.01,0.008±0.004,
6,Rosetta-II,2007-11-13,,12.49,,5322,2895,,,~0,~0.000±0.000,
7,Rosetta-III,2009-11-13,,13.34,,2483,2895,,,~0,−0.004±0.044,
8,Juno,2013-10-09,,14.93,,561[10],~2720,,,0±0.8[5],,
9,Hayabusa2,2015-12-03,4.7,10.3,,3090[11],590,,80.0,?,?,?
