In [1]:
import pandas as pd
from indralib.indra_time import IndraTime
import re

In [90]:
def remove_footnotes(text, numeric_only=True, single_letter_alpha=True):
    if numeric_only is False:
        text = re.sub(r"\[.*?\]", "", text)
    else:
        text = re.sub(r"\[\d+\]", "", text)
        if single_letter_alpha is True:
            text = re.sub(r"\[\w\]", "", text)
    return text

In [None]:
def extract_date_remarks(date):
    remarks = ""
    approxies = ["ca.", "c.", "circa", "around", "approximately", "~", ">", "<", "≈", "≥", "≤", "?"]
    for ap in approxies:
        if ap in date:
            date = date.replace(ap, "").strip()
            if date != "":
                remarks = f"{ap}"
    if "±" in date:
        dates = date.split("±")
        date = dates[0].strip()
        append = dates[1].strip().split(" ")
        if len(append) > 1:
            date = date + " " + append[1]
            remarks = "±" + append[0]
        else:
            remark = "±" + dates[1].strip()
    return date, remarks


In [98]:
def date_clean(date, default_scale = None):
    remarks = ""
    if isinstance(date, str):
        date = date.remove_footnotes(date)
        date = date.replace("–", " - ").replace(",", "").replace("\xa0", " ").replace("  ", " ").replace("BCE", "BC").strip()
        date_sub = date.split("-")
        if len(date_sub)==3 or (len(date_sub)==2 and " - " not in date):
            return date, remarks
        if default_scale is not None:
            date = date.split("-")
            if len(date) == 1:
                date = date[0].strip()+f" {default_scale}"
            else:
                date = f"{date[0].strip()} {default_scale} - {date[1].strip()} {default_scale}"
        else:
            dates = date.split(" - ")
            new_dates = []
            for di in dates:
                dj, rem = extract_date_remarks(di)
                new_dates.append(dj)
                if len(remarks) == 0:
                    remarks = rem
                else:
                    remarks += f", {rem}"
            date = " - ".join(new_dates)
            dates = date.split(" - ")
            if len(dates) == 2:
                print(f"split dates: {dates}")
                sub_dates1 = dates[1].split(" ")
                sub_dates2 = dates[0].split(" ")
                if len(sub_dates2)>1 and len(sub_dates1)==1:
                    date = f"{sub_dates1[0]} {sub_dates2[1]} - {dates[1]}"
    else:
        if default_scale is not None:
            date = f"{date} {default_scale}"
        else:
            date = str(date)
    if "/" in date:
        date_parts = date.split("/")
        date = date_parts[0].strip()
        alt_date_stub = date_parts[1].strip()
        alt_date = date[:len(date)-len(alt_date_stub)] + alt_date_stub
        remarks = f"Alt.: {alt_date}"       
    jd_dates = IndraTime.string_time_2_julian(date)
    if len(jd_dates) > 1 and jd_dates[1] is not None:
        date_start = IndraTime.julian_2_string_time(jd_dates[0])
        date_end = IndraTime.julian_2_string_time(jd_dates[1])
        date = f"{date_start} - {date_end}"
    else:
        date = IndraTime.julian_2_string_time(jd_dates[0])

    return date, remarks

SyntaxError: 'tuple' is an illegal expression for augmented assignment (2684515319.py, line 19)

In [89]:
def date_merge(year, rest):
    day = None
    month = None
    rest = rest.strip()
    year = year.strip()
    if rest is not None and len(rest) > 0:
        rparts = rest.split(" ")
        if len(rparts) == 1:
            month = rparts[0].strip().lower()
        else:
            try:
                month = rparts[1].strip().lower()
                day = int(rparts[0].strip())
            except ValueError:
                month = None
                day = None
            if month is None and day is None:
                try:
                    month = rparts[0].strip().lower()
                    day = int(rparts[1].strip())
                except ValueError:
                    month = None
                    day = None
    else:
        month = None
        day = None
    if month is not None:
        val_months = ["january", "february", "march", "april", "may", "june", "july", "august", "september", "october", "november", "december"]
        try:
            month_id = val_months.index(month) + 1
        except ValueError:
            month_id = None
            return f"{year}"
        year_parts = str(year).replace("\xa0"," ").split(" ", 1)
        if len(year_parts) > 1:
            year = year_parts[0]
            appendix = year_parts[1]
        else:
            appendix = ""
        if day is not None:
            date = f"{year}-{month_id:02d}-{day:02d}"
            if appendix != "":
                date = date + f" {appendix}"
        else:
            date = f"{year}-{month_id:02d}"
            if appendix != "":
                date = date + f" {appendix}"
        return date
    else:
        return f"{year}"

In [91]:
url = "https://en.wikipedia.org/wiki/Timeline_of_Iranian_history"
tables = pd.read_html(url)

In [92]:
tables[0]

Unnamed: 0,Year,Date,Event
0,3200 BC,,Elam civilization in the far west and southwes...


In [93]:
tables[32]

Unnamed: 0,Year,Date,Event
0,2001,8 June,"Iranian presidential election, 2001: President..."
1,2003,December,"40,000 people are killed in an earthquake in s..."
2,2005,24 June,"Iranian presidential election, 2005: Ahmadinej..."
3,2009,12 June,"Iranian presidential election, 2009: Ahmadinej..."
4,2009,13 June,2009–10 Iranian election protests: Protests in...
5,2013,3 August,Hassan Rouhani replaces Ahmadinejad as President.
6,2014,,"My Stealthy Freedom, an online movement in whi..."
7,2015,14 July,Signing of Iran nuclear deal between Iran and ...
8,2018,8 May,United States withdraws from the Iran nuclear ...
9,2017–19,,Iranian woman protest against compulsory hijab...


In [94]:
print("| Date | Event in Persian (Iranian) history |")
print("| ---- | ---- |")
for index in range(0,33):
    for i, row in enumerate(tables[index].iterrows()):
        year = str(row[1].iloc[0])
        if pd.isna(year) or year == "nan":
            continue
        year = remove_footnotes(year)
        rest = row[1].iloc[1]
        if pd.isna(rest):
            rest = ""
        rest = remove_footnotes(rest)
        date = date_merge(year, rest)
        date, remarks = date_clean(date)
        event = row[1].iloc[2]
        event = remove_footnotes(event)
        if pd.isna(event):
            continue
        if remarks != "":
            event = f"Date: {remarks}, {event}"
        print(f"| {date} | {event} |")

| Date | Event in Persian (Iranian) history |
| ---- | ---- |
| 3200 BC | Elam civilization in the far west and southwest of modern-day Iran and modern-day southeast Iraq. |
| 1250 BC | Untash-Napirisha, king of Elam, builds the Chogha Zanbil ziggurat complex in present-day Khuzestan Province. |
| 1210 BC | Elamite Empire reaches the height of its power. |
| 770 BC | The Persians start driving the Elamites of Anshan towards Susa. |
| 727 BC | Deioces founds the Median government. |
| 705 BC | Birth of Achaemenes (died c. 675 BC), the eponymous ancestor of the Achaemenid dynasty. |
| 647 BC | Assyrian Empire defeats Elam Empire in the Battle of Susa, resulting in looting and total destruction of Susa. |
| 633 BC | The Scythians invade Media. |
| 624 BC | The Medians repel the Scythians. |
| 624 BC | Cyaxares the Great becomes the king of the Medes. |
| 612 BC | Together with the Babylonians, Cyaxares the Great captures the Assyrian capital Nineveh, which leads to the eventual collapse o

In [95]:
url = "https://en.wikipedia.org/wiki/Egyptian_pyramids"
tables = pd.read_html(url)

In [96]:
tables[12]

Unnamed: 0,Pyramid (Pharaoh),Reign,Field,Height
0,Pyramid of Djoser (Djoser),c. 2670 BCE,Saqqara,62 meters (203 feet)
1,Red Pyramid (Sneferu),c. 2612–2589 BCE,Dahshur,104 meters (341 feet)
2,Meidum Pyramid (Sneferu),c. 2612–2589 BCE,Meidum,65 meters (213 feet) (ruined) Would have been ...
3,Great Pyramid of Giza (Khufu),c. 2589–2566 BCE,Giza,146.7 meters (481 feet) or 280 Egyptian Royal ...
4,Pyramid of Djedefre (Djedefre),c. 2566–2558 BCE,Abu Rawash,60 meters (197 feet)
5,Pyramid of Khafre (Khafre),c. 2558–2532 BCE,Giza,136.4 meters (448 feet) Originally: 143.5 m (4...
6,Pyramid of Menkaure (Menkaure),c. 2532–2504 BCE,Giza,65 meters (213 feet) or 125 Egyptian Royal cubits
7,Pyramid of Userkaf (Userkaf),c. 2494–2487 BCE,Saqqara,48 meters (161 feet)
8,Pyramid of Sahure (Sahure),c. 2487–2477 BCE,Abusir,47 meters (155 feet)
9,Pyramid of Neferirkare (Neferirkare Kakai),c. 2477–2467 BCE,Abusir,72.8 meters (239 feet)


In [97]:
for i, row in enumerate(tables[12].iterrows()):
    date = row[1].iloc[1]
    if pd.isna(date) or date == "nan":
        continue
    date = remove_footnotes(str(date))
    print(date)
    date, remarks = date_clean(date)
    pyramid = row[1].iloc[0]
    pyramid = remove_footnotes(pyramid)
    pyr_phar = pyramid.split("(")
    pyramid = pyr_phar[0].strip()
    if len(pyr_phar) > 1:
        pharao = pyr_phar[1].split(")")[0]
    else:
        pharao = ""
    if pd.isna(pyramid):
        continue
    field = row[1].iloc[2]
    field = remove_footnotes(height)
    height = row[1].iloc[3]
    height = remove_footnotes(height)
    if remarks != "":
        pyramid = f"Date: {remarks}, {pyramid}"
    print(f"| {date} | {pyramid} | {pharao} | {field} | {height} |")

c. 2670 BCE
| 2670 BC | Date: c., Pyramid of Djoser | Djoser | 40 meters (132 feet) or 50 meters (164 feet) | 62 meters (203 feet) |
c. 2612–2589 BCE
split dates: ['c. 2612', '2589 BC']
| 2612-01-01 - 2589 BC | Date: c., Red Pyramid | Sneferu | 62 meters (203 feet) | 104 meters (341 feet) |
c. 2612–2589 BCE
split dates: ['c. 2612', '2589 BC']
| 2612-01-01 - 2589 BC | Date: c., Meidum Pyramid | Sneferu | 104 meters (341 feet) | 65 meters (213 feet) (ruined) Would have been 91.65 meters (301 feet)[citation needed] or 175 Egyptian Royal cubits. |
c. 2589–2566 BCE
split dates: ['c. 2589', '2566 BC']
| 2589-01-01 - 2566 BC | Date: c., Great Pyramid of Giza | Khufu | 65 meters (213 feet) (ruined) Would have been 91.65 meters (301 feet)[citation needed] or 175 Egyptian Royal cubits. | 146.7 meters (481 feet) or 280 Egyptian Royal cubits |
c. 2566–2558 BCE
split dates: ['c. 2566', '2558 BC']
| 2566-01-01 - 2558 BC | Date: c., Pyramid of Djedefre | Djedefre | 146.7 meters (481 feet) or 280 Egyp