# Princeton Geniza Project


In [1]:
import pandas as pd


pgp_documents_csv = "https://github.com/princetongenizalab/pgp-metadata/raw/main/data/documents.csv"

documents = pd.read_csv(pgp_documents_csv)

In [5]:
# limit to documents with dates
docs_with_dates = documents[documents.doc_date_standard.notna() | documents.inferred_date_standard.notna()]
docs_with_docdate = documents[documents.doc_date_standard.notna()].copy()
docs_with_inferreddate = documents[documents.inferred_date_standard.notna()]

print(f"""
Total documents:      {len(documents):,}
Documents with dates: {len(docs_with_dates):,}
    date on document: {len(docs_with_docdate):,}
     inferred dating:  {len(docs_with_inferreddate):,}""")


Total documents:      35,091
Documents with dates: 4,380
    date on document: 4,064
     inferred dating:  321


In [6]:
docs_with_docdate[['pgpid', 'doc_date_original', 'doc_date_calendar', 'doc_date_standard']].head(10)

Unnamed: 0,pgpid,doc_date_original,doc_date_calendar,doc_date_standard
5,449,1570,Seleucid,1259
16,463,19 Adar 1427,Seleucid,1116-03-05
23,472,1337,Seleucid,1025-08-28/1026-09-14
36,491,,,1131
41,499,"Wednesday, 15 Kislev 1500",Seleucid,1188-12-07
43,502,Tevet 1548,Seleucid,1236-11-30/1236-12-28
47,506,Elul 1428,Seleucid,1117-08-01/1117-08-29
55,516,First decade of Ḥeshvan 1442,Seleucid,1130
61,524,"Thursday, 12 Sivan 4795",Anno Mundi,1035-05-22
62,525,Shawwāl 425,Hijrī,1034-08-29/1034-09-07


In [7]:
from lark.visitors import VisitError

# first, how far can we get with the standard dates? can we parse as edtf and sort, render?
from undate import Undate 

def parse_standard_date(value):
    try:
        return Undate.parse(value, "EDTF")
    except VisitError as err:
        print(f"Parse error on {value}: {err}")
                        

# ignore gregorian/julian thing for now
# from pgp code:
# Julian Thursday, 4 October 1582, being followed by Gregorian Friday, 15 October
# cut off between gregorian/julian dates, in julian days
#gregorian_start_jd = convertdate.julianday.from_julian(1582, 10, 5)

docs_with_docdate['undate'] = docs_with_docdate.doc_date_standard.apply(parse_standard_date)

Parse error on 1217-02-20/1217-02-29: Error trying to process rule "date":

Day out of range in datetime string "1217-02-29"
Parse error on 1747-02-29: Error trying to process rule "date":

Day out of range in datetime string "1747-02-29"


In [8]:
# what are the records with standardized dates that couldn't be parsed?

# this is probably a data error in the original

docs_with_docdate[docs_with_docdate.undate.isna()][['pgpid', 'doc_date_original', 'doc_date_calendar', 'doc_date_standard', 'last_modified']]

Unnamed: 0,pgpid,doc_date_original,doc_date_calendar,doc_date_standard,last_modified
3190,3957,middle decade of Adar 1528,Seleucid,1217-02-20/1217-02-29,2023-02-09 07:22:14.481118+00:00
34445,40006,,,1747-02-29,2024-08-07 18:24:19.425288+00:00


In [9]:
docs_with_docdate.doc_date_calendar.value_counts()

doc_date_calendar
Seleucid      1581
Anno Mundi    1128
Hijrī          859
Kharājī          8
Name: count, dtype: int64

In [10]:
# example hebrew dates
docs_with_docdate[docs_with_docdate.doc_date_calendar == "Anno Mundi"][['pgpid', 'doc_date_original', 'doc_date_calendar', 'doc_date_standard']]

Unnamed: 0,pgpid,doc_date_original,doc_date_calendar,doc_date_standard
61,524,"Thursday, 12 Sivan 4795",Anno Mundi,1035-05-22
90,561,10 Nisan 4716,Anno Mundi,0956-03-24
111,582,"Thursday, 6 Adar 4996",Anno Mundi,1236-02-14
119,591,"Sunday, 29 Tammuz 4898",Anno Mundi,1138-07-10
131,603,4805/4806,Anno Mundi,1044-08-27/1045-09-13
...,...,...,...,...
34831,40401,5408,Anno Mundi,1647-09-30/1648-09-16
34994,40566,5594,Anno Mundi,1833-09-14/1834-10-03
35052,40624,21 Nisan 5376,Anno Mundi,1616-04-08
35063,40635,5555,Anno Mundi,1794-09-25/1795-09-13


In [11]:
# how many end with AM ?
hebrew_dates = docs_with_docdate[docs_with_docdate.doc_date_calendar == "Anno Mundi"][docs_with_docdate.doc_date_original.notna()]
hebrew_dates[hebrew_dates.doc_date_original.str.endswith("AM")][['pgpid', 'doc_date_original', 'doc_date_calendar', 'doc_date_standard']]

  hebrew_dates = docs_with_docdate[docs_with_docdate.doc_date_calendar == "Anno Mundi"][docs_with_docdate.doc_date_original.notna()]


Unnamed: 0,pgpid,doc_date_original,doc_date_calendar,doc_date_standard
702,1223,"Wednesday, 9 Tammuz 4912 AM",Anno Mundi,1152-06-13
16702,19975,"Sunday, 10 Kislev 5583 AM",Anno Mundi,1822-11-24
25421,30550,Tammuz 5537 AM,Anno Mundi,1777-07-06/1777-08-03


In [12]:
# how many include periods
docs_with_docdate[docs_with_docdate.doc_date_original.notna() & docs_with_docdate.doc_date_original.str.contains("\\.")][['pgpid', 'doc_date_original', 'doc_date_calendar', 'doc_date_standard']]

Unnamed: 0,pgpid,doc_date_original,doc_date_calendar,doc_date_standard
1556,2163,first third of Tammuz 500[.],Anno Mundi,1244/1249
1567,2175,End of Sivan 152[.],Seleucid,1209/1218
1753,2460,13[..],Seleucid,988/1088
2018,2745,1[.] Kislev 48[..],Anno Mundi,1039-11-30/1138-11-24
3044,3805,13[..],Seleucid,988/1087
...,...,...,...,...
30595,35955,12 Muḥarram 52[.],Hijrī,1126/1134
31232,36738,54[.],Hijrī,1145/1154
32554,38077,14[...],Seleucid,1088-09-19/1188-09-23
34660,40226,49[.],Hijrī,1096-12-19/1106-09-01


In [13]:
# how many use ordinals instead of numerals?
hebrew_dates[hebrew_dates.doc_date_original.str.contains("st") | hebrew_dates.doc_date_original.str.contains("rd") | hebrew_dates.doc_date_original.str.contains("th")][['pgpid', 'doc_date_original', 'doc_date_calendar', 'doc_date_standard']]

Unnamed: 0,pgpid,doc_date_original,doc_date_calendar,doc_date_standard
635,1154,Last decade of Kislev 5004,Anno Mundi,1243-12
1172,1750,11th Tammuz 4767,Anno Mundi,1007
1173,1751,"Monday, 27th Ṭevet 4797",Anno Mundi,1037-01-23
1556,2163,first third of Tammuz 500[.],Anno Mundi,1244/1249
5142,6795,last decade of Tishrei 4991,Anno Mundi,1230-09-29/1230-10-08
5223,6892,last decade of Iyyar 4906,Anno Mundi,1146-05-04/1146-05-13
5664,7409,last third of Ḥeshvan 4965,Anno Mundi,1204-10-17/1204-10-25
5812,7581,middle third of Adar 4876,Anno Mundi,1116-05
7024,9068,Last decade of Ṭevet 4898,Anno Mundi,1138-01
8639,11215,Middle third of Av 4889,Anno Mundi,1129-07-29/1129-08-07


In [14]:
import re

# test removing ordinals without removing the numbers
for val in ['11th Tammuz 4767', "27th Tevet", "8th Kislev"]:
    cleand_val = re.sub(r'(\d+)(st|nd|rd|th)', "\\1", val)
    print(f"{val}: { cleand_val}")

11th Tammuz 4767: 11 Tammuz 4767
27th Tevet: 27 Tevet
8th Kislev: 8 Kislev


In [16]:
# parse hijri, anno mundi, and seleucid dates as undates

import re
from lark.exceptions import UnexpectedEOF

def parse_original_date(row):
    # print(f"PGPID {row.pgpid} {row.doc_date_original} ({row.doc_date_calendar})")
    undate_calendar = None
    if row.doc_date_calendar == "Anno Mundi":
        undate_calendar = "Hebrew"
    elif row.doc_date_calendar == "Hijrī":
        undate_calendar = "Islamic"
    elif row.doc_date_calendar == "Seleucid":
        # handle seleucid as hebrew with offset (adapt from pgp code)
        undate_calendar = "Seleucid"

    
    if undate_calendar:
        value = row.doc_date_original

        # some dates have unknown digits, e.g. 1[.] Kislev 48[..] or 152[.]
        # ... parser doesn't support this, but undate DOES
        if '[.' in value:
            print(f"ignoring missing digits for now {value}")
            value = value.replace("[.]", "0").replace("[..]", "00").replace("[...]", "000")         
        
        # some dates have inferred numbers, e.g. Friday, [25] Nisan [4810] or 8 Elul (4)811'
        # for now, just strip out brackets before parsing 
        # (in future, could potentially infer uncertainty based on these)
        value = value.replace('[', '').replace(']', '').replace('(', '').replace(')', '')

        # also remove unsupported modifiers:
        #   Late Tevet 4903, Last decade of Kislev 5004, first third of ...
        #   some dates include of, e.g. day of month
        modifiers = ["Late ", "(first|middle|last)( third|half|decade|tenth)? (of )?", "(Beginning|end) of ", "last day", "First 10 days", " of", "spring", "decade ", "night, "]
        for mod in modifiers:
            value = re.sub(mod, "", value, flags=re.I)

        # there are a handful of misspelled wednesdays...
        value = value.replace("Wedensday", "Wednesday")
        # and a Thrusday
        value = value.replace("Thrusday", "Thursday")

        # three Hebrew calendar dates include text "AM" at end; at least one AH date
        if value.endswith(" AM") or value.endswith(" AH"):
            value = value[:-3]
        if value.endswith("."):  # strip off trailing period
            value = value[:-1]
        # 

        # about 62 have ordinals; strip them out
        value = re.sub(r'(\d+)(st|nd|rd|th)', "\\1", value)
        
        try:
            return Undate.parse(value, undate_calendar)
        except (VisitError, ValueError, UnexpectedEOF) as err:
            print(f"Parse error on PGPID {row.pgpid} {value} ({undate_calendar}): {err}")

            # there are a handful of cases in PGP where calendars are mixed,
            # i.e. hebrew months used for hijri calendar

            # some dates are entered in ISO format for another calendar; can we parse and set calendar?
            if "-" in value and "/" not in value:  # exclude intervals for now
                try:
                    parsed = Undate.parse(value, "ISO8601")
                    if parsed:
                        parsed = parsed.as_calendar(undate_calendar)
                        print(f"parsed {value} with ISO8601 format and calendar {undate_calendar}, result is {parsed} ({parsed.earliest}/{parsed.latest})")
                        return parsed
                except ValueError as err:
                    print(f"Could not parse {value} as ISO date: {err}")

docs_with_docdate['undate_orig'] = docs_with_docdate.apply(parse_original_date, axis=1)

Parse error on PGPID 603 4805/4806 (Hebrew): Could not parse '4805/4806' as a Hebrew date
Parse error on PGPID 613 Tishrei–Ṭevet 1495 (Seleucid): Could not parse 'Tishrei–Ṭevet 1495' as a Hebrew date
Parse error on PGPID 658 Ḥannuka 1548 (Seleucid): Could not parse 'Ḥannuka 1548' as a Hebrew date
Parse error on PGPID 860 Marheshvan 1460 (Seleucid): Could not parse 'Marheshvan 1460' as a Hebrew date
Parse error on PGPID 997 Second third Av 1414 (Seleucid): Could not parse 'Second third Av 1414' as a Hebrew date
Parse error on PGPID 1098 Early Elul 1476 (Seleucid): Could not parse 'Early Elul 1476' as a Hebrew date
Parse error on PGPID 1111 Second third Tammuz 1529 (Seleucid): Could not parse 'Second third Tammuz 1529' as a Hebrew date
Parse error on PGPID 1139 Passover 1537 (Seleucid): Could not parse 'Passover 1537' as a Hebrew date
Parse error on PGPID 1140 Sivan, 1564 (Seleucid): Could not parse 'Sivan, 1564' as a Hebrew date
Parse error on PGPID 1339 426–30 (Islamic): Could not pars

In [19]:
# how many hebrew/islamic dates were parsed / not parsed?

orig_dates_parsed = docs_with_docdate[docs_with_docdate.undate_orig.notna()].copy()

orig_dates_unparsed = docs_with_docdate[docs_with_docdate.doc_date_original.notna() & docs_with_docdate.doc_date_calendar.isin(['Anno Mundi', 'Hijrī', 'Seleucid']) & docs_with_docdate.undate_orig.isna()] 

total_parsed = len(orig_dates_parsed)
total_unparsed = len(orig_dates_unparsed)
print(f"""original dates parsed: {total_parsed}
original dates unparsed: {total_unparsed} (anno mundi, hijri, and seleucid calendars)
proportion parsed: {(total_parsed/(total_parsed + total_unparsed))*100:0.2f}%""")

original dates parsed: 3401
original dates unparsed: 167 (anno mundi, hijri, and seleucid calendars)
proportion parsed: 95.32%


In [20]:
# what is the date granularity of the dates we were able to parse?

orig_dates_parsed['orig_date_precision'] = orig_dates_parsed.undate_orig.apply(lambda x: str(x.precision).lower())
orig_dates_parsed[['pgpid', 'doc_date_original', 'doc_date_calendar', 'doc_date_standard', 'undate', 'undate_orig', 'orig_date_precision']].head()

Unnamed: 0,pgpid,doc_date_original,doc_date_calendar,doc_date_standard,undate,undate_orig,orig_date_precision
5,449,1570,Seleucid,1259,1259,1570,year
16,463,19 Adar 1427,Seleucid,1116-03-05,1116-03-05,1427-12-19,day
23,472,1337,Seleucid,1025-08-28/1026-09-14,1025-08-28/1026-09-14,1337,year
41,499,"Wednesday, 15 Kislev 1500",Seleucid,1188-12-07,1188-12-07,1500-09-15,day
43,502,Tevet 1548,Seleucid,1236-11-30/1236-12-28,1236-11-30/1236-12-28,1548-10,month


In [21]:
# this is skewed because of the kinds of dates we're not able to parse or modifiers we're omitting entirely
orig_dates_parsed.orig_date_precision.value_counts()

orig_date_precision
day      1566
month    1013
year      822
Name: count, dtype: int64

In [22]:
# check on the seleucid date parsing

orig_dates_parsed[orig_dates_parsed.doc_date_calendar == 'Seleucid'][['pgpid', 'doc_date_original', 'doc_date_calendar', 'doc_date_standard', 'undate', 'undate_orig', 'orig_date_precision']].head()

Unnamed: 0,pgpid,doc_date_original,doc_date_calendar,doc_date_standard,undate,undate_orig,orig_date_precision
5,449,1570,Seleucid,1259,1259,1570,year
16,463,19 Adar 1427,Seleucid,1116-03-05,1116-03-05,1427-12-19,day
23,472,1337,Seleucid,1025-08-28/1026-09-14,1025-08-28/1026-09-14,1337,year
41,499,"Wednesday, 15 Kislev 1500",Seleucid,1188-12-07,1188-12-07,1500-09-15,day
43,502,Tevet 1548,Seleucid,1236-11-30/1236-12-28,1236-11-30/1236-12-28,1548-10,month


In [17]:
for row in orig_dates_parsed[orig_dates_parsed.doc_date_calendar == 'Seleucid'][:10].itertuples():
    print(f" original: {row.doc_date_original} undate:{row.undate_orig} pgp standard {row.doc_date_standard} earliest:{row.undate_orig.earliest} latest:{row.undate_orig.latest}")

 original: 1570 undate:1570 pgp standard 1259 earliest:1258-09-07 latest:1259-09-26
 original: 19 Adar 1427 undate:1427-12-19 pgp standard 1116-03-05 earliest:1116-03-12 latest:1116-03-12
 original: 1337 undate:1337 pgp standard 1025-08-28/1026-09-14 earliest:1025-09-03 latest:1026-09-20
 original: Wednesday, 15 Kislev 1500 undate:1500-09-15 pgp standard 1188-12-07 earliest:1188-12-14 latest:1188-12-14
 original: Tevet 1548 undate:1548-10 pgp standard 1236-11-30/1236-12-28 earliest:1236-12-07 latest:1237-01-04
 original: Elul 1428 undate:1428-06 pgp standard 1117-08-01/1117-08-29 earliest:1117-08-08 latest:1117-09-05
 original: First decade of Ḥeshvan 1442 undate:1442-08 pgp standard 1130 earliest:1130-10-13 latest:1130-11-10
 original: Ḥeshvan 1453 undate:1453-08 pgp standard 1141 earliest:1141-10-11 latest:1141-11-08
 original: Sunday, 21 Kislev 1355 undate:1355-09-21 pgp standard 1043-11-26 earliest:1043-12-02 latest:1043-12-02
 original: Monday, 16 Tammuz 1540 undate:1540-04-16 pgp

In [24]:
# check calendar agreement, how many were wrong?

calendar_mapping = {
    "hebrew": "Anno Mundi",
    "islamic": "Hijrī",
    "seleucid": "Seleucid"
}

orig_dates_parsed['undate_calendar'] = orig_dates_parsed.undate_orig.apply(lambda x: calendar_mapping.get(x.calendar, x.calendar))

In [28]:
# which records appear to have mismatched original calendars? (i.e. can be parsed by the opposite parser)
# only 4!  
# PGPIDs 3637, 5902, 6058, 9198

# however, looking at the PGP records indicates sometimes the authors mixed hebrew and arabic months
# from description of PGPID 3637: [It is unusual but not unheard of to combine Hebrew months with the Hijrī calendar.] 

orig_dates_parsed[['undate', 'undate_calendar', 'doc_date_calendar']].tail()

Unnamed: 0,undate,undate_calendar,doc_date_calendar
35063,1794-09-25/1795-09-13,Anno Mundi,Anno Mundi
35070,1755-09-06/1756-09-24,Anno Mundi,Anno Mundi
35071,1519-10-09,Hijrī,Hijrī
35072,1563-04-05,Hijrī,Hijrī
35073,1563-04-25,Hijrī,Hijrī


In [21]:
# can we sort by parsed original dates? 
# doesn't work currently because of overlapping dates / different granularity
#orig_dates_parsed.sort_values(by='undate_orig') #, key=lambda col: col.value.earliest)

In [29]:
# set earliest/latest for graphing

# IMPORTANT: we have to cast type to something pandas/altair supports

orig_dates_parsed['orig_date_earliest'] = orig_dates_parsed.undate_orig.apply(lambda x: x.earliest).astype('datetime64[s]')
orig_dates_parsed['orig_date_latest'] = orig_dates_parsed.undate_orig.apply(lambda x: x.latest).astype('datetime64[s]')
orig_dates_parsed['orig_date_mid'] = orig_dates_parsed.undate_orig.apply(lambda x: x.earliest + (x.latest - x.earliest)/2).astype('datetime64[s]')

In [20]:
orig_dates_parsed[['orig_date_earliest', 'orig_date_latest', 'orig_date_mid', 'pgpid', 'doc_date_calendar']].head(10)

Unnamed: 0,orig_date_earliest,orig_date_latest,orig_date_mid,pgpid,doc_date_calendar
5,1258-09-07,1259-09-26,1259-03-18,449,Seleucid
16,1116-03-12,1116-03-12,1116-03-12,463,Seleucid
23,1025-09-03,1026-09-20,1026-03-13,472,Seleucid
41,1188-12-14,1188-12-14,1188-12-14,499,Seleucid
43,1236-12-07,1237-01-04,1236-12-21,502,Seleucid
47,1117-08-08,1117-09-05,1117-08-22,506,Seleucid
55,1130-10-13,1130-11-10,1130-10-27,516,Seleucid
61,1035-05-28,1035-05-28,1035-05-28,524,Anno Mundi
62,1034-08-25,1034-09-22,1034-09-08,525,Hijrī
73,1141-10-11,1141-11-08,1141-10-25,537,Seleucid


In [30]:
orig_dates_parsed[['orig_date_earliest', 'orig_date_latest', 'orig_date_mid']].dtypes

orig_date_earliest    datetime64[s]
orig_date_latest      datetime64[s]
orig_date_mid         datetime64[s]
dtype: object

In [31]:

# are these data errors?

orig_dates_parsed[orig_dates_parsed.orig_date_earliest > Undate(2100).earliest][['pgpid', 'orig_date_earliest', 'orig_date_latest', 'orig_date_mid', 'doc_date_original', 'doc_date_calendar', 'doc_date_standard', 'orig_date_precision']]

Unnamed: 0,pgpid,orig_date_earliest,orig_date_latest,orig_date_mid,doc_date_original,doc_date_calendar,doc_date_standard,orig_date_precision


In [32]:
import altair as alt

# exclude dates after 2100
graphable_data = orig_dates_parsed[['orig_date_earliest', 'orig_date_latest', 'pgpid', 'doc_date_calendar']][orig_dates_parsed.orig_date_earliest < Undate(2100).earliest]

bar_chart = alt.Chart(graphable_data).mark_bar(opacity=0.5).encode(
    x=alt.X('orig_date_earliest:T', title="original date (range)"), # , axis=alt.Axis(format="r")),
    x2='orig_date_latest:T',
    y=alt.Y('count(pgpid)', title='Count of Documents')
).properties(width=1200, height=200)

earliest_chart = bar_chart.mark_point(opacity=0.2, color="green", interpolate="monotone").encode(
 x=alt.X('orig_date_earliest:T', title="Date (earliest)"), # axis=alt.Axis(format="r")),
 y=alt.Y('count(pgpid)', title='Count of Documents')
).properties(width=1200, height=200)

latest_chart = bar_chart.mark_point(opacity=0.2, color="blue", interpolate="monotone").encode(
 x=alt.X('orig_date_latest:T', title="Date (latest)"), # axis=alt.Axis(format="r")),
 y=alt.Y('count(pgpid)', title='Count of Documents')
).properties(width=1200, height=200)

# (bar_chart & line_chart).properties(title="Documents by date (1000-1300)")
(bar_chart & (latest_chart + earliest_chart)).interactive()

In [33]:

# plot points for all the documents with date, using jitter to scatter them vertically

jitter_plot = alt.Chart(graphable_data).mark_circle(size=8, opacity=0.5).encode(
    x="orig_date_earliest:T",   # maybe could eventually use jitter to plot between earliest/latest
    y=alt.Y("jitter:Q", title="").axis(None),
    color=alt.Color('doc_date_calendar:N', title="Calendar") #.legend(None)
).transform_calculate(
    # Generate Gaussian jitter with a Box-Muller transform
    jitter="sqrt(-2*log(random()))*cos(2*PI*random())"
).properties(width=1200, height=200)


jitter_plot 


In [225]:
(bar_chart & jitter_plot).interactive()

## compare weekdays

In [34]:
weekday_dates = orig_dates_parsed[orig_dates_parsed.doc_date_original.str.contains('day ')][['pgpid', 'doc_date_original', 'doc_date_calendar', 'doc_date_standard', 'undate', 'undate_orig', 'orig_date_precision']]
weekday_dates

Unnamed: 0,pgpid,doc_date_original,doc_date_calendar,doc_date_standard,undate,undate_orig,orig_date_precision
851,1377,"Wednesday night, 28 Sivan 1581",Seleucid,1270,1270,1581-03-28,day
1835,2550,"Monday night, 5 Av 1443",Seleucid,1132,1132,1443-05-05,day
1929,2649,"Sunday night, 25 Kislev 1444",Seleucid,1133,1133,1444-09-25,day
2013,2739,Wednesday 29th Elul 1354,Seleucid,1043-09-07,1043-09-07,1354-06-29,day
3257,4026,"Wednesday night, 29 Tishrei 1541",Seleucid,1229-09-18,1229-09-18,1541-07-29,day
...,...,...,...,...,...,...,...
29309,34623,"Sunday night, 20 Ṭevet 1578",Seleucid,1266/1267,1266/1267,1578-10-20,day
29930,35264,Wednesday 13 Ṭevet 1526,Seleucid,1214/1215,1214/1215,1526-10-13,day
34016,39564,Monday 16 Tevet 1339,Seleucid,1027-12-18,1027-12-18,1339-10-16,day
34474,40035,Monday 1st Iyyar 1437,Seleucid,1126-04-26,1126-04-26,1437-02-01,day


In [35]:
weekday_dates.orig_date_precision.value_counts()

orig_date_precision
day    104
Name: count, dtype: int64

In [36]:
days = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]

# get numeric weekday
weekday_dates['undate_weekday'] = weekday_dates.undate_orig.apply(lambda x: x.earliest.weekday)
weekday_dates['undate_weekday_name'] = weekday_dates.undate_weekday.apply(lambda x: days[x])
# extract weekday from date label
weekday_dates['orig_weekday'] = weekday_dates.doc_date_original.str.extract('([a-zA-Z]+day)', expand=False).str.strip()
# correct misspellings
misspelled_days = {
    "Wedensday": "Wednesday",
    "Thrusday": "Thursday",
}
weekday_dates['orig_weekday'] = weekday_dates.orig_weekday.apply(lambda x: misspelled_days.get(x, x))

# shift night to next day, e.g. Wednesday night should be Thursday
# NOTE: this must be done immediately after the day extraction, otherwise repeated runs continue shifting to the next day
def next_day(weekday):
    return days[(days.index(weekday) +1) % 7]

weekday_dates['orig_weekday'] = weekday_dates.apply(lambda row: next_day(row.orig_weekday) if " night" in row.doc_date_original else row.orig_weekday, axis=1)
weekday_dates[weekday_dates.doc_date_original.str.contains(" night")]

weekday_dates

Unnamed: 0,pgpid,doc_date_original,doc_date_calendar,doc_date_standard,undate,undate_orig,orig_date_precision,undate_weekday,undate_weekday_name,orig_weekday
851,1377,"Wednesday night, 28 Sivan 1581",Seleucid,1270,1270,1581-03-28,day,3,Thursday,Thursday
1835,2550,"Monday night, 5 Av 1443",Seleucid,1132,1132,1443-05-05,day,1,Tuesday,Tuesday
1929,2649,"Sunday night, 25 Kislev 1444",Seleucid,1133,1133,1444-09-25,day,0,Monday,Monday
2013,2739,Wednesday 29th Elul 1354,Seleucid,1043-09-07,1043-09-07,1354-06-29,day,2,Wednesday,Wednesday
3257,4026,"Wednesday night, 29 Tishrei 1541",Seleucid,1229-09-18,1229-09-18,1541-07-29,day,3,Thursday,Thursday
...,...,...,...,...,...,...,...,...,...,...
29309,34623,"Sunday night, 20 Ṭevet 1578",Seleucid,1266/1267,1266/1267,1578-10-20,day,0,Monday,Monday
29930,35264,Wednesday 13 Ṭevet 1526,Seleucid,1214/1215,1214/1215,1526-10-13,day,2,Wednesday,Wednesday
34016,39564,Monday 16 Tevet 1339,Seleucid,1027-12-18,1027-12-18,1339-10-16,day,0,Monday,Monday
34474,40035,Monday 1st Iyyar 1437,Seleucid,1126-04-26,1126-04-26,1437-02-01,day,0,Monday,Monday


In [37]:
weekday_dates[weekday_dates.doc_date_original.str.contains(" night")]

Unnamed: 0,pgpid,doc_date_original,doc_date_calendar,doc_date_standard,undate,undate_orig,orig_date_precision,undate_weekday,undate_weekday_name,orig_weekday
851,1377,"Wednesday night, 28 Sivan 1581",Seleucid,1270,1270,1581-03-28,day,3,Thursday,Thursday
1835,2550,"Monday night, 5 Av 1443",Seleucid,1132,1132,1443-05-05,day,1,Tuesday,Tuesday
1929,2649,"Sunday night, 25 Kislev 1444",Seleucid,1133,1133,1444-09-25,day,0,Monday,Monday
3257,4026,"Wednesday night, 29 Tishrei 1541",Seleucid,1229-09-18,1229-09-18,1541-07-29,day,3,Thursday,Thursday
5511,7237,"Tuesday night, 22 Kislev 1435",Seleucid,1123-12-12,1123-12-12,1435-09-22,day,2,Wednesday,Wednesday
5854,7637,"Monday night, 29 Ṭevet 1438",Seleucid,1127,1127,1438-10-29,day,4,Friday,Tuesday
5857,7642,"Thursday night, 23 Tammuz 1538",Seleucid,1227-07-09,1227-07-09,1538-04-23,day,4,Friday,Friday
6419,8332,"Friday night, 20 Iyar 4957",Anno Mundi,1197-05,1197-05,4957-02-20,day,5,Saturday,Saturday
29309,34623,"Sunday night, 20 Ṭevet 1578",Seleucid,1266/1267,1266/1267,1578-10-20,day,0,Monday,Monday


In [42]:
# how many match?
matches = weekday_dates[weekday_dates.undate_weekday_name == weekday_dates.orig_weekday]

mismatches = weekday_dates[weekday_dates.undate_weekday_name != weekday_dates.orig_weekday]

print(f"{len(matches)} matches, {len(mismatches)} mismatches ({(len(matches)/(len(matches)+len(mismatches)))*100:0.2f}%)")
mismatches.head(20)

44 matches, 60 mismatches (42.31%)


Unnamed: 0,pgpid,doc_date_original,doc_date_calendar,doc_date_standard,undate,undate_orig,orig_date_precision,undate_weekday,undate_weekday_name,orig_weekday
5271,6947,Monday 3 Iyyar 1740,Seleucid,1429-04-07,1429-04-07,1740-02-03,day,3,Thursday,Monday
5854,7637,"Monday night, 29 Ṭevet 1438",Seleucid,1127,1127,1438-10-29,day,4,Friday,Tuesday
8649,11227,Monday 24 Jumādā I 517,Hijrī,1123-07-20,1123-07-20,0517-05-24,day,4,Friday,Monday
16400,19649,Thursday 26 Iyyar 5306,Anno Mundi,1546-04-28,1546-04-28,5306-02-26,day,2,Wednesday,Thursday
17728,21094,Saturday 20 Rajab 550,Hijrī,1155-09-19,1155-09-19,0550-07-20,day,0,Monday,Saturday
23105,27479,Tuesday 11 Tammuz 5525,Anno Mundi,1765-06-30,1765-06-30,5525-04-11,day,6,Sunday,Tuesday
23110,27484,Friday 20th Shevat 5405,Anno Mundi,1645,1645,5405-11-20,day,3,Thursday,Friday
23111,27485,Sunday 22 Adar 5590,Anno Mundi,1830-03-17,1830-03-17,5590-12-22,day,2,Wednesday,Sunday
23113,27487,Thursday 15th Shevat 5450,Anno Mundi,1690,1690,5450-11-15,day,2,Wednesday,Thursday
23115,27489,Sunday 6 Nisan 5528,Anno Mundi,1768-03-24,1768-03-24,5528-01-06,day,3,Thursday,Sunday


In [30]:
mismatches.doc_date_calendar.value_counts()

doc_date_calendar
Anno Mundi    55
Seleucid       3
Hijrī          2
Name: count, dtype: int64

In [31]:
mismatches.orig_weekday.value_counts()

orig_weekday
Wednesday    17
Sunday       12
Monday       10
Thursday      9
Tuesday       7
Friday        4
Saturday      1
Name: count, dtype: int64

In [43]:
# how many mismatches are due to night?
night_mismatches = mismatches[mismatches.doc_date_original.str.contains(" night")]
print(f"{len(night_mismatches)} mismatches that include text 'night'")
night_mismatches

1 mismatches that include text 'night'


Unnamed: 0,pgpid,doc_date_original,doc_date_calendar,doc_date_standard,undate,undate_orig,orig_date_precision,undate_weekday,undate_weekday_name,orig_weekday
5854,7637,"Monday night, 29 Ṭevet 1438",Seleucid,1127,1127,1438-10-29,day,4,Friday,Tuesday


In [44]:
# plot frequency by day, just for fun

# get numeric weekday
orig_dates_parsed['undate_weekday'] = orig_dates_parsed.undate_orig.apply(lambda x: x.earliest.weekday)
orig_dates_parsed['undate_weekday_name'] = orig_dates_parsed.undate_weekday.apply(lambda x: days[x])

# restrict to dates with day precision; the rest are just using earliest day
orig_dates_days = orig_dates_parsed[orig_dates_parsed.orig_date_precision == 'day']


alt.Chart(orig_dates_days[['undate_weekday', 'undate_weekday_name', 'pgpid']]).mark_rect().encode(
    alt.X('undate_weekday_name', sort=days, title='weekday'),
    alt.Color('count(pgpid)', title='# of documents')
).properties(title='document frequency by weekday')


In [45]:
orig_dates_days.undate_weekday_name.value_counts()

undate_weekday_name
Monday       300
Thursday     280
Tuesday      233
Sunday       223
Wednesday    223
Friday       211
Saturday      96
Name: count, dtype: int64

In [46]:
# get rough century (gregorian calendar)
weekday_dates['century'] = orig_dates_days.undate_orig.apply(lambda x: f"{("%04d" % x.earliest.year)[:2]}00s")

weekday_dates[['pgpid', 'doc_date_original', 'doc_date_calendar', 'doc_date_standard', 'undate', 'undate_orig', 'century']].head()


Unnamed: 0,pgpid,doc_date_original,doc_date_calendar,doc_date_standard,undate,undate_orig,century
851,1377,"Wednesday night, 28 Sivan 1581",Seleucid,1270,1270,1581-03-28,1200s
1835,2550,"Monday night, 5 Av 1443",Seleucid,1132,1132,1443-05-05,1100s
1929,2649,"Sunday night, 25 Kislev 1444",Seleucid,1133,1133,1444-09-25,1100s
2013,2739,Wednesday 29th Elul 1354,Seleucid,1043-09-07,1043-09-07,1354-06-29,1000s
3257,4026,"Wednesday night, 29 Tishrei 1541",Seleucid,1229-09-18,1229-09-18,1541-07-29,1200s


In [47]:
weekday_dates.century.value_counts()

century
1700s    48
1600s    19
1100s    11
1800s     9
1200s     6
1000s     5
1500s     4
0900s     1
1400s     1
Name: count, dtype: int64

In [48]:

alt.Chart(weekday_dates[['undate_weekday', 'undate_weekday_name', 'pgpid', 'century']]).mark_rect().encode(
    alt.X('undate_weekday_name', sort=days, title='weekday'),
    alt.Y('century'),
    alt.Color('count(pgpid)')
).properties(title='document frequency by weekday and century')


In [49]:
# what about heat map by month?


# get numeric month
orig_dates_parsed['undate_month'] = orig_dates_parsed.undate_orig.apply(lambda x: x.month)
# orig_dates_parsed['undate_weekday_name'] = orig_dates_parsed.undate_weekday.apply(lambda x: days[x])

has_month = orig_dates_parsed[orig_dates_parsed.undate_month.notna()]
#orig_dates_months = [


alt.Chart(has_month[['undate_month', 'pgpid', 'doc_date_calendar']]).mark_rect().encode(
    alt.X('undate_month', title='month'),
    alt.Color('count(pgpid)', title='# of documents')
).facet(
    row=alt.Facet('doc_date_calendar', title="Original Calendar")
).properties(title='Document frequency by month and calendar')

In [50]:
has_month.doc_date_calendar.value_counts()

doc_date_calendar
Seleucid      1183
Anno Mundi     888
Hijrī          508
Name: count, dtype: int64

In [51]:
orig_dates_days[orig_dates_days.undate_weekday_name.notna()].shape

(1566, 39)

In [52]:
# weekday frequency by month?

orig_dates_days['undate_month'] = orig_dates_days.undate_orig.apply(lambda x: x.month)

alt.Chart(orig_dates_days[['undate_weekday', 'undate_weekday_name', 'pgpid', 'undate_month', 'doc_date_calendar']]).mark_rect().encode(
    alt.X('undate_weekday_name', sort=days, title='weekday'),
    alt.Y('undate_month', title="month"),
    alt.Color('count(pgpid)')
).facet(
    column=alt.Facet('doc_date_calendar', title="Original Calendar")
).properties(title='Document frequency by weekday and month (1,557 documents)')



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  orig_dates_days['undate_month'] = orig_dates_days.undate_orig.apply(lambda x: x.month)


In [53]:

# orig_dates_days['undate_month'] = orig_dates_days.undate_orig.apply(lambda x: x.month)

# alt.Chart(orig_dates_days[['undate_weekday', 'undate_weekday_name', 'pgpid', 'undate_month', 'doc_date_calendar']]).mark_rect().encode(
#     alt.X('undate_weekday_name', sort=days, title='weekday'),
#     alt.Y('undate_month', title="month"),
#     alt.Color('count(pgpid)')
# ).facet(
#     column=alt.Facet('doc_date_calendar', title="Original Calendar")
# ).properties(title='document frequency by weekday and month')



alt.Chart(weekday_dates[['undate_weekday', 'undate_weekday_name', 'pgpid', 'doc_date_calendar']]).mark_rect().encode(
    alt.X('undate_weekday_name', sort=days, title='weekday'),
    # alt.Y('doc_date_calendar'),
    alt.Color('count(pgpid)')
).facet(row=alt.Facet('doc_date_calendar', title="Original Calendar")
).properties(title='document frequency by weekday')

In [54]:
weekday_dates.doc_date_calendar.value_counts()

doc_date_calendar
Anno Mundi    82
Seleucid      20
Hijrī          2
Name: count, dtype: int64