# Analysis of Works Before 1800
This notebook analyzes the evolution of the number of works per century (before 1800) that have:
- A date (using ALL date-related properties)
- A country/place (using ALL country/place-related properties)
- Content = Full text/URL (not metadata like title, genre, etc.)

In [None]:
import json
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
import random

In [None]:
# Load the data
with open('output/extracted/extracted_data.json', 'r') as f:
    data = json.load(f)

print(f"Total items in dataset: {len(data):,}")

In [None]:
# Sample 100,000 items
random.seed(42)
all_ids = list(data.keys())
sample_size = min(100_000, len(all_ids))
sampled_ids = random.sample(all_ids, sample_size)
print(f"Sampled {sample_size:,} items")

In [None]:
def extract_year(date_str):
    """Extract year from date string like '2021-01-01T00:00:00Z' or '-0500-01-01T00:00:00Z'"""
    if not date_str:
        return None
    try:
        if date_str.startswith('-'):
            year = int(date_str.split('-')[1])
            return -year
        else:
            year = int(date_str.split('-')[0])
            return year
    except:
        return None

def get_century(year):
    """Get century from year. E.g., 1450 -> 15th century (1400-1499)"""
    if year is None:
        return None
    if year <= 0:
        return (year // 100) * 100
    else:
        return ((year - 1) // 100) * 100

print(f"Year 1450 -> Century: {get_century(1450)}")
print(f"Year 1501 -> Century: {get_century(1501)}")
print(f"Year -500 -> Century: {get_century(-500)}")

In [None]:
# ALL date properties found in the dataset
DATE_PROPERTIES = [
    'P577',   # publication date (40,130 items)
    'P571',   # inception (4,631 items)
    'P585',   # point in time (1,918 items)
    'P580',   # start time (698 items)
    'P582',   # end time (431 items)
    'P1191',  # date of first performance (429 items)
    'P2031',  # work period (start) (4 items)
    'P2032',  # work period (end) (4 items)
    'P3893',  # public domain date (3 items)
    'P1319',  # earliest date (1 items)
]

# ALL country/place properties found in the dataset
COUNTRY_PROPERTIES = [
    'P495',   # country of origin (20,028 items)
    'P291',   # place of publication (13,563 items)
    'P17',    # country (9,848 items)
    'P276',   # location (5,288 items)
    'P840',   # narrative location (372 items)
]

# Content = Full text/URL properties ONLY (not metadata like title, genre, etc.)
CONTENT_PROPERTIES = [
    'P953',   # full work available at URL (10,365 items)
    'P973',   # described at URL (3,564 items)
    'P996',   # document file on Commons (3,223 items)
    'P18',    # image (2,116 items)
    'P1343',  # described by source (764 items)
]
# + sitelinks (wikisource, etc.)

print(f"Date properties: {len(DATE_PROPERTIES)}")
print(f"Country/Place properties: {len(COUNTRY_PROPERTIES)}")
print(f"Content properties (URL/full text): {len(CONTENT_PROPERTIES)} + sitelinks")

In [None]:
def has_date(item):
    """Check if item has a date and return the year"""
    props = item.get('properties', {})
    for prop in DATE_PROPERTIES:
        if prop in props:
            values = props[prop].get('values', [])
            if values:
                year = extract_year(values[0])
                if year is not None:
                    return year
    return None

def has_country(item):
    """Check if item has a country or place"""
    props = item.get('properties', {})
    for prop in COUNTRY_PROPERTIES:
        if prop in props:
            values = props[prop].get('values', [])
            if values:
                return True
    return False

def has_content(item):
    """Check if item has full text/URL content or sitelinks"""
    # Check content properties (URL/full text)
    props = item.get('properties', {})
    for prop in CONTENT_PROPERTIES:
        if prop in props:
            values = props[prop].get('values', [])
            if values:
                return True
    # Check sitelinks (e.g., wikisource links)
    sitelinks = item.get('sitelinks', [])
    if sitelinks:
        return True
    return False

In [None]:
# Process sampled items
centuries_before_1800 = []

for qid in sampled_ids:
    item = data[qid]
    year = has_date(item)
    
    if year is None:
        continue
    if year >= 1800:
        continue
    if not has_country(item):
        continue
    if not has_content(item):
        continue
    
    century = get_century(year)
    centuries_before_1800.append(century)

print(f"Items before 1800 with date, country, and content: {len(centuries_before_1800):,}")

In [None]:
# Count items per century
century_counts = Counter(centuries_before_1800)

df = pd.DataFrame([
    {'century_start': century, 'count': count}
    for century, count in century_counts.items()
]).sort_values('century_start')

def century_label(year):
    if year < 0:
        return f"{abs(year)//100}th c. BCE"
    else:
        return f"{year//100 + 1}th c."

df['century_label'] = df['century_start'].apply(century_label)

print(df)

In [None]:
# Create the graph
plt.figure(figsize=(14, 7))

df_plot = df[df['count'] > 0].copy()

plt.bar(df_plot['century_label'], df_plot['count'], color='steelblue', edgecolor='black', linewidth=0.5)

plt.xlabel('Century', fontsize=12)
plt.ylabel('Number of Works', fontsize=12)
plt.title('Evolution of Number of Works Before 1800\n(Items with Date, Country/Place, and Full Text/URL Content)\nContent = URL, sitelinks, document files', fontsize=14)
plt.xticks(rotation=45, ha='right')

for i, (idx, row) in enumerate(df_plot.iterrows()):
    plt.text(i, row['count'] + max(df_plot['count'])*0.01, str(row['count']), 
             ha='center', va='bottom', fontsize=9)

plt.tight_layout()
plt.savefig('output/works_before_1800_by_century_all_properties.png', dpi=150, bbox_inches='tight')
plt.show()

print(f"\nGraph saved to output/works_before_1800_by_century_all_properties.png")

In [None]:
# Summary statistics
print("=" * 60)
print("SUMMARY")
print("=" * 60)
print(f"Total items sampled: {sample_size:,}")
print(f"Items before 1800 with date + country + content: {len(centuries_before_1800):,}")
print(f"Percentage: {len(centuries_before_1800)/sample_size*100:.2f}%")
print("\nProperties used:")
print(f"  Date ({len(DATE_PROPERTIES)}): {DATE_PROPERTIES}")
print(f"  Country/Place ({len(COUNTRY_PROPERTIES)}): {COUNTRY_PROPERTIES}")
print(f"  Content ({len(CONTENT_PROPERTIES)} + sitelinks): {CONTENT_PROPERTIES}")
print("\nItems by century:")
for _, row in df_plot.iterrows():
    print(f"  {row['century_label']}: {row['count']:,}")