In [3]:
from SPARQLWrapper import SPARQLWrapper, JSON
import csv

sparql = SPARQLWrapper("https://query.wikidata.org/sparql")

sparql.setQuery("""
SELECT ?writer ?writerLabel ?wikidataID ?birthYear ?countryLabel WHERE {
  ?writer wdt:P31 wd:Q5 ;          # human
          wdt:P106 wd:Q36180 ;     # occupation: writer
          wdt:P569 ?birthDate .    # date of birth

  BIND(YEAR(?birthDate) AS ?birthYear)
  FILTER(?birthYear < 1300)

  BIND(STRAFTER(STR(?writer), "entity/") AS ?wikidataID)

  OPTIONAL { ?writer wdt:P27 ?country . }

  SERVICE wikibase:label {
    bd:serviceParam wikibase:language "en".
  }
}
ORDER BY ?birthYear
""")

sparql.setReturnFormat(JSON)
results = sparql.query().convert()

output_file = "writers_born_before_1300.csv"

with open(output_file, "w", newline="", encoding="utf-8") as f:
    w = csv.writer(f)
    w.writerow(["wikidata_id", "writer_name", "birth_year", "country"])

    for r in results["results"]["bindings"]:
        w.writerow([
            r["wikidataID"]["value"],
            r["writerLabel"]["value"],
            r["birthYear"]["value"],
            r.get("countryLabel", {}).get("value", "")
        ])

print(f"CSV written to {output_file}")


CSV written to writers_born_before_1300.csv


In [18]:
from SPARQLWrapper import SPARQLWrapper, JSON
import pandas as pd
from tqdm import tqdm
import time
import random
from urllib.error import HTTPError

# -------------------------------------------------
# Read writers CSV
# -------------------------------------------------
writers_df = pd.read_csv(
    "writers_born_before_1300.csv",
    dtype={
        "wikidata_id": str,
        "writer_name": str,
        "birth_year": "Int64",
        "country": str
    }
)

writers_df = writers_df.sample(100, random_state=42).reset_index(drop=True)
# -------------------------------------------------
# SPARQL setup (IMPORTANT: User-Agent)
# -------------------------------------------------
sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
sparql.setReturnFormat(JSON)
sparql.addCustomHttpHeader(
    "User-Agent",
    "AncientWritersBot/1.0 (research; contact: youremail@example.com)"
)

# -------------------------------------------------
# Function: get works + Wikisource (rate-limit safe)
# -------------------------------------------------
def get_works_for_writer(writer_qid: str, max_retries=5):
    query = f"""
    SELECT ?work ?workLabel ?workID ?wikisource WHERE {{
      ?work wdt:P50 wd:{writer_qid} .
      BIND(STRAFTER(STR(?work), "entity/") AS ?workID)

      OPTIONAL {{
        ?wikisource schema:about ?work ;
                    schema:isPartOf <https://en.wikisource.org/> .
      }}

      SERVICE wikibase:label {{
        bd:serviceParam wikibase:language "en".
      }}
    }}
    """

    sparql.setQuery(query)

    for attempt in range(max_retries):
        try:
            results = sparql.query().convert()

            return [
                {
                    "work_wikidata_id": r["workID"]["value"],
                    "work_title": r["workLabel"]["value"],
                    "wikisource_url": r.get("wikisource", {}).get("value", "")
                }
                for r in results["results"]["bindings"]
            ]

        except HTTPError as e:
            if e.code == 429:
                # Exponential backoff + jitter
                sleep_time = (2 ** attempt) + random.uniform(0.5, 1.5)
                time.sleep(sleep_time)
            else:
                raise e

    # If all retries fail, return empty list
    return []

# -------------------------------------------------
# Build works dataframe (slow + safe)
# -------------------------------------------------
rows = []

for _, row in tqdm(
    writers_df.iterrows(),
    total=len(writers_df),
    desc="Fetching works + Wikisource",
    unit="writer"
):
    works = get_works_for_writer(row["wikidata_id"])

    for w in works:
        rows.append({
            "writer_wikidata_id": row["wikidata_id"],
            "writer_name": row["writer_name"],
            "birth_year": row["birth_year"],
            "country": row["country"],
            "work_wikidata_id": w["work_wikidata_id"],
            "work_title": w["work_title"],
            "wikisource_url": w["wikisource_url"]
        })

    # Hard throttle between writers
    time.sleep(1.0)

works_df = pd.DataFrame(rows)

# -------------------------------------------------
# Save result
# -------------------------------------------------
works_df.to_csv(
    "writers_before_1300_with_works_and_wikisource.csv",
    index=False
)

print("Saved data_extraction/writers_before_1300_with_works_and_wikisource.csv")


Fetching works + Wikisource: 100%|██████████| 100/100 [02:06<00:00,  1.26s/writer]

Saved data_extraction/writers_before_1300_with_works_and_wikisource.csv





In [29]:
df_wiki = works_df[works_df['wikisource_url'] != '']
df_wiki.to_csv("writers_with_wikisource_urls.csv", index=False)



In [35]:
df_wiki

Unnamed: 0,writer_wikidata_id,writer_name,birth_year,country,work_wikidata_id,work_title,wikisource_url
8,Q175053,Adam of Saint Victor,1122,Kingdom of France,Q52064174,Laudes crucis attollamus,https://en.wikisource.org/wiki/Laudes_crucis_a...
20,Q160441,Rabanus Maurus,776,Carolingian Empire,Q1410832,Veni Creator Spiritus,https://en.wikisource.org/wiki/Veni_Creator_Sp...
49,Q10133,Suetonius,70,Ancient Rome,Q1229963,The Twelve Caesars,https://en.wikisource.org/wiki/The_Lives_of_th...
50,Q10133,Suetonius,70,Ancient Rome,Q56071308,Lives of the Caesars 6. Nero,https://en.wikisource.org/wiki/The_Lives_of_th...
119,Q354523,Thomas of Celano,1190,,Q83771,Dies Irae,https://en.wikisource.org/wiki/Dies_Irae_(Thom...
120,Q354523,Thomas of Celano,1190,,Q19082344,Dies Irae,https://en.wikisource.org/wiki/Dies_Irae_(Irons)
143,Q1048,Julius Caesar,-99,Ancient Rome,Q544948,Commentarii de Bello Civili,https://en.wikisource.org/wiki/Commentaries_on...
144,Q1048,Julius Caesar,-99,Ancient Rome,Q21163272,Commentaries on the Gallic War,https://en.wikisource.org/wiki/Commentaries_on...
772,Q337773,Liu Zongyuan,773,Tang dynasty,Q18858003,Q18858003,https://en.wikisource.org/wiki/Gems_of_Chinese...
773,Q337773,Liu Zongyuan,773,Tang dynasty,Q18840804,Q18840804,https://en.wikisource.org/wiki/Gems_of_Chinese...


In [34]:
import requests
from bs4 import BeautifulSoup

url = "https://en.wikisource.org/wiki/Gems_of_Chinese_Literature/Liu_Tsung-yüan-Pas_Trop_Gouverner"

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}

response = requests.get(url, headers=headers)

if response.status_code == 200:
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Get main content container
    main_content = soup.find('div', {'class': 'mw-parser-output'})
    
    if main_content:
        # Extract clean text
        text = main_content.get_text(separator='\n', strip=True)
        
        # Save to file
        with open('wikisource_content.txt', 'w', encoding='utf-8') as f:
            f.write(text)
        
        print("Content extracted successfully!")
        print(f"Total characters: {len(text)}")
        print("\nPreview:")
        print(text[:500])
    else:
        print("Could not find main content")
else:
    print(f"Error: {response.status_code}")

Content extracted successfully!
Total characters: 4089

Preview:
←
Is There a God?
Gems of Chinese Literature
(
1922
)
translated by
Herbert Allen Giles
Pas Trop Gouverner
by
Liu Tsung-yüan
→
sister projects
:
Wikidata item
​
LIU TSUNG-YÜAN.
a.d. 773-819.
[A most versatile writer, and one of the intimate friends of
Han Wên-kung
(
q.v.
), like whom he was banished on political grounds to a distant official post, where he died. His breadth of intelligence allowed him to tolerate Buddhism, in direct opposition to the utterances of Han Wên-kung, who perceived in 


In [37]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm
import time

def extract_wikisource_content(url):
    """
    Extract main content from a Wikisource page
    
    Args:
        url: URL of the Wikisource page
    
    Returns:
        Extracted text content as string, or None if error
    """
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
    }
    
    try:
        response = requests.get(url, headers=headers, timeout=10)
        
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Get main content container
            main_content = soup.find('div', {'class': 'mw-parser-output'})
            
            if main_content:
                # Extract clean text
                text = main_content.get_text(separator='\n', strip=True)
                return text
            else:
                return None
        else:
            return None
            
    except Exception as e:
        # Silently continue on error
        return None

def extract_all_wikisource_pages(df):
    """
    Extract content from all Wikisource URLs in dataframe
    
    Args:
        df: DataFrame with 'wikisource_url' column
    
    Returns:
        DataFrame with added 'content' column
    """
    contents = []
    
    for url in tqdm(df['wikisource_url'], desc="Extracting content"):
        content = extract_wikisource_content(url)
        contents.append(content)
        
        # Be nice to the server - small delay between requests
        time.sleep(0.5)
    
    df['content'] = contents
    return df

# Usage:
df_wiki = extract_all_wikisource_pages(df_wiki)

# Save results
df_wiki.to_csv('wikisource_with_content.csv', index=False)

# Check how many succeeded
print(f"Successfully extracted: {df_wiki['content'].notna().sum()} / {len(df_wiki)}")

Extracting content: 100%|██████████| 18/18 [00:10<00:00,  1.68it/s]

Successfully extracted: 4 / 18



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['content'] = contents


In [38]:
df_wiki

Unnamed: 0,writer_wikidata_id,writer_name,birth_year,country,work_wikidata_id,work_title,wikisource_url,content
8,Q175053,Adam of Saint Victor,1122,Kingdom of France,Q52064174,Laudes crucis attollamus,https://en.wikisource.org/wiki/Laudes_crucis_a...,←\nEnglish-language translations of\nLaudes cr...
20,Q160441,Rabanus Maurus,776,Carolingian Empire,Q1410832,Veni Creator Spiritus,https://en.wikisource.org/wiki/Veni_Creator_Sp...,
49,Q10133,Suetonius,70,Ancient Rome,Q1229963,The Twelve Caesars,https://en.wikisource.org/wiki/The_Lives_of_th...,
50,Q10133,Suetonius,70,Ancient Rome,Q56071308,Lives of the Caesars 6. Nero,https://en.wikisource.org/wiki/The_Lives_of_th...,
119,Q354523,Thomas of Celano,1190,,Q83771,Dies Irae,https://en.wikisource.org/wiki/Dies_Irae_(Thom...,
120,Q354523,Thomas of Celano,1190,,Q19082344,Dies Irae,https://en.wikisource.org/wiki/Dies_Irae_(Irons),
143,Q1048,Julius Caesar,-99,Ancient Rome,Q544948,Commentarii de Bello Civili,https://en.wikisource.org/wiki/Commentaries_on...,
144,Q1048,Julius Caesar,-99,Ancient Rome,Q21163272,Commentaries on the Gallic War,https://en.wikisource.org/wiki/Commentaries_on...,
772,Q337773,Liu Zongyuan,773,Tang dynasty,Q18858003,Q18858003,https://en.wikisource.org/wiki/Gems_of_Chinese...,←\nCongratulations on a Fire\nGems of Chinese ...
773,Q337773,Liu Zongyuan,773,Tang dynasty,Q18840804,Q18840804,https://en.wikisource.org/wiki/Gems_of_Chinese...,


In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm
import time
from urllib.parse import urljoin, urlparse

def extract_wikisource_content(url):
    """
    Extract main content from a Wikisource page
    
    Args:
        url: URL of the Wikisource page
    
    Returns:
        Extracted text content as string, or None if error
    """
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
    }
    
    try:
        response = requests.get(url, headers=headers, timeout=10)
        
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Get main content container
            main_content = soup.find('div', {'class': 'mw-parser-output'})
            
            if main_content:
                # Extract clean text
                text = main_content.get_text(separator='\n', strip=True)
                return text
            else:
                return None
        else:
            return None
            
    except Exception as e:
        return None

def find_wikisource_links(url):
    """
    Find all Wikisource links within a page
    
    Args:
        url: URL of the Wikisource page
    
    Returns:
        List of Wikisource URLs found on the page
    """
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
    }
    
    try:
        response = requests.get(url, headers=headers, timeout=10)
        
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Get main content container
            main_content = soup.find('div', {'class': 'mw-parser-output'})
            
            if main_content:
                links = []
                
                # Find all links in the content
                for a_tag in main_content.find_all('a', href=True):
                    href = a_tag['href']
                    
                    # Convert relative URLs to absolute
                    full_url = urljoin(url, href)
                    
                    # Only keep Wikisource links (exclude external, special pages, etc.)
                    if 'wikisource.org/wiki/' in full_url and ':' not in full_url.split('/wiki/')[-1]:
                        links.append(full_url)
                
                # Remove duplicates
                return list(set(links))
            else:
                return []
        else:
            return []
            
    except Exception as e:
        return []

def extract_page_and_linked_content(url, max_depth=1):
    """
    Extract content from a page and optionally from linked pages
    
    Args:
        url: URL of the main Wikisource page
        max_depth: How many levels deep to follow links (0 = only main page, 1 = main + linked pages)
    
    Returns:
        Dictionary with main content and linked contents
    """
    result = {
        'main_url': url,
        'main_content': None,
        'linked_pages': []
    }
    
    # Extract main page content
    print(f"Extracting main page: {url}")
    result['main_content'] = extract_wikisource_content(url)
    
    if max_depth > 0:
        # Find links on the main page
        print(f"Finding links on main page...")
        links = find_wikisource_links(url)
        print(f"Found {len(links)} links")
        
        # Extract content from each linked page
        for link in tqdm(links, desc="Extracting linked pages"):
            linked_content = extract_wikisource_content(link)
            result['linked_pages'].append({
                'url': link,
                'content': linked_content
            })
            time.sleep(0.5)  # Be nice to the server
    
    return result

def extract_all_pages_with_links(df, max_depth=1):
    """
    Extract content from all pages in dataframe, including linked pages
    
    Args:
        df: DataFrame with 'wikisource_url' column
        max_depth: How many levels deep to follow links
    
    Returns:
        List of dictionaries with all extracted content
    """
    all_results = []
    
    for url in tqdm(df['wikisource_url'], desc="Processing pages"):
        result = extract_page_and_linked_content(url, max_depth=max_depth)
        all_results.append(result)
        time.sleep(0.5)
    
    return all_results

# Usage:
# # Extract just the main pages
# results = extract_all_pages_with_links(df_wiki, max_depth=0)
#
# # Extract main pages + their linked pages
# results_with_links = extract_all_pages_with_links(df_wiki, max_depth=1)
#
# # Access results:
# print(results[0]['main_content'])  # Main page content
# print(results[0]['linked_pages'])  # List of linked pages with their content

{'main_url': 'https://en.wikisource.org/wiki/The_Elements_of_Euclid_for_the_Use_of_Schools_and_Colleges',
 'main_content': '',
 'linked_pages': []}