In [0]:
# Install libs not already included in instance
%pip install requests-sse
%pip install pywikibot
%restart_python

[43mNote: you may need to restart the kernel using %restart_python or dbutils.library.restartPython() to use updated packages.[0m
[43mNote: you may need to restart the kernel using %restart_python or dbutils.library.restartPython() to use updated packages.[0m


In [0]:
import json
import os
from pywikibot.comms.eventstreams import EventStreams
from datetime import datetime, timedelta


In [0]:
# Define the name of the new uc_catalog
uc_catalog = 'wikimedia_db'
spark.sql('create catalog if not exists ' + uc_catalog)

# define the raw events schema
uc_schema_raw_events = 'raw_events'
spark.sql('create schema if not exists ' + uc_catalog + '.' + uc_schema_raw_events)

# save the volume time
tmp_volume_time = datetime.now()
tmp_volume =  f"events_tmp_{tmp_volume_time.strftime('%y_%m_%d')}"
spark.sql('create volume if not exists ' + uc_catalog + '.' + uc_schema_raw_events + '.' + tmp_volume)


DataFrame[]

In [0]:
# Create uc last_event_cache schema and volume if not exists
uc_schema_last_event_cache = 'last_event_cache'
spark.sql('create schema if not exists ' + uc_catalog + '.' + uc_schema_last_event_cache)

last_event_cache_volume = 'data'
spark.sql('create volume if not exists ' + uc_catalog + '.' + uc_schema_last_event_cache + '.' + last_event_cache_volume)

DataFrame[]

In [0]:

# simple helper function for checking if a file exists
def check_file_exists(last_event_cache_path: str) -> bool:
    return os.path.exists(last_event_cache_path)

# set stream object to start from  7 days ago on first run
# and then from the last event id on subsequent runs
def set_stream(last_event_cache_path: str, start_time: datetime) -> EventStreams:
    if not check_file_exists(last_event_cache_path):
        # start from 7 days ago
        stream_start_date_raw = start_time - timedelta(days=8)
        stream_start_date_formatted = stream_start_date_raw.strftime('%Y%m%d')
        return EventStreams(streams=["recentchange", "revision-create"], since=stream_start_date_formatted)
    else:
        # start from last event id
        with open(last_event_cache_path, 'r') as f:
            last_time_stamp = f.read()
            return EventStreams(streams=["recentchange", "revision-create"], since=last_time_stamp)



In [0]:
# set start time for streaming and temp voume naming
start_time = datetime.now()

# set stop time for streaming
duration = .5
stop_time = start_time + timedelta(minutes=duration)

# set last_event_cache path
last_event_cache_path = f"/Volumes/{uc_catalog}/{uc_schema_last_event_cache}/{last_event_cache_volume}/last_event_cache.txt"

# create the streaming object
stream = set_stream(last_event_cache_path, start_time)


In [0]:
tmp_stream = set_stream(last_event_cache_path, start_time)

for i in range(5):
    event = next(tmp_stream)
    print(json.dumps(event, indent=2))


{
  "$schema": "/mediawiki/recentchange/1.0.0",
  "meta": {
    "uri": "https://ar.wikipedia.org/wiki/%D8%AE%D8%A7%D8%B5:%D8%AF%D8%AE%D9%88%D9%84_%D8%A7%D9%84%D9%85%D8%B3%D8%AA%D8%AE%D8%AF%D9%85",
    "request_id": "63249159-52e2-4dc3-8a64-9325542535ac",
    "id": "604c6a76-ee54-41f9-bd71-6c3e0563d441",
    "domain": "ar.wikipedia.org",
    "stream": "mediawiki.recentchange",
    "dt": "2025-11-09T18:49:44.795Z",
    "topic": "eqiad.mediawiki.recentchange",
    "partition": 0,
    "offset": 5960682412
  },
  "type": "log",
  "namespace": -1,
  "title": "\u062e\u0627\u0635:\u062f\u062e\u0648\u0644 \u0627\u0644\u0645\u0633\u062a\u062e\u062f\u0645",
  "title_url": "https://ar.wikipedia.org/wiki/%D8%AE%D8%A7%D8%B5:%D8%AF%D8%AE%D9%88%D9%84_%D8%A7%D9%84%D9%85%D8%B3%D8%AA%D8%AE%D8%AF%D9%85",
  "comment": "",
  "timestamp": 1762714184,
  "user": "NoWwWeR",
  "bot": false,
  "log_id": 0,
  "log_type": "abusefilter",
  "log_action": "hit",
  "log_params": {
    "action": "autocreateaccount",
   

In [0]:
FAVORITE_TITLE = "Cristiano_Ronaldo"  # e.g. your favourite player/topic

stream.register_filter(
    server_name="fr.wikipedia.org",
    type="edit",
    namespace=0   # main article namespace, still lots of events
)

# build path for temp volume
raw_volume_path = f"/Volumes/{uc_catalog}/{uc_schema_raw_events}/{tmp_volume}"

# loop to get streaming data
while datetime.now() < stop_time:
    change = next(stream)
    
    # Use a field that definitely exists in the event data
    event_timestamp = change['meta']['dt']  # ISO 8601 timestamp
    revision_id = change.get('revision', {}).get('new', 'unknown')  # More reliable
    
    file = f"{raw_volume_path}/event_{revision_id}.json"

    # write event to file
    with open(file, 'w') as f:
        json.dump(change, f)

    # update last_event_cache with TIMESTAMP (what 'since' actually needs)
    with open(last_event_cache_path, 'w') as f:
        f.write(event_timestamp)