# Assignment: Wikipedia Page Views Pipeline

In [9]:
# Set your username here - use it consistently across all resources
USERNAME = "konstantinos"

In [10]:
import datetime
import json
import boto3
import requests

In [11]:
# Try different dates to see how the data changes
DATE_PARAM = "2025-11-21"

date = datetime.datetime.strptime(DATE_PARAM, "%Y-%m-%d")

# Construct the API URL
url = url = f"https://wikimedia.org/api/rest_v1/metrics/pageviews/top/en.wikipedia.org/all-access/{date.strftime('%Y/%m/%d')}"
print(f"Requesting REST API URL: {url}")

# Make the API request
wiki_server_response = requests.get(url, headers={"User-Agent": "curl/7.68.0"})
wiki_response_status = wiki_server_response.status_code
wiki_response_body = wiki_server_response.text

print(f"Wikipedia REST API Response body: {wiki_response_body[:500]}...")
print(f"Wikipedia REST API Response Code: {wiki_response_status}")

# Validate response
if wiki_response_status != 200:
    raise Exception(f"Received non-OK status code from Wiki Server: {wiki_response_status}")
print(f"Successfully retrieved Wikipedia data, content-length: {len(wiki_response_body)}")

Requesting REST API URL: https://wikimedia.org/api/rest_v1/metrics/pageviews/top/en.wikipedia.org/all-access/2025/11/21
Wikipedia REST API Response body: {"items":[{"project":"en.wikipedia","access":"all-access","year":"2025","month":"11","day":"21","articles":[{"article":"Main_Page","views":5958394,"rank":1},{"article":"Special:Search","views":860337,"rank":2},{"article":"Miss_Universe_2025","views":610834,"rank":3},{"article":"Google_Chrome","views":308863,"rank":4},{"article":"Wikipedia:Featured_pictures","views":253217,"rank":5},{"article":"Wicked:_For_Good","views":196127,"rank":6},{"article":"Spencer_Lofranco","views":159357,"rank":7},{"art...
Wikipedia REST API Response Code: 200
Successfully retrieved Wikipedia data, content-length: 56291


In [12]:
# Parse the API response and extract top views
wiki_response_parsed = wiki_server_response.json()
top_views = wiki_response_parsed["items"][0]["articles"]
print(f"Extracted {len(top_views)} top viewed pages")

# Transform to JSON Lines format
current_time = datetime.datetime.now(datetime.timezone.utc)
json_lines = ""
for page in top_views[:5]:
    record = {
        "title": page["article"],
        "views": page["views"],
        "rank": page["rank"],
        "date": date.strftime("%Y-%m-%d"),
        "retrieved_at": current_time.replace(tzinfo=None).isoformat(),
    }
    json_lines += json.dumps(record) + "\n"

print(f"Transformed {len(top_views)} records to JSON Lines")
print(f"First few lines:\n{json_lines[:500]}...")

Extracted 1000 top viewed pages
Transformed 1000 records to JSON Lines
First few lines:
{"title": "Main_Page", "views": 5958394, "rank": 1, "date": "2025-11-21", "retrieved_at": "2025-12-11T13:28:16.624017"}
{"title": "Special:Search", "views": 860337, "rank": 2, "date": "2025-11-21", "retrieved_at": "2025-12-11T13:28:16.624017"}
{"title": "Miss_Universe_2025", "views": 610834, "rank": 3, "date": "2025-11-21", "retrieved_at": "2025-12-11T13:28:16.624017"}
{"title": "Google_Chrome", "views": 308863, "rank": 4, "date": "2025-11-21", "retrieved_at": "2025-12-11T13:28:16.624017"}
{"tit...


In [13]:
S3_WIKI_BUCKET = USERNAME + "-wikidata"
s3 = boto3.client("s3")

bucket_names = [bucket["Name"] for bucket in s3.list_buckets()["Buckets"]]
if S3_WIKI_BUCKET not in bucket_names:
    s3.create_bucket(
        Bucket=S3_WIKI_BUCKET,
        CreateBucketConfiguration={"LocationConstraint": boto3.session.Session().region_name},
    )
    print(f"Created new bucket: {S3_WIKI_BUCKET}")
else:
    print(f"Using existing bucket: {S3_WIKI_BUCKET}")
s3_object_key = f"wikipedia_top_views_{date.strftime('%Y_%m_%d')}.json"

Using existing bucket: konstantinos-wikidata


In [14]:
# Test Lab 1
assert USERNAME != "<username>", "Please set your USERNAME at the top of the notebook"
assert S3_WIKI_BUCKET.endswith("-wikidata"), "Bucket name must end with '-wikidata'"

try:
    s3.head_bucket(Bucket=S3_WIKI_BUCKET)
    print(f"Bucket {S3_WIKI_BUCKET} exists!")
except Exception as e:
    print(f"Bucket {S3_WIKI_BUCKET} not found: {e}")
    raise

Bucket konstantinos-wikidata exists!


In [15]:
# LAB 2: Upload json_lines directly to S3
s3_key = f"raw-views/raw-views-{DATE_PARAM}.json"
s3.put_object(  
    Bucket=S3_WIKI_BUCKET,
    Key=s3_key,
    Body=json_lines,
)


{'ResponseMetadata': {'RequestId': 'S5QY3066CEBZPV3Y',
  'HostId': '6Jj04MhMHYuyPOi6jmdF7Ij5u/slRjHsRjxcgSPrDtREdk/EeJCSzRJn0pfGW/W+os7MqnTpfsrvG2HOWsTZnNZF4wMM7Yk4',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': '6Jj04MhMHYuyPOi6jmdF7Ij5u/slRjHsRjxcgSPrDtREdk/EeJCSzRJn0pfGW/W+os7MqnTpfsrvG2HOWsTZnNZF4wMM7Yk4',
   'x-amz-request-id': 'S5QY3066CEBZPV3Y',
   'date': 'Thu, 11 Dec 2025 13:28:17 GMT',
   'x-amz-server-side-encryption': 'AES256',
   'etag': '"58170c02250d6acc0dfe590966e78590"',
   'x-amz-checksum-crc32': 'RF5gog==',
   'x-amz-checksum-type': 'FULL_OBJECT',
   'content-length': '0',
   'server': 'AmazonS3'},
  'RetryAttempts': 0},
 'ETag': '"58170c02250d6acc0dfe590966e78590"',
 'ChecksumCRC32': 'RF5gog==',
 'ChecksumType': 'FULL_OBJECT',
 'ServerSideEncryption': 'AES256'}

In [16]:
# Test Lab 2
expected_key = f"raw-views/raw-views-{date.strftime('%Y-%m-%d')}.json"
try:
    s3.head_object(Bucket=S3_WIKI_BUCKET, Key=expected_key)
    print(f"File uploaded successfully to s3://{S3_WIKI_BUCKET}/{expected_key}")
except Exception as e:
    print(f"File not found at s3://{S3_WIKI_BUCKET}/{expected_key}")
    raise

File uploaded successfully to s3://konstantinos-wikidata/raw-views/raw-views-2025-11-21.json
