<a href="https://colab.research.google.com/github/giuliobarde/web_data_mining_project/blob/main/News_API_Scraper_and_S3_Upload_Notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# News API Scraper and S3 Upload Notebook

This notebook scrapes the past month's news content from the News API, partitions the articles into three compartments based on authorship (multiple authors, single author, and no authors), and uploads the resulting notebooks to your team's S3 bucket folder.

In [2]:
!pip install boto3

Collecting boto3
  Downloading boto3-1.37.10-py3-none-any.whl.metadata (6.7 kB)
Collecting botocore<1.38.0,>=1.37.10 (from boto3)
  Downloading botocore-1.37.10-py3-none-any.whl.metadata (5.7 kB)
Collecting jmespath<2.0.0,>=0.7.1 (from boto3)
  Downloading jmespath-1.0.1-py3-none-any.whl.metadata (7.6 kB)
Collecting s3transfer<0.12.0,>=0.11.0 (from boto3)
  Downloading s3transfer-0.11.4-py3-none-any.whl.metadata (1.7 kB)
Downloading boto3-1.37.10-py3-none-any.whl (139 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.6/139.6 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading botocore-1.37.10-py3-none-any.whl (13.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.4/13.4 MB[0m [31m94.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading jmespath-1.0.1-py3-none-any.whl (20 kB)
Downloading s3transfer-0.11.4-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.4/84.4 kB[0m [31m6.3 MB/s[0m eta [36m0:0

In [3]:
import os
import json
import requests
import datetime
import boto3
from botocore.config import Config
from botocore import UNSIGNED

# Create a temporary directory if it doesn't exist
if not os.path.exists('tmp'):
    os.makedirs('tmp')

In [4]:
# S3 Configuration
TEAM = "TEAM_1/"  # Your team folder
BUCKET_NAME = "cus635-spring2025"

# Create an anonymous S3 client (using UNSIGNED credentials)
s3 = boto3.client('s3', config=Config(signature_version=UNSIGNED))

In [13]:
# News API Scraping with adjusted date range
news_key = '116a9d5a70be4a3689c7b732a1a547d4'

# Calculate date range for the past month
end_date = datetime.datetime.utcnow()
start_date = end_date - datetime.timedelta(days=30)

allowed_start = datetime.datetime(2025, 2, 11)
if start_date < allowed_start:
    start_date = allowed_start

# Format dates as YYYY-MM-DD
end_str = end_date.strftime('%Y-%m-%d')
start_str = start_date.strftime('%Y-%m-%d')

# Use the 'everything' endpoint with a generic query 'news'
url = f'https://newsapi.org/v2/everything?q=news&from={start_str}&to={end_str}&sortBy=popularity&apiKey={news_key}'
response = requests.get(url)
data = response.json()

# Print response for debugging
print("Response from News API:", data)

if data.get('status') != 'ok':
    raise Exception('Error fetching data from News API')

articles = data.get('articles', [])
print(f"Fetched {len(articles)} articles from {start_str} to {end_str}")

Fetched 72 articles from 2025-02-11 to 2025-03-11


In [15]:
from collections import defaultdict

sources = defaultdict(list)
for article in articles:
    source_name = article.get('source', {}).get('name', 'Unknown')
    sources[source_name].append(article)

for src, art_list in sources.items():
    print(f"{src}: {len(art_list)} articles")

Yahoo Entertainment: 3 articles
Wired: 11 articles
The Verge: 35 articles
Gizmodo.com: 23 articles


In [21]:
import re

def sanitize_filename(name):
    # Replace spaces and non-alphanumeric characters with underscores
    return re.sub(r'\W+', '_', name)

# Create a directory for source notebooks if it doesn't exist
source_dir = "tmp/sources"
if not os.path.exists(source_dir):
    os.makedirs(source_dir)


# Create a directory for source JSON files if it doesn't exist
raw_source_dir = "tmp/sources_raw"
if not os.path.exists(raw_source_dir):
    os.makedirs(raw_source_dir)

# Create a dictionary for each source with an 'id' key for each article (assuming each article has a unique id)
for src, art_list in sources.items():
    # For this example, we simply use the index in the list
    articles_by_id = {str(index): article for index, article in enumerate(art_list)}
    sanitized_src = sanitize_filename(src)
    filepath = f"{raw_source_dir}/{sanitized_src}.json"
    with open(filepath, 'w') as f:
        json.dump(articles_by_id, f, indent=2)
    print(f"Created raw JSON file for {src}: {filepath}")

Created raw JSON file for Yahoo Entertainment: tmp/sources_raw/Yahoo_Entertainment.json
Created raw JSON file for Wired: tmp/sources_raw/Wired.json
Created raw JSON file for The Verge: tmp/sources_raw/The_Verge.json
Created raw JSON file for Gizmodo.com: tmp/sources_raw/Gizmodo_com.json


In [22]:
# Upload each raw JSON file from the sources_raw directory to S3
for filename in os.listdir(raw_source_dir):
    local_path = f"{raw_source_dir}/{filename}"
    # Upload files to the "sources" subfolder under your team folder in S3
    s3_key = TEAM + "sources/" + filename
    s3.upload_file(local_path, BUCKET_NAME, s3_key)
    print(f"Uploaded {local_path} to s3://{BUCKET_NAME}/{s3_key}")

Uploaded tmp/sources_raw/Gizmodo_com.json to s3://cus635-spring2025/TEAM_1/sources/Gizmodo_com.json
Uploaded tmp/sources_raw/Yahoo_Entertainment.json to s3://cus635-spring2025/TEAM_1/sources/Yahoo_Entertainment.json
Uploaded tmp/sources_raw/The_Verge.json to s3://cus635-spring2025/TEAM_1/sources/The_Verge.json
Uploaded tmp/sources_raw/Wired.json to s3://cus635-spring2025/TEAM_1/sources/Wired.json


In [23]:
# List files in the S3 bucket under the TEAM_1/sources/ folder to verify uploads
response = s3.list_objects_v2(Bucket=BUCKET_NAME, Prefix=TEAM + "sources/")
if 'Contents' in response:
    print('Files in S3 Bucket under', TEAM + "sources/", ':')
    for obj in response['Contents']:
        print('-', obj['Key'])
else:
    print('No files found in the bucket under sources.')

Files in S3 Bucket under TEAM_1/sources/ :
- TEAM_1/sources/Gizmodo_com.json
- TEAM_1/sources/The_Verge.json
- TEAM_1/sources/Wired.json
- TEAM_1/sources/Yahoo_Entertainment.json
