<a href="https://colab.research.google.com/github/giuliobarde/web_data_mining_project/blob/main/News_API_Scraper_and_S3_Upload_Notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# News API Scraper and S3 Upload Notebook

This notebook scrapes the past month's news content from the News API, partitions the articles into three compartments based on authorship (multiple authors, single author, and no authors), and uploads the resulting notebooks to your team's S3 bucket folder.

In [3]:
!pip install boto3

Collecting boto3
  Downloading boto3-1.37.11-py3-none-any.whl.metadata (6.7 kB)
Collecting botocore<1.38.0,>=1.37.11 (from boto3)
  Downloading botocore-1.37.11-py3-none-any.whl.metadata (5.7 kB)
Collecting jmespath<2.0.0,>=0.7.1 (from boto3)
  Downloading jmespath-1.0.1-py3-none-any.whl.metadata (7.6 kB)
Collecting s3transfer<0.12.0,>=0.11.0 (from boto3)
  Downloading s3transfer-0.11.4-py3-none-any.whl.metadata (1.7 kB)
Downloading boto3-1.37.11-py3-none-any.whl (139 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.6/139.6 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading botocore-1.37.11-py3-none-any.whl (13.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.4/13.4 MB[0m [31m57.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading jmespath-1.0.1-py3-none-any.whl (20 kB)
Downloading s3transfer-0.11.4-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.4/84.4 kB[0m [31m7.4 MB/s[0m eta [36m0:0

In [8]:
import os
import json
import requests
import datetime
import boto3
from botocore.config import Config
from botocore import UNSIGNED
from collections import defaultdict

# Create a temporary directory if it doesn't exist
if not os.path.exists('tmp'):
    os.makedirs('tmp')

In [9]:
# S3 Configuration
TEAM = "TEAM_1/"  # Your team folder
BUCKET_NAME = "cus635-spring2025"

# Create an anonymous S3 client (using UNSIGNED credentials)
s3 = boto3.client('s3', config=Config(signature_version=UNSIGNED))

In [17]:
# News API Scraping with adjusted date range
news_key = '116a9d5a70be4a3689c7b732a1a547d4'

# Calculate date range for the past month
end_date = datetime.datetime.utcnow()
start_date = end_date - datetime.timedelta(days=30)

allowed_start = datetime.datetime(2025, 2, 11)
if start_date < allowed_start:
    start_date = allowed_start

# Format dates as YYYY-MM-DD
end_str = end_date.strftime('%Y-%m-%d')
start_str = start_date.strftime('%Y-%m-%d')

# Defined query to get sports articles
query1 = 'sport'
url1 = (f'https://newsapi.org/v2/top-headlines?q={query1}&from={start_str}&to={end_str}&sortBy=popularity&apiKey={news_key}')
response1 = requests.get(url1)

# Same for bitcoin articles
query2 = 'bitcoin'
url2 = (f'https://newsapi.org/v2/everything?q={query2}&from={start_str}&to={end_str}&sortBy=popularity&apiKey={news_key}')
response2 = requests.get(url2)

# Same for government articles
query3 = 'government'
url3 = (f'https://newsapi.org/v2/top-headlines?q={query3}&from={start_str}&to={end_str}&sortBy=popularity&apiKey={news_key}')
response3 = requests.get(url3)

data1 = response1.json()
data2 = response2.json()
data3 = response3.json()

# Gets the source for each article
def print_source_counts(data, label):
    sources = defaultdict(list)
    for article in data.get('articles', []):
        source_name = article.get('source', {}).get('name', 'Unknown')
        sources[source_name].append(article)
    for src, articles in sources.items():
        print(f"  {src}: {len(articles)} articles")

# Process and print the results for each dataset by source
print_source_counts(data1, 'data1')
print_source_counts(data2, 'data2')
print_source_counts(data3, 'data3')

  Bild: 2 articles
  Fox Sports: 1 articles
  BBC Sport: 3 articles
  FourFourTwo: 1 articles
  Il Sole 24 Ore: 1 articles
  Google News (India): 1 articles
  Le Monde: 1 articles
  Politico: 1 articles
  TalkSport: 1 articles
  Bleacher Report: 2 articles
  ANSA.it: 1 articles
  L'equipe: 1 articles
  The Irish Times: 1 articles
  The Hindu: 1 articles
  The Times of India: 1 articles
  Wired: 1 articles
  The Verge: 2 articles
  Gizmodo.com: 9 articles
  BBC News: 3 articles
  CNET: 1 articles
  Business Insider: 6 articles
  NPR: 3 articles
  Slashdot.org: 2 articles
  Yahoo Entertainment: 10 articles
  Time: 1 articles
  ESPN: 1 articles
  Xataka.com: 10 articles
  ABC News: 9 articles
  Fox News: 1 articles
  Le Monde: 1 articles
  Digital Trends: 1 articles
  The Atlantic: 1 articles
  Obscura.net: 2 articles
  Kaspersky.com: 1 articles
  Zeteo.com: 1 articles
  Github.com: 1 articles
  Startupbaniya.com: 1 articles
  Pluralistic.net: 1 articles
  Educatedguesswork.org: 1 article

In [13]:
# Combine articles from the 3 datasets
all_articles = data1.get('articles', []) + data2.get('articles', []) + data3.get('articles', [])

# Print the total number of all articles
print(f"Total articles: {len(all_articles)}")


Total articles: 137


In [14]:
import re

def sanitize_filename(name):
    # Replace spaces and non-alphanumeric characters with underscores
    return re.sub(r'\W+', '_', name)

# Create a directory for source notebooks if it doesn't exist
source_dir = "tmp/sources"
if not os.path.exists(source_dir):
    os.makedirs(source_dir)


# Create a directory for source JSON files if it doesn't exist
raw_source_dir = "tmp/sources_raw"
if not os.path.exists(raw_source_dir):
    os.makedirs(raw_source_dir)

# Create a dictionary for each source with an 'id' key for each article (assuming each article has a unique id)
for src, art_list in sources.items():
    # For this example, we simply use the index in the list
    articles_by_id = {str(index): article for index, article in enumerate(art_list)}
    sanitized_src = sanitize_filename(src)
    filepath = f"{raw_source_dir}/{sanitized_src}.json"
    with open(filepath, 'w') as f:
        json.dump(articles_by_id, f, indent=2)
    print(f"Created raw JSON file for {src}: {filepath}")

Created raw JSON file for Bild: tmp/sources_raw/Bild.json
Created raw JSON file for BBC Sport: tmp/sources_raw/BBC_Sport.json
Created raw JSON file for Fox Sports: tmp/sources_raw/Fox_Sports.json
Created raw JSON file for FourFourTwo: tmp/sources_raw/FourFourTwo.json
Created raw JSON file for Il Sole 24 Ore: tmp/sources_raw/Il_Sole_24_Ore.json
Created raw JSON file for Le Monde: tmp/sources_raw/Le_Monde.json
Created raw JSON file for Politico: tmp/sources_raw/Politico.json
Created raw JSON file for TalkSport: tmp/sources_raw/TalkSport.json
Created raw JSON file for Bleacher Report: tmp/sources_raw/Bleacher_Report.json
Created raw JSON file for ANSA.it: tmp/sources_raw/ANSA_it.json
Created raw JSON file for L'equipe: tmp/sources_raw/L_equipe.json
Created raw JSON file for The Irish Times: tmp/sources_raw/The_Irish_Times.json
Created raw JSON file for The Hindu: tmp/sources_raw/The_Hindu.json
Created raw JSON file for The Times of India: tmp/sources_raw/The_Times_of_India.json
Created ra

In [15]:
# Upload each raw JSON file from the sources_raw directory to S3
for filename in os.listdir(raw_source_dir):
    local_path = f"{raw_source_dir}/{filename}"
    # Upload files to the "sources" subfolder under your team folder in S3
    s3_key = TEAM + "sources/" + filename
    s3.upload_file(local_path, BUCKET_NAME, s3_key)
    print(f"Uploaded {local_path} to s3://{BUCKET_NAME}/{s3_key}")

Uploaded tmp/sources_raw/CoinDesk.json to s3://cus635-spring2025/TEAM_1/sources/CoinDesk.json
Uploaded tmp/sources_raw/Il_Sole_24_Ore.json to s3://cus635-spring2025/TEAM_1/sources/Il_Sole_24_Ore.json
Uploaded tmp/sources_raw/Fox_News.json to s3://cus635-spring2025/TEAM_1/sources/Fox_News.json
Uploaded tmp/sources_raw/Bloomberg.json to s3://cus635-spring2025/TEAM_1/sources/Bloomberg.json
Uploaded tmp/sources_raw/Flowingdata_com.json to s3://cus635-spring2025/TEAM_1/sources/Flowingdata_com.json
Uploaded tmp/sources_raw/Genbeta_com.json to s3://cus635-spring2025/TEAM_1/sources/Genbeta_com.json
Uploaded tmp/sources_raw/Bleacher_Report.json to s3://cus635-spring2025/TEAM_1/sources/Bleacher_Report.json
Uploaded tmp/sources_raw/Digital_Trends.json to s3://cus635-spring2025/TEAM_1/sources/Digital_Trends.json
Uploaded tmp/sources_raw/CBS_News.json to s3://cus635-spring2025/TEAM_1/sources/CBS_News.json
Uploaded tmp/sources_raw/Slashdot_org.json to s3://cus635-spring2025/TEAM_1/sources/Slashdot_o

In [16]:
# List files in the S3 bucket under the TEAM_1/sources/ folder to verify uploads
response = s3.list_objects_v2(Bucket=BUCKET_NAME, Prefix=TEAM + "sources/")
if 'Contents' in response:
    print('Files in S3 Bucket under', TEAM + "sources/", ':')
    for obj in response['Contents']:
        print('-', obj['Key'])
else:
    print('No files found in the bucket under sources.')

Files in S3 Bucket under TEAM_1/sources/ :
- TEAM_1/sources/ABC_News.json
- TEAM_1/sources/ABC_News_AU_.json
- TEAM_1/sources/ANSA_it.json
- TEAM_1/sources/AppleInsider.json
- TEAM_1/sources/Associated_Press.json
- TEAM_1/sources/BBC_News.json
- TEAM_1/sources/BBC_Sport.json
- TEAM_1/sources/Bild.json
- TEAM_1/sources/Bleacher_Report.json
- TEAM_1/sources/Bloomberg.json
- TEAM_1/sources/Breitbart_News.json
- TEAM_1/sources/Business_Insider.json
- TEAM_1/sources/CBC_News.json
- TEAM_1/sources/CBS_News.json
- TEAM_1/sources/CNET.json
- TEAM_1/sources/CoinDesk.json
- TEAM_1/sources/Crypto_Coins_News.json
- TEAM_1/sources/Digital_Trends.json
- TEAM_1/sources/ESPN.json
- TEAM_1/sources/Educatedguesswork_org.json
- TEAM_1/sources/Flowingdata_com.json
- TEAM_1/sources/FourFourTwo.json
- TEAM_1/sources/Fox_News.json
- TEAM_1/sources/Fox_Sports.json
- TEAM_1/sources/Genbeta_com.json
- TEAM_1/sources/Github_com.json
- TEAM_1/sources/Gizmodo_com.json
- TEAM_1/sources/Gizmodo_jp.json
- TEAM_1/sour