# Web scraping with GitHub

## We start with a few useful imports

In [120]:
BASE_URL = "https://www.blog.gov.uk/"

In [121]:
import json
from collections import defaultdict
from datetime import datetime, timedelta
from pathlib import Path

import requests
from bs4 import BeautifulSoup

## Let's create a folder to store results

In [123]:
data_dir = Path('data')
data_dir.mkdir(exist_ok=True)

## Set a cutoff date

Useful to test, and in production to not crawl the websites again unnecessarily.

While in development we can set a sensible result, say a month from now:

In [122]:
CUTOFF_DATE = datetime.now() - timedelta(days=10)
CUTOFF_DATE

datetime.datetime(2024, 12, 2, 18, 41, 32, 61135)

## Examine the target URL:

Check the base URL: https://www.blog.gov.uk/all-posts/ and if it paginates, check what happens with the second: https://www.blog.gov.uk/all-posts/page/2/, third https://www.blog.gov.uk/all-posts/page/3/, and last page https://www.blog.gov.uk/all-posts/page/174/ – also, what happens if we visit page number one using the full url https://www.blog.gov.uk/all-posts/page/1/?, and what if we visit one that is beyond our the numeration https://www.blog.gov.uk/all-posts/page/10000/?

Examine the pages using the developer tools & using requests.

In [124]:
page_response = requests.get('https://www.blog.gov.uk/all-posts/')
page_soup = BeautifulSoup(page_response.text, "html.parser")

In [125]:
entry_list = page_soup.find('ul', {'class':'blogs-list'})
entry_list_elements = entry_list.find_all('li')
first_element = entry_list_elements[0]
print(first_element.find('h3').text)

Making it quicker and easier to search on GOV.UK


### Try different pages, using page 1

In [126]:
page_response = requests.get('https://www.blog.gov.uk/all-posts/page/1')
page_soup = BeautifulSoup(page_response.text, "html.parser")

In [127]:
entry_list = page_soup.find('ul', {'class':'blogs-list'})
entry_list_elements = entry_list.find_all('li')
first_element = entry_list_elements[0]
print(first_element.find('h3').text)

Making it quicker and easier to search on GOV.UK


### Try different pages, using page 2

In [128]:
page_response = requests.get('https://www.blog.gov.uk/all-posts/page/2')
page_soup = BeautifulSoup(page_response.text, "html.parser")

In [129]:
entry_list = page_soup.find('ul', {'class':'blogs-list'})
entry_list_elements = entry_list.find_all('li')
first_element = entry_list_elements[0]
print(first_element.find('h3').text)

Do schools close because of bad weather? Everything parents need to know


### How about when we reach the end of the pagination?

In [130]:
page_response = requests.get('https://www.blog.gov.uk/all-posts/page/10000')
page_soup = BeautifulSoup(page_response.text, "html.parser")

In [131]:
entry_list = page_soup.find('ul', {'class':'blogs-list'})
entry_list_elements = entry_list.find_all('li')
element = entry_list_elements[0]
element['class']

['noresults']

## Start writing a `fetch_page` based on the experiments

In [132]:
def parse_entry(entry):
    title = entry.find('h3').text
    anchors = entry.find_all('a')
    link = anchors[0]['href']
    time = entry.find('time')['datetime']
    blog = anchors[1].text
    blog_link = anchors[1]['href']
    description = entry.find('p').text

    entry_dict = {
        'title': title,
        'link': link,
        'time': datetime.strptime(time[:19], '%Y-%m-%dT%H:%M:%S'),
        'blog': blog,
        'blog_link': blog_link,
        'description': description
    }

    return entry_dict

In [133]:
parse_entry(first_element)

{'title': 'Do schools close because of bad weather? Everything parents need to know',
 'link': 'https://educationhub.blog.gov.uk/2024/12/05/do-schools-close-because-of-bad-weather-everything-parents-need-to-know/',
 'time': datetime.datetime(2024, 12, 5, 18, 33, 29),
 'blog': 'The Education Hub',
 'blog_link': 'https://educationhub.blog.gov.uk',
 'description': "Schools will make every effort to stay open in adverse weather conditions, however, the safety of pupils and teachers is a top priority. Here's everything you need to know."}

In [134]:
def fetch_entries_for_page(page_number, cutoff_date):
    page_response = requests.get(f'https://www.blog.gov.uk/all-posts/page/{page_number}')
    page_soup = BeautifulSoup(page_response.text, "html.parser")
    entry_list = page_soup.find('ul', {'class':'blogs-list'})
    entry_list_elements = entry_list.find_all('li')
    no_results_li = entry_list.find('li', {'class':'noresults'})
    entries = []
    if no_results_li:
        return entries
    for entry in entry_list_elements:
        parsed_entry = parse_entry(entry)
        if parsed_entry['time'] < cutoff_date:
            break
        entries.append(parsed_entry)
    return entries

In [135]:
entries = fetch_entries_for_page(2, CUTOFF_DATE)
print(len(entries))
entries[:3]

21


[{'title': 'Do schools close because of bad weather? Everything parents need to know',
  'link': 'https://educationhub.blog.gov.uk/2024/12/05/do-schools-close-because-of-bad-weather-everything-parents-need-to-know/',
  'time': datetime.datetime(2024, 12, 5, 18, 33, 29),
  'blog': 'The Education Hub',
  'blog_link': 'https://educationhub.blog.gov.uk',
  'description': "Schools will make every effort to stay open in adverse weather conditions, however, the safety of pupils and teachers is a top priority. Here's everything you need to know."},
 {'title': 'Heavy vehicle testing and approvals: what we’re working on',
  'link': 'https://movingon.blog.gov.uk/2024/12/05/heavy-vehicle-testing-and-approvals-what-were-working-on/',
  'time': datetime.datetime(2024, 12, 5, 15, 47, 5),
  'blog': 'Moving On',
  'blog_link': 'https://movingon.blog.gov.uk',
  'description': 'I wanted to update you on some of the work we’re doing behind the scenes at DVSA in the heavy vehicle testing and approvals worl

## Then we can just web scrape the whole index!

In [136]:
all_current_entries = []

for page in range(1, 1_000):
    entries = fetch_entries_for_page(page, CUTOFF_DATE)
    if not entries:
        break
    all_current_entries.extend(entries)

In [137]:
len(all_current_entries)

61

In [139]:
sorted_entries = sorted(all_current_entries, key=lambda x: x['time'], reverse=True)
sorted_entries[0], sorted_entries[-1]

({'title': 'Making it quicker and easier to search on GOV.UK',
  'link': 'https://insidegovuk.blog.gov.uk/2024/12/12/making-it-quicker-and-easier-to-search-on-gov-uk/',
  'time': datetime.datetime(2024, 12, 12, 15, 54, 26),
  'blog': 'Inside GOV.UK',
  'blog_link': 'https://insidegovuk.blog.gov.uk',
  'description': 'We’ve made some changes to the interface of site search on GOV.UK to create a simpler, more user-friendly experience.'},
 {'title': 'Working together for Nature Recovery in Greater Manchester\xa0\xa0',
  'link': 'https://naturalengland.blog.gov.uk/2024/12/03/working-together-for-nature-recovery-in-greater-manchester/',
  'time': datetime.datetime(2024, 12, 3, 8, 47, 48),
  'blog': 'Natural England',
  'blog_link': 'https://naturalengland.blog.gov.uk',
  'description': 'The launch of Biodiversity Net Gain (BNG) has been a major achievement this year – this world-leading initiative means that for the first time, development must leave nature measurably improved. I was lucky 

## Now we can focus on the article page!

Let's take a page, for example

In [140]:
article_url = sorted_entries[0]['link']
article_url

'https://insidegovuk.blog.gov.uk/2024/12/12/making-it-quicker-and-easier-to-search-on-gov-uk/'

In [141]:
article_response = requests.get(article_url)
article_soup = BeautifulSoup(article_response.text, "html.parser")

In [142]:
article_tag = article_soup.find('article')
article_title = article_tag.find('h1').text
print(article_title)

Making it quicker and easier to search on GOV.UK


In [143]:
def parse_article_metadata(article_soup):
    article_tag = article_soup.find('article')
    article_title = article_tag.find('h1').text
    authors = article_tag.find_all('a', {'class': 'author'})
    article_authors = [author.text for author in authors]
    article_posted_on = article_tag.find('time')['datetime']
    categories = article_tag.find_all('a', {'rel': 'category'})
    article_categories = [category.text for category in categories]
    article_categories

    article_metadata_data = {
        'title': article_title,
        'authors': article_authors,
        'posted_on': datetime.strptime(article_posted_on[:19], '%Y-%m-%dT%H:%M:%S'),
        'categories': article_categories
    }

    return article_metadata_data

In [144]:
parse_article_metadata(article_soup)

{'title': 'Making it quicker and easier to search on GOV.UK',
 'authors': ['Catriona Fraser, Product Manager, Search, GOV.UK',
  'Monica Crusellas, Senior Interaction Designer, GOV.UK'],
 'posted_on': datetime.datetime(2024, 12, 12, 15, 54, 26),
 'categories': ['Product changes', "What we're working on"]}

## On to focus on the content?

In [145]:
consumable_tags = { "p", "h2", "h3", "h4" }

def parse_article_content(article_soup):
    entry_container = article_soup.find('div', {'class': 'entry-content'})
    content = []
    for child in entry_container.children:
        if child.name in consumable_tags:
            if child.text:
                content.append({'content': child.text, 'tag': child.name})
        else:
            if child.name:
              # print(f"Unknown tag: {child.name}")
              pass

    return content

In [146]:
content = parse_article_content(article_soup)
content[:3]

[{'content': 'GOV.UK site search is one of the main ways people find information on GOV.UK. It’s used more than 4 million times a month. In our strategy for growth, we said we wanted to improve site search to help make it quicker and easier for users to access government information and services.',
  'tag': 'p'},
 {'content': 'In October we blogged about how we chose, integrated and launched a new search engine to power site search. Building on that foundation, we’ve now made several changes to GOV.UK’s search interface (where users enter, filter and read search results) to create a simpler, more user-friendly experience.',
  'tag': 'p'},
 {'content': 'These updates include a new autocomplete feature, a streamlined design for filters and sorting, and improved readability of search results. This blog post explains these changes and what they mean for GOV.UK users.',
  'tag': 'p'}]

In [147]:
def fetch_article(article_entry):
    article_response = requests.get(article_entry['link'])
    article_soup = BeautifulSoup(article_response.text, "html.parser")
    article_metadata = parse_article_metadata(article_soup)
    article_content = parse_article_content(article_soup)

    article_metadata['content'] = article_content
    article_metadata['link'] = article_entry['link']
    article_metadata['blog'] = article_entry['blog']
    article_metadata['blog_link'] = article_entry['blog_link']
    article_metadata['description'] = article_entry['description']

    return article_metadata

In [148]:
fetched_article = fetch_article(sorted_entries[-2])
fetched_article

{'title': "Following in my family's forestry footsteps",
 'authors': ['Ian Everard'],
 'posted_on': datetime.datetime(2024, 12, 3, 9, 0),
 'categories': ['Careers'],
 'content': [{'content': 'As a second-generation forester, my family tree is made up of passionate foresters. My parents both worked in forestry, and my son David is following in our family’s footsteps working for Forestry Scotland. As a family we have a deep connection to the outdoors. We want to make a difference to people, nature and the environment, and leave a legacy for the next generation.',
   'tag': 'p'},
  {'content': 'Growing up in forestry', 'tag': 'h2'},
  {'content': 'As I cast my mind back to the things that shaped and inspired me as a young lad growing up in Hampshire, Northumberland and Penarth, I’m reminded of fond family holidays spent in Cornwall, roaming the picturesque countryside and forests that form our magnificent English landscape. I recall from an early age that I only ever wanted to be outdoors

## ⚠️ Let's web scrape all the articles!

In [149]:
len(all_current_entries)

61

In [150]:
full_articles = []

for entry in all_current_entries:
    full_articles.append(fetch_article(entry))

In [151]:
full_articles[0]

{'title': 'Making it quicker and easier to search on GOV.UK',
 'authors': ['Catriona Fraser, Product Manager, Search, GOV.UK',
  'Monica Crusellas, Senior Interaction Designer, GOV.UK'],
 'posted_on': datetime.datetime(2024, 12, 12, 15, 54, 26),
 'categories': ['Product changes', "What we're working on"],
 'content': [{'content': 'GOV.UK site search is one of the main ways people find information on GOV.UK. It’s used more than 4 million times a month. In our strategy for growth, we said we wanted to improve site search to help make it quicker and easier for users to access government information and services.',
   'tag': 'p'},
  {'content': 'In October we blogged about how we chose, integrated and launched a new search engine to power site search. Building on that foundation, we’ve now made several changes to GOV.UK’s search interface (where users enter, filter and read search results) to create a simpler, more user-friendly experience.',
   'tag': 'p'},
  {'content': 'These updates in

## How are we going to save this data?

Since we are going to store data in git, we want to aboid both big files and lots of files.

Let's write a file to bucket articles on a weekly basis.

In [168]:
from collections import defaultdict
from datetime import datetime, timedelta

def bucket_articles_weekly(articles):
    weekly_buckets = defaultdict(list)
    sorted_articles = sorted(articles, key=lambda x: x['posted_on'], reverse=True)

    for article in sorted_articles:
        article_date = article['posted_on'].date()
        start_of_week = article_date - timedelta(days=article_date.weekday())
        weekly_buckets[start_of_week].append(article)

    sorted_weekly_buckets = dict(sorted(weekly_buckets.items(), reverse=True))

    return sorted_weekly_buckets


In [169]:
bucketed_articles = bucket_articles_weekly(full_articles)

for week_start, week_articles in bucketed_articles.items():
    print(f"Week starting {week_start}:")
    for article in week_articles:
        print(f"  - {article['title']} ({article['posted_on']})")
    print()

Week starting 2024-12-09:
  - Making it quicker and easier to search on GOV.UK (2024-12-12 15:54:26)
  - Portsmouth’s New Emergency Department Opens with support from Defence Medical Services personnel (2024-12-12 13:24:55)
  - Magic Maps: change to service platform   (2024-12-12 12:18:11)
  - Ambitious and future-thinking: advising on digital government (2024-12-12 10:34:14)
  - Farewell to Ipsum  (2024-12-12 10:32:52)
  - Strengthening actuarial practice through collaboration (2024-12-12 09:26:35)
  - Celebrating one year of Projects for Nature (2024-12-12 09:24:08)
  - Our consultation events are over but there’s still time to share your thoughts (2024-12-12 09:00:55)
  - Transforming Defra together: the journey from paper to online forms (2024-12-12 08:00:00)
  - Guidance now available for improved Countryside Stewardship Higher Tier offer  (2024-12-11 17:14:46)
  - Getting ready to apply for Countryside Stewardship Higher Tier (2024-12-11 16:40:36)
  - Puberty blockers: what you n

## Let's write weekly files!

In [166]:
class DateTimeCodec(json.JSONEncoder):
    def __init__(self, datetime_fields=None, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.datetime_fields = set(datetime_fields or [])

    def default(self, obj):
        return obj.isoformat() if isinstance(obj, datetime) else super().default(obj)

    def decode(self, obj):
        return {k: datetime.fromisoformat(v) if k in self.datetime_fields and isinstance(v, str) else v
                for k, v in obj.items()}

def write_weekly_data(date, records):
    datetime_fields = ['posted_on']
    target_file = data_dir / f'{date}.json'
    if target_file.exists():
        with open(target_file) as f:
            current_records = json.load(f, object_hook=DateTimeCodec(datetime_fields).decode)
        existing_record_links = {
            record['link'] for record in current_records
        }
    else:
        existing_record_links = set()
        current_records = []

    for new_record in records:
        if new_record['link'] in existing_record_links:
            continue
        existing_record_links.add(new_record['link'])
        current_records.append(new_record)

    current_records = sorted(current_records, key=lambda x: x['posted_on'], reverse=True)
    with open(target_file, 'w') as f:
        json.dump(current_records, f, indent=4, cls=DateTimeCodec, datetime_fields=datetime_fields)

In [167]:
for week_start, week_articles in bucketed_articles.items():
    write_weekly_data(week_start, week_articles)

## Remember that cutoff date?

We now can save it in a file to keep track of our progress so far. We will read this file to get the next cutoff date

In [171]:
with open(data_dir / 'cutoff_date.txt', 'w') as f:
    f.write(full_articles[0]['posted_on'].isoformat())

In [173]:
with open(data_dir / 'cutoff_date.txt') as f:
    CUTOFF_DATE = datetime.fromisoformat(f.read())

In [174]:
CUTOFF_DATE

datetime.datetime(2024, 12, 12, 15, 54, 26)