In [3]:
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import os

# Base URL
base_url = "https://www.ncedc.org/mt/"

# Folder to save downloaded files
output_folder = "ncedc_html_files"
os.makedirs(output_folder, exist_ok=True)

# Define date range
start_date = datetime.strptime("2023-06-20", "%Y-%m-%d")
end_date = datetime.strptime("2024-06-21", "%Y-%m-%d")

def get_last_modified(url):
    """Get the Last-Modified header from a URL."""
    response = requests.head(url)
    if "Last-Modified" in response.headers:
        last_modified = response.headers["Last-Modified"]
        return datetime.strptime(last_modified, "%a, %d %b %Y %H:%M:%S %Z")
    return None

def download_file(file_url, filename):
    """Download a file from a URL and save it locally."""
    response = requests.get(file_url, stream=True)
    if response.status_code == 200:
        file_path = os.path.join(output_folder, filename)
        with open(file_path, "wb") as file:
            for chunk in response.iter_content(1024):
                file.write(chunk)
        print(f"Downloaded: {filename}")
    else:
        print(f"Failed to download: {filename} (Status Code: {response.status_code})")

# Scrape the website
response = requests.get(base_url)
if response.status_code == 200:
    soup = BeautifulSoup(response.text, "html.parser")

    # Find all links
    links = soup.find_all("a", href=True)

    for link in links:
        file_url = link["href"]
        # Check if the link ends with .html
        if file_url.endswith(".html"):
            full_url = base_url + file_url
            # Get the Last-Modified date
            last_modified = get_last_modified(full_url)
            if last_modified and start_date <= last_modified <= end_date:
                # Download the file if it matches the date range
                download_file(full_url, file_url.split("/")[-1])
else:
    print(f"Failed to access {base_url} (Status Code: {response.status_code})")


Downloaded: nc71133194_MT.html
Downloaded: nc73908361_MT.html
Downloaded: nc73908746_MT.html
Downloaded: nc73909066_MT.html
Downloaded: nc73918291_MT.html
Downloaded: nc73922596_MT.html
Downloaded: nc73923661_MT.html
Downloaded: nc73925281_MT.html
Downloaded: nc73928421_MT.html
Downloaded: nc73934641_MT.html
Downloaded: nc73938626_MT.html
Downloaded: nc73938706_MT.html
Downloaded: nc73938736_MT.html
Downloaded: nc73943821_MT.html
Downloaded: nc73943841_MT.html
Downloaded: nc73943846_MT.html
Downloaded: nc73944166_MT.html
Downloaded: nc73944271_MT.html
Downloaded: nc73947486_MT.html
Downloaded: nc73947830_MT.html
Downloaded: nc73947835_MT.html
Downloaded: nc73948665_MT.html
Downloaded: nc73952475_MT.html
Downloaded: nc73952605_MT.html
Downloaded: nc73961261_MT.html
Downloaded: nc73975501_MT.html
Downloaded: nc73992726_MT.html
Downloaded: nc73994586_MT.html
Downloaded: nc73997816_MT.html
Downloaded: nc73998256_MT.html
Downloaded: nc73999211_MT.html
Downloaded: nc74000441_MT.html
Download