In [1]:
import os
import requests
from kaggle_secrets import UserSecretsClient
from bs4 import BeautifulSoup
import shutil
import json

# ---------------------------------------------------
# Step 1. Force-refresh the Kaggle dataset by downloading its latest version
# Define the full dataset slug
DATASET_SLUG = "lorentzyeung/price-paid-data-202304"

# Temporary directory to hold the refreshed dataset files
REFRESHED_DATASET_DIR = "./temp_kaggle_data/"
if os.path.exists(REFRESHED_DATASET_DIR):
    shutil.rmtree(REFRESHED_DATASET_DIR)
os.makedirs(REFRESHED_DATASET_DIR, exist_ok=True)

print("Downloading the latest version of Kaggle dataset...")
!kaggle datasets download -d {DATASET_SLUG} -p {REFRESHED_DATASET_DIR} --unzip
print("Download complete. Using data from:", REFRESHED_DATASET_DIR)

# Use the refreshed folder as our input directory
KAGGLE_INPUT_DIR = REFRESHED_DATASET_DIR
print("Kaggle input directory set to:", KAGGLE_INPUT_DIR)

# ---------------------------------------------------
# Step 2. Set up local working directories
DATA_DIR = "./data/"
os.makedirs(DATA_DIR, exist_ok=True)
LAST_UPDATE_FILE = os.path.join(DATA_DIR, "last_update.txt")

# Official Gov.uk Price Paid Data page
URL = "https://www.gov.uk/government/statistical-data-sets/price-paid-data-downloads"

def get_current_month():
    """
    Fetch the webpage and extract the current month information from the h2 tag.
    Expected h2 tag format:
    <h2 id="december-2024-data-current-month">December 2024 data (current month)</h2>
    """
    try:
        response = requests.get(URL)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, "html.parser")
        
        h2_tag = soup.find("h2", id=lambda x: x and "data-current-month" in x)
        if h2_tag:
            text = h2_tag.get_text(strip=True)
            # Remove the " (current month)" part
            current_month = text.split(" (")[0].strip()  # e.g. "December 2024 data"
            return current_month, soup
        else:
            print("Could not find the current month information on the page.")
            return None, soup
    except Exception as e:
        print(f"Error fetching the page: {e}")
        return None, None

def get_stored_update():
    """
    Retrieve last_update.txt from the refreshed Kaggle dataset.
    If it does not exist, create an empty one.
    """
    kaggle_last_update_path = os.path.join(KAGGLE_INPUT_DIR, "last_update.txt")
    if os.path.exists(kaggle_last_update_path):
        print("Found last_update.txt in dataset. Loading...")
        shutil.copy(kaggle_last_update_path, LAST_UPDATE_FILE)
    else:
        print("last_update.txt not found in dataset. Creating a new one.")
        with open(LAST_UPDATE_FILE, "w") as file:
            file.write("")  # Create an empty file

    # Read and return the stored value
    with open(LAST_UPDATE_FILE, "r") as file:
        return file.read().strip()

def store_update(new_value):
    """Store the new update value in last_update.txt"""
    with open(LAST_UPDATE_FILE, "w") as file:
        file.write(new_value)
    print("Updated last_update.txt with new value.")

def load_existing_monthly_files():
    """
    Copy all CSV files from the refreshed Kaggle dataset folder (KAGGLE_INPUT_DIR) into the local DATA_DIR.
    This preserves previously accumulated monthly files.
    """
    if not os.path.exists(KAGGLE_INPUT_DIR):
        print(f"Kaggle input directory {KAGGLE_INPUT_DIR} not found!")
        return
    print("Files in Kaggle input directory:", os.listdir(KAGGLE_INPUT_DIR))
    
    for filename in os.listdir(KAGGLE_INPUT_DIR):
        if filename.endswith('.csv') and filename not in ["pp-complete.csv", "last_update.txt", "dataset-metadata.json"]:
            src = os.path.join(KAGGLE_INPUT_DIR, filename)
            dest = os.path.join(DATA_DIR, filename)
            print(f"Checking file: {filename}")
            print(f"  Source: {src}")
            print(f"  Destination: {dest}")
            if not os.path.exists(dest):
                print(f"Copying existing monthly file: {filename}")
                shutil.copy(src, dest)
            else:
                print(f"Monthly file {filename} already exists locally; skipping copy.")

def find_monthly_csv_download_link(soup):
    """
    Find the CSV download link for the monthly file by searching for 
    'pp-monthly-update-new-version.csv' in the href.
    """
    for a in soup.find_all("a", href=True):
        if "pp-monthly-update-new-version.csv" in a["href"]:
            return a["href"]
    return None

def find_csv_download_link(soup):
    """
    Find the CSV download link for the complete dataset by searching for 
    'pp-complete.csv' in the href.
    """
    for a in soup.find_all("a", href=True):
        if "pp-complete.csv" in a["href"]:
            return a["href"]
    return None

def get_monthly_file_name(soup):
    """
    Extract the monthly file name from the h2 tag.
    For example, if the h2 text is 'December 2024 data (current month)',
    return 'December 2024 data.csv'.
    """
    h2_tag = soup.find("h2", id=lambda x: x and "data-current-month" in x)
    if h2_tag:
        text = h2_tag.get_text(strip=True)
        name = text.split(" (")[0]  # e.g. "December 2024 data"
        return f"{name}.csv"
    return None

def download_file(url, output_filename):
    """
    Download the file from the given URL to DATA_DIR using the specified output filename.
    """
    file_path = os.path.join(DATA_DIR, output_filename)
    print(f"Downloading to {file_path}...")
    try:
        with requests.get(url, stream=True) as r:
            r.raise_for_status()
            total_size = int(r.headers.get('content-length', 0))
            downloaded = 0
            chunk_size = 8192  # 8KB chunks
            with open(file_path, 'wb') as f:
                for chunk in r.iter_content(chunk_size=chunk_size):
                    if chunk:
                        f.write(chunk)
                        downloaded += len(chunk)
                        print(f"\rDownloaded {downloaded / 1e6:.2f} MB", end='')
        print("\nDownload complete!")
    except Exception as e:
        print(f"Error downloading file: {e}")

def main():
    """
    Main function to check for an update based on the current month.
    If the current month (extracted from the h2 tag) has changed since the last run,
    download the monthly file (if not already downloaded) first, then the complete file.
    """
    load_existing_monthly_files()
    print("Contents of DATA_DIR before update:", os.listdir(DATA_DIR))
    
    current_month, soup = get_current_month()
    if not current_month:
        print("No current month information found. Exiting.")
        return
    
    stored_month = get_stored_update()
    
    if stored_month == current_month:
        print("No update detected for the current month. Skipping download.")
    else:
        print(f"New update detected: {current_month} (Previous: {stored_month})")
        # --- Download the monthly file if not already present ---
        monthly_link = find_monthly_csv_download_link(soup)
        monthly_filename = get_monthly_file_name(soup)
        if monthly_link and monthly_filename:
            monthly_file_path = os.path.join(DATA_DIR, monthly_filename)
            if os.path.exists(monthly_file_path):
                print(f"Monthly file {monthly_filename} already exists. Skipping download for monthly file.")
            else:
                if monthly_link.startswith("/"):
                    monthly_link = "https://www.gov.uk" + monthly_link
                print(f"Monthly file link found: {monthly_link}")
                print(f"Downloading monthly file as: {monthly_filename}")
                download_file(monthly_link, monthly_filename)
        else:
            print("Could not find the monthly file download link or file name.")
        
        # --- Download the complete (huge) file (always replace) ---
        complete_link = find_csv_download_link(soup)
        if complete_link:
            if complete_link.startswith("/"):
                complete_link = "https://www.gov.uk" + complete_link
            complete_filename = "pp-complete.csv"
            print(f"Complete file link found: {complete_link}")
            print(f"Downloading complete file as: {complete_filename}")
            download_file(complete_link, complete_filename)
        else:
            print("Could not find the complete file download link.")
        
        # Update the stored update key with the current month
        store_update(current_month)

if __name__ == "__main__":
    main()

Downloading the latest version of Kaggle dataset...
Dataset URL: https://www.kaggle.com/datasets/lorentzyeung/price-paid-data-202304
License(s): other
Downloading price-paid-data-202304.zip to ./temp_kaggle_data
 98%|██████████████████████████████████████▎| 1.39G/1.41G [00:05<00:00, 256MB/s]
100%|███████████████████████████████████████| 1.41G/1.41G [00:05<00:00, 254MB/s]
Download complete. Using data from: ./temp_kaggle_data/
Kaggle input directory set to: ./temp_kaggle_data/
Files in Kaggle input directory: ['December 2024 data.csv', 'pp-complete.csv', 'March 2024 data.csv', 'last_update.txt']
Checking file: December 2024 data.csv
  Source: ./temp_kaggle_data/December 2024 data.csv
  Destination: ./data/December 2024 data.csv
Copying existing monthly file: December 2024 data.csv
Checking file: March 2024 data.csv
  Source: ./temp_kaggle_data/March 2024 data.csv
  Destination: ./data/March 2024 data.csv
Copying existing monthly file: March 2024 data.csv
Contents of DATA_DIR before

In [2]:
# ---------------------------------------------------
# Step 3. Update the Kaggle dataset with the new files
user_secrets = UserSecretsClient()
kaggle_api_key = user_secrets.get_secret("kaggle-api")
os.environ["KAGGLE_USERNAME"] = "lorentzyeung"
os.environ["KAGGLE_KEY"] = kaggle_api_key

# Define metadata for the dataset update
metadata = {
    "title": "UK Property Price official data (Monthly Update)",
    "id": "lorentzyeung/price-paid-data-202304",
    "licenses": [{"name": "CC0-1.0"}]
}

metadata_path = os.path.join(DATA_DIR, "dataset-metadata.json")
with open(metadata_path, "w") as f:
    json.dump(metadata, f, indent=4)
print(f"dataset-metadata.json created at {metadata_path}")

# If there are files in DATA_DIR, create a new dataset version
if len(os.listdir(DATA_DIR)) > 0:
    print("Files found in ./data/. Proceeding with dataset update...")
    !kaggle datasets version -p ./data -m "Automatic update: last_update.txt retrieved and updated, new monthly file added, pp-complete.csv replaced" --dir-mode=tar
else:
    print("No files in ./data/. Skipping dataset update.")

dataset-metadata.json created at ./data/dataset-metadata.json
Files found in ./data/. Proceeding with dataset update...
Starting upload for file December 2024 data.csv
100%|██████████████████████████████████████| 13.0M/13.0M [00:00<00:00, 38.6MB/s]
Upload successful: December 2024 data.csv (13MB)
Starting upload for file March 2024 data.csv
100%|██████████████████████████████████████| 16.1M/16.1M [00:00<00:00, 53.9MB/s]
Upload successful: March 2024 data.csv (16MB)
Starting upload for file last_update.txt
100%|█████████████████████████████████████████| 18.0/18.0 [00:00<00:00, 95.2B/s]
Upload successful: last_update.txt (18B)
Dataset version is being created. Please check progress at https://www.kaggle.com/lorentzyeung/price-paid-data-202304
