In [1]:
import os
import requests
from kaggle_secrets import UserSecretsClient
from bs4 import BeautifulSoup  # Fix missing import


# My Kaggle dataset
KAGGLE_DATASET = "price-paid-data-202304"

# Kaggle input directory where dataset files are stored
KAGGLE_INPUT_DIR = f"/kaggle/input/{KAGGLE_DATASET}/"
# /kaggle/input/price-paid-data-202304
print(KAGGLE_INPUT_DIR)

# Work directory inside the notebook
DATA_DIR = "./data/"
os.makedirs(DATA_DIR, exist_ok=True)

# File path for last_update.txt inside the notebook
LAST_UPDATE_FILE = os.path.join(DATA_DIR, "last_update.txt")


# Official Statistical data set Price Paid Data
URL = "https://www.gov.uk/government/statistical-data-sets/price-paid-data-downloads"

def get_current_month():
    """
    Fetch the webpage and extract the current month information from the h2 tag.
    Expected h2 tag format:
    <h2 id="december-2024-data-current-month">December 2024 data (current month)</h2>
    """
    try:
        response = requests.get(URL)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, "html.parser")
        
        h2_tag = soup.find("h2", id=lambda x: x and "data-current-month" in x)
        if h2_tag:
            text = h2_tag.get_text(strip=True)
            # Remove the " (current month)" part
            current_month = text.split(" (")[0].strip()  # e.g. "December 2024 data"
            return current_month, soup
        else:
            print("Could not find the current month information on the page.")
            return None, soup
    except Exception as e:
        print(f"Error fetching the page: {e}")
        return None, None

def get_stored_update():
    """
    Retrieve last_update.txt from the Kaggle dataset.
    If it does not exist, create an empty one.
    """
    kaggle_last_update_path = "/kaggle/input/price-paid-data-202304/last_update.txt"

    if os.path.exists(kaggle_last_update_path):
        print("Found last_update.txt in dataset. Loading...")
        os.system(f"cp {kaggle_last_update_path} {LAST_UPDATE_FILE}")  # Copy the file
    else:
        print("last_update.txt not found in dataset. Creating a new one.")
        with open(LAST_UPDATE_FILE, "w") as file:
            file.write("")  # Create an empty file

    # Read and return the stored value
    with open(LAST_UPDATE_FILE, "r") as file:
        return file.read().strip()

def store_update(new_value):
    """Store the new update value in last_update.txt"""
    with open(LAST_UPDATE_FILE, "w") as file:
        file.write(new_value)

    print("Updated last_update.txt with new value.")


def load_existing_monthly_files():
    """
    Copy all CSV files from the Kaggle dataset folder in INPUT_DIR (except pp-complete.csv,
    dataset-metadata.json, and last_update.txt) into the local DATA_DIR.
    This will preserve previously accumulated monthly files.
    """
    if not os.path.exists(KAGGLE_INPUT_DIR):
        print(f"Kaggle input directory {KAGGLE_INPUT_DIR} not found!")
        return
    for filename in os.listdir(KAGGLE_INPUT_DIR):
        # Consider CSV files that are not pp-complete.csv, last_update.txt, or dataset-metadata.json
        if filename.endswith('.csv') and filename not in ["pp-complete.csv", "last_update.txt", "dataset-metadata.json"]:
            src = os.path.join(KAGGLE_INPUT_DIR, filename)
            dest = os.path.join(DATA_DIR, filename)
            if not os.path.exists(dest):
                print(f"Copying existing monthly file: {filename}")
                os.system(f"cp {src} {dest}")
            else:
                print(f"Monthly file {filename} already exists locally; skipping copy.")


def find_monthly_csv_download_link(soup):
    """
    Find the CSV download link for the monthly file by searching for 
    'pp-monthly-update-new-version.csv' in the href.
    """
    for a in soup.find_all("a", href=True):
        if "pp-monthly-update-new-version.csv" in a["href"]:
            return a["href"]
    return None

def find_csv_download_link(soup):
    """
    Find the CSV download link for the complete dataset by searching for 
    'pp-complete.csv' in the href.
    """
    for a in soup.find_all("a", href=True):
        if "pp-complete.csv" in a["href"]:
            return a["href"]
    return None

def get_monthly_file_name(soup):
    """
    Extract the monthly file name from the h2 tag.
    For example, if the h2 text is 'December 2024 data (current month)',
    return 'December 2024 data.csv'.
    """
    h2_tag = soup.find("h2", id=lambda x: x and "data-current-month" in x)
    if h2_tag:
        text = h2_tag.get_text(strip=True)
        name = text.split(" (")[0]  # e.g. "December 2024 data"
        return f"{name}.csv"
    return None

def download_file(url, output_filename):
    """
    Download the file from the given URL to DATA_DIR using the specified output filename.
    """
    file_path = os.path.join(DATA_DIR, output_filename)
    
    print(f"Downloading to {file_path}...")
    
    try:
        with requests.get(url, stream=True) as r:
            r.raise_for_status()
            total_size = int(r.headers.get('content-length', 0))
            downloaded = 0
            chunk_size = 8192  # 8KB chunks
            
            with open(file_path, 'wb') as f:
                for chunk in r.iter_content(chunk_size=chunk_size):
                    if chunk:
                        f.write(chunk)
                        downloaded += len(chunk)
                        print(f"\rDownloaded {downloaded / 1e6:.2f} MB", end='')
        print("\nDownload complete!")
    except Exception as e:
        print(f"Error downloading file: {e}")

def main():
    """
    Main function to check for an update based on the current month.
    If the current month (extracted from the h2 tag) has changed since the last run,
    download the monthly file (if not already downloaded) first, then the complete file.
    """

    load_existing_monthly_files()

    
    current_month, soup = get_current_month()
    if not current_month:
        print("No current month information found. Exiting.")
        return
    
    stored_month = get_stored_update()
    
    if stored_month == current_month:
        print("No update detected for the current month. Skipping download.")
    else:
        print(f"New update detected: {current_month} (Previous: {stored_month})")
        
        # --- Download the monthly file if not already present ---
        monthly_link = find_monthly_csv_download_link(soup)
        monthly_filename = get_monthly_file_name(soup)
        if monthly_link and monthly_filename:
            monthly_file_path = os.path.join(DATA_DIR, monthly_filename)
            if os.path.exists(monthly_file_path):
                print(f"Monthly file {monthly_filename} already exists. Skipping download for monthly file.")
            else:
                if monthly_link.startswith("/"):
                    monthly_link = "https://www.gov.uk" + monthly_link
                print(f"Monthly file link found: {monthly_link}")
                print(f"Downloading monthly file as: {monthly_filename}")
                download_file(monthly_link, monthly_filename)
        else:
            print("Could not find the monthly file download link or file name.")
        
        # --- Download the complete (huge) file (always replace) ---
        complete_link = find_csv_download_link(soup)
        if complete_link:
            if complete_link.startswith("/"):
                complete_link = "https://www.gov.uk" + complete_link
            # Use a constant filename for the complete file
            complete_filename = "pp-complete.csv"
            print(f"Complete file link found: {complete_link}")
            print(f"Downloading complete file as: {complete_filename}")
            download_file(complete_link, complete_filename)
        else:
            print("Could not find the complete file download link.")
        
        # Update the stored update key with the current month
        store_update(current_month)

if __name__ == "__main__":
    main()


/kaggle/input/price-paid-data-202304/
Copying existing monthly file: December 2024 data.csv
Copying existing monthly file: December 2023 data.csv
Found last_update.txt in dataset. Loading...
New update detected: December 2024 data (Previous: October 2024 data
November 2024 data)
Monthly file link found: http://prod.publicdata.landregistry.gov.uk.s3-website-eu-west-1.amazonaws.com/pp-monthly-update-new-version.csv
Downloading monthly file as: December 2024 data.csv
Downloading to ./data/December 2024 data.csv...
Downloaded 13.59 MB
Download complete!
Complete file link found: http://prod.publicdata.landregistry.gov.uk.s3-website-eu-west-1.amazonaws.com/pp-complete.csv
Downloading complete file as: pp-complete.csv
Downloading to ./data/pp-complete.csv...
Downloaded 5219.07 MB
Download complete!
Updated last_update.txt with new value.


In [2]:
import os
from kaggle_secrets import UserSecretsClient

user_secrets = UserSecretsClient()
kaggle_api_key =  user_secrets.get_secret("kaggle-api")

os.environ["KAGGLE_USERNAME"] = "lorentzyeung"
os.environ["KAGGLE_KEY"] = kaggle_api_key

In [3]:
import json

# Define metadata
metadata = {
    "title": "UK Property Price official data (Monthly Update)", 
    "id": "lorentzyeung/price-paid-data-202304",  # (URL slug)
    "licenses": [{"name": "CC0-1.0"}]  # License (default is CC0-1.0 for public datasets)
}

# Save metadata to the ./data/ folder
metadata_path = "./data/dataset-metadata.json"
with open(metadata_path, "w") as f:
    json.dump(metadata, f, indent=4)

print(f"dataset-metadata.json created at {metadata_path}")


dataset-metadata.json created at ./data/dataset-metadata.json


In [4]:
import os

# Check if at least one file exists in ./data/
if len(os.listdir(DATA_DIR)) > 0:
    print("Files found in ./data/. Proceeding with dataset update...")
    
    # Update dataset
    !kaggle datasets version -p ./data -m "Automatic update: last_update.txt retrieved and updated, new monthly file added, pp-complete.csv replaced" --dir-mode=tar
else:
    print("No files in ./data/. Skipping dataset update.")


Files found in ./data/. Proceeding with dataset update...
Starting upload for file last_update.txt
100%|█████████████████████████████████████████| 18.0/18.0 [00:00<00:00, 28.1B/s]
Upload successful: last_update.txt (18B)
Starting upload for file December 2024 data.csv
100%|██████████████████████████████████████| 13.0M/13.0M [00:00<00:00, 17.7MB/s]
Upload successful: December 2024 data.csv (13MB)
Starting upload for file pp-complete.csv
100%|██████████████████████████████████████| 4.86G/4.86G [01:14<00:00, 70.0MB/s]
Upload successful: pp-complete.csv (5GB)
Dataset version is being created. Please check progress at https://www.kaggle.com/lorentzyeung/price-paid-data-202304


In [5]:
!ls -lh /kaggle/input/price-paid-data-202304


total 4.9G
-rw-r--r-- 1 nobody nogroup  13M Feb 18 20:30 'December 2023 data.csv'
-rw-r--r-- 1 nobody nogroup  13M Feb 18 20:30 'December 2024 data.csv'
-rw-r--r-- 1 nobody nogroup   37 Feb 18 20:30  last_update.txt
-rw-r--r-- 1 nobody nogroup 4.9G Feb 18 20:32  pp-complete.csv
