In [10]:

import geopandas as gpd
import glob 
import pandas as pd
import os
import concurrent.futures
from osgeo import ogr
import pandas as pd
import os
import shutil 
import tempfile
import threading

import sys 
sys.path.append('/Users/gracecolverd/New_dataset')
from src.buildings import get_bounding_boxes , calculate_bounding_boxes, generate_new_filename


year = 2022
# input_gpk = '/Volumes/T9/Data_downloads/Versik_building_data/2024_03_full_building_data/6101/Data/new_verisk_2022.gpkg'
input_gpk = '/Volumes/T9/2024_Data_downloads/Versik_building_data/2024_03_22_updated_data/UKBuildings_Edition_15_new_format_upn.gpkg'


ds = ogr.Open(input_gpk)
layer = ds.GetLayer()
extent = layer.GetExtent()
bounding_boxes = calculate_bounding_boxes(extent, chunk_height=10000, chunk_width=10000)


# Ensure thread_local is defined at the global level
thread_local = threading.local()


def get_thread_temp_file(log_file):
    if not hasattr(thread_local, 'temp_file'):
        temp_dir = os.path.dirname(log_file)
        temp_file = tempfile.NamedTemporaryFile(delete=False, mode='w+', suffix='.csv', prefix='temp_log_', dir=temp_dir)
        thread_local.temp_file = temp_file.name
        thread_local.temp_file_first_write = True
    return thread_local.temp_file, thread_local.temp_file_first_write


def process_batch(bbox_list, output_directory, log_file="proc/processing_log.csv"):
    print('starting batch')
    results = []
    for bbox in bbox_list:
        filepath, bb, status, length = process_bbox(bbox, output_directory)  # Ensure this is the correct return format
        results.append((filepath, bb, status, length))
    df = pd.DataFrame(results, columns=['filepath', 'bbox', 'status', 'len'])
    print('results done, starting save')
    # Get the temp file path and whether it's the first write operation
    temp_file_path, is_first_write = get_thread_temp_file(log_file)
    print('temp file path is ' , temp_file_path )
    # Open the file with 'a' mode to append and ensure headers are written correctly
    with open(temp_file_path, 'a') as f:
        df.to_csv(f, header=is_first_write, index=False)
    
    # Update the flag to indicate the header should not be written next time
    if is_first_write:
        thread_local.temp_file_first_write = False

    print('temp file saved for batch')



def run_batching(bounding_boxes, output_directory, log_file="proc/processing_log.csv"):
    batch_size = 5
    with concurrent.futures.ThreadPoolExecutor(max_workers=6) as executor:
        # Store futures if you need to wait for them or check for exceptions
        futures = []
        for i in range(0, len(bounding_boxes), batch_size):
            batch = bounding_boxes[i:i+batch_size]
            future = executor.submit(process_batch, batch, output_directory, log_file)
            futures.append(future)
        # Optionally wait for all futures if needed
        concurrent.futures.wait(futures)


def process_bbox(bbox, output_directory, log_file="proc/processing_log.csv"):
    # Assuming generate_new_filename is defined elsewhere
    output_path = generate_new_filename(bbox, directory=output_directory)
    
    try:
        # Assuming input_gpk is defined elsewhere
        gdf = gpd.read_file(input_gpk, bbox=bbox)
        if gdf.empty:
            # print(f"Empty GeoDataFrame for bbox {bbox}")
            return output_path, bbox, 'completed', 0 
        
        gdf.to_file(output_path, driver="GPKG")
        return output_path, bbox, 'completed', len(gdf) 
    except Exception as e:
        print(f"An error occurred while processing bbox {bbox}: {e}")
        return output_path, bbox, 'error', None 




def merge_temp_logs_to_main(log_file="proc/processing_log.csv"):
    """
    Merge all temporary log files created by threads into the main log file.

    Parameters:
    - log_file: str, the path to the main log file.
    """
    # Assume temp files are in the same directory as the main log file
    temp_files = [f for f in os.listdir(os.path.dirname(log_file)) if f.startswith('temp_log_') and f.endswith('.csv')]
    print('num of temp files found ', len(temp_files) ) 
    # Create or append to the main log file
    if len(temp_files) != 0:
        for temp_file in temp_files:
            temp_file_path = os.path.join(os.path.dirname(log_file), temp_file)
            df_temp = pd.read_csv(temp_file_path )
            
            if os.path.exists(log_file):
                df_temp.to_csv(log_file, mode='a', header=False, index=False)
            else:
                df_temp.to_csv(log_file, mode='w', header=True, index=False)
    print('fn complete')
            
    

def cleanup_temp_files(temp_file_prefix="temp_log_", temp_dir="proc"):
    
    for filename in os.listdir(temp_dir):
        if filename.startswith(temp_file_prefix) and filename.endswith('.csv'):
            os.remove(os.path.join(temp_dir, filename))
            print(f"Deleted temporary file: {filename}") 
    


def generate_bbox_list(bounding_boxes, log= 'proc/processing_log.csv'): 
    print('Start generate_bbox_list')
    if os.path.exists(log):
        log = pd.read_csv(log)
        complete = log[log['status']=='completed']['bbox'].tolist() 
        bbox_to_process = [bbox for bbox in bounding_boxes if str(bbox) not in complete] 
    
    else:
        bbox_to_process = bounding_boxes
    print('num to process ',   len(bbox_to_process) ) 
    return bbox_to_process 

def cleanup_output_files(output_dir   ):
    log = pd.read_csv('proc/processing_log.csv') 
    if len(log) == 0:
        print('log not found')
        return
    folder = glob.glob(output_dir + '/*' ) 
    sucess_files = log[log['status']=='completed']['filepath'].tolist()   

    not_succ = [f for f in folder if f not in sucess_files]  
    print('files to remove: ', len(not_succ ))
    # remove files that are not in the log
    for f in not_succ:
        os.remove(f) 

In [2]:
# input_gpk = '/Volumes/T9/Data_downloads/Versik_building_data/2024_03_full_building_data/6101/Data/new_verisk_2022.gpkg'
# input_gpk = '/Volumes/T9/Data_downloads/Versik_building_data/2024_03_full_building_data/6101/Data/UKBuildings_Edition_15_new_format.gpkg'
# bbox = bounding_boxes[4100]
# gdf = gpd.read_file(input_gpk, bbox=bbox)

In [3]:
# gdf = gpd.GeoDataFrame.from_features([feature for feature in features], crs=25833)

In [11]:
bounding_boxes

[(9252.36, 6435.25920425788, 19252.36, 16435.25920425788),
 (9252.36, 16435.25920425788, 19252.36, 26435.25920425788),
 (9252.36, 26435.25920425788, 19252.36, 36435.259204257876),
 (9252.36, 36435.259204257876, 19252.36, 46435.259204257876),
 (9252.36, 46435.259204257876, 19252.36, 56435.259204257876),
 (9252.36, 56435.259204257876, 19252.36, 66435.25920425788),
 (9252.36, 66435.25920425788, 19252.36, 76435.25920425788),
 (9252.36, 76435.25920425788, 19252.36, 86435.25920425788),
 (9252.36, 86435.25920425788, 19252.36, 96435.25920425788),
 (9252.36, 96435.25920425788, 19252.36, 106435.25920425788),
 (9252.36, 106435.25920425788, 19252.36, 116435.25920425788),
 (9252.36, 116435.25920425788, 19252.36, 126435.25920425788),
 (9252.36, 126435.25920425788, 19252.36, 136435.25920425786),
 (9252.36, 136435.25920425786, 19252.36, 146435.25920425786),
 (9252.36, 146435.25920425786, 19252.36, 156435.25920425786),
 (9252.36, 156435.25920425786, 19252.36, 166435.25920425786),
 (9252.36, 166435.2592

In [5]:
# import fiona
# input_gpk = '/Volumes/T9/2024_Data_downloads/Versik_building_data/2024_03_full_building_data/6101/Data/UKBuildings_Edition_15_new_format.gpkg'
# # Open the shapefile
# with fiona.open(input_gpk) as shapefile:
#     # Iterate over the records
#     for record in shapefile:
#         # Print the record
#         print(record)

In [15]:
# test chunking works for one box

import fiona
# input_gpk = '/Volumes/T9/Data_downloads/Versik_building_data/2024_03_full_building_data/6101/Data/UKBuildings_Edition_15_new_format.gpkg'
# Open the shapefile
# with fiona.open(input_gpk) as shapefile:
#     # Iterate over the records
#     for record in shapefile:
#         # Print the record
#         print(record)
#         import fiona

from shapely.geometry import shape
input_gpk = '/Volumes/T9/2024_Data_downloads/Versik_building_data/2024_03_22_updated_data/UKBuildings_Edition_15_new_format_upn.gpkg'


# Define your bounding box (minx, miny, maxx, maxy)
gpkg_path = input_gpk
# input_gpk = '/Volumes/T9/Data_downloads/Versik_building_data/2024_03_full_building_data/6101/Data/UKBuildings_Edition_15_new_format.gpkg'
# Define your bounding box (minx, miny, maxx, maxy)
bbox = bounding_boxes[1]


# Initialize a list to hold the features and their IDs
features = []
ids = []

# Use fiona to open the file and load only features within the bounding box
with fiona.open(gpkg_path, 'r') as src:
    # Use the filter method with the bbox parameter to limit features
    for feature in src.filter(bbox=bbox):
        # Append the feature, including its geometry converted to a shapely geometry, to the list
        features.append({
            'properties': feature['properties'],
            'geometry': shape(feature['geometry']),
        })
        # Separately store the feature ID
        ids.append(feature['id'])

# Convert the list of features into a GeoDataFrame
gdf = gpd.GeoDataFrame.from_features(features)

# Add the ID column
gdf['upn'] = ids

# Set the CRS (Coordinate Reference System) of the GeoDataFrame to match the source layer
gdf.crs = src.crs



print(gdf)


ValueError: Assigning CRS to a GeoDataFrame without a geometry column is not supported. Use GeoDataFrame.set_geometry to set the active geometry column.

In [16]:
merge_temp_logs_to_main()

cleanup_temp_files()

bbox_to_process = generate_bbox_list(bounding_boxes )      

output_directory = f"/Volumes/T9/postcode_data/data/verisk_y2022_V4"

os.makedirs(output_directory, exist_ok=True)

cleanup_output_files(output_directory   )

FileNotFoundError: [Errno 2] No such file or directory: 'proc'

In [None]:


run_batching(bbox_to_process, output_directory )

starting batch
starting batch
starting batch
starting batch
starting batch
starting batch


  out[:] = [_pygeos_to_shapely(geom) for geom in data]
  out[:] = [_pygeos_to_shapely(geom) for geom in data]
  out[:] = [_pygeos_to_shapely(geom) for geom in data]


results done, starting save
temp file path is  /Users/gracecolverd/New_dataset/proc/temp_log_hr0e0q8f.csv
temp file saved for batch
starting batch
results done, starting save
temp file path is  /Users/gracecolverd/New_dataset/proc/temp_log_8_ss4ix2.csv
temp file saved for batch
starting batch
results done, starting save
temp file path is  /Users/gracecolverd/New_dataset/proc/temp_log_8_ss4ix2.csv
temp file saved for batch
starting batch
results done, starting save
temp file path is  /Users/gracecolverd/New_dataset/proc/temp_log_i6b0i7bi.csv
temp file saved for batch
starting batch
results done, starting save
temp file path is  /Users/gracecolverd/New_dataset/proc/temp_log_hr0e0q8f.csv
temp file saved for batch
starting batch
results done, starting save
temp file path is  /Users/gracecolverd/New_dataset/proc/temp_log_rfoly7cs.csv
temp file saved for batch
starting batch
results done, starting save
temp file path is  /Users/gracecolverd/New_dataset/proc/temp_log_8_ss4ix2.csv
temp file sa