# Data Analysis and Reporting Notebook

## Setup of Analysis

**Imports**

Libraries needed to run analysis and create report 

In [None]:
import geopandas as gpd
import shapely
from shapely.errors import TopologicalError
import pandas as pd
import os
from docx import Document
from docx.shared import Pt

**Dataset Folder for Analysis**

In [None]:
dataset = ""

dataset_folder = f"datasets/{dataset}"

**Filepath Setup**

In [None]:
def find_geospatial_files(folder_path):
    """
    Returns a list of all geospatial files (.shp, .gpkg, or .csv) found in the given folder path.
    If no geospatial files are found, returns an empty list.
    """
    geospatial_files = [
        file_name for file_name in os.listdir(folder_path)
        if file_name.endswith('.shp') or file_name.endswith('.gpkg') or file_name.endswith('.csv')
    ]
    return geospatial_files

# Define folder path containing the files
folder_path = f"{dataset_folder}/files"  # The folder containing the geospatial files
print(f"Looking for geospatial files in folder: {folder_path}")

# Find geospatial files
geospatial_files = find_geospatial_files(folder_path)

if geospatial_files:
    print("Geospatial files found:")
    for file in geospatial_files:
        print(f"- {file}")
else:
    print("No geospatial files found in the folder.")


In [None]:
# Select shapefile to analyse
geospatial_file = geospatial_files[0]
print(geospatial_file)

# Define paths for output files
geojson_output_file = f"{dataset_folder}/{dataset}_{geospatial_file}_sample.geojson"
print(f"GeoJSON output file path: {geojson_output_file}")

features_plot_file = f"{dataset_folder}/{dataset}_plot.png"
print(f"Features plot file path: {features_plot_file}")

report_output_file = f"{dataset_folder}/{dataset}_analysis_report"
print(f"Analysis report file path: {report_output_file}")

Read in shapefile and initially inspect first 5 rows (and headers).

In [None]:
def load_geospatial_file(folder_path, file_name):
    """
    Loads a geospatial file (either .shp, .gpkg, or .csv) into a GeoDataFrame.
    Attempts to handle invalid geometries using pyogrio's on_invalid option.
    For CSV files, expects columns 'latitude' and 'longitude' to convert to geometries.
    
    Args:
        folder_path (str): Path to the folder containing the file.
        file_name (str): Name of the file to load.
    
    Returns:
        GeoDataFrame: A GeoDataFrame containing the data.
    """
    file_path = os.path.join(folder_path, file_name)
    
    try:
        if file_name.endswith('.shp') or file_name.endswith('.gpkg'):
            # Load shapefile or GeoPackage using pyogrio
            gdf = gpd.read_file(file_path, engine="pyogrio", on_invalid="ignore")
        
        elif file_name.endswith('.csv'):
            # Load CSV file and attempt to create geometries if lat/lon columns are present
            gdf = pd.read_csv(file_path)
                  
        else:
            raise ValueError("Unsupported file format. Only .shp, .gpkg, and .csv files are supported.")
    
    except TopologicalError as e:
        print(f"TopologicalError encountered: {e}")
        raise
    except Exception as e:
        print(f"An error occurred: {e}")
        raise
    
    return gdf

file_path = f"{geospatial_file}"

# Load the geospatial file (shapefile or GeoPackage)
gdf = load_geospatial_file(folder_path, file_path)
print(f"Geospatial data loaded with {len(gdf)} records.")
gdf.head()


## Analyse folder and shapefile

In [None]:
# Analyse the contents of the folder
def analyze_folder(folder_path):
    """Returns a detailed description of files contained in the folder."""
    folder_contents = os.listdir(folder_path)
    content_details = {}
    total_size = 0
    for file in folder_contents:
        file_path = os.path.join(folder_path, file)
        file_size = os.path.getsize(file_path)
        content_details[file] = f"Size: {file_size} bytes"
        total_size += file_size
    content_details["Total Folder Size"] = f"{total_size} bytes"
    return content_details

folder_description = analyze_folder(folder_path)
folder_description

In [None]:
def analyze_geospatial_file(gdf, geojson_output_file, file_path):
    """
    Analyzes a geospatial file and returns important information for the report.
    Converts Timestamp columns to string to handle JSON serialization.
    
    Args:
        gdf (GeoDataFrame): The GeoDataFrame to analyze.
        geojson_output_file (str): The path to save the sample GeoJSON file.
        file_path (str): The path to the original geospatial file.
        
    Returns:
        dict: Dictionary containing analysis information.
    """
    
    # Convert any Timestamp columns to strings for JSON serialization
    for column in gdf.select_dtypes(include=['datetime']):
        gdf[column] = gdf[column].astype(str)

    # Save a sample as GeoJSON (first 50 records)
    try:
        gdf.head(50).to_file(geojson_output_file, driver="GeoJSON")
        print(f"Sample GeoJSON saved as {geojson_output_file}")
    except Exception as e:
        return None, {"error": f"Error saving GeoJSON: {e}"}
    
    # Analyze the geospatial data
    geometry_type = gdf.geom_type.unique()
    record_count = len(gdf)
    features = gdf.columns.tolist()
    crs = str(gdf.crs) if gdf.crs is not None else "Unknown"
    
    # Convert a sample of the data to GeoJSON for preview
    try:
        geojson_sample = gdf.head(50).to_json()
    except Exception as e:
        return None, {"error": f"Error converting to GeoJSON: {e}"}
    
    # Get the file size
    file_size = os.path.getsize(file_path)
    
    # Compile the analysis into a dictionary
    geospatial_data = {
        "geometry_type": ', '.join(geometry_type),
        "record_count": record_count,
        "features": ', '.join(features),
        "crs": crs,
        "geojson_sample": geojson_sample,
        "file_size": file_size
    }

    return geospatial_data


path = f"{folder_path}/{file_path}"

geospatial_data = analyze_geospatial_file(gdf, geojson_output_file, path)
geospatial_data

## Create Report

In [None]:
def generate_report(folder_data, geospatial_data, output_file):
    """
    Generates a DOCX report summarising the dataset
    
    Args:
        folder_data (dict): Information about folder contents.
        geospatial_data (dict): Information about the shapefile.
        output_file (str): The file path to save the report.
    """
    doc = Document()
    
    # Add title
    doc.add_heading(f"{dataset} Analysis Report", 0)

    # Add section for folder analysis
    doc.add_heading('Folder Contents:', level=1)
    doc.add_paragraph('The following files were found in the folder:')
    for file, details in folder_data.items():
        doc.add_paragraph(f"{file}: {details}", style='List Bullet')

    # Add file analysis section
    custom_heading = doc.add_heading(f"{geospatial_file} File Analysis:", level=0)
    for run in custom_heading.runs:
        run.font.size = Pt(16)
        run.bold = True
    doc.add_paragraph(f"Geometry Type: {geospatial_data['geometry_type']}", style='List Bullet')
    doc.add_paragraph(f"Number of Records: {geospatial_data['record_count']}", style='List Bullet')
    doc.add_paragraph(f"Features/Attributes: {geospatial_data['features']}", style='List Bullet')
    doc.add_paragraph(f"Coordinate Reference System (CRS): {geospatial_data['crs']}", style='List Bullet')
    doc.add_paragraph(f"File Size: {geospatial_data['file_size']} bytes", style='List Bullet')

    doc.add_heading("Plot of Dataset")
    doc.add_paragraph(f"Plot of first row of data:")
        
    doc.add_heading("GeoJSON Sample Data", level=1)
    try:
        with open(geojson_output_file, 'r') as geojson_file:
            geojson_content = geojson_file.read(2000)  # Read the first 2000 characters
            doc.add_paragraph("Sample GeoJSON content:")
            doc.add_paragraph(geojson_content)
    except Exception as e:
        doc.add_paragraph(f"Error loading GeoJSON file: {e}")

    # Save the report to a DOCX file
    output = f"{output_file}_({geospatial_file}).docx"

    doc.save(output)
    print(f"Report saved as {output}")


image_files = [features_plot_file]

generate_report(folder_description, geospatial_data, report_output_file)

## View data in interactive map

In [None]:
# simple interactive
# check subset of rows
gdf.iloc[:1].explore()

# Whole table
#gdf.explore()

## clip to a subset

#row = 1
#col = 'geometry'

# draw a bounding box using https://wktmap.com/
#bbox_wkt = f"{gdf.loc[row, col]}"
#bbox_poly = shapely.wkt.loads(bbox_wkt)

#gdf_clipped = gdf.clip(bbox_poly)

#print(bbox_wkt)
#print(len(gdf_clipped))

#gdf_clipped.explore()