In [1]:
import json
from pathlib import Path
from typing import Dict, List, Any

def load_enriched_dataset(filepath: str) -> List[Dict[str, Any]]:
    """
    Load the enriched dataset from JSON file.
    
    This function reads our previously enriched dataset that contains both the
    nested and flat author information. It includes error handling and validation
    to ensure we can properly process the data.
    
    Args:
        filepath: Path to the enriched dataset JSON file
    
    Returns:
        List of dictionaries containing the dataset records
    """
    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            data = json.load(f)
        print(f"Successfully loaded {len(data)} records from the enriched dataset")
        return data
    except Exception as e:
        print(f"Error loading enriched dataset: {str(e)}")
        raise

def restructure_record(record: Dict[str, Any]) -> Dict[str, Any]:
    """
    Restructure a single record to have the desired format.
    
    This function reorganizes the fields in each record to ensure that all
    author-related information is properly nested within the 'authors' field.
    It also removes duplicate author information from the top level.
    
    Args:
        record: Dictionary containing a single record's data
    
    Returns:
        Restructured record dictionary
    """
    # Create a new record with the desired structure
    restructured = {
        # Copy all fields except author-related ones
        "title": record["title"],
        "subtitle": record["subtitle"],
        "issued": record["issued"],
        "publisher": record["publisher"],
        "language": record["language"],
        "categories": record["categories"],
        "ISBN 10": record["ISBN 10"],
        "ISBN 13": record["ISBN 13"],
        "main_image_url": record["main_image_url"]
    }
    
    # Handle the authors field - ensure it's always a list of dictionaries
    authors = record.get("authors", [])
    if authors and isinstance(authors, list):
        # Update the first author's information with the enriched data
        if len(authors) > 0:
            # Keep existing author information
            author_info = authors[0].copy()
            # Add the enriched information
            if record.get("author_avatar"):
                author_info["author_avatar"] = record["author_avatar"]
            if record.get("author_about"):
                author_info["author_about"] = record["author_about"]
            # Update the authors list with the enriched information
            authors[0] = author_info
    
    # Add the updated authors list to the restructured record
    restructured["authors"] = authors
    
    return restructured

def clean_dataset(data: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
    """
    Clean and restructure the entire dataset.
    
    This function processes all records in the dataset, ensuring consistent
    structure and proper nesting of author information. It also handles any
    edge cases or inconsistencies in the data.
    
    Args:
        data: List of dictionaries containing all records
    
    Returns:
        List of restructured record dictionaries
    """
    cleaned_data = []
    for record in data:
        cleaned_record = restructure_record(record)
        cleaned_data.append(cleaned_record)
    
    print(f"Successfully restructured {len(cleaned_data)} records")
    return cleaned_data

def save_dataset(data: List[Dict[str, Any]], output_path: str):
    """
    Save the restructured dataset to a JSON file.
    
    This function saves the cleaned and restructured data with proper formatting
    and indentation for readability.
    
    Args:
        data: List of dictionaries containing the restructured records
        output_path: Path where the cleaned dataset should be saved
    """
    try:
        with open(output_path, 'w', encoding='utf-8') as f:
            json.dump(data, f, indent=4, ensure_ascii=False)
        print(f"Successfully saved restructured dataset to {output_path}")
    except Exception as e:
        print(f"Error saving dataset: {str(e)}")
        raise

def main(input_path: str, output_path: str):
    """
    Main function to orchestrate the dataset restructuring process.
    
    This function coordinates the loading, restructuring, and saving of the
    dataset, providing feedback about the process at each step.
    
    Args:
        input_path: Path to the enriched dataset JSON file
        output_path: Path where the restructured dataset should be saved
    """
    print("Starting dataset restructuring process...")
    
    # Load the enriched dataset
    print("\nLoading enriched dataset...")
    data = load_enriched_dataset(input_path)
    
    # Clean and restructure the data
    print("\nRestructuring dataset...")
    cleaned_data = clean_dataset(data)
    
    # Save the restructured dataset
    print("\nSaving restructured dataset...")
    save_dataset(cleaned_data, output_path)
    
    print("\nDataset restructuring complete!")
    print(f"Original record count: {len(data)}")
    print(f"Restructured record count: {len(cleaned_data)}")

if __name__ == "__main__":
    main(
        input_path="enriched_dataset.json",
        output_path="final_dataset.json"
    )
    

Starting dataset restructuring process...

Loading enriched dataset...
Successfully loaded 100866 records from the enriched dataset

Restructuring dataset...
Successfully restructured 100866 records

Saving restructured dataset...
Successfully saved restructured dataset to final_dataset.json

Dataset restructuring complete!
Original record count: 100866
Restructured record count: 100866
