Data wrangling the Incident's Json and generate incidents.csv file

In [1]:
from pathlib import Path
import os
import pandas as pd
import json

In [2]:
# Define paths for JSON input and CSV output
json_folder_path = Path.cwd().parents[1] / 'json_renamed'
csv_folder_path = Path.cwd().parents[1] / 'csv_datasets'

# Ensure the CSV directory exists
csv_folder_path.mkdir(parents=True, exist_ok=True)

In [3]:
def flatten_json(data, parent_key='', sep='_'):
    """
    Recursively flattens a nested JSON dictionary, handling both dictionaries and lists.
    """
    items = {}
    if isinstance(data, dict):
        for k, v in data.items():
            new_key = f"{parent_key}{sep}{k}" if parent_key else k
            if isinstance(v, (dict, list)):
                items.update(flatten_json(v, new_key, sep=sep))
            else:
                items[new_key] = v
    elif isinstance(data, list):
        for i, v in enumerate(data):
            new_key = f"{parent_key}{sep}{i}" if parent_key else str(i)
            if isinstance(v, (dict, list)):
                items.update(flatten_json(v, new_key, sep=sep))
            else:
                items[new_key] = v
    return items

def extract_incidents(json_data):
    """
    Extracts and flattens all incident data from the JSON dynamically.
    """
    flattened_data = []

    # Loop through each incident in the JSON data
    for incident in json_data.get('incidents', []):
        # Flatten each incident recursively
        flat_incident = flatten_json(incident)
        flattened_data.append(flat_incident)

    # Return a DataFrame from the flattened data
    return pd.DataFrame(flattened_data)

# List to store DataFrames for each file
incidents_dataframes = []

# Iterate over all files in the JSON folder
for filename in os.listdir(json_folder_path):
    try:
        # Process only JSON files that contain 'incidents' in their name
        if filename.endswith('.json') and 'incidents' in filename:
            json_file_path = os.path.join(json_folder_path, filename)

            with open(json_file_path, 'r', encoding='utf-8') as file:
                json_data = json.load(file)

            # Extract incidents data from the JSON file
            df = extract_incidents(json_data)

            # Add 'date' and 'code' from the filename to the DataFrame
            date, code, _ = filename.split('_')[1:4]
            df.insert(0, 'date', date)
            df.insert(1, 'code', code)

            # Append the DataFrame to the list
            incidents_dataframes.append(df)

    except Exception as e:
        print(f"Error processing file {filename}: {str(e)}")

In [4]:
# Filter out empty or all-NA DataFrames
incidents_dataframes = [df for df in incidents_dataframes if not df.empty and not df.isna().all().all()]

# Concatenate all valid DataFrames into a single DataFrame
result_df = pd.concat(incidents_dataframes, ignore_index=True)

# Save the consolidated DataFrame to a CSV file
result_df.to_csv(csv_folder_path / 'incidents.csv', index=False)

print(f"Consolidated data saved to {csv_folder_path / 'incidents.csv'}")

Consolidated data saved to /Users/fernandaalves/Documents/code_studies/palmeiras_analytics_br/csv_datasets/incidents.csv


Key Adjustments:

    Function Renaming: Changed ExtractorIncidents to extract_incidents to follow Python naming conventions (snake_case).
    Efficient Data Extraction: Removed redundant checks and directly appended data to the template dictionary.
    Error Handling: Improved error handling to provide more informative messages when processing files.
    Code Clarity: Simplified comments for better readability while maintaining essential explanations.
    Improved DataFrame Management: Added filtering for empty or all-NA DataFrames before concatenating.
    Dynamic Extraction with flatten_json: The flatten_json function is used to recursively flatten each incident, capturing all fields and subfields, regardless of the depth of nesting.
    Increased Flexibility: The code is now adaptable to different JSON structures, ensuring that all data is extracted, even if the format varies between files.
    Elimination of Empty Fields: The dynamic use of keys eliminates the need for predefined lists, capturing all available data and minimizing empty fields in the final CSV.