Data wrangling the Events json and generate events.csv file

In [1]:
from pathlib import Path
import os
import pandas as pd
import json

In [2]:
# Define paths for JSON input and CSV output
json_folder_path = Path.cwd().parents[1] / 'json_renamed'
csv_folder_path = Path.cwd().parents[1] / 'csv_datasets'

# Ensure the CSV directory exists
csv_folder_path.mkdir(parents=True, exist_ok=True)

In [None]:
def flatten_json(data, parent_key='', sep='_'):
    """
    Recursively flattens a nested JSON dictionary or list.
    """
    items = {}
    
    # If the data is a dictionary, iterate through its items
    if isinstance(data, dict):
        for k, v in data.items():
            new_key = f"{parent_key}{sep}{k}" if parent_key else k
            if isinstance(v, (dict, list)):
                items.update(flatten_json(v, new_key, sep=sep))
            else:
                items[new_key] = v
                
    # If the data is a list, iterate through its elements
    elif isinstance(data, list):
        for i, v in enumerate(data):
            new_key = f"{parent_key}{sep}{i}" if parent_key else str(i)
            if isinstance(v, (dict, list)):
                items.update(flatten_json(v, new_key, sep=sep))
            else:
                items[new_key] = v

    return items

def extract_event_info(event_data):
    """
    Extracts and flattens all event data into a standardized format.
    """
    # Flatten the entire event JSON structure
    flattened_event = flatten_json(event_data)

    # Include additional key information if necessary
    additional_info = {
        'event_id': event_data.get('id'),
        'event_slug': event_data.get('slug')
    }

    # Combine flattened data with additional information
    flattened_event.update(additional_info)
    return flattened_event

# List to store all DataFrames
events_df = []

# Iterate over each file in the JSON directory
for filename in os.listdir(json_folder_path):
    try:
        if filename.endswith('.json') and 'event' in filename:  # Process only JSON files containing 'event'
            json_file_path = os.path.join(json_folder_path, filename)

            with open(json_file_path, 'r', encoding='utf-8') as file:
                ojson = json.load(file)

            if 'event' in ojson:  # Check for the 'event' key
                ojson_df = extract_event_info(ojson['event'])
                df = pd.DataFrame([ojson_df])

                # Extract 'date' and 'code' from the filename
                date, code = filename.split('_')[1:3]
                df.insert(0, 'date', date)
                df.insert(1, 'code', code)

                events_df.append(df)

    except Exception as e:
        print(f"Error processing file {filename}: {str(e)}")


In [1]:
# Combine all DataFrames into one
result_df = pd.concat(events_df, ignore_index=True)

# Save the consolidated DataFrame to a CSV file
result_df.to_csv(csv_folder_path / 'events.csv', index=False)

print(f"Consolidated data saved to {csv_folder_path / 'events.csv'}")


Consolidated data saved to /Users/fernandaalves/Documents/code_studies/palmeiras_analytics_br/csv_datasets/events.csv


Key Adjustments

    flatten_json: The flattening function is used recursively to navigate through all levels of nesting and extract all possible information.
    extract_event_info: Now extracts complete data from the JSON, regardless of the depth of nesting.
    JSON Files Loop: The code iterates over all JSON files in the specified folder and consolidates the data into a single CSV.
    Addition of 'date' and 'code' Columns: Extracts "date" and "code" from the filename and adds them as columns to the final DataFrame.