In [3]:
from pathlib import Path
import os
import pandas as pd
import json

In [4]:
# Define paths for JSON input and CSV output
json_folder_path = Path.cwd().parents[1] / 'json_renamed'
csv_folder_path = Path.cwd().parents[1] / 'csv_datasets'

# Ensure the CSV directory exists
csv_folder_path.mkdir(parents=True, exist_ok=True)

In [7]:
def flatten_json(data, parent_key='', sep='_'):
    """
    Recursively flattens a nested JSON dictionary.
    """
    items = {}
    for k, v in data.items():
        new_key = f"{parent_key}{sep}{k}" if parent_key else k
        if isinstance(v, dict):
            items.update(flatten_json(v, new_key, sep=sep))
        elif isinstance(v, list):
            for i, item in enumerate(v):
                if isinstance(item, dict):
                    items.update(flatten_json(item, f"{new_key}{i}", sep=sep))
                else:
                    items[f"{new_key}{i}"] = item
        else:
            items[new_key] = v
    return items

def process_json_file(json_path):
    """
    Processes a specific JSON file, extracts and flattens all data,
    and adds the columns 'date' and 'code'.
    """
    # Extract 'date' and 'code' from the filename
    filename = json_path.stem
    date, code = filename.split('_')[1], filename.split('_')[2]

    # Load the JSON file
    with open(json_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    # Initialize the list to store flattened data
    flattened_data = []

    # Function to process and flatten all players of a team
    def process_team(team, team_data):
        team_info = flatten_json(team_data, parent_key=f"{team}_info")  # Flatten team information
        players = team_data.get('players', [])
        for player in players:
            flat_player = flatten_json(player)
            flat_player.update(team_info)  # Add team information to each player
            flat_player['team'] = team  # Add team information (home/away)
            flat_player['date'] = date  # Add the date
            flat_player['code'] = code  # Add the code
            flattened_data.append(flat_player)

    # Process each team ('home' and 'away')
    for team in ['home', 'away']:
        if team in data:
            process_team(team, data[team])

    return pd.DataFrame(flattened_data)

Consolidated data saved to /Users/fernandaalves/Documents/code_studies/palmeiras_analytics_br/csv_datasets/lineups.csv


In [None]:
# List to store all DataFrames
all_dataframes = []

# Iterate over all JSON files in the folder
for json_file in json_folder_path.glob('j_*.json'):
    df = process_json_file(json_file)  # Process each JSON file
    all_dataframes.append(df)

# Concatenate all DataFrames into a single DataFrame
final_df = pd.concat(all_dataframes, ignore_index=True)

# Save the concatenated DataFrame to a CSV file
final_df.to_csv(csv_folder_path / 'lineups.csv', index=False)

print(f"Consolidated data saved to {csv_folder_path / 'lineups.csv'}")

Key Improvements

    Enhanced Flattening with flatten_json: The flatten_json function is used recursively to ensure all nested data structures are flattened and included in the final output, regardless of their depth or complexity.

    Dynamic Data Handling: The function now dynamically handles different structures within the JSON, making it more robust to variations in data formatting.

    Full Data Coverage: The revised approach ensures that every field, including deeply nested ones, is captured in the output, reducing the likelihood of missing data.