In [1]:
from pathlib import Path
import os
import pandas as pd
import json

In [2]:
# Define paths for JSON input and CSV output
json_folder_path = Path.cwd().parents[1] / 'json_renamed'
csv_folder_path = Path.cwd().parents[1] / 'csv_datasets'

# Ensure the CSV directory exists
csv_folder_path.mkdir(parents=True, exist_ok=True)

In [7]:
# Define a mapping template to convert shotmap data into a standardized dictionary format

def ExtractShotmap(shotmap_data):
    template = {
        'is_home': [],
        'shot_type': [],
        'situation': [],
        'body_part': [],
        'reversed_period_time': [],
        'reversed_period_time_sec': [],
        'goal_mouth_loc': [],
        'time': [],
        'time_sec': [],
        'incident_type': [],

        'player_name': [],
        'player_slug': [],
        'player_id': [],
        'player_position': [],
        'player_usercount': [],

        'xg': [],
        'xgot': [],

        'player_x': [],
        'player_y': [],
        'player_z': [],

        'goal_x': [],
        'goal_y': [],
        'goal_z': [],

        'block_x': [],
        'block_y': [],
        'block_z': [],

        'draw_start_x': [],
        'draw_end_x': [],
        'draw_goal_x': [],

        'draw_start_y': [],
        'draw_end_y': [],
        'draw_goal_y': []
    }

    # Iterate through each entry in the shotmap data
    for info in shotmap_data['shotmap']:
        # Extract basic shot information
        template['is_home'].append(info.get('isHome', None))
        template['shot_type'].append(info.get('shotType', None))
        template['situation'].append(info.get('situation', None))
        template['body_part'].append(info.get('bodyPart', None))
        template['reversed_period_time'].append(info.get('reversedPeriodTime', None))
        template['reversed_period_time_sec'].append(info.get('reversedPeriodTimeSeconds', None))
        template['goal_mouth_loc'].append(info.get('goalMouthLocation', None))
        template['time'].append(info.get('time', None))
        template['time_sec'].append(info.get('timeSeconds', None))
        template['incident_type'].append(info.get('incidentType', None))

        # Extract player information
        player_info = info.get('player', {})
        template['player_name'].append(player_info.get('name', None))
        template['player_slug'].append(player_info.get('slug', None))
        template['player_id'].append(player_info.get('id', None))
        template['player_position'].append(player_info.get('position', None))
        template['player_usercount'].append(player_info.get('userCount', None))

        # Extract expected goals (xG) information
        template['xg'].append(info.get('xg', None))
        template['xgot'].append(info.get('xgot', None))

        # Extract player coordinates
        coordinates = info.get('playerCoordinates', {})
        template['player_x'].append(coordinates.get('x', None))
        template['player_y'].append(coordinates.get('y', None))
        template['player_z'].append(coordinates.get('z', None))

        # Extract goal mouth coordinates
        goal_coordinates = info.get('goalMouthCoordinates', {})
        template['goal_x'].append(goal_coordinates.get('x', None))
        template['goal_y'].append(goal_coordinates.get('y', None))
        template['goal_z'].append(goal_coordinates.get('z', None))

        # Extract block coordinates
        block_coordinates = info.get('blockCoordinates', {})
        template['block_x'].append(block_coordinates.get('x', None))
        template['block_y'].append(block_coordinates.get('y', None))
        template['block_z'].append(block_coordinates.get('z', None))

        # Extract drawing information for visualization
        draw = info.get('draw', {})
        template['draw_start_x'].append(draw.get('start', {}).get('x', None))
        template['draw_end_x'].append(draw.get('end', {}).get('x', None))
        template['draw_goal_x'].append(draw.get('goal', {}).get('x', None))

        template['draw_start_y'].append(draw.get('start', {}).get('y', None))
        template['draw_end_y'].append(draw.get('end', {}).get('y', None))
        template['draw_goal_y'].append(draw.get('goal', {}).get('y', None)) 

    # Return the filled template
    return template


In [None]:
# Create a list to store the DataFrames
shotmaps_df = []

# Iterate through all files in the JSON folder
for filename in os.listdir(json_folder_path):
    try:
        # Check if the file ends with '.json' and contains 'shotmap' in its name
        if filename.endswith('.json') and 'shotmap' in filename:
            # Construct the full path to the file
            json_file_path = os.path.join(json_folder_path, filename)

            # Open the file and load the JSON data
            with open(json_file_path, 'r', encoding='utf-8') as file:
                data = json.load(file)

            # Check if 'shotmap' data exists in the JSON
            if 'shotmap' in data:
                # Extract the shotmap data from the JSON file
                shotmap_template = ExtractShotmap(data)

                # Convert the dictionary into a DataFrame
                df = pd.DataFrame(shotmap_template)

                # Extract the date and code from the filename and insert them into the DataFrame columns names
                date, code, _ = filename.split('_')[1:4]
                df.insert(0, 'date', date)
                df.insert(1, 'code', code)
                df['filename'] = filename  # Add the filename as a column

                # Append the DataFrame to the list
                shotmaps_df.append(df)

    # Handle exceptions
    except Exception as e:
        print(f'Error processing the file {filename}: {str(e)}')

# Filter out DataFrames that are either empty or contain only NA values
shotmaps_df = [df for df in shotmaps_df if not df.empty and not df.isna().all().all()]

# Concatenate all non-empty DataFrames into a single DataFrame
if shotmaps_df:
    result_df = pd.concat(shotmaps_df, ignore_index=True)

    # Save the concatenated DataFrame to a CSV file
    result_df.to_csv(csv_folder_path / 'shotmaps.csv', index=False)
else:
    print('No DataFrame to concatenate.')

In [3]:
def flatten_json(data, parent_key='', sep='_'):
    """
    Recursively flattens a nested JSON dictionary or list.
    """
    items = {}
    if isinstance(data, dict):
        for k, v in data.items():
            new_key = f"{parent_key}{sep}{k}" if parent_key else k
            if isinstance(v, dict):
                items.update(flatten_json(v, new_key, sep=sep))
            elif isinstance(v, list):
                for i, item in enumerate(v):
                    if isinstance(item, (dict, list)):
                        items.update(flatten_json(item, f"{new_key}{sep}{i}", sep=sep))
                    else:
                        items[f"{new_key}{sep}{i}"] = item
            else:
                items[new_key] = v
    elif isinstance(data, list):
        for i, item in enumerate(data):
            items.update(flatten_json(item, f"{parent_key}{sep}{i}", sep=sep))
    else:
        items[parent_key] = data
    return items

def extract_shotmap(shotmap_data):
    """
    Extracts all shotmap data from JSON using a dynamic approach.
    """
    flattened_data = []

    # Loop through each shotmap entry in the JSON data
    for shot in shotmap_data.get('shotmap', []):
        # Flatten each shot entry and append to the list
        flat_shot = flatten_json(shot)
        flattened_data.append(flat_shot)

    # Return a DataFrame from the flattened data
    return pd.DataFrame(flattened_data)

# List to store DataFrames for each file
shotmaps_dataframes = []

# Iterate over all files in the JSON folder
for filename in os.listdir(json_folder_path):
    try:
        # Process only JSON files that contain 'shotmap' in their name
        if filename.endswith('.json') and 'shotmap' in filename:
            json_file_path = os.path.join(json_folder_path, filename)

            with open(json_file_path, 'r', encoding='utf-8') as file:
                json_data = json.load(file)

            # Extract shotmap data from the JSON file
            df = extract_shotmap(json_data)

            # Add 'date' and 'code' from the filename to the DataFrame
            date, code, _ = filename.split('_')[1:4]
            df.insert(0, 'date', date)
            df.insert(1, 'code', code)

            # Append the DataFrame to the list
            shotmaps_dataframes.append(df)

    except Exception as e:
        print(f"Error processing file {filename}: {str(e)}")

In [4]:
# Filter out empty or all-NA DataFrames
shotmaps_dataframes = [df for df in shotmaps_dataframes if not df.empty and not df.isna().all().all()]

# Concatenate all valid DataFrames into a single DataFrame
if shotmaps_dataframes:
    result_df = pd.concat(shotmaps_dataframes, ignore_index=True)

    # Save the consolidated DataFrame to a CSV file
    result_df.to_csv(csv_folder_path / 'shotmaps.csv', index=False)
    print(f"Consolidated data saved to {csv_folder_path / 'shotmaps.csv'}")
else:
    print('No DataFrame to concatenate.')

Consolidated data saved to /Users/fernandaalves/Documents/code_studies/palmeiras_analytics_br/csv_datasets/shotmaps.csv


Key Adjustments

    Dynamic Extraction with flatten_json: The flatten_json function has been enhanced to recursively process both dictionaries and lists. This ensures that every level of nested data is flattened and captured in the final DataFrame.

    Increased Flexibility: This updated approach is highly adaptable to different JSON structures. It can handle any depth or variation in the data format, ensuring that all available data is extracted.

    Reduced Empty Fields: By dynamically flattening the entire JSON structure, the code minimizes the likelihood of missing or empty fields in the final CSV output.