In [189]:
import os
import json
import glob
import pandas as pd
import polars as pl

In [197]:
def get_json_files(input_folder):
    """
    Returns a list of paths to all JSON files in the specified folder.
    
    Args:
        input_folder (str): Path to the folder to search for JSON files
        
    Returns:
        list: List of full paths to JSON files
    """
    # Make sure the path is normalized
    input_folder = os.path.normpath(input_folder)
    
    # Check if folder exists
    if not os.path.exists(input_folder):
        print(f"Error: Folder '{input_folder}' does not exist")
        return []
    
    # Use glob to find all .json files
    json_files = glob.glob(os.path.join(input_folder, "*.json"))
    
    print(f"Found {len(json_files)} JSON files in '{input_folder}'")
    return json_files

In [198]:
def load_json_file(file_path):
    """
    Loads a JSON file into a Python dictionary.
    
    Args:
        file_path (str): Path to the JSON file
        
    Returns:
        dict or list: The loaded JSON data
        None: If there was an error loading the file
    """
    try:
        with open(file_path, 'r') as f:
            data = json.load(f)
        return data
    except FileNotFoundError:
        print(f"Error: File '{file_path}' not found")
        return None
    except json.JSONDecodeError as e:
        print(f"Error: Invalid JSON in '{file_path}': {e}")
        return None
    except Exception as e:
        print(f"Error loading '{file_path}': {e}")
        return None

In [199]:

def flatten_dict(d, parent_key='', sep='_'):
    """
    Flatten a nested dictionary by joining keys with underscores.
    
    Args:
        d (dict): Dictionary to flatten
        parent_key (str): Parent key used in recursion (empty for the initial call)
        sep (str): Separator character, defaults to underscore
        
    Returns:
        dict: Flattened dictionary with underscore-separated keys
    """
    items = []
    
    # Handle the case when d is not a dictionary
    if not isinstance(d, dict):
        return {parent_key: d} if parent_key else d
    
    for k, v in d.items():
        new_key = f"{parent_key}{sep}{k}" if parent_key else k
        
        # If value is a dictionary, recursively flatten it
        if isinstance(v, dict):
            items.extend(flatten_dict(v, new_key, sep).items())
        else:
            # For non-dict values, just add the key-value pair
            items.append((new_key, v))
            
    return dict(items)

In [200]:
# file_path = file_paths[-1]

match_df_list = []

file_paths = get_json_files("ipl_json_2024_onwards")

for file_path in file_paths:
    match_details ={}
    match_details['path_name'] = file_path.split("/")[-1]
    match = load_json_file(file_path)
    match_info = match['info']
    match_details['city'] = match_info['city']
    match_details['date'] = match_info['dates'][0]
    try:
        match_details['winner'] = match_info['outcome']['winner']
        match_details['winning_margin'] = match_info['outcome']['by']
        try:
            match_details['winning_margin_runs'] = match_info['outcome']['by']['runs']
        except:
            match_details['winning_margin_wickets'] = match_info['outcome']['by']['wickets']
    except:
        match_details['winner'] = match_info['outcome']['eliminator']
        # match_details['winning_margin'] = None
        # match_details['winning_margin_runs'] = None
        print(file_path)
    match_details['player_of_match'] = match_info['player_of_match'][0]
    match_details['venue'] = match_info['venue']
    match_details['team1'] = match_info['teams'][0]
    match_details['team2'] = match_info['teams'][1]
    match_details['toss_winner'] = match_info['toss']['winner']
    match_details['toss_decision'] = match_info['toss']['decision']
    match_df_list.append(match_details)

Found 104 JSON files in 'ipl_json_2024_onwards'
ipl_json_2024_onwards/1473469.json


In [201]:
match_df = pd.DataFrame(match_df_list)
match_df.head()
match_df.to_csv("data_2025/match_info.csv", index=False)
match_df.to_parquet("data_2025/match_info.parquet", index=False)

In [202]:
# file_path = file_paths[-1]

ball_df_list = []

file_paths = get_json_files("ipl_json_2025")

# file_path = file_paths[-1]
for file_path in file_paths:
    ball_details = {}
    ball_details['path_name'] = file_path.split("/")[-1]
    match = load_json_file(file_path)

    innings = match['innings']

    for i in range(len(innings)):
        inning = innings[i]

        ball_details['team'] = inning['team']
        ball_details['inning_sequence'] = i+1

        overs = inning['overs']

        for j in range(len(overs)):
            over = overs[j]

            ball_details['over'] = over['over']

            k=0

            for k in range(len(over['deliveries'])):
                delivery = over['deliveries'][k]
                ball_details['delivery'] = k+1

                try:
                    wicket_details = delivery.get('wickets', None)[0]
                    ball_details['wicket'] = 1
                    ball_details['player_out'] = wicket_details['player_out']
                    ball_details['kind'] = wicket_details['kind']
                    ball_details['wicket_fielders'] = wicket_details['fielders'][0]['name']
                except:
                    pass
                try:
                    review_details = delivery.pop('review', None)
                except:
                    pass
                delivery_details = flatten_dict(delivery)

                # if 'wickets' in delivery_details:
                #     print('broken')
                #     raise Exception("Wicket details are not handled correctly")
               
                # delivery_details['']
                ball_details.update(delivery_details)
                ball_df_list.append(ball_details)
                ball_details = {}
# len(innings)

Found 33 JSON files in 'ipl_json_2025'


In [203]:
ball_df = pd.DataFrame(ball_df_list)
fill_down_columns = ['path_name','team','inning_sequence','over']
ball_df[fill_down_columns] = ball_df[fill_down_columns].ffill()
ball_df.head()
ball_df.to_csv("data_2025/ball_info.csv", index=False)
ball_df.to_parquet("data_2025/ball_info.parquet", index=False)

In [160]:
file_path = "ipl_json_2025/1473446.json"
match = load_json_file(file_path)
innings = match['innings']
inning = innings[0]
over = inning['overs']
over = over[8]
delivery = over['deliveries'][2]
# wicket_details = delivery.pop('wickets')
delivery

{'batter': 'Shubman Gill',
 'bowler': 'HH Pandya',
 'non_striker': 'B Sai Sudharsan',
 'runs': {'batter': 0, 'extras': 0, 'total': 0},
 'wickets': [{'player_out': 'Shubman Gill',
   'fielders': [{'name': 'Naman Dhir'}],
   'kind': 'caught'}]}

In [129]:
wicket_details[0]['fielders'][0]['name']

'Mohammed Siraj'

In [86]:
over['deliveries'][0]

{'batter': 'J Fraser-McGurk',
 'bowler': 'JC Archer',
 'non_striker': 'Abishek Porel',
 'runs': {'batter': 0, 'extras': 0, 'total': 0}}

In [87]:
output_dict

{'batter': 'J Fraser-McGurk',
 'bowler': 'JC Archer',
 'non_striker': 'Abishek Porel',
 'runs_batter': 0,
 'runs_extras': 0,
 'runs_total': 0}