# Cricket Match Data Processor

This notebook processes multiple cricket match JSON files to extract only the cricket_match_extended_ball data and save it as CSV files named in the format `match_id__team_id.csv`.

In [1]:
import json
import csv
import os
import glob
import pandas as pd
from typing import Dict, List, Any, Union, Optional
from tqdm.notebook import tqdm

## Helper Functions

In [2]:
def flatten_dict(d: Dict, parent_key: str = '', sep: str = '_') -> Dict:
    """
    Flatten a nested dictionary structure.
    
    Args:
        d: The dictionary to flatten
        parent_key: The parent key for the current dictionary
        sep: Separator between keys
        
    Returns:
        Flattened dictionary
    """
    items = []
    for k, v in d.items():
        new_key = f"{parent_key}{sep}{k}" if parent_key else k
        
        if isinstance(v, dict):
            items.extend(flatten_dict(v, new_key, sep=sep).items())
        elif isinstance(v, list):
            # For lists, we'll stringify them unless they're a list of dicts
            if v and isinstance(v[0], dict):
                # If it's a list of dicts, we'll just use the first item
                # This is a simplification - you may want to handle this differently
                items.extend(flatten_dict(v[0], new_key, sep=sep).items())
            else:
                items.append((new_key, str(v)))
        else:
            items.append((new_key, v))
    return dict(items)

In [3]:
def process_and_extract_cricket_match_extended_ball(data_array: List[Dict]) -> tuple:
    """
    Process commentary_with_extended_summary and extract only cricket_match_extended_ball data.
    
    Args:
        data_array: List of dictionaries with {type, data} structure
        
    Returns:
        Tuple of (DataFrame containing only cricket_match_extended_ball data, match_id, team_id) or (None, None, None) if not found
    """
    # Extract only items of type 'cricket_match_extended_ball'
    ball_data = []
    match_id = None
    team_id = None
    
    for item in data_array:
        if item.get('type') == 'cricket_match_extended_ball':
            # Flatten the data part of the structure
            if 'data' in item and isinstance(item['data'], dict):
                flattened_data = flatten_dict(item['data'])
                ball_data.append(flattened_data)
                
                # Extract match_id and team_id from the first item
                if match_id is None and 'match_id' in flattened_data:
                    match_id = str(flattened_data['match_id'])
                if team_id is None and 'batting_team_id' in flattened_data:
                    team_id = str(flattened_data['batting_team_id'])
    
    if not ball_data:
        return None, None, None
        
    # Create DataFrame
    df = pd.DataFrame(ball_data)
    return df, match_id, team_id

In [4]:
def convert_json_to_csv(json_data: Union[str, Dict], output_dir: str = '.') -> Optional[str]:
    """
    Convert JSON data to CSV format, extracting only cricket_match_extended_ball data
    and naming the file as match_id__team_id.csv
    
    Args:
        json_data: JSON data either as a string or parsed dictionary
        output_dir: Directory to save the output CSV file
    
    Returns:
        Path to the generated CSV file or None if error
    """
    # Parse JSON if it's a string
    if isinstance(json_data, str):
        try:
            data = json.loads(json_data)
        except json.JSONDecodeError as e:
            print(f"Error parsing JSON: {e}")
            return None
    else:
        data = json_data
    
    # Check if the data has the expected structure
    if not data.get('status') or not data.get('data'):
        print("Invalid JSON structure. Expected format: { status: boolean, data: {...} }")
        return None
    
    # Process only commentary_with_extended_summary to extract cricket_match_extended_ball
    if 'commentary_with_extended_summary' in data['data'] and data['data']['commentary_with_extended_summary']:
        df, match_id, team_id = process_and_extract_cricket_match_extended_ball(
            data['data']['commentary_with_extended_summary']
        )
        
        if df is not None:
            # Use match_id and team_id for the filename
            if match_id and team_id:
                output_file = os.path.join(output_dir, f"{match_id}__{team_id}.csv")
            else:
                output_file = os.path.join(output_dir, "match_unknown__team_unknown.csv")
            
            # Write to CSV
            try:
                # Create output directory if it doesn't exist
                os.makedirs(output_dir, exist_ok=True)
                
                df.to_csv(output_file, index=False)
                return output_file
            except Exception as e:
                print(f"Error writing CSV file {output_file}: {e}")
    else:
        return None
    
    return None

## Process Multiple Files

### Set Input and Output Directories

In [5]:
# Set your input and output directories here
input_dir = "/Users/milangabriel/Downloads/gabscore/Scrapper/Match_Data_Extract/JSON"  # Replace with your input directory path
output_dir = "/Users/milangabriel/Downloads/gabscore/Scrapper/Match_Data_Extract/JSON/"  # Replace with your output directory path

# Create output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

### Process all JSON files in the input directory

In [6]:
# Get list of all JSON files in the input directory
json_files = glob.glob(os.path.join(input_dir, "*.json"))
print(f"Found {len(json_files)} JSON files to process")

Found 22 JSON files to process


In [7]:
# Process each JSON file
processed_files = []
failed_files = []

for json_file in tqdm(json_files, desc="Processing files"):
    try:
        # Read JSON file
        with open(json_file, 'r', encoding='utf-8') as file:
            json_data = json.load(file)
        
        # Convert to CSV
        output_file = convert_json_to_csv(json_data, output_dir)
        
        if output_file:
            processed_files.append((json_file, output_file))
        else:
            failed_files.append((json_file, "No cricket_match_extended_ball data found"))
    except Exception as e:
        failed_files.append((json_file, str(e)))
        
print(f"\nSuccessfully processed {len(processed_files)} files")
print(f"Failed to process {len(failed_files)} files")

Processing files:   0%|          | 0/22 [00:00<?, ?it/s]


Successfully processed 0 files
Failed to process 22 files


### View processing results

In [8]:
# Show successful files
print("Successfully processed files:")
for json_file, csv_file in processed_files:
    print(f"  {os.path.basename(json_file)} -> {os.path.basename(csv_file)}")

Successfully processed files:


In [9]:
# Show failed files
if failed_files:
    print("\nFailed files:")
    for json_file, error in failed_files:
        print(f"  {os.path.basename(json_file)}: {error}")


Failed files:
  20.json: No cricket_match_extended_ball data found
  16.json: No cricket_match_extended_ball data found
  6.json: No cricket_match_extended_ball data found
  7.json: No cricket_match_extended_ball data found
  17.json: No cricket_match_extended_ball data found
  21.json: No cricket_match_extended_ball data found
  10.json: No cricket_match_extended_ball data found
  1.json: No cricket_match_extended_ball data found
  11.json: No cricket_match_extended_ball data found
  2.json: No cricket_match_extended_ball data found
  12.json: No cricket_match_extended_ball data found
  13.json: No cricket_match_extended_ball data found
  3.json: No cricket_match_extended_ball data found
  8.json: No cricket_match_extended_ball data found
  22.json: No cricket_match_extended_ball data found
  18.json: No cricket_match_extended_ball data found
  4.json: No cricket_match_extended_ball data found
  14.json: No cricket_match_extended_ball data found
  15.json: No cricket_match_extended_b

## Process a Specific Directory

Use this cell if you want to process files in a specific directory other than the one configured above.

In [10]:
def process_directory(input_directory, output_directory):
    # Create output directory if it doesn't exist
    os.makedirs(output_directory, exist_ok=True)
    
    # Get list of all JSON files in the input directory
    json_files = glob.glob(os.path.join(input_directory, "*.json"))
    print(f"Found {len(json_files)} JSON files to process")
    
    # Process each JSON file
    processed_files = []
    failed_files = []
    
    for json_file in tqdm(json_files, desc="Processing files"):
        try:
            # Read JSON file
            with open(json_file, 'r', encoding='utf-8') as file:
                json_data = json.load(file)
            
            # Convert to CSV
            output_file = convert_json_to_csv(json_data, output_directory)
            
            if output_file:
                processed_files.append((json_file, output_file))
            else:
                failed_files.append((json_file, "No cricket_match_extended_ball data found"))
        except Exception as e:
            failed_files.append((json_file, str(e)))
    
    print(f"\nSuccessfully processed {len(processed_files)} files")
    print(f"Failed to process {len(failed_files)} files")
    
    return processed_files, failed_files

# Example usage - uncomment and update paths to use:
# specific_input_dir = "path/to/specific/json/folder"
# specific_output_dir = "path/to/specific/output/folder"
# processed, failed = process_directory(specific_input_dir, specific_output_dir)

## Process a Single File

Use this cell if you want to process just one file.

In [11]:
def process_single_file(json_file_path, output_directory='.'):
    # Create output directory if it doesn't exist
    os.makedirs(output_directory, exist_ok=True)
    
    try:
        # Read JSON file
        with open(json_file_path, 'r', encoding='utf-8') as file:
            json_data = json.load(file)
        
        # Convert to CSV
        output_file = convert_json_to_csv(json_data, output_directory)
        
        if output_file:
            print(f"Successfully processed {json_file_path} -> {output_file}")
            return output_file
        else:
            print(f"No cricket_match_extended_ball data found in {json_file_path}")
            return None
    except Exception as e:
        print(f"Error processing {json_file_path}: {e}")
        return None

# Example usage - uncomment and update path to use:
# single_file = "path/to/specific/json/file.json"
# result = process_single_file(single_file, output_dir)

In [12]:
# Loop over JSON files in the input directory
json_files = glob.glob(os.path.join(input_dir, "*.json"))

for json_file in tqdm(json_files, desc="Processing JSON files"):
    with open(json_file, 'r', encoding='utf-8') as f:
        json_content = f.read()
    
    csv_path = convert_json_to_csv(json_content, output_dir=output_dir)
    if csv_path:
        print(f"✅ Converted: {json_file} → {csv_path}")
    else:
        print(f"❌ Failed to convert: {json_file}")

Processing JSON files:   0%|          | 0/22 [00:00<?, ?it/s]

❌ Failed to convert: /Users/milangabriel/Downloads/gabscore/Scrapper/Match_Data_Extract/JSON/20.json
❌ Failed to convert: /Users/milangabriel/Downloads/gabscore/Scrapper/Match_Data_Extract/JSON/16.json
❌ Failed to convert: /Users/milangabriel/Downloads/gabscore/Scrapper/Match_Data_Extract/JSON/6.json
❌ Failed to convert: /Users/milangabriel/Downloads/gabscore/Scrapper/Match_Data_Extract/JSON/7.json
❌ Failed to convert: /Users/milangabriel/Downloads/gabscore/Scrapper/Match_Data_Extract/JSON/17.json
❌ Failed to convert: /Users/milangabriel/Downloads/gabscore/Scrapper/Match_Data_Extract/JSON/21.json
❌ Failed to convert: /Users/milangabriel/Downloads/gabscore/Scrapper/Match_Data_Extract/JSON/10.json
❌ Failed to convert: /Users/milangabriel/Downloads/gabscore/Scrapper/Match_Data_Extract/JSON/1.json
❌ Failed to convert: /Users/milangabriel/Downloads/gabscore/Scrapper/Match_Data_Extract/JSON/11.json
❌ Failed to convert: /Users/milangabriel/Downloads/gabscore/Scrapper/Match_Data_Extract/JSON/2