# Initial Investigation

This notebook contians my code to process the data from json files into a pandas-friendly csv formats.

I started by loading a single match and inspecting the structure of the data. After I had gotten that into a format I was happy with, I continued to process all remaining matches.

In [1]:
import pandas as pd
import numpy as np
from uuid import uuid4
import os
import json
import sys

In [2]:
sample_match_path = '../data/t20s_json/211028.json'
with open(sample_match_path, 'r') as f:
    data = json.load(f)
    
meta = data['meta']
info = data['info']
innings = data['innings']

In [None]:
print(f"meta: {meta}")
print(f"info: {info}")

In [None]:
len(innings)

In [None]:
innings[0].keys()

In [None]:
innings[1].keys()

In [None]:
df = pd.DataFrame(innings[0]['overs'])
df

In [None]:
innings[0]['overs'][0]['deliveries'][0]

In [9]:
df_row = {
    # Match identifiers
    'game_id': None,
    'date': None,
    'venue': None,
    'location': None,
    'gender': None,
    'match_type': None,
    'innings': None,  # 1st or 2nd innings
    
    # Team info
    'batting_team': None,
    'bowling_team': None,
    'batting_team_players': None,
    'bowling_team_players': None,
    
    # Over/Ball info
    'over': None,
    'ball_in_over': None,  # 1-6
    
    # Players
    'batter': None,
    'bowler': None,
    'non_striker': None,
    
    # Runs breakdown
    'runs_batter': None,    # Runs scored by batter
    'extras': None,         # Extra runs
    'total': None,          # Total runs for delivery
    
    # Extra details
    'is_wicket': False,     # Boolean
    'wicket_type': None,    # caught/bowled/lbw etc
    'fielder': None,        # If caught/run-out
    
    # Target (for 2nd innings)
    'target_runs': None,
    'target_overs': None,
    
    # Match situation
    'current_runs': None,  # Team score before this ball
    'current_wickets': None,       # Wickets fallen before this ball
    'powerplay': False,     # Boolean - is it powerplay?
    'extras_details': None,
}


In [10]:
def process_match(data):
    match_df = pd.DataFrame(columns=df_row.keys())

    meta = data['meta']
    info = data['info']
    all_innings = data['innings']
#-----------------------------------------
    game_id = uuid4()
    date = info['dates'][0]
    venue =  info['venue']
    location = info['city'] if 'city' in info else None # Some matches don't have a city but keeping it may be userful later
    gender = info['gender']
    match_type = info['match_type']
    innings = None
    teams = info['teams']
#-----------------------------------------
    innings_count = 0
    for inning in all_innings:
        innings_count += 1
        runs_this_innings = 0
        wickets_this_innings = 0

        batting_team = inning['team']
        bowling_team = list(set(teams) - {batting_team})[0]

        batting_team_players = info['players'][batting_team]
        batting_team_players_ids = [info['registry']['people'][player] for player in batting_team_players]
        bowling_team_players = info['players'][bowling_team]
        bowling_team_players_ids = [info['registry']['people'][player] for player in bowling_team_players]
        
        # While I am only looking at t20 matches, I'd like to keep it more flexible for other match formats
        powerplay_info = inning['powerplays']
        powerplay_start = str(powerplay_info[0]['from']).split('.')
        powerplay_end = str(powerplay_info[0]['to']).split('.')

        if 'target' in inning:
            target_runs = inning['target']['runs']
            target_overs = inning['target']['overs']
        else:
            target_runs = None
            target_overs = None

        for over in inning['overs']:
            over_num = over['over']
            delivery_count = 0
            for delivery in over['deliveries']:
                delivery_count += 1

                powerplay = True if over_num >= int(powerplay_start[0]) and over_num <= int(powerplay_end[0]) and delivery_count >= int(powerplay_start[1]) and delivery_count <= int(powerplay_end[1]) else False

                batter = info['registry']['people'][delivery['batter']]
                bowler = info['registry']['people'][delivery['bowler']]
                non_striker = info['registry']['people'][delivery['non_striker']]

                runs_batter = delivery['runs']['batter']
                extras = delivery['runs']['extras']
                total = delivery['runs']['total']
                runs_this_innings += total

                if 'wickets' in delivery:
                    wickets_this_innings += 1
                    is_wicket = True
                    wicket_type = delivery['wickets'][0]['kind']
                    player_out = info['registry']['people'][delivery['wickets'][0]['player_out']]
                    if 'fielders' in delivery['wickets'][0]:
                        fielder = info['registry']['people'][delivery['wickets'][0]['fielders'][0]]
                    else:
                        fielder = None
                else:
                    is_wicket = False
                    wicket_type = None
                    player_out = None
                    fielder = None

                over_row = {
                    'game_id': game_id,
                    'date': date,
                    'venue': venue,
                    'location': location,
                    'gender': gender,
                    'match_type': match_type,
                    'innings': innings_count,
                    
                    'batting_team': batting_team,
                    'bowling_team': bowling_team,
                    'batting_team_players': batting_team_players_ids,
                    'bowling_team_players': bowling_team_players_ids,
                    
                    'over': over_num,
                    'ball_in_over': delivery_count,
                    
                    'batter': batter,
                    'bowler': bowler,
                    'non_striker': non_striker,
                    
                    'runs_batter': runs_batter,
                    'extras': extras,
                    'total': total,
                    
                    'is_wicket': is_wicket,
                    'wicket_type': wicket_type,
                    'fielder': fielder,
                    'player_out': player_out,
                    
                    'target_runs': target_runs,
                    'target_overs': target_overs,
                    
                    'current_runs': runs_this_innings,
                    'current_wickets': wickets_this_innings,
                    'powerplay': powerplay,
                    'extras_details': delivery.get('extras'),
                }
                match_df = pd.concat([match_df, pd.DataFrame([over_row])], ignore_index=True)
    return match_df
    


In [11]:
def process_match(data):
    match_rows = []
    
    info = data['info']
    game_id = uuid4()
    match_info = {
        'game_id': game_id,
        'date': info['dates'][0],
        'venue': info['venue'],
        'location': info.get('city'),
        'gender': info['gender'],
        'match_type': info['match_type']
    }
    
    teams = info['teams']
    registry = info['registry']['people']
    
    for innings_count, inning in enumerate(data['innings'], 1):
        batting_team = inning['team']
        bowling_team = list(set(teams) - {batting_team})[0]
        
        batting_team_players_ids = [registry[player] for player in info['players'][batting_team]]
        bowling_team_players_ids = [registry[player] for player in info['players'][bowling_team]]
        
        powerplay_info = inning['powerplays'][0]
        pp_start = [int(x) for x in str(powerplay_info['from']).split('.')]
        pp_end = [int(x) for x in str(powerplay_info['to']).split('.')]
        
        target_info = inning.get('target', {})
        target_runs = target_info.get('runs')
        target_overs = target_info.get('overs')
        
        runs_this_innings = 0
        wickets_this_innings = 0
        
        for over in inning['overs']:
            over_num = over['over']
            for delivery_count, delivery in enumerate(over['deliveries'], 1):
                powerplay = (over_num >= pp_start[0] and over_num <= pp_end[0] and 
                           delivery_count >= pp_start[1] and delivery_count <= pp_end[1])
                
                runs = delivery['runs']
                total = runs['total']
                runs_this_innings += total
                
                wicket_info = delivery.get('wickets', [{}])[0] if 'wickets' in delivery else {}
                is_wicket = bool(wicket_info)
                if is_wicket:
                    wickets_this_innings += 1
                
                row = {
                    **match_info,
                    'innings': innings_count,
                    'batting_team': batting_team,
                    'bowling_team': bowling_team,
                    'batting_team_players': batting_team_players_ids,
                    'bowling_team_players': bowling_team_players_ids,
                    'over': over_num,
                    'ball_in_over': delivery_count,
                    'batter': registry[delivery['batter']],
                    'bowler': registry[delivery['bowler']],
                    'non_striker': registry[delivery['non_striker']],
                    'runs_batter': runs['batter'],
                    'extras': runs['extras'],
                    'total': total,
                    'is_wicket': is_wicket,
                    'wicket_type': wicket_info.get('kind'),
                    'fielder': registry[wicket_info['fielders'][0]['name']] if 'fielders' in wicket_info else None,
                    'player_out': registry[wicket_info['player_out']] if 'player_out' in wicket_info else None,
                    'target_runs': target_runs,
                    'target_overs': target_overs,
                    'current_runs': runs_this_innings,
                    'current_wickets': wickets_this_innings,
                    'powerplay': powerplay,
                    'extras_details': delivery.get('extras'),
                }
                match_rows.append(row)
    
    return pd.DataFrame(match_rows)

In [None]:
df = process_match(data)
df.head(10)

In [None]:
df[df['is_wicket'] == True].head()

In [None]:
df.info()

In [None]:
def process_all_matches(path_to_data_directory):
    files = os.listdir(path_to_data_directory)
    all_matches_df = pd.DataFrame()

    for file in files:
        try:
            with open(os.path.join(path_to_data_directory, file), 'r') as f:
                data = json.load(f)
                match_df = process_match(data)
                all_matches_df = pd.concat([all_matches_df, match_df], ignore_index=True)
        except Exception as e:
            print(f"Error processing match: {file}")
            print(f"Error: {e}")
            print("\n\n")
            continue
    return all_matches_df

all_matches_df = process_all_matches('../data/t20s_json')
all_matches_df.info()

In [None]:
all_matches_df['game_id'].nunique()

3793 games out of a total of 3835 is pretty good going.

There's quite a few matches that have no location, but I think this is okay, I doubt we really need it, if we want to group by location venue would probably be a better choice anyway.

Of 858370 balls, there are 47481 wickets, so about 1 in 20. Of those, 29896 were due to a catch.
Target runs is included in about half of the balls, which looks about right.

In [17]:
all_matches_df.to_csv('../data/saved_data/all_matches.csv', index=False)

---

In [18]:
matches_df = pd.read_csv('../data/saved_data/all_matches.csv')

In [None]:
matches_df['batter'].nunique()

---