In [1]:
import pandas as pd
import numpy as np
import os
import json
import ast
import collections, operator, functools
from tqdm import tqdm

In [None]:
per_ball_data = pd.read_csv('../data/saved_data/all_matches.csv')
per_ball_data.head(5)

In [None]:
per_ball_data.info()

---
# Per player statistics: Bowlers

We want to rate the bowlers based on their performances.
On average:
- How many wickets does each bowler take per ball?
- How many runs does each bowler give up per ball?
- How many bowler-related extras does each bowler give up per ball?

In [None]:
per_ball_data['wicket_type'].value_counts()

In [5]:
# We'll need to parse extras details as a string so we can analyse them better.
def parse_extras_details(extras_details):
    if pd.isna(extras_details):
        return {}
    try:
        return ast.literal_eval(extras_details)
    except:
        return {}
per_ball_data['extras_details'] = per_ball_data['extras_details'].apply(parse_extras_details)

In [None]:
per_ball_data['extras_details'].dtype

In [None]:
extras_details = per_ball_data[per_ball_data['extras_details'] != {}]['extras_details']

def sum_dict_series(series):
    # Apply Counter to each dictionary and reduce
    result = dict(functools.reduce(operator.add,
                                 map(collections.Counter, series)))
    return result

extras_counts = sum_dict_series(extras_details)
extras_counts


In [None]:
dicts_to_add = [{'a': 1, 'b': 2}, {'a': 3, 'c': 4}]
total_counts = {}
for extras_dict in dicts_to_add:
    total_counts = {**total_counts, **extras_dict}
total_counts

In [None]:
bowlers_df = per_ball_data[['game_id', 'bowler', 'runs_batter', 'extras', 'total', 'is_wicket', 'wicket_type', 'extras_details', 'powerplay']].copy()
bowlers_df.head(5)

In [None]:
def get_bowler_statistics(per_ball_data):
    bowlers_df = per_ball_data[['game_id', 'bowler', 'runs_batter', 'extras', 'total', 'is_wicket', 'wicket_type', 'extras_details', 'powerplay']].copy()
    relevant_wicket_types = ['bowled', 'lbw', 'caught', 'caught and bowled', 'stumped']
    bowlers_df['relevant_wicket'] = bowlers_df['wicket_type'].isin(relevant_wicket_types)
    relevant_extras_types = ['noballs', 'wides']
    bowlers_df['runs_from_relevant_extras'] = bowlers_df['extras_details'].apply(lambda x: sum(x.get(key, 0) for key in relevant_extras_types))
    
    powerplay_df = bowlers_df[bowlers_df['powerplay'] == True]
    non_powerplay_df = bowlers_df[bowlers_df['powerplay'] == False]

    total_stats = bowlers_df.groupby('bowler').agg({
        'runs_batter': ['mean'],
        'runs_from_relevant_extras': ['mean'],
        'total': ['mean'],
        'relevant_wicket': ['mean']
    }).rename(columns={'runs_batter': 'batter_runs_conceded', 'total': 'total_runs_conceded', 'relevant_wicket': 'taken_from_relevant_wickets'})

    powerplay_stats = powerplay_df.groupby('bowler').agg({
        'runs_batter': ['mean'],
        'runs_from_relevant_extras': ['mean'],
        'total': ['mean'],
        'relevant_wicket': ['mean']
    }).rename(columns={'runs_batter': 'batter_runs_conceded', 'total': 'total_runs_conceded', 'relevant_wicket': 'taken_from_relevant_wickets'})

    non_powerplay_stats = non_powerplay_df.groupby('bowler').agg({
        'runs_batter': ['mean'],
        'runs_from_relevant_extras': ['mean'],
        'total': ['mean'],
        'relevant_wicket': ['mean']
    }).rename(columns={'runs_batter': 'batter_runs_conceded', 'total': 'total_runs_conceded', 'relevant_wicket': 'taken_from_relevant_wickets'}) 

    total_stats.columns = [
        f'total_{col[0]}_{col[1]}' for col in total_stats.columns
    ]
    powerplay_stats.columns = [
        f'powerplay_{col[0]}_{col[1]}' for col in powerplay_stats.columns
    ]
    non_powerplay_stats.columns = [
        f'non_powerplay_{col[0]}_{col[1]}' for col in non_powerplay_stats.columns
    ]

    bowler_stats = pd.concat([
        total_stats,
        powerplay_stats,
        non_powerplay_stats
    ], axis=1)
    return bowler_stats
bowler_stats = get_bowler_statistics(per_ball_data)
bowler_stats.head(5)

In [11]:
bowler_stats.reset_index().to_csv('../data/saved_data/bowler_stats.csv', index=False)


---
# Per player statistics: Batsmen

We want to rate the batsmen based on their performances.
On average:
- How many runs do they score per ball?
- How many high-scoring hits do they make per ball (4 or more runs)? (using as don't have boundaries data)
    - For runs/high-scoring hits, we'll separate powerplay, and non-powerplay, as well as total.
- How often are they gotten out?

In [12]:
batsmen_df = per_ball_data[['game_id', 'batter', 'runs_batter', 'extras', 'total', 'is_wicket', 'powerplay']].copy()

In [None]:
def get_batsman_statistics(per_ball_data):
    batsmen_df = per_ball_data[['game_id', 'batter', 'runs_batter', 'extras', 'total', 'is_wicket', 'powerplay']].copy()
    batsmen_df['high_scoring_hit'] = batsmen_df['runs_batter'] >= 4

    powerplay_df = batsmen_df[batsmen_df['powerplay'] == True]
    non_powerplay_df = batsmen_df[batsmen_df['powerplay'] == False]

    total_stats = batsmen_df.groupby('batter').agg({
        'runs_batter': ['mean'],
        'high_scoring_hit': ['mean'],
        'total': ['mean'],
        'is_wicket': ['mean']
    })

    powerplay_stats = powerplay_df.groupby('batter').agg({
        'runs_batter': ['mean'],
        'high_scoring_hit': ['mean'],
        'total': ['mean'],
        'is_wicket': ['mean']
    })

    non_powerplay_stats = non_powerplay_df.groupby('batter').agg({
        'runs_batter': ['mean'],
        'high_scoring_hit': ['mean'],
        'total': ['mean'],
        'is_wicket': ['mean']
    })

    total_stats.columns = [
        f'total_{col[0]}_{col[1]}' for col in total_stats.columns
    ]
    powerplay_stats.columns = [
        f'powerplay_{col[0]}_{col[1]}' for col in powerplay_stats.columns
    ]
    non_powerplay_stats.columns = [
        f'non_powerplay_{col[0]}_{col[1]}' for col in non_powerplay_stats.columns
    ]

    batsman_stats = pd.concat([
        total_stats,
        powerplay_stats,
        non_powerplay_stats
    ], axis=1)

    return batsman_stats
batsman_stats = get_batsman_statistics(per_ball_data)
batsman_stats.head(5)

In [14]:
batsman_stats.reset_index().to_csv('../data/saved_data/batter_stats.csv', index=False)

Looks like we have some nan values in the stats. This should be all good though, it suggests that they have not batted in powerplays etc...
Will keep as is, they shouldn't be counted in the per-team stats and therefore we don't want to replace with 0.

Now for merging back onto the original dataframe.

---


In [None]:
per_game_data = per_ball_data[['game_id', 'innings', 'bowler', 'batter', 'current_runs', 'current_wickets']].copy()
per_game_data = per_game_data.merge(bowler_stats, on=['bowler'], how='left') \
                             .merge(batsman_stats, on=['batter'], how='left')
per_game_data.head(3)

In [None]:
per_game_data.columns

In [None]:
per_game_aggregated = per_game_data.groupby(['game_id', 'innings']).agg({ # will effectively calculate weighted averages of bowler and batsman stats depending on no. of balls
    **{column: 'mean' for column in per_game_data.columns if column not in ['game_id', 'innings', 'bowler', 'batter', 'current_runs', 'current_wickets']},
    'current_runs': 'max',
    'current_wickets': 'max'
}).rename(columns={'current_runs': 'final_runs', 'current_wickets': 'final_wickets'})
per_game_aggregated.head(5)

In [None]:
per_game_aggregated.info()

In [None]:
# Standardize all columns to have mean 0 and standard deviation 1
standardized = (per_game_aggregated - per_game_aggregated.mean()) / per_game_aggregated.std()
standardized = standardized.drop(columns=['final_runs', 'final_wickets']).merge(per_game_aggregated.reset_index()[['game_id', 'innings', 'final_runs', 'final_wickets']], on=['game_id', 'innings'], how='left')
standardized.head(5)


In [None]:
standardized.info()

In [21]:
standardized.to_csv('../data/saved_data/standardized_per_game_data.csv', index=False)

---
# Possibly a glaring issue:
So I woke up this morning and realised one key thing. Each player is being rated on their performances over their entire career, not just up until the current game. 

I don't know how big an issue this is, but it's definitely notable - if it is a player's first game, they're being rated on their t20 career past that point. I belive this issue is known as 'data leakage', and I should probably instead look at player's games up to the current game instead of for their entire career, especially since the dataset spans several decades.

I think that even if it hurts the performance of the model, it's a more accurate representation of the players' abilities and therefore a better way to do predictions - it would allow us to predict games better in the future as well.

In order to implement this:
1. Get players' match data up to the current game
2. Use relevant number of games and use that to calculate stats
3. Otherwise same as before.

Impl:
- Groupby player and within that group by game.
- Calculate per_game stats for the player - easy using groupby and aggregate operations similar to above.
- Use a rolling window with the end being the current game and the size being the number of balls/games in order to calculate stats
    - May be more more challenging as window size may spill onto different players, but should be fine to work around.
    - This should hopefully be somewhat efficient using the windowed approach compared to lookup tables or going row by row.

---
# Trend analysis

Before I commit to the above, I'm going to inspect how players' stats change over time. The caveat to the above is that (from my limited knowledge of cricket) players usually only play for in t20 internationals for a few years during their primes, so this may not be a big issue.

If the general trend is that players' stats change very little over time, then I don't think it's a big issue using all of a player's data rather than just their data up to the current game, as the stats are unlikely to change much. Furthermore it's also significantly more efficient an implementation than needing to calculate previous games.

If however players' stats change a lot over time, then I should probably use historical data only.

For the sake of time, I am only going to look at batter runs and wickets taken. These are probably the most important stats and should give a good indication of what I need to know.

---

After suggestions from a friend who works in Biomedical research and then further independent research online, I've decided to use CUSUM analysis to detect changes in players' stats over time.

While I understand the theory behind it, I've never implemented it before, so copilot did a lot of the heavy lifting here. I can't really credit myself for the following implementation, all I did was adapt the code it generated to fit my data and needs.

In [None]:

def perform_cusum_analysis(df, player_id, metric='runs_batter', threshold=1, check = 'bowler'):
    # Get player's chronological data
    player_data = df[df[check] == player_id] \
                            .sort_values(by='date', ascending=True)[['game_id', metric]] \
                            .groupby('game_id').agg('mean') \
                            .values \
                            .flatten()
    # Calculate mean and standard deviation
    mean = np.mean(player_data)
    std = np.std(player_data)
    
    # Initialize CUSUM arrays
    cusum_pos = np.zeros(len(player_data))
    cusum_neg = np.zeros(len(player_data))
    
    # Calculate CUSUM values
    for i in range(1, len(player_data)):
        # Standardize the observation
        z_score = (player_data[i] - mean) / std
        
        # Calculate positive and negative CUSUMs
        cusum_pos[i] = max(0, float(cusum_pos[i-1] + z_score - threshold))
        cusum_neg[i] = min(0, float(cusum_neg[i-1] + z_score + threshold))
    
    # Detect change points
    change_points = np.where(
        (abs(cusum_pos) > threshold) | 
        (abs(cusum_neg) > threshold)
    )[0]
    
    return {
        'cusum_pos': cusum_pos,
        'cusum_neg': cusum_neg,
        'change_points': change_points,
        'mean': mean,
        'std': std
    }

def analyze_performance_stability(df, metrics=['runs_batter', 'is_wicket'], check = 'bowler'):
    def categorize_trend(cusum_result):
        # If no significant changes, performance is stable
        if len(cusum_result['change_points']) == 0:
            return 'stable'
        
        # Look at direction of changes
        pos_changes = cusum_result['cusum_pos'].max() > 1
        neg_changes = cusum_result['cusum_neg'].min() < -1
        
        if pos_changes and neg_changes:
            return 'variable'  # Both improvement and decline
        elif pos_changes:
            return 'improving'
        elif neg_changes:
            return 'declining'
        return 'stable'
    
    results = {}
    for player in tqdm(df['bowler'].unique()):
        player_trends = {}
        for metric in metrics:
            cusum_result = perform_cusum_analysis(df, player, metric)
            player_trends[metric] = categorize_trend(cusum_result)
        results[player] = player_trends
    
    # Summarize results
    summary = {metric: {} for metric in metrics}
    for metric in tqdm(metrics):
        trends = [results[player][metric] for player in results]
        summary[metric] = {
            'stable': trends.count('stable') / len(trends),
            'improving': trends.count('improving') / len(trends),
            'declining': trends.count('declining') / len(trends),
            'variable': trends.count('variable') / len(trends)
        }
    
    return results, summary

# Run analysis
player_trends, trend_summary = analyze_performance_stability(per_ball_data, check='bowler')

# Print summary
print("Performance Stability Analysis: Bowlers")
for metric, stats in trend_summary.items():
    print(f"{metric}:")
    for trend, proportion in stats.items():
        print(f"{trend}: {proportion:.1%}")
    print('\n')



In [None]:
player_trends, trend_summary = analyze_performance_stability(per_ball_data, check='batter')

# Print summary
print("Performance Stability Analysis: Batsmen")
for metric, stats in trend_summary.items():
    print(f"{metric}:")
    for trend, proportion in stats.items():
        print(f"{trend}: {proportion:.1%}")
    print('\n')

In [None]:
def visualize_cusum_analysis(df, player_id, metric='runs_batter', threshold=1):
    import matplotlib.pyplot as plt
    
    result = perform_cusum_analysis(df, player_id, metric, threshold)
    
    # Create figure with two subplots
    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(15, 10))
    

        # Get player's chronological data
    player_data = df[df['bowler'] == player_id] \
                            .sort_values(by='date', ascending=True)[['game_id', metric]] \
                            .groupby('game_id').agg('mean') \
                            .values \
                            .flatten()
    # Plot 1: Raw performance data
    ax1.plot(player_data, 'b-', label='Performance')
    ax1.axhline(y=result['mean'], color='r', linestyle='--', label='Mean')
    ax1.axhline(y=result['mean'] + result['std'], color='g', linestyle=':', label='+1 SD')
    ax1.axhline(y=result['mean'] - result['std'], color='g', linestyle=':', label='-1 SD')
    ax1.set_title(f'Raw Performance Data{" for " + player_id if player_id else ""}')
    ax1.legend()
    
    # Plot 2: CUSUM chart
    ax2.plot(result['cusum_pos'], 'g-', label='Positive CUSUM')
    ax2.plot(result['cusum_neg'], 'r-', label='Negative CUSUM')
    ax2.axhline(y=0, color='b', linestyle='--')
    
    # Highlight change points
    for cp in result['change_points']:
        ax2.axvline(x=cp, color='purple', alpha=0.3)
    
    ax2.set_title('CUSUM Analysis')
    ax2.legend()
    
    plt.tight_layout()
    plt.show()
    
    # Print summary
    print(f"\nSummary:")
    print(f"Mean performance: {result['mean']:.2f}")
    print(f"Standard deviation: {result['std']:.2f}")
    print(f"Number of change points: {len(result['change_points'])}")
    if len(result['change_points']) > 0:
        print(f"Change points at games: {result['change_points'].tolist()}")


visualize_cusum_analysis(per_ball_data, '3a60e0b5', 'runs_batter')

Based on these results it seems that over 2/3 of players have no significant changes in their performances over time. Furthermore, since CUSUM analysis only tracks when there is repeated improvement or decline, we can be fairly confident that blips in form of 1-2 games do not significantly affect results. Furthermore, my relatively low (arbitrarily chosen) threshold of 1sd away from the mean is fairly conservative, so there's a good chance that my results will be overestimating players' changes in form over time...

Therefore, I am going to keep my current approach of using all of a player's data in the dataset to predict results.