In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from mplsoccer import Pitch

In [None]:
actions = pd.read_csv('data/train_actions.csv')
game_stats = pd.read_csv('data/train_game_stats.csv')
season_stats = pd.read_csv('data/train_season_stats.csv')

In [None]:
def exploratory_analysis(actions, game_stats, season_stats):
    # 1. Distribution of Event Types
    plt.figure(figsize=(12, 6))
    event_type_counts = actions['type'].value_counts()
    ax = event_type_counts.plot(kind='bar')
    plt.title('Distribution of Action Types')
    plt.xlabel('Action Type')
    plt.ylabel('Frequency')
    plt.xticks(rotation=45, ha='right')

    # Add count labels on top of each bar
    for i, v in enumerate(event_type_counts):
        ax.text(i, v, str(v), ha='center', va='bottom')

    plt.tight_layout()
    plt.savefig('images/event_type_distribution.png')
    plt.show()

    # 2. Average Events per Match
    match_level_stats = actions.groupby('match_id').agg({
        'type': [
            'count',
            lambda x: (x == 'Pass').sum(),
            lambda x: (x == 'Shot').sum()
        ],
        'outcome': lambda x: (x == 'Goal').sum()
    })

    # Rename columns for clarity
    match_level_stats.columns = ['total_actions', 'total_passes', 'total_shots', 'total_goals']
    
    print(match_level_stats.describe())

    # 3. Player Performance Analysis
    player_season_performance = actions.groupby(['player', 'season']).agg({
         'type': [
            'count',
            lambda x: (x == 'Pass').sum(),
            lambda x: (x == 'Shot').sum()
        ],
        'outcome': lambda x: (x == 'Goal').sum()
    })

    player_season_performance.columns = ['total_actions', 'total_passes', 'total_shots', 'total_goals']
    
    # Merge with season minutes played
    player_season_performance = player_season_performance.merge(
        season_stats[['player', 'season', 'minutes_played']], 
        left_index=True, 
        right_on=['player', 'season']
    )

    season_2021 = player_season_performance.query("season == 2021")
    season_2022 = player_season_performance.query("season == 2022")
    print(season_2021.describe())
    print(season_2022.describe())
    
    # Top Performers
    for metric in ['total_actions', 'total_shots', 'total_goals', 'total_passes']:
        print(f"Top 5 Players by {metric} in 2021:")
        print(season_2021.sort_values(metric, ascending=False).head(5))
        print(f"Top 5 Players by {metric} in 2022:")
        print(season_2022.sort_values(metric, ascending=False).head(5))

    # 4. Possession Length Distribution
    plt.hist(actions[['match_id', 'possession']].value_counts(), bins=30)
    plt.title('Distribution of Possession Length')
    plt.xlabel('Possession Length')
    plt.ylabel('Frequency')
    plt.tight_layout()
    plt.savefig('images/possession_length_distribution.png')
    plt.show()

# Run the analysis
exploratory_analysis(actions, game_stats, season_stats)

In [None]:
def create_goal_location_heatmap(df, title=None):
    """
    Create a heatmap showing the location of goals on the soccer field
    
    Parameters:
    df: DataFrame with the actions data
    title: Custom title for the plot (optional)
    """
    # Pitch setup
    field_length = 120 
    field_width = 80 
    horizontal_zones = 5
    vertical_zones = 7
    horizontal_lines = np.linspace(0, field_width, horizontal_zones + 1)
    vertical_lines = np.linspace(0, field_length, vertical_zones + 1)
    
    # Filter for and count goals
    goal_actions = df[df['outcome'] == 'Goal']
    zone_goal_counts = goal_actions.groupby('zone').size().reset_index(name='count')
    heatmap_data = np.zeros((horizontal_zones, vertical_zones))
    
    # Fill the array with goal count values
    for _, row in zone_goal_counts.iterrows():
        zone = row['zone']
        # Extract i and j from zone label (assuming format 'Zone i-j')
        i, j = map(int, zone.replace('Zone ', '').split('-'))
        # Convert to 0-based indexing
        i -= 1
        j -= 1
        heatmap_data[i, j] = row['count']
    
    # Create a pitch and heatmap
    pitch = Pitch(pitch_type='custom', pitch_length=field_length, pitch_width=field_width,
                 line_color='black', line_zorder=2)
    fig, ax = pitch.draw(figsize=(12, 8))
    X, Y = np.meshgrid(vertical_lines, horizontal_lines)
    cmap = plt.cm.Purples
    vmax = heatmap_data.max()
    mesh = ax.pcolormesh(X, Y, heatmap_data, cmap=cmap, alpha=0.7, zorder=1, vmax=vmax)
    
    # Add a colorbar
    cbar = plt.colorbar(mesh, ax=ax)
    cbar.set_label('Number of Goals')
    
    # Add the percentage values in the cells
    total_goals = goal_actions.shape[0]
    for i in range(horizontal_zones):
        for j in range(vertical_zones):
            # Calculate center of the zone
            x_center = (vertical_lines[j] + vertical_lines[j + 1]) / 2
            y_center = (horizontal_lines[i] + horizontal_lines[i + 1]) / 2
            
            # Calculate percentage of goals in this zone
            value = heatmap_data[i, j] / total_goals if total_goals > 0 else 0
            if value > 0:  # Only show non-zero values
                ax.text(x_center, y_center, f"{value * 100 :.1f}%", ha='center', va='center',
                      fontsize=10, color='black', fontweight='bold', zorder=3)
    
    # Add row labels (first number in zone) on the y-axis
    for i in range(horizontal_zones):
        y_pos = (horizontal_lines[i] + horizontal_lines[i + 1]) / 2
        ax.text(-3, y_pos, f"{i+1}", ha='right', va='center', fontsize=10)
    
    # Add column labels (second number in zone) on the x-axis
    for j in range(vertical_zones):
        x_pos = (vertical_lines[j] + vertical_lines[j + 1]) / 2
        ax.text(x_pos, field_width + 5, f"{j+1}", ha='center', va='bottom', fontsize=10)
    
    # Set title
    plot_title = title or "Goal Locations on the Field"
    ax.set_title(plot_title, fontsize=16)
    plt.gca().invert_yaxis()  # Ensure (0,0) is at the top-left
    
    return fig, ax

fig, ax = create_goal_location_heatmap(actions)
plt.show()

In [None]:
# Where are actions happening?
def create_zone_action_heatmap(df, possession_type, title=None):
    """
    Create a heatmap showing the count of actions by zone for specific possession types
    
    Parameters:
    df: DataFrame with the actions data
    possession_type: 'scoring' or 'conceding' to filter the data
    title: Custom title for the plot (optional)
    """
    # Set up Pitch
    field_length = 120
    field_width = 80   
    horizontal_zones = 5
    vertical_zones = 7
    horizontal_lines = np.linspace(0, field_width, horizontal_zones + 1)
    vertical_lines = np.linspace(0, field_length, vertical_zones + 1)
    
    # Filter dataset based on possession type
    if possession_type == 'scoring':
        filtered_df = df[df['possession_scores'] == True]
    elif possession_type == 'conceding':
        filtered_df = df[df['possession_concedes'] == True]
    else:
        filtered_df = df.loc[((df['possession_scores'] != True) & df['possession_concedes'] != True)]  # Use all data if no specific type is selected
    
    # Count actions by zone
    zone_counts = filtered_df.groupby('zone').size().reset_index(name='count')
    heatmap_data = np.zeros((horizontal_zones, vertical_zones))
    
    # Fill the array with count values
    for _, row in zone_counts.iterrows():
        zone = row['zone']
        i, j = map(int, zone.replace('Zone ', '').split('-'))
        # Convert to 0-based indexing
        i -= 1
        j -= 1
        heatmap_data[i, j] = row['count']
    
    # Create a pitch and heatmap
    pitch = Pitch(pitch_type='custom', pitch_length=field_length, pitch_width=field_width,
                 line_color='black', line_zorder=2)
    fig, ax = pitch.draw(figsize=(12, 8))
    X, Y = np.meshgrid(vertical_lines, horizontal_lines)
    
    # Create a custom colormap
    if possession_type == 'scoring':
        cmap = plt.cm.Greens  # Green colormap for scoring
    elif possession_type == 'conceding':
        cmap = plt.cm.Reds    # Red colormap for conceding
    else:
        cmap = plt.cm.Blues   # Blue colormap for all actions
    
    # For normalization
    vmax = heatmap_data.max()
    mesh = ax.pcolormesh(X, Y, heatmap_data, cmap=cmap, alpha=0.7, zorder=1, vmax=vmax)
    cbar = plt.colorbar(mesh, ax=ax)
    cbar.set_label('Number of Actions')
    
    # Add the count values in the cells
    for i in range(horizontal_zones):
        for j in range(vertical_zones):
            x_center = (vertical_lines[j] + vertical_lines[j + 1]) / 2
            y_center = (horizontal_lines[i] + horizontal_lines[i + 1]) / 2
            
            value = heatmap_data[i, j] / filtered_df.shape[0]
            if value > 0:  # Only show non-zero values
                ax.text(x_center, y_center, f"{value * 100 :.1f}%", ha='center', va='center',
                      fontsize=10, color='black', fontweight='bold', zorder=3)
    
    # Add row and column labels
    for i in range(horizontal_zones):
        y_pos = (horizontal_lines[i] + horizontal_lines[i + 1]) / 2
        ax.text(-3, y_pos, f"{i+1}", ha='right', va='center', fontsize=10)

    for j in range(vertical_zones):
        x_pos = (vertical_lines[j] + vertical_lines[j + 1]) / 2
        ax.text(x_pos, field_width + 5, f"{j+1}", ha='center', va='bottom', fontsize=10)
    
    # Set title
    if title:
        plot_title = title
    else:
        if possession_type == 'scoring':
            plot_title = "Actions in Scoring Possessions"
        elif possession_type == 'conceding':
            plot_title = "Actions in Conceding Possessions"
        else:
            plot_title = "Actions in Neutral Possessions"
    
    ax.set_title(plot_title, fontsize=16)
    plt.gca().invert_yaxis()
    
    return fig, ax


fig1, ax1 = create_zone_action_heatmap(actions, possession_type='scoring')
plt.show()
fig2, ax2 = create_zone_action_heatmap(actions, possession_type='conceding')
plt.show()
fig3, ax3 = create_zone_action_heatmap(actions, possession_type='neutral')
plt.show()