# GVSA Team Analysis

This notebook provides deep dive analysis for a specific team.

## Usage
1. Set the `target_team_name` and `target_season` variables below
2. Run all cells to see comprehensive analysis including:
   - Team performance metrics
   - Distribution comparisons (U11 teams vs all teams)
   - Visualizations showing where the team ranks
   - Summary statistics

## Example
The default example analyzes "West Coast United 15B Green RP" in Fall 2025, but you can change these variables to analyze any team.


In [None]:
# Import required libraries
import sys
from pathlib import Path

# Database and ORM
from pony.orm import db_session, select, count
from models import db, Season, Division, Team, TeamSeason, Match, Club
from db_pony import GVSA_Database

# Data analysis
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set up plotting
%matplotlib inline
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

print("Libraries imported successfully")


## Connect to Database


In [None]:
# Connect to the database
db_path = "/projects/gvsa_scrape/gvsa_data2.db"

# Explicitly bind the database (required for Jupyter notebooks)
try:
    if hasattr(db, 'provider') and db.provider:
        db.disconnect()
except Exception:
    pass

# Bind to the database
db.bind(provider='sqlite', filename=db_path, create_db=False)
db.generate_mapping(create_tables=False)

# Initialize database connection
gvsa_db = GVSA_Database(db_path)

print(f"Connected to database: {db_path}")
print(f"Database file exists: {Path(db_path).exists()}")

# Verify connection
with db_session:
    season_count = count(s for s in Season)
    print(f"Seasons in database: {season_count}")


## Configure Target Team

**Modify these variables to analyze a different team:**


In [None]:
# Configuration: Set your target team and season here
target_team_name = "West Coast United 15B Green RP"  # Change this to analyze a different team
target_season = "Fall 2025"  # Change this to analyze a different season

print(f"Target team: '{target_team_name}'")
print(f"Target season: {target_season}")


## Find and Load Target Team


In [None]:
# Programmatically search for the target team
print(f"Searching for team: '{target_team_name}'")
print(f"Target season: {target_season}")
print("=" * 80)

with db_session:
    # Search for the target team (flexible matching)
    search_terms = [
        target_team_name.lower(),
        target_team_name.lower().replace(" ", ""),
        target_team_name.lower().replace("-", " "),
    ]
    
    target_team_season = None
    for search_term in search_terms:
        team_seasons = list(select(
            ts for ts in TeamSeason 
            if search_term in ts.team_name.lower() 
            and target_season in ts.division.season.season_name
        ))
        if team_seasons:
            target_team_season = team_seasons[0]
            break
    
    if not target_team_season:
        print(f"Warning: Could not find exact match for '{target_team_name}'")
        print("\nSearching for similar teams...")
        # Try broader search
        similar_teams = list(select(
            ts for ts in TeamSeason 
            if any(term in ts.team_name.lower() for term in target_team_name.lower().split()[:3])
            and target_season in ts.division.season.season_name
        ))
        if similar_teams:
            print(f"Found {len(similar_teams)} similar teams:")
            for ts in similar_teams:
                print(f"  - {ts.team_name} ({ts.division.division_name})")
            target_team_season = similar_teams[0]  # Use first match
        else:
            print("No similar teams found. Please verify the team name and season.")
    
    if target_team_season:
        print(f"\nFound team: {target_team_season.team_name}")
        print(f"  Division: {target_team_season.division.division_name}")
        print(f"  Season: {target_team_season.division.season.season_name}")
        print(f"  Record: {target_team_season.wins}W-{target_team_season.losses}L-{target_team_season.ties}T")
        print(f"  Points: {target_team_season.points}")
        print(f"  Goals For: {target_team_season.goals_for}")
        print(f"  Goals Against: {target_team_season.goals_against}")
        print(f"  Goal Differential: {target_team_season.goal_differential}")


## Load Season Data

Load all teams for the target season for comparison purposes.


In [None]:
# Load all teams for target season
df_season = pd.DataFrame()
df_age_group = pd.DataFrame()
df_club_teams = pd.DataFrame()

if target_team_season:
    with db_session:
        # Get the target season
        season_obj = target_team_season.division.season
        print(f"Loading data for season: {season_obj.season_name}")
        
        # Load all team seasons for this season
        all_season_data = []
        for ts in select(ts for ts in TeamSeason if ts.division.season == season_obj):
            all_season_data.append({
                'team_name': ts.team_name,
                'division': ts.division.division_name,
                'wins': ts.wins,
                'losses': ts.losses,
                'ties': ts.ties,
                'points': ts.points,
                'goals_for': ts.goals_for,
                'goals_against': ts.goals_against,
                'goal_differential': ts.goal_differential,
            })
        
        df_season = pd.DataFrame(all_season_data)
        print(f"Loaded {len(df_season)} teams for {season_obj.season_name}")
        
        if len(df_season) > 0:
            # Extract age group from target team's division (e.g., "U11" from "U11 Boys 5th Division")
            target_division = target_team_season.division.division_name
            age_group_keywords = ['U9', 'U10', 'U11', 'U12', 'U13', 'U14', 'U15', 'U16', 'U17', 'U18', 'U19']
            age_group = None
            for keyword in age_group_keywords:
                if keyword in target_division:
                    age_group = keyword
                    break
            
            if age_group:
                # Filter for same age group teams
                df_age_group = df_season[df_season['division'].str.contains(age_group, case=False, na=False)].copy()
                print(f"Found {len(df_age_group)} {age_group} teams in {season_obj.season_name}")
            
            # Identify teams from same club (if club info available)
            if target_team_season.team and target_team_season.team.club:
                club_name = target_team_season.team.club.name
                df_club_teams = df_season[df_season['team_name'].str.contains(
                    club_name.split()[0], case=False, na=False
                )].copy()
                if len(df_club_teams) > 0:
                    print(f"Found {len(df_club_teams)} teams from same club")
                    print("\nClub teams:")
                    for idx, row in df_club_teams.iterrows():
                        marker = " <-- TARGET" if row['team_name'] == target_team_season.team_name else ""
                        print(f"  - {row['team_name']} ({row['division']}){marker}")
        else:
            print("Warning: No teams found for this season")
else:
    print("Target team not found. Please check the team name and season.")


In [None]:
def plot_distribution_with_highlights(
    df: pd.DataFrame, 
    metric: str, 
    title: str, 
    xlabel: str, 
    target_team_name: str = None, 
    highlight_teams: pd.DataFrame = None,
    subset_label: str = "All Teams"
) -> tuple:
    """
    Plot a distribution histogram with highlights for specific teams using seaborn.
    
    Parameters
    ----------
    df : pd.DataFrame
        DataFrame containing team data
    metric : str
        Column name to plot
    title : str
        Plot title
    xlabel : str
        X-axis label
    target_team_name : str, optional
        Name of the target team to highlight
    highlight_teams : pd.DataFrame, optional
        DataFrame of teams to highlight (e.g., same club teams)
    subset_label : str
        Label for the subset being plotted
        
    Returns
    -------
    tuple
        (fig, ax) matplotlib objects
    """
    # Check if DataFrame is empty
    if df is None or len(df) == 0:
        print(f"Warning: No data available for {title}")
        return None, None
    
    # Check if metric column exists
    if metric not in df.columns:
        print(f"Warning: Column '{metric}' not found in DataFrame")
        return None, None
    
    fig, ax = plt.subplots(figsize=(12, 6))
    
    # Use seaborn to plot the main distribution
    sns.histplot(data=df, x=metric, bins=30, alpha=0.6, color='steelblue', 
                 edgecolor='black', kde=False, ax=ax, label=f'{subset_label} (n={len(df)})')
    
    # Highlight additional teams (e.g., same club)
    if highlight_teams is not None and len(highlight_teams) > 0 and metric in highlight_teams.columns:
        highlight_values = highlight_teams[metric].values
        for val in highlight_values:
            ax.axvline(val, color='orange', linestyle='--', linewidth=2, alpha=0.7)
        # Add annotation
        ax.text(0.02, 0.98, f'Highlighted teams: {len(highlight_teams)}', 
                transform=ax.transAxes, fontsize=10, verticalalignment='top',
                bbox=dict(boxstyle='round', facecolor='orange', alpha=0.3))
    
    # Highlight target team
    if target_team_name is not None and 'team_name' in df.columns:
        # Try exact match first
        target_row = df[df['team_name'] == target_team_name]
        if len(target_row) == 0:
            # Fallback to contains match
            target_row = df[df['team_name'].str.contains(target_team_name, case=False, na=False)]
        
        if len(target_row) > 0 and metric in target_row.columns:
            target_value = target_row[metric].iloc[0]
            actual_team_name = target_row['team_name'].iloc[0]
            
            ax.axvline(target_value, color='red', linestyle='-', linewidth=3, 
                      label=f'Target: {actual_team_name}')
            # Add text annotation
            display_name = actual_team_name[:40] + "..." if len(actual_team_name) > 40 else actual_team_name
            ax.text(0.02, 0.92, f'Target team ({display_name}): {target_value}', 
                    transform=ax.transAxes, fontsize=10, verticalalignment='top',
                    bbox=dict(boxstyle='round', facecolor='red', alpha=0.3))
            
            # Calculate percentile
            if len(df) > 0:
                percentile = (df[metric] <= target_value).sum() / len(df) * 100
                ax.text(0.02, 0.86, f'Percentile: {percentile:.1f}%', 
                        transform=ax.transAxes, fontsize=10, verticalalignment='top',
                        bbox=dict(boxstyle='round', facecolor='lightblue', alpha=0.3))
    
    ax.set_xlabel(xlabel, fontsize=12)
    ax.set_ylabel('Number of Teams', fontsize=12)
    ax.set_title(title, fontsize=14, fontweight='bold')
    ax.legend()
    ax.grid(True, alpha=0.3)
    
    plt.tight_layout()
    return fig, ax


## Distribution Analysis: Age Group Comparison

Compare the target team to all teams in the same age group.


In [None]:
# Get target team name for visualization
target_name_viz = None
if target_team_season:
    target_name_viz = target_team_season.team_name

if len(df_age_group) > 0 and target_name_viz:
    print(f"Comparing {target_name_viz} to {len(df_age_group)} age group teams")
    
    # Goals For distribution
    plot_distribution_with_highlights(
        df_age_group, 
        'goals_for', 
        f'Goals For Distribution - Age Group Teams ({target_season})',
        'Goals For',
        target_team_name=target_name_viz,
        highlight_teams=df_club_teams if len(df_club_teams) > 0 else None,
        subset_label="Age Group Teams"
    )
    plt.show()
else:
    print("Age group data not available for comparison")


In [None]:
if len(df_age_group) > 0 and target_name_viz:
    # Goals Against distribution
    plot_distribution_with_highlights(
        df_age_group, 
        'goals_against', 
        f'Goals Against Distribution - Age Group Teams ({target_season})',
        'Goals Against',
        target_team_name=target_name_viz,
        highlight_teams=df_club_teams if len(df_club_teams) > 0 else None,
        subset_label="Age Group Teams"
    )
    plt.show()


In [None]:
if len(df_age_group) > 0 and target_name_viz:
    # Goal Differential distribution
    plot_distribution_with_highlights(
        df_age_group, 
        'goal_differential', 
        f'Goal Differential Distribution - Age Group Teams ({target_season})',
        'Goal Differential',
        target_team_name=target_name_viz,
        highlight_teams=df_club_teams if len(df_club_teams) > 0 else None,
        subset_label="Age Group Teams"
    )
    plt.show()


In [None]:
if len(df_age_group) > 0 and target_name_viz:
    # Points distribution
    plot_distribution_with_highlights(
        df_age_group, 
        'points', 
        f'Points Distribution - Age Group Teams ({target_season})',
        'Points',
        target_team_name=target_name_viz,
        highlight_teams=df_club_teams if len(df_club_teams) > 0 else None,
        subset_label="Age Group Teams"
    )
    plt.show()


In [None]:
if len(df_season) > 0 and target_name_viz:
    print(f"Comparing {target_name_viz} to {len(df_season)} total teams in season")
    
    # Goals For distribution
    plot_distribution_with_highlights(
        df_season, 
        'goals_for', 
        f'Goals For Distribution - ALL Teams ({target_season})',
        'Goals For',
        target_team_name=target_name_viz,
        highlight_teams=df_club_teams if len(df_club_teams) > 0 else None,
        subset_label="All Teams"
    )
    plt.show()


In [None]:
if len(df_season) > 0 and target_name_viz:
    # Goals Against distribution
    plot_distribution_with_highlights(
        df_season, 
        'goals_against', 
        f'Goals Against Distribution - ALL Teams ({target_season})',
        'Goals Against',
        target_team_name=target_name_viz,
        highlight_teams=df_club_teams if len(df_club_teams) > 0 else None,
        subset_label="All Teams"
    )
    plt.show()


In [None]:
if len(df_season) > 0 and target_name_viz:
    # Goal Differential distribution
    plot_distribution_with_highlights(
        df_season, 
        'goal_differential', 
        f'Goal Differential Distribution - ALL Teams ({target_season})',
        'Goal Differential',
        target_team_name=target_name_viz,
        highlight_teams=df_club_teams if len(df_club_teams) > 0 else None,
        subset_label="All Teams"
    )
    plt.show()


In [None]:
if len(df_season) > 0 and target_name_viz:
    # Points distribution
    plot_distribution_with_highlights(
        df_season, 
        'points', 
        f'Points Distribution - ALL Teams ({target_season})',
        'Points',
        target_team_name=target_name_viz,
        highlight_teams=df_club_teams if len(df_club_teams) > 0 else None,
        subset_label="All Teams"
    )
    plt.show()


## Summary Statistics

Compare the target team's performance to age group and all teams.


In [None]:
# Create summary comparison
if target_team_season and target_name_viz:
    # Get target team data from appropriate DataFrame
    if len(df_age_group) > 0:
        target_row = df_age_group[df_age_group['team_name'] == target_name_viz]
        if len(target_row) == 0:
            target_row = df_season[df_season['team_name'] == target_name_viz]
    else:
        target_row = df_season[df_season['team_name'] == target_name_viz]
    
    if len(target_row) > 0:
        target_row = target_row.iloc[0]
        
        print("=" * 80)
        print(f"TARGET TEAM: {target_row['team_name']}")
        print(f"Division: {target_row['division']}")
        print("=" * 80)
        
        metrics = ['goals_for', 'goals_against', 'goal_differential', 'points']
        
        comparison_data = []
        for metric in metrics:
            if metric not in target_row.index:
                continue
                
            target_val = target_row[metric]
            
            # Age group comparison
            age_mean = df_age_group[metric].mean() if len(df_age_group) > 0 and metric in df_age_group.columns else None
            age_percentile = None
            if age_mean is not None:
                age_percentile = (df_age_group[metric] <= target_val).sum() / len(df_age_group) * 100
            
            # All teams comparison
            all_mean = df_season[metric].mean() if len(df_season) > 0 and metric in df_season.columns else None
            all_percentile = None
            if all_mean is not None:
                all_percentile = (df_season[metric] <= target_val).sum() / len(df_season) * 100
            
            comparison_data.append({
                'Metric': metric.replace('_', ' ').title(),
                'Target Value': target_val,
                'Age Group Mean': f"{age_mean:.1f}" if age_mean is not None else "N/A",
                'Age Group Percentile': f"{age_percentile:.1f}%" if age_percentile is not None else "N/A",
                'All Teams Mean': f"{all_mean:.1f}" if all_mean is not None else "N/A",
                'All Teams Percentile': f"{all_percentile:.1f}%" if all_percentile is not None else "N/A",
            })
        
        if comparison_data:
            df_comparison = pd.DataFrame(comparison_data)
            print("\nComparison Summary:")
            print(df_comparison.to_string(index=False))
        
        if len(df_club_teams) > 0:
            print("\n" + "=" * 80)
            print("Club Teams Summary:")
            print("=" * 80)
            required_cols = ['team_name', 'division', 'goals_for', 'goals_against', 'goal_differential', 'points']
            available_cols = [col for col in required_cols if col in df_club_teams.columns]
            if available_cols:
                print(df_club_teams[available_cols].to_string(index=False))
else:
    print("Target team data not available. Please verify team name and season.")
