# 1. Import & Setup

In [1]:
# Import libraries required for data analysis and visualization.
# 'pandas' for data manipulation, 'seaborn' for plot styling.
# 'warnings' to ignore unnecessary warning messages.
import pandas as pd
import seaborn as sns
import warnings
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

warnings.filterwarnings('ignore')

# Set the plot visualization style using seaborn.
plt.style.use("seaborn")
sns.set(font_scale=1.1)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
       print(os.path.join(dirname, filename))

/kaggle/input/all-football-players-stats-in-top-5-leagues-2425/top5-players24-25.xlsx


## 💡 Background

In the modern football industry, young players are not just squad reinforcements - they are long-term investments for clubs. Increasingly congested fixture schedules, commercial pressures and the risk of injury to first-team players make the need for regeneration more urgent.

On the other hand, clubs now have access to very detailed performance data. This opens up the opportunity to objectively evaluate young players' contributions - not only based on goals and assists but also through advanced metrics such as expected goals (xG), expected assists (xAG) and other progressive actions.

This analysis aims to examine how young players perform in Europe's top leagues. To what extent are they contributing? Are they ready to become mainstays or do they still need time to develop? These findings are expected to provide insights for clubs in designing rotation strategies and future investments.

## 🎯 Analysis Objectives
This notebook aims to explore the performance of young players (aged <= 21 years) from Europe's top 5 leagues (Premier League, La Liga, Bundesliga, Serie A, Ligue 1). The main focus includes:
- Frequency and pattern of young players' playing minutes
- Productivity in scoring goals and making assists
- Player effectiveness in converting xG (Expected Goals) and xAG (Expected Assisted Goals)
- Distribution of young players' positions
- Identification of leagues and clubs that best support young talent development

## 🔍 Methodology
- Dataset: Player statistics from the top five European leagues of the last season.

    - Young Players: Defined as players aged less than or equal to 21 years (<=21).

- Key Stats:

    - Finishing: Goals, Non-Penalty Goals, xG, Gls/90, Gls/xG

    - Playmaking: Assists, xAG, Progressive Passes, Ast/90

    - Defending: Minutes, Position, Cards, and non-goal contributions

- Tools:

    - Python (Pandas, Plotly, Seaborn, Scikit-learn)

    - Clustering: Players are grouped based on dominant roles (Finisher, Playmaker, Defender) through KMeans & radar profiling approaches.

# 2. Load Data

In [2]:
PATH = '/kaggle/input/all-football-players-stats-in-top-5-leagues-2425/top5-players24-25.xlsx'
df = pd.read_excel(PATH)
# Display the first 5 rows of the DataFrame to get a glimpse of the data structure.
df.head(5)

Unnamed: 0,Rk,Player,Nation,Pos,Squad,Comp,Age,Born,MP,Starts,...,Gls_90,Ast_90,G+A_90,G-PK_90,G+A-PK_90,xG_90,xAG_90,xG+xAG_90,npxG_90,npxG+xAG_90
0,1,Max Aarons,eng ENG,DF,Bournemouth,eng Premier League,24.0,2000.0,3,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,Max Aarons,eng ENG,"DF,MF",Valencia,es La Liga,24.0,2000.0,4,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.02,0.0,0.02
2,3,Rodrigo Abajas,es ESP,DF,Valencia,es La Liga,21.0,2003.0,1,1,...,0.0,0.0,0.0,0.0,0.0,0.1,0.0,0.1,0.1,0.1
3,4,James Abankwah,ie IRL,"DF,MF",Udinese,it Serie A,20.0,2004.0,6,0,...,0.0,0.0,0.0,0.0,0.0,0.06,0.0,0.06,0.06,0.06
4,5,Keyliane Abdallah,fr FRA,FW,Marseille,fr Ligue 1,18.0,2006.0,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## 📊 Initial Data Check and Cleaning
This section aims to check data quality, such as missing values and data duplication, and perform basic cleaning if needed.

In [3]:
# Check missing values
print("\n--- Total Missing Values on All Dataset ---")
total_missing = df.isnull().sum().sum()
if total_missing > 0:
    print(f"Total missing values: {total_missing}")
else:
    print("No missing values in dataset.")

print("\n--- Missing Values per columns (if any) ---")
missing_per_column = df.isnull().sum()
missing_per_column = missing_per_column[missing_per_column > 0]
if not missing_per_column.empty:
    print(missing_per_column)
else:
    print("No missing values per columns.")


--- Total Missing Values on All Dataset ---
Total missing values: 23

--- Missing Values per columns (if any) ---
Nation    7
Age       8
Born      8
dtype: int64


In [4]:
# Check for total missing values across the dataset.
print("\n--- Total Missing Values Across the Entire Dataset ---")
total_missing = df.isnull().sum().sum()
if total_missing > 0:
    print(f"Total missing values: {total_missing}")
else:
    print("There are no missing values in the dataset.")

# Check for missing values per column (if any).
print("\n--- Missing Values per Column (if any) ---")
missing_per_column = df.isnull().sum()
missing_per_column = missing_per_column[missing_per_column > 0]
if not missing_per_column.empty:
        print(missing_per_column)
else:
    print("There are no missing values per column.")

# Check for total duplicate values across the dataset.
print("\n--- Total Duplicate Values Across Entire Dataset ---")
total_duplicate = df.duplicated().sum().sum()
if total_duplicate > 0:
    print(f"Total duplicate values: {total_duplicate}")
else:
    print("There are no duplicate values in the dataset.")

# Display rows that have missing values in the 'Nation', 'Age', or 'Born' columns.
missing_rows = df[df['Nation'].isnull() | df['Age'].isnull() | df['Born'].isnull()][['Player', 'Squad','Nation', 'Age', 'Born']]
print(missing_rows)

# Remove rows that contain missing values to ensure clean data.
df_cleaned = df.dropna()

# Recheck missing values after cleaning.
print("\n--- Missing Values per Column (after cleaning) ---")
missing_per_column = df_cleaned.isnull().sum()
missing_per_column = missing_per_column[missing_per_column > 0]
if not missing_per_column.empty:
    print(missing_per_column)
else:
    print("No missing values per column.")


--- Total Missing Values Across the Entire Dataset ---
Total missing values: 23

--- Missing Values per Column (if any) ---
Nation    7
Age       8
Born      8
dtype: int64

--- Total Duplicate Values Across Entire Dataset ---
There are no duplicate values in the dataset.
                   Player           Squad  Nation   Age    Born
100         Olabade Aluko  Leicester City     NaN   NaN     NaN
273        Hannes Behrens      Hoffenheim  de GER   NaN     NaN
663   Pape Daouda Diongue      Strasbourg  sn SEN   NaN     NaN
862            Jake Evans  Leicester City     NaN   NaN     NaN
1320       Atakan Karazor       Stuttgart     NaN  27.0  1996.0
1518            Fer López      Celta Vigo     NaN   NaN     NaN
1603          Mateus Mane          Wolves     NaN   NaN     NaN
1769        Max Moerstedt      Hoffenheim  de GER   NaN     NaN
1778         Jeremy Monga  Leicester City     NaN   NaN     NaN
1917       Plamedi Nsingi          Nantes     NaN  23.0  2000.0

--- Missing Values pe

In [5]:
# Display the first 5 rows of the cleaned DataFrame.
df_cleaned.head()

# Displays summary information about the DataFrame (data type, number of non-nulls).
df_cleaned.info()

# Displays descriptive statistics for numeric columns (mean, standard deviation, etc.).
df_cleaned.describe()

<class 'pandas.core.frame.DataFrame'>
Index: 2844 entries, 0 to 2853
Data columns (total 37 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Rk           2844 non-null   int64  
 1   Player       2844 non-null   object 
 2   Nation       2844 non-null   object 
 3   Pos          2844 non-null   object 
 4   Squad        2844 non-null   object 
 5   Comp         2844 non-null   object 
 6   Age          2844 non-null   float64
 7   Born         2844 non-null   float64
 8   MP           2844 non-null   int64  
 9   Starts       2844 non-null   int64  
 10  Min          2844 non-null   int64  
 11  90s          2844 non-null   float64
 12  Gls          2844 non-null   int64  
 13  Ast          2844 non-null   int64  
 14  G+A          2844 non-null   int64  
 15  G-PK         2844 non-null   int64  
 16  PK           2844 non-null   int64  
 17  PKatt        2844 non-null   int64  
 18  CrdY         2844 non-null   int64  
 19  CrdR       

Unnamed: 0,Rk,Age,Born,MP,Starts,Min,90s,Gls,Ast,G+A,...,Gls_90,Ast_90,G+A_90,G-PK_90,G+A-PK_90,xG_90,xAG_90,xG+xAG_90,npxG_90,npxG+xAG_90
count,2844.0,2844.0,2844.0,2844.0,2844.0,2844.0,2844.0,2844.0,2844.0,2844.0,...,2844.0,2844.0,2844.0,2844.0,2844.0,2844.0,2844.0,2844.0,2844.0,2844.0
mean,1428.365682,25.018987,1998.637482,19.061181,13.537271,1215.077707,13.500141,1.688115,1.20429,2.892405,...,0.112032,0.076909,0.188949,0.104434,0.181357,0.135591,0.088878,0.224272,0.12769,0.216428
std,824.506208,4.49289,4.499159,11.499917,11.324974,965.234586,10.725007,3.160629,1.948538,4.538927,...,0.196378,0.136747,0.264578,0.186588,0.253648,0.218226,0.124636,0.27276,0.209537,0.263716
min,1.0,15.0,1982.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,714.75,22.0,1996.0,9.0,3.0,320.75,3.6,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.01,0.01,0.0475,0.01,0.04
50%,1427.5,25.0,1999.0,20.0,11.0,1059.0,11.8,0.0,0.0,1.0,...,0.0,0.0,0.08,0.0,0.08,0.06,0.06,0.14,0.06,0.14
75%,2143.25,28.0,2002.0,30.0,23.0,1999.5,22.2,2.0,2.0,4.0,...,0.16,0.12,0.29,0.1425,0.28,0.18,0.13,0.33,0.17,0.32
max,2854.0,41.0,2008.0,38.0,38.0,3420.0,38.0,31.0,18.0,47.0,...,2.43,2.65,2.65,2.43,2.65,5.06,2.47,5.06,5.06,5.06


# 3. Exploratory Dataset Analysis

## 📈 Player Age Distribution
This section analyzes the overall age distribution of players per league. The histogram visualization will show the age distribution, with vertical lines indicating the mean and median age to show the data center.

In [6]:
# Calculate the mean and median values of player ages.
mean_age = df_cleaned['Age'].mean()
median_age = df_cleaned['Age'].median()

# Create a histogram of the distribution of player ages across leagues.
fig = px.histogram(
df_cleaned,
    x="Age",
    nbins=20, # Number of 'bins' or age groups
    marginal="rug", # Display a small line below each data point
    opacity=0.75,
    color_discrete_sequence=['dodgerblue'], # Color of histogram bars
    title="Player Age Distribution (Across Leagues)",
)

# Add a vertical line to show the average age.
fig.add_hline(
    y=mean_age,
    line_dash="dash",
    line_color="green",
    annotation_text=f"Mean: {mean_age:.2f}",
    annotation_position="top left"
)

# Adds a vertical line to show the median age.
fig.add_hline(
    y=median_age,
    line_dash="dot",
    line_color="red",
    annotation_text=f"Median: {median_age:.2f}",
    annotation_position="top right"
)

# Centers the plot title.
fig.update_layout(title_x=0.5)
fig.show()

The age distribution shows that most professional players are in their prime (22-28 years). The similar mean and median indicate a balanced distribution. The presence of young players (<21 years) is still a minority, strengthening the importance of analyzing their contributions.

In [7]:
# 1. Take the 6 leagues with the most players.
leagues = df_cleaned['Comp'].value_counts().head(6).index.tolist()

# 2. Create a subplot with a 2-row and 3-column layout to show
# the age distribution per league.
fig = make_subplots(
    rows=2, cols=3,
    subplot_titles=leagues, # The title for each subplot is the league name
    horizontal_spacing=0.1,
    vertical_spacing=0.15
)

# 3. Iterate for each league and add a histogram of player ages.
for idx, liga in enumerate(leagues):
    row = idx // 3 + 1
    col = idx % 3 + 1

    data = df_cleaned[df_cleaned['Comp'] == liga]
    mean_age = data['Age'].mean()
    median_age = data['Age'].median()

    # Add an age histogram for the current league.
    fig.add_trace(go.Histogram(
        x=data['Age'],
        nbinsx=20,
        marker_color='dodgerblue',
        opacity=0.75,
        name=liga,
        showlegend=False
    ), row=row, col=col)

    # Add an average age line for the current league.
    fig.add_vline( 
        x=mean_age, line_dash='dash', line_color='green', 
        row=row, col=col, 
        annotation_text=f"Average: {mean_age:.1f}", 
        annotation_position="top left", 
        annotation_font_size=10 
    ) 

    # Added age median lines for current leagues. 
    fig.add_vline( 
        x=median_age, line_dash='dot', line_color='red', 
        row=row, col=col, 
        annotation_text=f"Median: {median_age:.1f}", 
        annotation_position="top right", 
        annotation_font_size=10 
    )

# 4. Set the overall plot layout.
fig.update_layout(
    height=650,
    width=1000,
    title="Player Age Distribution per League (with Mean & Median)",
    title_x=0.5, # Position the title in the center
    bargap=0.05 # Distance between histogram bars
)

# Set the X and Y axis labels.
fig.update_xaxes(title_text="Age")
fig.update_yaxes(title_text="Number of Players")

fig.show()

**📊 Player Age Distribution by League**

The age distribution across Europe's top five leagues shows interesting patterns reflecting each league's squad management strategy.

The average and median player age across leagues hover around **24-25**, indicating most players are in their prime.

**🧠 Insights by League:**
- 🇫🇷 Ligue 1
Lowest average age (~24.1).
Left-skewed distribution (many young players).
Insight: Ligue 1 is known as a youth development league. Many clubs give opportunities to players aged <23.

- 🇮🇹 Serie A
Average and median around 25, but distribution spreads to older ages.
Insight: Serie A accommodates experienced senior players. A more tactical style allows veteran players to remain competitive.

- 🇩🇪 Bundesliga and 🇪🇸 La Liga
Symmetrical and balanced distribution between young and senior players.
Insight: Both leagues demonstrate good age management in squads, allowing space for regeneration while retaining experienced players.

- 🏴 Premier League
Dense distribution at 25-27 years.
Insight: The EPL tends to maximize mature players, aligning with the high physical and competitive demands.

## 🏃‍♂️ Youth Players Statistics
This section focuses on young players (age <= 21) to analyze their distribution across leagues and clubs, and identify the youngest active players.

In [8]:
# Calculate and display the number of young players (age <= 21) per league.
age_threshold = 21 # Age threshold for 'young player'
young_players = df_cleaned[df_cleaned['Age'] <= age_threshold]
young_players_per_liga = young_players[['Comp']].value_counts().head(5)

print(young_players_per_liga)

young_players_per_comp = young_players['Comp'].value_counts().head(5).reset_index()
young_players_per_comp.columns = ['Comp', 'Number of Young Players']

# Create a bar chart showing young players per league
fig = px.bar(
    young_players_per_comp,
    x='Number of Young Players',
    y='Comp',
    orientation='h', # Horizontal orientation
    color='Number of Young Players',
    color_continuous_scale='YlGnBu', # Color scale
    title='Number of Young Players per League (< 21 Years)'
)

fig.update_layout(
    xaxis_title="Number of Young Players",
    yaxis_title="League",
    yaxis=dict(autorange='reversed'), # Reverse Y-axis order
    title_x=0.5
)

fig.show()

Comp              
fr Ligue 1            183
es La Liga            152
it Serie A            132
eng Premier League    127
de Bundesliga          98
Name: count, dtype: int64


In [9]:
# Calculate and display the 5 clubs with the most young players.
young_players_per_club = young_players[['Squad', 'Comp']].value_counts().head(10)

print(young_players_per_club)

young_players_per_club = young_players['Squad'].value_counts().head(10).reset_index()
young_players_per_club.columns = ['Squad', 'Number of Young Players']

# Create a bar chart showing clubs with most young players
fig = px.bar(
    young_players_per_club,
    x='Number of Young Players',
    y='Squad',
    orientation='h',
    color='Number of Young Players',
    color_continuous_scale='viridis',
    title='Top 5 Clubs with Most Young Players (< 21 Years)'
)

fig.update_layout(
    xaxis_title="Number of Young Players",
    yaxis_title="Club",
    yaxis=dict(autorange='reversed'),
    title_x=0.5
)

fig.show()

Squad           Comp              
Reims           fr Ligue 1            18
Strasbourg      fr Ligue 1            18
Montpellier     fr Ligue 1            14
Valladolid      es La Liga            13
Barcelona       es La Liga            13
Real Madrid     es La Liga            12
Sevilla         es La Liga            11
Tottenham       eng Premier League    11
Lens            fr Ligue 1            11
Eint Frankfurt  de Bundesliga         11
Name: count, dtype: int64


In [10]:
# Calculating the proportion of young players in each club
young_players = df_cleaned[df_cleaned['Age'] <= age_threshold]

total_per_club = df_cleaned['Squad'].value_counts().reset_index()
total_per_club.columns = ['Squad', 'Total Players']

young_per_club = young_players['Squad'].value_counts().reset_index()
young_per_club.columns = ['Squad', 'Young Players']

proportion_df = pd.merge(total_per_club, young_per_club, on='Squad', how='left')
proportion_df['Young Players'] = proportion_df['Young Players'].fillna(0)

proportion_df['Young Proportion (%)'] = 100 * proportion_df['Young Players'] / proportion_df['Total Players']

top_proportion = proportion_df.sort_values('Young Proportion (%)', ascending=False).head(10)

# Creating a bar chart showing top 10 clubs with highest proportion of young players
fig = px.bar(
    top_proportion,
    x='Young Proportion (%)',
    y='Squad',
    orientation='h',
    color='Young Proportion (%)',
    color_continuous_scale='sunsetdark',
    title='Top 10 Clubs with Highest Proportion of Young Players'
)

fig.update_layout(
    xaxis_title="Young Player Proportion (%)",
    yaxis_title="Club",
    yaxis=dict(autorange='reversed'),
    title_x=0.5
)

fig.show()

**Young Players in Leagues and Clubs**

1. **French League (Ligue 1) Dominates in Number of Young Players:**
    - The chart shows Ligue 1 has the most young players (~180) compared to other major leagues, indicating it's the league that develops/gives most opportunities to U21 players.

2. **French Clubs Lead in Proportion and Number of Young Players:**
    - Strasbourg and Reims (both from Ligue 1) have the most young players (17-18), reinforcing Ligue 1's position as fertile ground for youth development.
    - These clubs also have the highest proportions (>50%), showing young players are a significant part of their squads.

3. **Barcelona Shows Strong Commitment to Young Players:**
    - While not topping in quantity, Barcelona appears in Top 5 for young players (13-14) and ranks third in proportion (~50%), consistent with their La Masia academy philosophy.

4. **Varied Approaches Across Leagues and Clubs:**
    - While Ligue 1 leads overall, other leagues like La Liga have clubs strongly invested in youth (Barcelona, Real Madrid).
    - Premier League and Bundesliga have fewer young players overall, but some Bundesliga clubs appear in Top 10 proportions.

5. **Potential for Youth Recruitment:**
    - For clubs/scouts seeking young talent, Ligue 1 (especially Strasbourg, Reims) should be primary targets.
    - Clubs like Barcelona and Real Madrid also merit attention for their youth focus.

Here are the 10 youngest players who debuted in top 5 European leagues

In [11]:
# Display the 10 youngest players in the dataset.
youngest_player = df_cleaned.sort_values('Age', ascending=True).reset_index().head(10)
print(youngest_player[['Player', 'Age','Squad','Comp', 'Pos', 'Min']])

              Player   Age          Squad                Comp    Pos   Min
0       Harry Howell  15.0       Brighton  eng Premier League     FW     7
1  Djylian N'Guessan  15.0  Saint-Étienne          fr Ligue 1     FW   181
2        Mikey Moore  16.0      Tottenham  eng Premier League     FW   366
3     Ayyoub Bouaddi  16.0          Lille          fr Ligue 1     MF  1151
4      Mohamed Meité  16.0         Rennes          fr Ligue 1     FW   544
5         Faik Sakar  16.0     RB Leipzig       de Bundesliga     MF     1
6    Kyllian Antonio  16.0           Lens          fr Ligue 1  DF,MF   139
7       David Otorbi  16.0       Valencia          es La Liga     MF    14
8        Viggo Gebel  16.0     RB Leipzig       de Bundesliga     MF    14
9      Ibrahim Mbaye  16.0      Paris S-G          fr Ligue 1  FW,MF   340


In [12]:
# Display the 3 youngest players from each league.
youngest_players_per_liga = (
    df_cleaned.sort_values('Age')
    .groupby('Comp')
    .head(3)
    .reset_index(drop=True)
)

print(youngest_players_per_liga[['Player', 'Age', 'Squad', 'Comp', 'Pos', 'Min']].sort_values('Comp').reset_index(drop=True))

                Player   Age          Squad                Comp    Pos   Min
0           Faik Sakar  16.0     RB Leipzig       de Bundesliga     MF     1
1          Viggo Gebel  16.0     RB Leipzig       de Bundesliga     MF    14
2    Kacper Koscierski  17.0         Bochum       de Bundesliga     DF     7
3         Harry Howell  15.0       Brighton  eng Premier League     FW     7
4          Mikey Moore  16.0      Tottenham  eng Premier League     FW   366
5      Shumaira Mheuka  16.0        Chelsea  eng Premier League     MF     1
6         David Otorbi  16.0       Valencia          es La Liga     MF    14
7          Héctor Fort  17.0      Barcelona          es La Liga     DF   585
8     Arturo Rodríguez  17.0     Las Palmas          es La Liga     FW    21
9    Djylian N'Guessan  15.0  Saint-Étienne          fr Ligue 1     FW   181
10      Ayyoub Bouaddi  16.0          Lille          fr Ligue 1     MF  1151
11       Mohamed Meité  16.0         Rennes          fr Ligue 1     FW   544

## ⏱️ Playing Minutes and Starter Rate Analysis
This section analyzes how many minutes young players get and how often they start, showing coaches' trust in young players and their potential as future key players.

In [13]:
# Calculate average playing minutes for young players.
avg_minutes_young = young_players['Min'].mean()
print(f"Average playing minutes for young players (≤ {age_threshold:.1f} years): {avg_minutes_young:.2f} minutes")

# Calculate average minutes per league.
avg_min_per_league = young_players.groupby('Comp')['Min'].mean().round(2).sort_values(ascending=False)
print(avg_min_per_league)

# Create bar chart for average minutes per league.
fig = px.bar(
    avg_min_per_league.reset_index(),
    x='Min',
    y='Comp',
    orientation='h',
    text='Min',
    color = 'Min',
    color_continuous_scale = 'Emrld',
    title='Average Playing Minutes for Young Players per League'
)

fig.update_layout(
    yaxis=dict(title='League'),
    xaxis=dict(title='Average Minutes'),
    yaxis_autorange='reversed',
    title_x=0.5
)

fig.show()

Average playing minutes for young players (≤ 21.0 years): 688.67 minutes
Comp
eng Premier League    746.83
de Bundesliga         702.61
es La Liga            692.19
fr Ligue 1            684.65
it Serie A            623.90
Name: Min, dtype: float64


In [14]:
# Calculate average minutes per club.
avg_min_per_club = young_players.groupby('Squad')['Min'].mean().round(2).sort_values(ascending=False)

# Create bar chart for top 10 clubs with highest average minutes.
fig = px.bar(
    avg_min_per_club.head(10).reset_index(),
    x='Min',
    y='Squad',
    orientation='h',
    color = 'Min',
    color_continuous_scale='inferno_r',
    text='Min',
    title='Top 10 Clubs with Highest Average Minutes for Young Players'
)

fig.update_layout(
    yaxis=dict(title='Club'),
    xaxis=dict(title='Average Minutes'),
    yaxis_autorange='reversed',
    title_x=0.5
)

fig.show()

1. **Playing Minutes Don't Always Align with Quantity/Proportion:**
    - While Ligue 1 leads in quantity/proportion, its average minutes (684.65) are below overall average (688.67) and below Premier League, Bundesliga, and La Liga.
    - However, Strasbourg and PSG from Ligue 1 appear in Top 10 for minutes, showing some French clubs do give significant playing time.

2. **Premier League Leads in Average Minutes at League Level, But Specific Clubs Stand Out:**
    - Premier League has highest average minutes (746.83), showing clubs tend to give more playing time to young players.
    - Ipswich Town, Nott'ham Forest, Newcastle Utd, and Brighton from Premier League all appear in Top 10.

3. **La Liga and Bundesliga Provide Consistent Playing Opportunities:**
    - La Liga (692.19) and Bundesliga (702.61) have above-average minutes, showing good opportunities for young players.
    - Celta Vigo, Atlético Madrid, and Barcelona from La Liga appear in Top 10.

4. **Serie A Provides Fewest Minutes on Average:**
    - Serie A has lowest average minutes (623.90), suggesting young players face more challenges getting consistent playing time.

5. **Elite Clubs Provide Significant Playing Time:**
    - Presence of clubs like Barcelona, Atlético Madrid, and PSG in Top 10 shows commitment to integrating young talents even in high-ambition clubs.

In [15]:
# Get top 10 young players with most minutes played.
top_10_min_play_young_player = (
    young_players[['Player', 'Min', 'Age', 'Squad', 'Comp']]
    .sort_values('Min', ascending=False)
    .head(10)
    .reset_index(drop=True)
)

print(top_10_min_play_young_player)

# Create bar chart for top 10 young players by minutes.
fig = px.bar(
    top_10_min_play_young_player,
    x='Min',
    y='Player',
    orientation='h',
    color='Min',
    color_continuous_scale = 'bupu',
    text='Min',
    title='Top 10 Young Players with Most Minutes Played'
)
fig.update_layout(yaxis=dict(autorange='reversed'), title_x=0.5)
fig.show()

               Player   Min   Age          Squad                Comp
0        Milos Kerkez  3336  20.0    Bournemouth  eng Premier League
1  Cristhian Mosquera  3319  20.0       Valencia          es La Liga
2         Zion Suzuki  3314  21.0          Parma          it Serie A
3     Bart Verbruggen  3240  21.0       Brighton  eng Premier League
4      Omar El Hilali  3153  20.0       Espanyol          es La Liga
5        Levi Colwill  3149  21.0        Chelsea  eng Premier League
6      Illia Zabarnyi  3109  21.0    Bournemouth  eng Premier League
7       Diego Coppola  2927  20.0  Hellas Verona          it Serie A
8    Mateus Fernandes  2909  20.0    Southampton  eng Premier League
9               Pedri  2879  21.0      Barcelona          es La Liga


In [16]:
# Compare total minutes with total '90s' (number of full 90-minute matches).
compare_minutes = (
    young_players[['Player', 'Min', '90s', 'Squad', 'Comp']]
    .sort_values('Min', ascending=False)
    .head(10)
    .reset_index(drop=True)
)

fig = go.Figure()

# Bar chart for total minutes.
fig.add_trace(go.Bar(
    x=compare_minutes['Player'],
    y=compare_minutes['Min'],
    name='Total Minutes',
    marker_color='#97B067'  # Dark green
))

# Bar chart for minutes based on '90s'.
fig.add_trace(go.Bar(
    x=compare_minutes['Player'],
    y=compare_minutes['90s'] * 90,
    name='(90s × 90 minutes)',
    marker_color='#E3DE61'  # Light green
))

fig.update_layout(
    barmode='group', # Grouped bars
    title='Comparison of Total Minutes vs 90s (Young Players)',
    xaxis_title='Player',
    yaxis_title='Minutes Played',
    title_x=0.5,
    legend=dict(x=0.5, xanchor='center', y=1.1, orientation='h')
)

fig.show()

In [17]:
young_players_played = young_players.sort_values('MP', ascending=False)
young_players_start = young_players.sort_values('Starts', ascending=False)

top_young = young_players.sort_values('MP', ascending=False).head(10)

# Create bar chart comparing matches played (MP) and starts.
fig = go.Figure()
fig.add_trace(go.Bar(x=top_young['Player'], y=top_young['MP'], name='Total Appearances', marker_color='#00879E'))
fig.add_trace(go.Bar(x=top_young['Player'], y=top_young['Starts'], name='Starts', marker_color='#FFAB5B'))

fig.update_layout(
    barmode='group',
    title='Young Players with Most Appearances (MP vs Starts)',
    xaxis_title='Player',
    yaxis_title='Number of Matches',
    title_x=0.5
)
fig.show()

**1. Milos Kerkez Dominates in Minutes and Appearances:**
* Milos Kerkez leads with most minutes (3336) and appearances (~38-39 matches with ~33-34 starts), showing high consistency and trust from his team.

**2. High Coach Trust in Certain Young Players:**
* Several young players consistently get very high minutes (approaching/exceeding 3000), indicating key roles in their teams.
* The minutes comparison shows most players have minutes close to full 90-minute matches, indicating they often play full or nearly full matches.

**3. Importance of Consistency and Starting Role:**
* Players like Kerkez, Cristhian Mosquera, and Zion Suzuki not only play often but start regularly, showing they're first-choice players.

**4. Diversity of Positions and Leagues:**
* The presence of young players accumulating such high minutes shows some young talents are already trusted at the highest professional level.

In [18]:
# Calculate starter ratio (Starts/MP) for young players with at least 20 appearances.
young_players['starter_ratio'] = young_players['Starts'] / young_players['MP']
young_players_filtered = young_players[young_players['MP'] >= 20]  # Filter players with sufficient appearances
top_starter = young_players_filtered.sort_values(['starter_ratio', 'MP'], ascending=False).head(10)

# Create bar chart for young players' starter ratio.
fig = px.bar(
    top_starter,
    x='Player',
    y='starter_ratio',
    color='starter_ratio',
    color_continuous_scale='viridis',
    title='Young Players Starter Ratio (Minimum 20 Appearances)',
    labels={'starter_ratio': 'Starter Ratio'},
    hover_data=['Squad', 'MP', 'Starts', '90s']
)
fig.update_traces(texttemplate='%{y:.2f}', textposition='outside')
fig.update_layout(title_x=0.5)
fig.show()

In [19]:
# Calculate average starter ratio per league.
starter_by_league = young_players.groupby('Comp')[['MP', 'Starts']].sum()
starter_by_league['Starter Ratio'] = starter_by_league['Starts'] / starter_by_league['MP']
starter_by_league = starter_by_league.sort_values('Starter Ratio', ascending=False).reset_index()

# Create bar chart for starter ratio per league.
px.bar(
    starter_by_league,
    x='Comp',
    y='Starter Ratio',
    color='Starter Ratio',
    color_continuous_scale='plasma',
    title='Young Players Starter Ratio per League',
    labels={'Starter Ratio': 'Starter Ratio'}
).show()

1. **"Anti-Rotation" Young Players with Absolute Key Roles:**
    * The starter ratio chart shows 10 young players with perfect 1.00 ratios (always starting when they play), indicating they're not just promising talents but core pillars of their teams.

2. **La Liga Leads in Giving Starter Roles to Young Players:**
    * La Liga has the highest starter ratio (~0.60 or 60%), showing clubs are more likely to start young players.

3. **Consistency in Premier League and Ligue 1:**
    * These leagues rank second and third with ~0.59 ratios, showing proactive approaches to starting young players.

4. **Bundesliga and Serie A Below Average in Starter Ratio:**
    * These leagues have lower ratios (~0.56-0.57), suggesting young players more often come off the bench or are part of rotations.

**Interim Conclusion:**
- La Liga is most consistent in giving starter roles to young players.
- Ligue 1 stands out in quantity and proportion, showing boldness in introducing new talent.
- Premier League gives the most average minutes, showing a competitive yet trusting environment.
- There's an "elite" group of young players (like Kerkez, Suzuki, Mosquera) who have become indispensable starters at very young ages.

## ⚽ Young Players' Performance: Goals and Assists
This section analyzes young players' goal and assist contributions, comparing actual goals with Expected Goals (xG) and actual assists with Expected Assisted Goals (xAG) to measure their efficiency.

In [20]:
# Calculate total goals, assists, and G+A (Goals + Assists) for all players.
total_goals = df_cleaned['Gls'].sum()
total_assists = df_cleaned['Ast'].sum()
total_ga = df_cleaned['Gls'].add(df_cleaned['Ast']).sum()

# Calculate totals for young players.
young_goals = young_players['Gls'].sum()
young_assists = young_players['Ast'].sum()
young_ga = young_players['Gls'].add(young_players['Ast']).sum()

# Calculate young players' contribution percentages.
percent_goals = 100 * young_goals / total_goals
percent_assists = 100 * young_assists / total_assists
percent_ga = 100 * young_ga / total_ga

# Print young players' contributions.
print(f"Young players age threshold: ≤ {age_threshold:.2f} years")
print(f"Goal contribution by young players: {young_goals} of {total_goals} ({percent_goals:.2f}%)")
print(f"Assist contribution by young players: {young_assists} of {total_assists} ({percent_assists:.2f}%)")
print(f"Total G+A contribution by young players: {young_ga} of {total_ga} ({percent_ga:.2f}%)")

# Data for pie charts.
goals_data = [young_goals, total_goals - young_goals]
assists_data = [young_assists, total_assists - young_assists]
ga_data = [young_ga, total_ga - young_ga]

labels = ['Young Players', 'Other Players']
colors = ['#F05A7E', '#125B9A']

# Create subplot with 3 pie charts for contribution visualization.
from plotly.subplots import make_subplots

fig = make_subplots(
    rows=1, cols=3,
    specs=[[{'type':'domain'}, {'type':'domain'}, {'type':'domain'}]],
    subplot_titles=["Goal Contribution", "Assist Contribution", "G+A Contribution"]
)

# Add pie chart for goal contribution.
fig.add_trace(go.Pie(
    labels=labels,
    values=goals_data,
    name="Goals",
    marker=dict(colors=colors),
    hole=0.5, # Donut chart
    pull=[0.1, 0]  # Slightly separate 'Young Players' slice
), 1, 1)

# Add pie chart for assist contribution.
fig.add_trace(go.Pie(
    labels=labels,
    values=assists_data,
    name="Assists",
    marker=dict(colors=colors),
    hole=0.5,
    pull=[0.1, 0]
), 1, 2)

# Add pie chart for G+A contribution.
fig.add_trace(go.Pie(
    labels=labels,
    values=ga_data,
    name="G+A",
    marker=dict(colors=colors),
    hole=0.5,
    pull=[0.1, 0]
), 1, 3)

# Set plot layout.
fig.update_layout(
    title_text="Goal & Assist Contribution by Young Players (Age ≤ 21 Years)",
    annotations=[dict(text='Goals', x=0.15, y=0.5, font_size=14, showarrow=False),
                 dict(text='Assists', x=0.5, y=0.5, font_size=14, showarrow=False),
                 dict(text='G+A', x=0.85, y=0.5, font_size=14, showarrow=False)]
)

fig.show()

Young players age threshold: ≤ 21.00 years
Goal contribution by young players: 672 of 4801 (14.00%)
Assist contribution by young players: 534 of 3425 (15.59%)
Total G+A contribution by young players: 1206 of 8226 (14.66%)


1. **Significant Offensive Contribution from Young Players:**
    - Young players (≤21) contribute 14% of total goals, 15.6% of assists, and 14.7% of G+A - significant given their age.

2. **Young Players More Prominent in Assists:**
    - Their assist contribution (15.6%) is slightly higher than goals (14%), suggesting many young players are playmakers, wingers delivering crosses, or creative midfielders focused on chance creation rather than primary scorers.

3. **Great Potential for Future Growth:**
    - These contributions are promising considering they're still developing. Their ability to collectively contribute nearly 15% of total offense shows their potential.

4. **Team Reliance on Senior/Established Players:**
    - Most contributions (~85%) still come from older players (>21), which is natural given experience, physical maturity, and key positions often filled by seniors.

5. **Importance of Investing in Youth Development:**
    - These contributions show youth investments are valuable. These players provide real added value and are future assets.

In [21]:
# Show top 10 young goal scorers, separating penalty (PK) and non-penalty (G-PK) goals.
young_players = df_cleaned[df_cleaned['Age'] <= age_threshold].sort_values('Gls', ascending=False)
data = young_players[['Player', 'Gls', 'G-PK', 'PK', 'Age', 'Squad', 'Min']].head(10).reset_index(drop=True)

fig = go.Figure()

# Bar chart for total goals.
fig.add_trace(go.Bar(
    x=data['Player'],
    y=data['Gls'],
    name='Total Goals',
    marker_color='#2B3467',
    offsetgroup=0,
    hovertext=data.apply(lambda row: f"Age: {row['Age']}<br>Club: {row['Squad']}<br>Minutes: {row['Min']}\\n", axis=1)
))

# Bar chart for non-penalty goals (bottom stack).
fig.add_trace(go.Bar(
    x=data['Player'],
    y=data['G-PK'],
    name='Non-Penalty Goals',
    marker_color='#BAD7E9',
    offsetgroup=1,
    base=0
))

# Bar chart for penalty goals (stacked on non-penalty).
fig.add_trace(go.Bar(
    x=data['Player'],
    y=data['PK'],
    name='Penalty Goals',
    marker_color='#EB455F',
    offsetgroup=1,
    base=data['G-PK']
))

fig.update_layout(
    barmode='group', # Grouped bars
    title='Top 10 Young Players: Total Goals vs (Non-Penalty + Penalty Goals)',
    xaxis_title='Player',
    yaxis_title='Number of Goals',
    title_x=0.5,
    legend=dict(x=0.5, xanchor='center', y=1.1, orientation='h')
)

fig.show()

In [22]:
# Show top 5 young goal scorers from each league.
leagues = young_players['Comp'].value_counts().head(6).index.tolist()

fig = make_subplots(
    rows=2, cols=3,
    subplot_titles=[f"{liga}" for liga in leagues],
    horizontal_spacing=0.1,
    vertical_spacing=0.15
)

for idx, liga in enumerate(leagues):
    row = idx // 3 + 1
    col = idx % 3 + 1

    data = young_players[young_players['Comp'] == liga].sort_values('Gls', ascending=False).head(5)

    fig.add_trace(go.Bar(
        x=data['Gls'],
        y=data['Player'],
        orientation='h',
        name=liga,
        marker_color='#387ADF',
        text=data['Gls'],
        textposition='auto'
    ), row=row, col=col)

    fig.update_yaxes(autorange='reversed', row=row, col=col)

fig.update_layout(
    height=600,
    width=1000,
    title_text=f"Top 5 Young Goal Scorers (≤ {age_threshold:.1f} Years) per League",
    showlegend=False,
    title_x=0.5
)

fig.show()

1. **Diverse Distribution of Top Young Scorers per League:**
    - Ligue 1 stands out with two top scorers (Emanuel Emegha and Bradley Barcola) both scoring 14 goals.
    - Bundesliga also has highly productive young scorers, especially Benjamin Šeško (13) and Jamal Musiala (12).
    - Premier League and La Liga top scorers have 12 (Liam Delap) and 11 (Thierno Barry) goals respectively.
    - Serie A has the lowest top scorers (8 goals each for Assane Diao and Santiago Castro), consistent with fewer minutes.

2. **High Effectiveness of Non-Penalty Goals:**
    - Most goals by young players are from open play. Emanuel Emegha and Bradley Barcola scored all 14 goals without penalties.
    - Players like Benjamin Šeško, Mika Biereth, and Liam Delap have significant penalty contributions but still high open-play goals.

3. **Young Players Already Key Goal Threats:**
    - Scoring 12-14 goals in a season shows these young players are already significant threats.

4. **Offensive Quality in Ligue 1 and Bundesliga:**
    - These leagues, especially Ligue 1, feature the most prolific young scorers reaching double digits.

5. **Importance of Goal Contributions:**
    - Given young players contribute ~14% of total goals, these top scorers drive that contribution, showing real impact.

In [23]:
# Show top 5 young assist makers from each league.
leagues = young_players['Comp'].value_counts().head(6).index.tolist()

fig = make_subplots(
    rows=2, cols=3,
    subplot_titles=[f"{liga}" for liga in leagues],
    horizontal_spacing=0.1,
    vertical_spacing=0.15
)

for idx, liga in enumerate(leagues):
    row = idx // 3 + 1
    col = idx % 3 + 1

    data = young_players[young_players['Comp'] == liga].sort_values('Ast', ascending=False).head(5)

    fig.add_trace(go.Bar(
        x=data['Ast'],
        y=data['Player'],
        orientation='h',
        name=liga,
        marker_color='#FBA834',
        text=data['Ast'],
        textposition='auto'
    ), row=row, col=col)

    fig.update_yaxes(autorange='reversed', row=row, col=col)

fig.update_layout(
    height=600,
    width=1000,
    title_text=f"Top 5 Young Assist Makers (≤ {age_threshold:.1f} Years) per League",
    showlegend=False,
    title_x=0.5
)

fig.show()

In [24]:
# Show top 5 young players with most G+A (Goals + Assists) per league.
young_players = df_cleaned[df_cleaned['Age'] <= age_threshold].copy()
young_players['G+A'] = young_players['Gls'] + young_players['Ast']

leagues = young_players['Comp'].value_counts().head(6).index.tolist()

fig = make_subplots(
    rows=2, cols=3,
    subplot_titles=[f"{liga}" for liga in leagues],
    horizontal_spacing=0.1,
    vertical_spacing=0.15
)

for idx, liga in enumerate(leagues):
    row = idx // 3 + 1
    col = idx % 3 + 1

    data = (
        young_players[young_players['Comp'] == liga]
        .sort_values('G+A', ascending=False)
        .head(5)
    )

    fig.add_trace(go.Bar(
        x=data['G+A'],
        y=data['Player'],
        orientation='h',
        name=liga,
        marker_color='#512B81',
        text=data['G+A'],
        textposition='auto',
        hovertext=[
            f"Goals: {g} | Assists: {a} | Club: {s}"
            for g, a, s in zip(data['Gls'], data['Ast'], data['Squad'])
        ],
        hoverinfo='text'
    ), row=row, col=col)

    fig.update_yaxes(autorange='reversed', row=row, col=col)

fig.update_layout(
    height=650,
    width=1000,
    title_text=f"Top 5 Young Players (≤ {age_threshold:.1f} Years) by G+A per League",
    showlegend=False,
    title_x=0.5
)

fig.show()

In [25]:
# Create interactive leaderboard for top 30 young players by G+A.
young_players['G+A'] = young_players['Gls'] + young_players['Ast']

young_leaderboard = young_players.sort_values('G+A', ascending=False)

fig = px.bar(
    young_leaderboard.head(30),
    x='G+A',
    y='Player',
    color='G+A',
    color_continuous_scale='viridis',
    orientation='h',
    text='G+A',
    hover_data={
        'Player': True,
        'Gls': True,
        'Ast': True,
        'G+A': True,
        'Age': True,
        'Squad': True,
        'Comp': True
    },
    title=f"Young Players Leaderboard (≤ {age_threshold:.1f} Years) by G+A"
)

fig.update_layout(
    yaxis=dict(autorange='reversed'),
    xaxis_title="Goals + Assists",
    yaxis_title="Player",
    title_x=0.5,
    height=800
)

fig.show()

**1. Bradley Barcola and Florian Wirtz are Europe's Most Influential Young Players in G+A:**
* Barcola (24 G+A) and Wirtz (22 G+A) lead as young players with highest combined offensive contributions.

**2. Most Creative Young Talents (Assists) Across Leagues:**
* Lamine Yamal (La Liga) leads with 13 assists.
* Florian Wirtz (Bundesliga) follows with 12 assists.
* Rayan Cherki (Ligue 1) is also highly productive with 11 assists.

**3. Concentration of High G+A Talent in Ligue 1, La Liga, Bundesliga:**
* Ligue 1: Barcola (24 G+A) and Cherki (19 G+A) lead.
* La Liga: Yamal (22 G+A) and Jude Bellingham (17 G+A) show big influence.
* Bundesliga: Wirtz (22 G+A) and Šeško (18 G+A) are standout young stars.

**4. All-Rounder Roles of Bellingham and Xavi Simons:**
* Both appear in minutes, starter ratio, and now G+A stats (17 and 16 respectively), confirming them as complete midfielders.

**5. Proof of Direct Impact on Team Performance:**
* High G+A numbers show young players aren't just squad players but directly influence match outcomes.

## ♟ Young Players' Position Analysis
This section analyzes the distribution of young players' positions and average minutes played per position.

In [26]:
# Count young players by position.
young_pos_counts = young_players['Pos'].value_counts().reset_index()
young_pos_counts.columns = ['Position', 'Number of Players']

print(young_pos_counts)

# Create bar chart for young players by position.
fig = px.bar(
    young_pos_counts,
    x='Number of Players',
    y='Position',
    orientation='h',
    text='Number of Players',
    color='Number of Players',
    color_continuous_scale='matter',
    title='Number of Young Players by Position'
)

fig.update_layout(
    yaxis=dict(categoryorder='total ascending'), # Sort positions by player count
    title_x=0.5
)

fig.show()

  Position  Number of Players
0       DF                185
1       MF                159
2       FW                 96
3    FW,MF                 84
4    MF,FW                 74
5    DF,MF                 30
6    MF,DF                 27
7    DF,FW                 15
8       GK                 14
9    FW,DF                  8


In [27]:
# Calculate average minutes per position.
avg_min_per_pos = young_players.groupby('Pos')['Min'].mean().round(2).sort_values(ascending=False).reset_index()
avg_min_per_pos.columns = ['Position', 'Average Minutes']
print(avg_min_per_pos)

# Create bar chart for average minutes per position.
fig = px.bar(
    avg_min_per_pos,
    x='Average Minutes',
    y='Position',
    orientation='h',
    text='Average Minutes',
    color='Average Minutes',
    color_continuous_scale='viridis',
    title='Average Minutes Played by Young Players per Position'
)

fig.update_layout(
    yaxis=dict(categoryorder='total ascending'),
    title_x=0.5
)

fig.show()

  Position  Average Minutes
0       GK           885.86
1    DF,MF           791.57
2    MF,DF           784.93
3    MF,FW           752.74
4       MF           744.36
5       DF           742.39
6    FW,MF           647.99
7    DF,FW           509.80
8       FW           472.33
9    FW,DF            50.00


In [28]:
# Compare average minutes and number of young players per position.
avg_minutes = young_players.groupby('Pos')['Min'].mean().round(2)
count_players = young_players['Pos'].value_counts().sort_index()

pos_stats = pd.DataFrame({
    'Average Minutes': avg_minutes,
    'Number of Players': count_players
}).reset_index().rename(columns={'index': 'Pos'})
print(pos_stats)

fig = go.Figure()

# Bar chart for average minutes.
fig.add_trace(go.Bar(
    x=pos_stats['Pos'],
    y=pos_stats['Average Minutes'],
    name='Average Minutes',
    marker_color='#C95792',
    yaxis='y1'
))

# Bar chart for number of players.
fig.add_trace(go.Bar(
    x=pos_stats['Pos'],
    y=pos_stats['Number of Players'],
    name='Number of Players',
    marker_color='#F8B55F',
    yaxis='y2'
))

# Configure layout with dual Y-axes for better comparison.
fig.update_layout(
    title='Average Minutes & Number of Young Players per Position',
    xaxis_title='Position',
    yaxis=dict(
        title='Average Minutes',
        titlefont=dict(color='#3D365C'),
        tickfont=dict(color='#3D365C'),
        side='left'
    ),
    yaxis2=dict(
        title='Number of Players',
        titlefont=dict(color='#3D365C'),
        tickfont=dict(color='#3D365C'),
        overlaying='y',
        side='right'
    ),
    barmode='group',
    title_x=0.5,
    legend=dict(x=0.5, xanchor='center', y=1.1, orientation='h')
)

fig.show()

     Pos  Average Minutes  Number of Players
0     DF           742.39                185
1  DF,FW           509.80                 15
2  DF,MF           791.57                 30
3     FW           472.33                 96
4  FW,DF            50.00                  8
5  FW,MF           647.99                 84
6     GK           885.86                 14
7     MF           744.36                159
8  MF,DF           784.93                 27
9  MF,FW           752.74                 74


**1. Goalkeepers (GK) Get Highest Average Minutes, But Fewest in Number:**
* GK position has highest average minutes (~885.86) but fewest players (only 14), showing clubs fully trust young GKs when they break through.

**2. Defenders (DF) & Midfielders (MF) Most Numerous, Varied Minutes:**
* DF and MF have most young players (185 and 159 respectively).
* They get good average minutes (~742-744), showing regular playing time.
* Flexible positions like "DF,MF" get even higher minutes (~791), showing versatility is valued.

**3. Forwards (FW) and Offensive Flexibility Get Fewer Minutes:**
* FW position has lowest average minutes (~472.33) despite decent numbers (96 players).
* "FW,DF" positions get very few minutes (50) with few players (8).
* "FW,MF" players (84) get lower minutes (~647.99) than DF/MF, suggesting more competition up front.

**4. Importance of Positional Flexibility:**
* Players who can play multiple positions (especially between midfield and defense) get more minutes.

**5. Young Player Distribution in Teams:**
* Most young players concentrate in defense and midfield, possibly because these positions require energy and aggression suited to young players, or have more rotation/development space.

## 🧿 Young Players' Efficiency (Goals vs xG & Assists vs xAG)
This section measures how efficient players are at converting chances (for scorers) or creating quality chances (for playmakers) using Expected Goals (xG) and Expected Assisted Goals (xAG) metrics.

In [29]:
# 1. Filter young players with at least 10 goals, sorted by goals per 90 minutes.
young_players = df_cleaned[(df_cleaned['Age'] <= age_threshold) & (df_cleaned['Gls'] >= 10)].sort_values(['Gls_90', 'Gls'], ascending=False)
data = young_players[['Player', 'Gls', 'Age', 'Squad', 'Gls_90', 'Min']].head(10).reset_index(drop=True)

# 2. Create bar chart for top 10 most effective young scorers (by Gls_90).
fig = px.bar(
    data,
    x='Player',
    y='Gls_90',
    color='Gls_90',
    color_continuous_scale='viridis',
    hover_data=['Age', 'Squad', 'Min', 'Gls'],
    labels={'Gls_90': 'Goals per 90 Minutes'},
    height=400,
    title='Top 10 Most Effective Young Goal Scorers',
    text='Gls_90'
)

fig.update_traces(
    texttemplate='%{text:.2f}', # Show 2 decimal places
    textposition='outside'
)

fig.update_layout(
    title_x=0.5,
    uniformtext_minsize=8,
    uniformtext_mode='hide',
    yaxis_title='Goals per 90 Minutes'
)

fig.show()

In [30]:
# 1. Filter young players with at least 10 goals and calculate conversion ratio Goals/xG.
young_players = df_cleaned[(df_cleaned['Age'] <= age_threshold) & (df_cleaned['Gls'] >= 10)]
data = young_players[['Player', 'Gls', 'Age', 'Squad', 'xG']].reset_index(drop=True)
data['conv_Gls_xG'] = data['Gls'] / data['xG'] # Goals to Expected Goals ratio
data = data.sort_values('conv_Gls_xG', ascending=False).head(10)

# 2. Create combined bar and line plot for Goals, xG, and Goals/xG ratio.
fig = go.Figure()

# Bar chart for actual Goals.
fig.add_trace(go.Bar(
    x=data['Player'],
    y=data['Gls'],
    name='Goals',
    marker_color='#EF9651',
    yaxis='y'
))

# Bar chart for Expected Goals (xG).
fig.add_trace(go.Bar(
    x=data['Player'],
    y=data['xG'],
    name='xG',
    marker_color='#EC5228',
    yaxis='y'
))

# Line for Goals/xG conversion ratio.
fig.add_trace(go.Scatter(
    x=data['Player'],
    y=data['conv_Gls_xG'],
    name='Conversion Rate (Goals/xG)',
    marker_color='#3F7D58',
    mode='lines+markers+text',
    text=[f'{val:.2f}' for val in data['conv_Gls_xG']],
    textposition='top center',
    yaxis='y2' # Use secondary Y-axis
))

fig.update_layout(
    title='Goals vs xG and Young Players Conversion Rate',
    xaxis=dict(title='Player'),
    yaxis=dict(
        title='Number of Goals & xG',
        titlefont=dict(color='#440154'),
        tickfont=dict(color='#440154')
    ),
    yaxis2=dict(
        title='Conversion Rate (Goals/xG)',
        titlefont=dict(color='#440154'),
        tickfont=dict(color='#440154'),
        overlaying='y', # Stack secondary Y-axis
        side='right'
    ),
    barmode='group',
    title_x=0.5,
    legend=dict(x=0.5, xanchor='center', y=1.15, orientation='h'),
    height=500
)

fig.show()

In [31]:
# 1. Filter young players with assists in top 95%, sorted by assists per 90 minutes.
young_players = df_cleaned[(df_cleaned['Age'] <= age_threshold) & (df_cleaned['Ast'] >= df_cleaned['Ast'].quantile(0.95))].sort_values(['Ast_90', 'Ast'], ascending=False)
data = young_players[['Player', 'Ast', 'Age', 'Squad', 'Ast_90', 'Min']].head(10).reset_index(drop=True)

# Create bar chart for top 10 young players by assists per 90 minutes.
fig = px.bar(
    data,
    x='Player',
    y='Ast_90',
    hover_data=['Age', 'Squad', 'Min', 'Ast'],
    color='Ast_90',
    color_continuous_scale='magma',
    labels={'Ast_90': 'Assists per 90 Minutes'},
    height=400,
    title='Top 10 Young Players with Highest Assists per 90 Minutes',
    text='Ast_90'
)

fig.update_traces(texttemplate='%{text:.2f}', textposition='outside')

fig.update_layout(
    title_x=0.5,
    uniformtext_minsize=8,
    uniformtext_mode='hide',
    yaxis_title='Assists per 90 Minutes'
)

fig.show()

In [32]:
# 1. Filter young players with assists in top 95% and calculate Assist/xAG ratio.
young_players = df_cleaned[(df_cleaned['Age'] <= age_threshold) & (df_cleaned['Ast'] >= df_cleaned['Ast'].quantile(0.95))]
data = young_players[['Player', 'Ast', 'Age', 'Squad', 'xAG']].reset_index(drop=True)
data['conv_Ast_xAG'] = data['Ast'] / data['xAG'] # Assists to Expected Assisted Goals ratio
data = data.sort_values('conv_Ast_xAG', ascending=False).head(10)

# 2. Create combined bar and line plot for Assists, xAG, and Assist/xAG ratio.
fig = go.Figure()

# Bar chart for actual Assists.
fig.add_trace(go.Bar(
    x=data['Player'],
    y=data['Ast'],
    name='Assists',
    marker_color='#0d0887',
    yaxis='y'
))

# Bar chart for Expected Assisted Goals (xAG).
fig.add_trace(go.Bar(
    x=data['Player'],
    y=data['xAG'],
    name='xAG',
    marker_color='#cc4678',
    yaxis='y'
))

# Line for Assist/xAG conversion ratio.
fig.add_trace(go.Scatter(
    x=data['Player'],
    y=data['conv_Ast_xAG'],
    name='Conversion Rate (Assist/xAG)',
    marker_color='#f0f951',
    mode='lines+markers+text',
    text=[f'{val:.2f}' for val in data['conv_Ast_xAG']],
    textposition='top center',
    yaxis='y2'
))

fig.update_layout(
    title='Assists vs xAG and Young Players Conversion Rate',
    xaxis=dict(title='Player'),
    yaxis=dict(
        title='Number of Assists & xAG',
        titlefont=dict(color='#0d0887'),
        tickfont=dict(color='#0d0887')
    ),
    yaxis2=dict(
        title='Conversion Rate (Assist/xAG)',
        titlefont=dict(color='#0d0887'),
        tickfont=dict(color='#0d0887'),
        overlaying='y',
        side='right'
    ),
    barmode='group',
    title_x=0.5,
    legend=dict(x=0.5, xanchor='center', y=1.15, orientation='h'),
    height=500
)

fig.show()

**1. Mika Biereth is Most Effective Young Scorer per 90 Minutes:**
* Leads with 0.93 goals/90, showing exceptional efficiency when on the pitch.
* Others like Jamal Musiala (0.60), Bradley Barcola (0.58) also show high effectiveness.

**2. Rayan Cherki Leads in Assist Effectiveness per 90 Minutes:**
* Tops with 0.48 assists/90, indicating highly creative playmaking.
* Florian Wirtz (0.46) and Lamine Yamal (0.41) also show high assist efficiency.

**3. Goal Conversion Rate (Goals vs xG) Highlights Finishing Quality:**
* Andrey Santos and Xavi Simons have highest conversion rates (1.96), scoring nearly double the expected goals, showing clinical finishing.
* Lucas Stassin (1.72) and Paul Nebel (1.71) also excellent converters.
* Players like Wirtz and Barcola have ~1.00 ratios, scoring as expected from chances.

**4. Assist Conversion Rate (Assists vs xAG) Highlights Vision and Key Pass Quality:**
* Benjamin Šeško leads with 2.38 ratio, his key passes often becoming assists beyond expectation.
* Christantus Uche (2.07) and Jude Bellingham (2.05) also show high conversion, indicating quality passes and vision.

## 🕹 Young Players' Contribution: Progressive Passes
This section analyzes young players' contributions in making progressive passes (PrgP) and their relationship with Expected Assisted Goals (xAG), showing their ability to advance play and create chances.

In [33]:
# 1. Filter young players and take top 10 by progressive passes (PrgP).
top_passes_progressive = young_players[['Player', 'Squad', 'Comp', 'Age','PrgP', 'xAG', 'Ast']].sort_values('PrgP', ascending=False).head(10).reset_index()

# Create bar chart for top 10 young players by progressive passes.
px.bar(
    top_passes_progressive,
    x='Player',
    y='PrgP',
    hover_data=['Squad', 'Age', 'Ast', 'PrgP', 'xAG'],
    title='Top 10 Young Players by Progressive Passes (PrgP)',
    color='PrgP',
    color_continuous_scale='tropic'
).update_layout(title_x=0.5).show()

In [34]:
# Calculate total progressive passes and Expected Assisted Goals (PrgP_xAG).
young_players['PrgP_xAG'] = young_players['PrgP'] + young_players['xAG']
top_progressors = young_players.sort_values('PrgP_xAG', ascending=False).head(10)

# Create bar chart for top 10 young players by PrgP + xAG.
px.bar(
    top_progressors,
    x='Player',
    y='PrgP_xAG',
    hover_data=['Squad', 'Age', 'Ast', 'PrgP', 'xAG'],
    title='Top 10 Young Players (Progressive Passes + xAG)',
    color='PrgP_xAG',
    color_continuous_scale='Plasma'
).update_layout(title_x=0.5).show()

In [35]:
# Create scatter plot showing relationship between progressive passes (PrgP) and xAG,
# with point size and color based on assists (Ast).
top_progressors_players = young_players.sort_values('PrgP_xAG', ascending=False).head(10)['Player'].tolist()
fig = px.scatter(
    young_players, 
    x='PrgP', 
    y='xAG', 
    color='Ast', 
    size='Ast', 
    hover_data=['Player', 'Squad', 'Age'],
    title='Relationship Between Progressive Passes and xAG (Size & Color: Assists)',
    color_continuous_scale='Viridis'
)

# Add annotations for each player
for index, row in young_players.iterrows():
    if row['Player'] in top_progressors_players:
        fig.add_annotation(
            x=row['PrgP'],
            y=row['xAG'],
            text=row['Player'],
            showarrow=False, # No arrow
            yshift=10, # Shift text slightly up
            xshift=0, # Shift text slightly left/right
            font=dict(size=8, color="black") # Font size and color
        )

fig.update_layout(title_x=0.5)
fig.show()

## 📕Summary 

**Comprehensive Analysis and Insights on Young Players' Potential in Top 5 Leagues:**

**I. League Volume and Commitment to Young Players:**

1.  **Ligue 1 (France): Largest Talent Producer & Club Incubator:**
    * Most young players (~180), with clubs like Strasbourg and Reims having high proportions (>50%).

2.  **La Liga (Spain): Best for Starter Roles and Main Creators:**
    * Highest starter ratio (~60%), producing world-class young playmakers like Pedri.

3.  **Premier League (England): Competitive Yet Trusting Environment:**
    * Highest average minutes (746.83), with clubs like Ipswich Town and Brighton giving significant playing time.

4.  **Bundesliga (Germany): High Offensive Efficiency and Focused Talent:**
    * Productive young scorers (Šeško, Musiala) and creative assist makers (Wirtz).

5.  **Serie A (Italy): Challenging for Minutes, But Individual Effectiveness:**
    * Lowest average minutes (623.90), but players like Mika Biereth show exceptional efficiency.

**II. Young Players' Roles and Contributions:**

1.  **Significant Direct Offensive Impact:**
    * Contribute 14% of goals, 15.6% of assists, and 14.7% of G+A - substantial for their age.

2.  **Young Players as Core Pillars (Not Just Squad Players):**
    * Players like Milos Kerkez show perfect starter ratios (1.00 in 20+ matches), becoming indispensable.

3.  **Position Diversity and Key Roles:**
    * Goalkeepers: Highest average minutes but fewest players.
    * Defenders & Midfielders: Most numerous with regular minutes.
    * Forwards: Fewer minutes but some highly efficient scorers.
    * Positional flexibility increases playing time.

4.  **Efficiency in Offensive Production (Per 90 Minutes):**
    * Mika Biereth most effective scorer (0.93 goals/90).
    * Rayan Cherki most effective creator (0.48 assists/90).

5.  **Superior Finishing and Key Pass Vision:**
    * Andrey Santos and Xavi Simons have highest Goals/xG (1.96), showing clinical finishing.
    * Benjamin Šeško has highest Assists/xAG (2.38), showing exceptional pass quality.

**Key Standout Players Across Metrics:**

* **Milos Kerkez & Zion Suzuki:** Most minutes and perfect starter ratios, showing reliability.
* **Bradley Barcola & Florian Wirtz:** Absolute leaders in G+A, showing massive direct impact.
* **Rayan Cherki & Lamine Yamal:** Highly effective playmakers.
* **Mika Biereth:** Most efficient scorer per 90 minutes.
* **Andrey Santos & Xavi Simons:** Most clinical finishers.
* **Jude Bellingham:** Complete all-rounder excelling in minutes, G+A, and assist conversion.

Collectively, this data provides a rich picture of young players in European football, highlighting the leagues most invested in youth, the clubs most active in development, and the individuals showing exceptional potential across various aspects of the game.

# Clustering & Classification

In [36]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

# Select role features
features = [
    'Gls', 'G-PK', 'xG', 'Ast', 'xAG', 'PrgP', 'CrdY', 'Min'
]

# Get young players data
young_players = df_cleaned[df_cleaned['Age'] <= age_threshold].copy()

# Filter and drop NA
X = young_players[features].fillna(0)

# Standardization
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

kmeans = KMeans(n_clusters=3, random_state=42)
young_players['RoleCluster'] = kmeans.fit_predict(X_scaled)

pca = PCA(n_components=2)
pca_result = pca.fit_transform(X_scaled)
young_players['PC1'] = pca_result[:, 0]
young_players['PC2'] = pca_result[:, 1]

fig = px.scatter(
    young_players,
    x='PC1', y='PC2',
    color='RoleCluster',
    hover_data=['Player', 'Squad', 'Age'],
    title='Young Players Role Clustering Based on Performance Stats'
)
fig.show()

In [37]:
cluster_summary = young_players.groupby('RoleCluster')[features].mean().round(2)
display(cluster_summary)

# After examining cluster_summary results:
# Cluster 0 → Finisher
# Cluster 1 → Defender
# Cluster 2 → Playmaker

role_map = {
    0: "Finisher",
    1: "Defender",
    2: "Playmaker"
}

young_players['Role'] = young_players['RoleCluster'].map(role_map)

fig = px.scatter(
    young_players,
    x='PC1', y='PC2',
    color='Role',
    hover_data=['Player', 'Squad', 'Age', 'Gls', 'Ast'],
    title='Young Players Role Classification Based on Performance Stats'
)
fig.show()

Unnamed: 0_level_0,Gls,G-PK,xG,Ast,xAG,PrgP,CrdY,Min
RoleCluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,1.75,1.71,1.82,1.86,1.74,71.97,4.18,1753.38
1,0.26,0.26,0.31,0.17,0.22,8.57,0.63,276.67
2,8.42,8.0,7.36,5.15,5.08,99.52,3.97,2203.97


In [38]:
young_players['Role'].value_counts()

Role
Defender     509
Finisher     150
Playmaker     33
Name: count, dtype: int64

In [39]:
role_count = young_players['Role'].value_counts().reset_index()
role_count.columns = ['Role', 'Number of Players']

fig = px.bar(
    role_count,
    x='Role',
    y='Number of Players',
    color='Role',
    title='Young Players Role Distribution',
    text='Number of Players',
    color_discrete_sequence=px.colors.qualitative.Set2
)
fig.update_traces(textposition='outside')
fig.update_layout(title_x=0.5)
fig.show()

In [40]:
avg_stats = young_players.groupby('Role')[['Gls', 'Ast', 'xG', 'xAG']].mean().round(2).reset_index()

fig = px.bar(
    avg_stats.melt(id_vars='Role'),
    x='Role',
    y='value',
    color='variable',
    barmode='group',
    title='Average Key Stats per Role',
    labels={'value': 'Average', 'variable': 'Statistic'},
    color_discrete_sequence=px.colors.sequential.Viridis
)
fig.update_layout(title_x=0.5)
fig.show()