This notebook aims at exploring data previously preprocessed. This notebook has different sections:
- structure and consistency analysis
- goals scored analysis
- clubs analysis
- focus on specific clubs

In [None]:
import os
import sys
import matplotlib.pyplot as plt
import seaborn as sns
from adjustText import adjust_text
root_path = os.path.abspath(os.path.join(os.getcwd(), ".."))
sys.path.append(root_path)

import pandas as pd

from src.config import load_config
import src.utils
from src.data_analysis import ClubAnalysis

# config.yaml importation
config_file = 'config.yaml'
config_path = os.path.join(root_path, config_file)
config = load_config(config_path)

# Preprocessed train data importation

In [None]:
preprocessed_data_path = os.path.join(root_path, config['preprocessed_dir'])
df_train_path = os.path.join(preprocessed_data_path, f"{config['preprocessed_train_df_name']}.csv")

df = pd.read_csv(df_train_path)
df.head()

In [None]:
df.columns

# Strucutre and consistency

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.isna().mean().sort_values(ascending=False)

In [None]:
df[config['final_result_column']].value_counts(normalize=True)

# Goals scored analysis

In [None]:
df["total_goals"] = df[config['nb_goals_home_column']] + df[config['nb_goals_away_column']]

plt.figure(figsize=(10, 6))
sns.histplot(df["total_goals"], bins=10)
plt.title("Number of goals per match repartition")
plt.xlabel("Number of goals scored per match")
plt.ylabel("Frequency")
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(df[config['nb_goals_home_column']], label="Goals scored by home teams", fill=True)
sns.histplot(df[config['nb_goals_away_column']], label="Goals scored by away teams", fill=True)
plt.title("Goals scored by home teams vs goals scored by away teams")
plt.xlabel("Number of goals scored")
plt.legend()
plt.show()

In [None]:
goals_by_season = (
    df.groupby(config['season_column'])[[config['nb_goals_home_column'], config['nb_goals_away_column']]]
    .mean()
    .assign(total=lambda x: x[config['nb_goals_home_column']] + x[config['nb_goals_away_column']])
)

plt.figure(figsize=(10, 6))
sns.lineplot(data=goals_by_season[config['nb_goals_home_column']], marker="o", label='Goals scored at home')
sns.lineplot(data=goals_by_season[config['nb_goals_away_column']], marker="o", label='Goals scored away')
sns.lineplot(data=goals_by_season['total'], marker="o", label='Total goals')
plt.title("Evolution of the goals scored average per season")
plt.ylabel("Goals average")
plt.xlabel("Season")
plt.xticks(rotation=45)
plt.show()

In [None]:
df["over_2_5"] = df["total_goals"] > 2.5
prop_over_2_5 = df["over_2_5"].mean() * 100

print(f"{prop_over_2_5:.2f}% of the matches had more than 2.5 goals")

over25_by_season = df.groupby(config['season_column'])["over_2_5"].mean() * 100
plt.figure(figsize=(10, 6))
sns.barplot(x=over25_by_season.index, y=over25_by_season.values)
plt.title("Percentage of matches with +2.5 goals per season")
plt.ylabel("Percentage of matches with +2.5 goals")
plt.xlabel("Season")
plt.xticks(rotation=45)
plt.show()

# Clubs performances analysis

In [None]:
df['home_points'] = df[config['final_result_column']].map({'home': 3, 'draw': 1, 'away': 0})
df['away_points'] = df[config['final_result_column']].map({'home': 0, 'draw': 1, 'away': 3})

points_home = df.groupby([config['season_column'], config['home_column']])['home_points'].sum()
points_away = df.groupby([config['season_column'], config['away_column']])['away_points'].sum()

points_totaux = points_home.add(points_away, fill_value=0).reset_index()
points_totaux.rename(columns={0:'points_final'}, inplace=True)

champion_points = points_totaux.groupby(config['season_column'])['points_final'].max()

points_to_stay_l1 = []
for season, group in points_totaux.groupby(config['season_column']):
    points_teams = group['points_final'].sort_values(ascending=False)
    seuil = points_teams.iloc[-4]  # the 3 last teams are relegated to L2
    points_to_stay_l1.append(seuil)

points_to_stay_l1 = pd.Series(points_to_stay_l1, index=champion_points.index)

plt.figure(figsize=(12,6))
sns.lineplot(x=champion_points.index, y=champion_points.values, marker='o', label='Number of points to be champion')
sns.lineplot(x=points_to_stay_l1.index, y=points_to_stay_l1.values, marker='o', label='Number of points to stay in L1')
plt.xticks(rotation=45)
plt.title("Final points to be champion and to stay in L1")
plt.ylabel("Points")
plt.xlabel("Season")
plt.legend()
plt.show()

In [None]:
plt.figure(figsize=(12,5))
sns.boxplot(x=config['final_result_column'], y='odd_home', data=df)
plt.title("Home odd according to the result")
plt.xlabel("Result")
plt.ylabel("Home odd")
plt.show()

plt.figure(figsize=(12,5))
sns.boxplot(x=config['final_result_column'], y='odd_away', data=df)
plt.title("Away odd according to the match result")
plt.xlabel("Result")
plt.ylabel("Away odd")
plt.show()

In [None]:
club_perf = (
    df.groupby(config['home_column'])[[config['nb_goals_home_column'], config['nb_goals_away_column']]]
    .mean()
    .rename(columns={
        config['nb_goals_home_column']: "avg_goals_scored_home",
        config['nb_goals_away_column']: "avg_goals_conceded_home"
    })
)

plt.figure(figsize=(10, 8))
sns.scatterplot(
    data=club_perf,
    x="avg_goals_conceded_home",
    y="avg_goals_scored_home",
    s=100,
    color="cornflowerblue",
    alpha=0.8
)

texts = []
for club, row in club_perf.iterrows():
    texts.append(plt.text(
        row["avg_goals_conceded_home"],
        row["avg_goals_scored_home"],
        club,
        fontsize=9,
        weight='bold'
    ))

adjust_text(texts, arrowprops=dict(arrowstyle='-', color='gray', lw=0.5))

plt.title("Attack / defense profile of the clubs (at home)")
plt.xlabel("Average goals conceded (defense)")
plt.ylabel("Average goals scored (attack)")
plt.axline((0, 0), slope=1, color='gray', linestyle='--', alpha=0.5)
plt.show()

In [None]:
club_perf = (
    df.groupby(config['away_column'])[[config['nb_goals_home_column'], config['nb_goals_away_column']]]
    .mean()
    .rename(columns={
        config['nb_goals_home_column']: "avg_goals_conceded_away",
        config['nb_goals_away_column']: "avg_goals_scored_away"
    })
)

plt.figure(figsize=(10, 8))
sns.scatterplot(
    data=club_perf,
    x="avg_goals_conceded_away",
    y="avg_goals_scored_away",
    s=100,
    color="cornflowerblue",
    alpha=0.8
)

texts = []
for club, row in club_perf.iterrows():
    texts.append(plt.text(
        row["avg_goals_conceded_away"],
        row["avg_goals_scored_away"],
        club,
        fontsize=9,
        weight='bold'
    ))

adjust_text(texts, arrowprops=dict(arrowstyle='-', color='gray', lw=0.5))

plt.title("Attack / defense profile of the clubs (away)")
plt.xlabel("Average goals conceded (defense)")
plt.ylabel("Average goals scored (attack)")
plt.axline((0, 0), slope=1, color='gray', linestyle='--', alpha=0.5)
plt.show()

In [None]:
home_vs_opponent = (
    df.groupby([config['home_column'], config['away_column']])['home_points'].sum()
    .reset_index()
    .rename(columns={config['home_column']: 'club', config['away_column']: 'opponent'})
)

matches_vs_opponent_home = (
    df.groupby([config['home_column'], config['away_column']])['home_points']
    .count()
    .reset_index()
    .rename(columns={config['home_column']: 'club', config['away_column']: 'opponent', 'home_points': 'n_matches'})
)

home_vs_opponent = home_vs_opponent.merge(matches_vs_opponent_home, on=['club', 'opponent'])
home_vs_opponent['avg_points'] = home_vs_opponent['home_points'] / home_vs_opponent['n_matches']

# Bête noire = minimum points en domicile
beasts_home = home_vs_opponent.loc[home_vs_opponent.groupby('club')['avg_points'].idxmin()]
beasts_home = beasts_home.sort_values('avg_points')

# Graphique
plt.figure(figsize=(12, 8))
bars = plt.barh(beasts_home['club'], beasts_home['avg_points'], color='skyblue', alpha=0.8)
texts = []
for bar, opponent, n in zip(bars, beasts_home['opponent'], beasts_home['n_matches']):
    width = bar.get_width()
    label = f"{opponent} ({n})"
    texts.append(plt.text(width + 0.05, bar.get_y() + bar.get_height()/2, label, va='center', fontsize=9, weight='bold'))

plt.xlabel("Average points won against the opponent")
plt.ylabel("Club")
plt.title("Home bogey teams (matches played)")
plt.xlim(0, 3)
plt.show()

In [None]:
away_vs_opponent = (
    df.groupby([config['away_column'], config['home_column']])['away_points'].sum()
    .reset_index()
    .rename(columns={config['away_column']: 'club', config['home_column']: 'opponent'})
)

matches_vs_opponent_away = (
    df.groupby([config['away_column'], config['home_column']])['away_points']
    .count()
    .reset_index()
    .rename(columns={config['away_column']: 'club', config['home_column']: 'opponent', 'away_points': 'n_matches'})
)

away_vs_opponent = away_vs_opponent.merge(matches_vs_opponent_away, on=['club', 'opponent'])
away_vs_opponent['avg_points'] = away_vs_opponent['away_points'] / away_vs_opponent['n_matches']

# Bête noire = minimum points en extérieur
beasts_away = away_vs_opponent.loc[away_vs_opponent.groupby('club')['avg_points'].idxmin()]
beasts_away = beasts_away.sort_values('avg_points')

# Graphique
plt.figure(figsize=(12, 8))
bars = plt.barh(beasts_away['club'], beasts_away['avg_points'], color='tomato', alpha=0.8)
texts = []
for bar, opponent, n in zip(bars, beasts_away['opponent'], beasts_away['n_matches']):
    width = bar.get_width()
    label = f"{opponent} ({n})"
    texts.append(plt.text(width + 0.05, bar.get_y() + bar.get_height()/2, label, va='center', fontsize=9, weight='bold'))

plt.xlabel("Average points won against the opponent")
plt.ylabel("Club")
plt.title("Away bogey teams (matches played)")
plt.xlim(0, 3)
plt.show()

In [None]:
home_vs_opponent_5 = home_vs_opponent[home_vs_opponent['n_matches'] >= 5]

beasts_home_5 = home_vs_opponent_5.loc[home_vs_opponent_5.groupby('club')['avg_points'].idxmin()]
beasts_home_5 = beasts_home_5.sort_values('avg_points')

plt.figure(figsize=(12, 8))
bars = plt.barh(beasts_home_5['club'], beasts_home_5['avg_points'], color='skyblue', alpha=0.8)
for bar, opponent, n in zip(bars, beasts_home_5['opponent'], beasts_home_5['n_matches']):
    width = bar.get_width()
    plt.text(width + 0.05, bar.get_y() + bar.get_height()/2, f"{opponent} ({n})", va='center', fontsize=9, weight='bold')

plt.xlabel("Average points won against the opponent")
plt.ylabel("Club")
plt.title("Home bogey teams among teams faced +5 times")
plt.xlim(0, 3)
plt.show()

In [None]:
away_vs_opponent_5 = away_vs_opponent[away_vs_opponent['n_matches'] >= 5]

beasts_away_5 = away_vs_opponent_5.loc[away_vs_opponent_5.groupby('club')['avg_points'].idxmin()]
beasts_away_5 = beasts_away_5.sort_values('avg_points')

plt.figure(figsize=(12, 8))
bars = plt.barh(beasts_away_5['club'], beasts_away_5['avg_points'], color='tomato', alpha=0.8)
for bar, opponent, n in zip(bars, beasts_away_5['opponent'], beasts_away_5['n_matches']):
    width = bar.get_width()
    plt.text(width + 0.05, bar.get_y() + bar.get_height()/2, f"{opponent} ({n})", va='center', fontsize=9, weight='bold')

plt.xlabel("Average points won against the opponent")
plt.ylabel("Club")
plt.title("Away bogey teams among teams faced +5 times")
plt.xlim(0, 3)
plt.show()

In [None]:
club_season_points = (
    df.groupby(['season', 'home'])['home_points'].sum().reset_index().rename(columns={'home':'club', 'home_points':'points_home'})
)
club_season_points_away = (
    df.groupby(['season', 'away'])['away_points'].sum().reset_index().rename(columns={'away':'club', 'away_points':'points_away'})
)
club_season_points_total = pd.concat([club_season_points, club_season_points_away])
club_season_points_total = club_season_points_total.groupby(['season','club'])[['points_home','points_away']].sum().reset_index()
club_season_points_total['total_points'] = club_season_points_total['points_home'] + club_season_points_total['points_away']

# Moyenne par club
club_avg_points = club_season_points_total.groupby('club')['total_points'].mean().sort_values(ascending=False)

plt.figure(figsize=(12,6))
sns.barplot(x=club_avg_points.values, y=club_avg_points.index, color="green")
plt.xlabel("Average points per season")
plt.ylabel("Club")
plt.title("Average points per season for each club")
plt.show()

In [None]:
home_points = df.groupby(['season','home'])['home_points'].sum().reset_index().rename(columns={'home':'club', 'home_points':'points_home'})
away_points = df.groupby(['season','away'])['away_points'].sum().reset_index().rename(columns={'away':'club', 'away_points':'points_away'})

season_points = pd.merge(home_points, away_points, on=['season','club'])
season_points['total_points'] = season_points['points_home'] + season_points['points_away']

season_points['rank'] = season_points.groupby('season')['total_points'].rank(method='min', ascending=False)

club_avg_rank = season_points.groupby('club')['rank'].mean().sort_values()
plt.figure(figsize=(12,6))
sns.barplot(x=club_avg_rank.values, y=club_avg_rank.index, color="green")
plt.xlabel("Average ranking per season")
plt.ylabel("Club")
plt.title("Average rankings per season for each club")
plt.show()

# Specific club performances analysis

In [None]:
club_to_focus = 'Metz'

In [None]:
focus_club_perf = ClubAnalysis(df=df, club_name=club_to_focus, config=config)
focus_club_perf.plot_all()