In [12]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('season-2425.csv', encoding='utf-8')

In [None]:
print("Initial Data Preview:")
print(df.head())

In [None]:
df.columns = df.columns.str.strip()

# Convert the 'Date' column to datetime objects.
df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%y')

# Check for missing values in each column
print("\nMissing Values per Column:")
print(df.isnull().sum())

In [5]:
df.drop(columns=['Referee'], inplace=True)

In [6]:
df['home_goal_diff'] = df['FTHG'] - df['FTAG']
df['away_goal_diff'] = df['FTAG'] - df['FTHG']

In [7]:
df['home_win'] = np.where(df['FTR'] == 'H', 1, 0)
df['home_draw'] = np.where(df['FTR'] == 'D', 1, 0)
df['home_loss'] = np.where(df['FTR'] == 'A', 1, 0)

df['away_win'] = np.where(df['FTR'] == 'A', 1, 0)
df['away_draw'] = np.where(df['FTR'] == 'D', 1, 0)
df['away_loss'] = np.where(df['FTR'] == 'H', 1, 0)


In [8]:
# Aggregate home performance metrics
home_stats = df.groupby('HomeTeam').agg(
    home_matches=('FTHG', 'count'),
    home_goals_scored=('FTHG', 'sum'),
    home_goals_conceded=('FTAG', 'sum'),
    home_goal_difference=('home_goal_diff', 'sum'),
    home_wins=('home_win', 'sum'),
    home_draws=('home_draw', 'sum'),
    home_losses=('home_loss', 'sum'),
    home_avg_goals_scored=('FTHG', 'mean'),
    home_avg_goals_conceded=('FTAG', 'mean')
).reset_index().rename(columns={'HomeTeam': 'Team'})

# Aggregate away performance metrics
away_stats = df.groupby('AwayTeam').agg(
    away_matches=('FTAG', 'count'),
    away_goals_scored=('FTAG', 'sum'),
    away_goals_conceded=('FTHG', 'sum'),
    away_goal_difference=('away_goal_diff', 'sum'),
    away_wins=('away_win', 'sum'),
    away_draws=('away_draw', 'sum'),
    away_losses=('away_loss', 'sum'),
    away_avg_goals_scored=('FTAG', 'mean'),
    away_avg_goals_conceded=('FTHG', 'mean')
).reset_index().rename(columns={'AwayTeam': 'Team'})

In [9]:
team_stats = pd.merge(home_stats, away_stats, on='Team', how='outer').fillna(0)

In [10]:
# Calculate overall metrics:
team_stats['total_matches'] = team_stats['home_matches'] + team_stats['away_matches']
team_stats['total_goals_scored'] = team_stats['home_goals_scored'] + team_stats['away_goals_scored']
team_stats['total_goals_conceded'] = team_stats['home_goals_conceded'] + team_stats['away_goals_conceded']
team_stats['total_goal_difference'] = team_stats['home_goal_difference'] + team_stats['away_goal_difference']

# Total wins, draws, and losses across all matches
team_stats['wins'] = team_stats['home_wins'] + team_stats['away_wins']
team_stats['draws'] = team_stats['home_draws'] + team_stats['away_draws']
team_stats['losses'] = team_stats['home_losses'] + team_stats['away_losses']

# Calculate total points (win = 3, draw = 1, loss = 0)
team_stats['points'] = team_stats['wins'] * 3 + team_stats['draws']

# Average goals per match (overall)
team_stats['avg_goals_scored'] = team_stats['total_goals_scored'] / team_stats['total_matches']
team_stats['avg_goals_conceded'] = team_stats['total_goals_conceded'] / team_stats['total_matches']

In [None]:
# Display the derived team performance metrics
print("\nTeam Performance Metrics:")
print(team_stats.sort_values(by='points', ascending=False))

In [20]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Read the CSV files for season 2425 and season 2324
df_2425 = pd.read_csv('season-2425.csv', encoding='utf-8')
df_2324 = pd.read_csv('season-2324.csv', encoding='utf-8')

# Clean column names and convert the 'Date' column for both datasets
for df in [df_2425, df_2324]:
    df.columns = df.columns.str.strip()  # remove extra whitespace
    df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%y')  # convert dates


df_train = df_2324.copy()
df_test = df_2425.copy()

# Training set:
train_home_dummies = pd.get_dummies(df_train['HomeTeam'], prefix='Home')
train_away_dummies = pd.get_dummies(df_train['AwayTeam'], prefix='Away')
X_train = pd.concat([train_home_dummies, train_away_dummies], axis=1)
X_train = sm.add_constant(X_train)  # adds an intercept term
y_train = df_train['FTHG']  # using full-time home goals as the target

# Test set:
test_home_dummies = pd.get_dummies(df_test['HomeTeam'], prefix='Home')
test_away_dummies = pd.get_dummies(df_test['AwayTeam'], prefix='Away')
X_test = pd.concat([test_home_dummies, test_away_dummies], axis=1)
X_test = sm.add_constant(X_test)
y_test = df_test['FTHG']

X_test = X_test.reindex(columns=X_train.columns, fill_value=0)

X_train = X_train.astype(float)
X_test = X_test.astype(float)
y_train = pd.to_numeric(y_train, errors='coerce')
y_test = pd.to_numeric(y_test, errors='coerce')

if X_train.isnull().values.any() or y_train.isnull().values.any():
    raise ValueError("Found NaN values in training data after conversion. Check your inputs.")

poisson_model = sm.GLM(y_train, X_train, family=sm.families.Poisson())
poisson_results = poisson_model.fit()
print(poisson_results.summary())

y_pred = poisson_results.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
print("\nTest MSE:", mse)
print("Test MAE:", mae)

# Show a few predictions compared to actual values
comparison = pd.DataFrame({
    'Date': df_test['Date'],
    'HomeTeam': df_test['HomeTeam'],
    'Actual Home Goals': y_test,
    'Predicted Home Goals': np.round(y_pred, 2)
})
print("\nComparison of Actual vs. Predicted Home Goals:")
print(comparison.head(10))




                 Generalized Linear Model Regression Results                  
Dep. Variable:                   FTHG   No. Observations:                  380
Model:                            GLM   Df Residuals:                      341
Model Family:                 Poisson   Df Model:                           38
Link Function:                    Log   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -542.31
Date:                Wed, 05 Feb 2025   Deviance:                       376.13
Time:                        19:40:24   Pearson chi2:                     328.
No. Iterations:                    71   Pseudo R-squ. (CS):             0.2609
Covariance Type:            nonrobust                                         
                       coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------
const                0.2686      0.042  

In [21]:
remaining_matches = pd.read_csv('remaining-fixtures.csv', encoding='utf-8')

remaining_matches.drop(columns=['Location', 'Round Number', 'Match Number'], inplace=True)

In [22]:
print(remaining_matches.head())

               Date       Home Team           Away Team  Result
0  07/02/2025 20:00  Rayo Vallecano  Real Valladolid CF     NaN
1  08/02/2025 13:00        RC Celta          Real Betis     NaN
2  08/02/2025 15:15   Athletic Club           Girona FC     NaN
3  08/02/2025 17:30   UD Las Palmas       Villarreal CF     NaN
4  08/02/2025 20:00     Real Madrid  Atlético de Madrid     NaN
