Importaciones

In [57]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import requests
from sklearn.dummy import DummyClassifier
from plotnine import *
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import gradio as gr
import pickle

API Token

In [58]:
api = "d073f379a0e94969beb4956bda52ef0f"
headers = {"X-Auth-Token": api}

Obtengo todos los datos

In [59]:
df = pd.DataFrame()
for year in range(2022, 2025):
    url = f'https://api.football-data.org/v4/competitions/PD/matches?season={year}'

    response = requests.get(url, headers=headers)
    data = response.json()

    matches = data["matches"]
    df_season = pd.DataFrame(matches)
    df = pd.concat([df, df_season], ignore_index=True)

Formateando columna 'season'

In [60]:
df['startDate'] = df['season'].apply(lambda x: x['startDate'])
df['endDate'] = df['season'].apply(lambda x: x['endDate'])

df['startDate'] = pd.to_datetime(df['startDate'])
df['endDate'] = pd.to_datetime(df['endDate'])
df['season'] = df['startDate'].dt.year.astype(str) + '/' + df['endDate'].dt.year.astype(str).str[-2:]

Formateando columnas 'awayTeam' y 'homeTeam'

In [61]:
df['homeTeam'] = df['homeTeam'].apply(lambda x: x['name'])
df['awayTeam'] = df['awayTeam'].apply(lambda x: x['name'])

Formateando columna 'score' y añadiendo columnas de goles de local y visitante, resultado media parte y resultado final

In [62]:
df['homeGoalsFullTime'] = df['score'].apply(lambda x: x['fullTime']).apply(lambda x: x["home"])
df['awayGoalsFullTime'] = df['score'].apply(lambda x: x['fullTime']).apply(lambda x: x["away"])

df['homeGoalsHalfTime'] = df['score'].apply(lambda x: x['halfTime']).apply(lambda x: x["home"])
df['awayGoalsHalfTime'] = df['score'].apply(lambda x: x['halfTime']).apply(lambda x: x["away"])

Formateando columna 'referees'

In [63]:
df["referees"] = df['referees'].apply(lambda x: x[0]['name'] if isinstance(x, list) and len(x) > 0 else None)

Eliminando columnas irrelevantes

In [64]:
df = df.drop(["startDate", "endDate", "competition", "area", "odds", "id", "utcDate", "stage", "group", "lastUpdated", "score"], axis=1)

Feature Engineering

In [65]:
teams = df["homeTeam"].unique()
seasons = df["season"].unique()

for team in teams:
    total_matches = df.loc[(df["awayTeam"] == team) | (df["homeTeam"] == team)]
    total_matches_finished = total_matches.loc[total_matches["status"] == "FINISHED"]
    total_matches_home = total_matches_finished.loc[total_matches_finished["homeTeam"] == team]
    total_matches_away = total_matches_finished.loc[total_matches_finished["awayTeam"] == team]
    total_matches_finished_len = len(total_matches_finished)
    wins = ((total_matches_finished["homeTeam"] == team) & (total_matches_finished["homeGoalsFullTime"] > total_matches_finished["awayGoalsFullTime"])).sum() + \
           ((total_matches_finished["awayTeam"] == team) & (total_matches_finished["awayGoalsFullTime"] > total_matches_finished["homeGoalsFullTime"])).sum()

    draws = (total_matches_finished["homeGoalsFullTime"] == total_matches_finished["awayGoalsFullTime"]).sum()

    losses = ((total_matches_finished["homeTeam"] == team) & (total_matches_finished["homeGoalsFullTime"] < total_matches_finished["awayGoalsFullTime"])).sum() + \
             ((total_matches_finished["awayTeam"] == team) & (total_matches_finished["awayGoalsFullTime"] < total_matches_finished["homeGoalsFullTime"])).sum()

    df.loc[df["homeTeam"] == team, "homeTeamWins"] = wins
    df.loc[df["homeTeam"] == team, "homeTeamDraws"] = draws
    df.loc[df["homeTeam"] == team, "homeTeamLosses"] = losses

    df.loc[df["awayTeam"] == team, "awayTeamWins"] = wins
    df.loc[df["awayTeam"] == team, "awayTeamDraws"] = draws
    df.loc[df["awayTeam"] == team, "awayTeamLosses"] = losses
    
    total_away_team_goals = df.loc[(df["awayTeam"] == team)]["awayGoalsFullTime"].sum()
    total_home_team_goals = df.loc[(df["homeTeam"] == team)]["homeGoalsFullTime"].sum()
    total_goals = total_away_team_goals + total_home_team_goals
    df.loc[df["homeTeam"] == team, "totalHomeTeamGoals"] = total_goals
    df.loc[df["awayTeam"] == team, "totalAwayTeamGoals"] = total_goals
    df.loc[df["homeTeam"] == team, "homeTeamGoalRatio"] = total_goals / total_matches_finished_len
    df.loc[df["awayTeam"] == team, "awayTeamGoalRatio"] = total_goals / total_matches_finished_len
    df.loc[df["homeTeam"] == team, "totalHomeTeamGoalsAtHome"] = total_home_team_goals
    df.loc[df["awayTeam"] == team, "totalAwayTeamGoalsAway"] = total_away_team_goals
    df.loc[df["homeTeam"] == team, "homeTeamGoalRatioAtHome"] = total_home_team_goals / len(total_matches_home)
    df.loc[df["awayTeam"] == team, "awayTeamGoalRatioAway"] = total_away_team_goals / len(total_matches_away)
    total_home_team_goals_conceded = df.loc[df["homeTeam"] == team]["awayGoalsFullTime"].sum()
    total_away_team_goals_conceded = df.loc[df["awayTeam"] == team]["homeGoalsFullTime"].sum()
    total_goals_conceded = total_away_team_goals_conceded + total_home_team_goals_conceded
    df.loc[df["homeTeam"] == team, "totalHomeTeamGoalsConceded"] = total_goals_conceded
    df.loc[df["awayTeam"] == team, "totalAwayTeamGoalsConceded"] = total_goals_conceded
    df.loc[df["homeTeam"] == team, "homeTeamGoalConcededRatio"] = total_goals_conceded / total_matches_finished_len
    df.loc[df["awayTeam"] == team, "awayTeamGoalConcededRatio"] = total_goals_conceded / total_matches_finished_len
    df.loc[df["homeTeam"] == team, "totalHomeTeamGoalsConcededAtHome"] = total_home_team_goals_conceded
    df.loc[df["awayTeam"] == team, "totalAwayTeamGoalsConcededAway"] = total_away_team_goals_conceded
    df.loc[df["homeTeam"] == team, "homeTeamGoalConcededRatioAtHome"] = total_home_team_goals_conceded / len(total_matches_home)
    df.loc[df["awayTeam"] == team, "awayTeamGoalConcededRatioAway"] = total_away_team_goals_conceded / len(total_matches_away)

    for season in seasons:
        total_matches_finished_per_season = total_matches_finished.loc[total_matches_finished["season"] == season]
        total_matches_finished_per_season_len = len(total_matches_finished_per_season)
        total_matches_home_per_season = total_matches_finished_per_season.loc[total_matches_finished_per_season["homeTeam"] == team]
        total_matches_away_per_season = total_matches_finished_per_season.loc[total_matches_finished_per_season["awayTeam"] == team]
        total_home_team_goals_per_season = df.loc[(df["homeTeam"] == team) & (df["season"] == season)]["homeGoalsFullTime"].sum()
        total_away_team_goals_per_season = df.loc[(df["awayTeam"] == team) & (df["season"] == season)]["awayGoalsFullTime"].sum()
        total_home_team_goals_conceded_per_season = df.loc[(df["homeTeam"] == team) & (df["season"] == season)]["awayGoalsFullTime"].sum()
        total_away_team_goals_conceded_per_season = df.loc[(df["awayTeam"] == team) & (df["season"] == season)]["homeGoalsFullTime"].sum()
        total_goals_per_season = total_home_team_goals_per_season + total_away_team_goals_per_season
        total_goals_conceded_per_season = total_home_team_goals_conceded_per_season + total_away_team_goals_conceded_per_season
        df.loc[(df["homeTeam"] == team) & (df["season"] == season), "totalHomeTeamGoalsperSeason"] = total_goals_per_season
        df.loc[(df["awayTeam"] == team) & (df["season"] == season), "totalAwayTeamGoalsperSeason"] = total_goals_per_season
        df.loc[(df["homeTeam"] == team) & (df["season"] == season), "homeTeamGoalRatioperSeason"] = total_goals_per_season / total_matches_finished_per_season_len
        df.loc[(df["awayTeam"] == team) & (df["season"] == season), "awayTeamGoalRatioperSeason"] = total_goals_per_season / total_matches_finished_per_season_len
        df.loc[(df["homeTeam"] == team) & (df["season"] == season), "totalHomeTeamGoalsAtHomeperSeason"] = total_home_team_goals_per_season
        df.loc[(df["awayTeam"] == team) & (df["season"] == season), "totalAwayTeamGoalsAwayperSeason"] = total_away_team_goals_per_season
        df.loc[(df["homeTeam"] == team) & (df["season"] == season), "homeTeamGoalRatioAtHome"] = total_home_team_goals / len(total_matches_home_per_season)
        df.loc[(df["awayTeam"] == team) & (df["season"] == season), "awayTeamGoalRatioAway"] = total_away_team_goals / len(total_matches_away_per_season)
        df.loc[(df["homeTeam"] == team) & (df["season"] == season), "totalHomeTeamGoalsConcededperSeason"] = total_goals_conceded_per_season
        df.loc[(df["awayTeam"] == team) & (df["season"] == season), "totalAwayTeamGoalsConcededperSeason"] = total_goals_conceded_per_season
        df.loc[(df["homeTeam"] == team) & (df["season"] == season), "homeTeamGoalConcededRatioperSeason"] = total_goals_conceded_per_season / total_matches_finished_per_season_len
        df.loc[(df["awayTeam"] == team) & (df["season"] == season), "awayTeamGoalConcededRatioperSeason"] = total_goals_conceded_per_season / total_matches_finished_per_season_len
        df.loc[(df["homeTeam"] == team) & (df["season"] == season), "totalHomeTeamGoalsConcededAtHomeperSeason"] = total_home_team_goals_per_season
        df.loc[(df["awayTeam"] == team) & (df["season"] == season), "totalAwayTeamGoalsConcededAwayperSeason"] = total_away_team_goals_per_season
        df.loc[(df["homeTeam"] == team) & (df["season"] == season), "homeTeamGoalConcededRatioAtHome"] = total_home_team_goals_conceded_per_season / len(total_matches_home_per_season)
        df.loc[(df["awayTeam"] == team) & (df["season"] == season), "awayTeamGoalConcededRatioAway"] = total_away_team_goals_conceded_per_season / len(total_matches_away_per_season)



Preprocesando

In [66]:
df_train = df.loc[df["status"] == "FINISHED"].loc[:, df.columns.drop(["awayGoalsHalfTime", "homeGoalsHalfTime", "season", "matchday", "status", "referees", "homeTeam", "awayTeam"])]
df_train.loc[df_train["homeGoalsFullTime"] > df_train["awayGoalsFullTime"], "winner"] = "WINNER_HOME"
df_train.loc[df_train["homeGoalsFullTime"] < df_train["awayGoalsFullTime"], "winner"] = "WINNER_AWAY"
df_train.loc[df_train["homeGoalsFullTime"] == df_train["awayGoalsFullTime"], "winner"] = "DRAW"
df_train = df_train.drop(columns=["homeGoalsFullTime", "awayGoalsFullTime"])
df_test = df.loc[df["status"] != "FINISHED"].loc[:, df.columns.drop(["awayGoalsHalfTime", "homeGoalsHalfTime", "homeGoalsFullTime", "awayGoalsFullTime", "season", "matchday", "status", "referees", "awayTeam", "homeTeam"])]

In [67]:
y = df_train["winner"]
x = df_train.drop(columns=["winner"])

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

Entrenando el modelo

In [68]:
clf = LogisticRegression()
clf.fit(x_train, y_train)
y_train_pred = clf.predict(x_train)
y_test_pred = clf.predict(x_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [69]:
print("Confusion Matrix Training:\n", confusion_matrix(y_train, y_train_pred))
print("Confusion Matrix Test:\n", confusion_matrix(y_test, y_test_pred))

Confusion Matrix Training:
 [[ 37  36  82]
 [ 24  97  56]
 [ 21  41 215]]
Confusion Matrix Test:
 [[ 6 23 42]
 [10 41 18]
 [ 7 19 95]]


In [70]:
print("Classification Report Training:\n", classification_report(y_train, y_train_pred))
print("Classification Report Test:\n", classification_report(y_test, y_test_pred))

Classification Report Training:
               precision    recall  f1-score   support

        DRAW       0.45      0.24      0.31       155
 WINNER_AWAY       0.56      0.55      0.55       177
 WINNER_HOME       0.61      0.78      0.68       277

    accuracy                           0.57       609
   macro avg       0.54      0.52      0.52       609
weighted avg       0.55      0.57      0.55       609

Classification Report Test:
               precision    recall  f1-score   support

        DRAW       0.26      0.08      0.13        71
 WINNER_AWAY       0.49      0.59      0.54        69
 WINNER_HOME       0.61      0.79      0.69       121

    accuracy                           0.54       261
   macro avg       0.46      0.49      0.45       261
weighted avg       0.49      0.54      0.50       261



In [71]:
y_test_prediction = clf.predict(df_test)

In [72]:
home_teams_test = df.loc[df["status"] != "FINISHED"]["homeTeam"]
away_teams_test = df.loc[df["status"] != "FINISHED"]["awayTeam"]

home_teams_train = df.loc[df["status"] == "FINISHED"]["homeTeam"]
away_teams_train = df.loc[df["status"] == "FINISHED"]["awayTeam"]

overview_test = pd.DataFrame()
overview_test["homeTeam"] = home_teams_test
overview_test["awayTeam"] = away_teams_test
overview_test["winner"] = y_test_prediction

Guardando el modelo

In [73]:
with open('modelo.clf', 'wb') as file:
    pickle.dump(clf, file)