### Este notebook ayuda a visualizar y modularizar el proceso completo de extracción, carga y transformación de los datos

In [1]:
# Módulo con los import necesarios para el proyecto
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
# Cargar el archivo CSV en un DataFrame de pandas
results_df = pd.read_csv('../data/results.csv')

# Mostrar las primeras filas del DataFrame
results_df.head()

Unnamed: 0,date,home_team,away_team,home_score,away_score,competition,stadium,city,country,neutral,world_cup
0,1871-03-27,Scotland,England,1,0,1871 Scotland v England International,Raeburn Place,Edinburgh,Scotland,False,False
1,1872-02-05,England,Scotland,2,1,1871-72 Home Nations International,The Oval,London,England,False,False
2,1873-03-03,Scotland,England,0,0,1872-73 Home Nations International,West of Scotland F.C.,Glasgow,Scotland,False,False
3,1874-02-23,England,Scotland,1,0,1873-74 Home Nations International,The Oval,London,England,False,False
4,1875-02-15,England,Ireland,2,0,1874-75 Home Nations rugby union matches,The Oval,London,England,False,False


In [3]:
# Mostrar información general del dataframe
results_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2783 entries, 0 to 2782
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   date         2783 non-null   object
 1   home_team    2783 non-null   object
 2   away_team    2783 non-null   object
 3   home_score   2783 non-null   int64 
 4   away_score   2783 non-null   int64 
 5   competition  2760 non-null   object
 6   stadium      2783 non-null   object
 7   city         2783 non-null   object
 8   country      2783 non-null   object
 9   neutral      2783 non-null   bool  
 10  world_cup    2783 non-null   bool  
dtypes: bool(2), int64(2), object(7)
memory usage: 201.2+ KB


In [4]:
# El rugby profesional comenzó en 1995, por lo que debemos filtrar los datos para incluir solo los partidos desde 1995
results_df = results_df[results_df['date'] > '1995-01-01']

results_df.head()

Unnamed: 0,date,home_team,away_team,home_score,away_score,competition,stadium,city,country,neutral,world_cup
1387,1995-01-21,France,Wales,21,9,1995 Five Nations Championship,Parc des Princes,Paris,France,False,False
1388,1995-01-21,Ireland,England,8,20,1995 Five Nations Championship,Lansdowne Road,Dublin,Ireland,False,False
1389,1995-02-04,England,France,31,10,1995 Five Nations Championship,Twickenham,London,England,False,False
1390,1995-02-04,Scotland,Ireland,26,13,1995 Five Nations Championship,Murrayfield Stadium,Edinburgh,Scotland,False,False
1391,1995-02-18,Wales,England,9,23,1995 Five Nations Championship,Cardiff Arms Park,Cardiff,Wales,False,False


In [5]:
# Función para indicar si el partido fue ganado por el equipo local o visitante o si fue un empate
def get_winner_loser(row): 
    if row['home_score'] > row['away_score']:
        winner = row['home_team']
        loser = row['away_team']
    elif row['home_score'] < row['away_score']:
        winner = row['away_team']
        loser = row['home_team']
    else:
        winner = 'draw'
        loser = 'draw'
    return winner, loser

In [6]:
# Función para medir el rendimiento de un equipo en función de los partidos ganados, perdidos y empatados
def get_performance(results_df, team_name, current_row_index, n_games):
    team_performance = []
    games = results_df.copy()
    games_sliced = games.iloc[:current_row_index]

    for index in range(len(games_sliced)-1, -1, -1):
        current_row = games_sliced.iloc[index]

        if current_row['home_team'] == team_name or current_row['away_team'] == team_name:
            if current_row['winner'] == team_name:
                team_performance.append(1)
            elif current_row['winner'] == 'draw':
                team_performance.append(0.5)
            else:
                team_performance.append(0)
    return sum(team_performance[:n_games])      

In [7]:
# Aplicamos la función get_winner_loser a cada fila del DataFrame
results_df[['winner', 'loser']] = results_df.apply(get_winner_loser, axis=1).apply(pd.Series)

# Inicializamos las columnas para los puntos tanto de local como de visitante
results_df['ranking_home_points'] = 0
results_df['ranking_away_points'] = 0

# Añadimos una columna para el margen de victoria del equipo local
results_df['margin'] = results_df['home_score'] - results_df['away_score']

# Añadimos una columna para el resultado del partido definiendo si gana el equipo local o visitante
results_df['result'] = results_df['margin'].apply(lambda x: 'home_win' if x > 0 else ('away_win' if x == 0 else 'draw'))

# Inicializamos el ranking de los equipos empezando en 80 todos 
ranking_points = { 'Scotland': 80, 'England': 80, 'Ireland': 80, 'Wales': 80, 'France': 80, 'Italy': 80, 'South Africa': 80, 'New Zealand': 80, 'Australia': 80, 'Argentina': 80}

for i, row in results_df.iterrows():
    # Obtenemos el nombre del equipo local y visitante
    home_team = row['home_team']
    away_team = row['away_team']

    # Actualizamos los puntos de ranking de los equipos local y visitante
    results_df.at[i, 'ranking_home_points'] = ranking_points[home_team]
    results_df.at[i, 'ranking_away_points'] = ranking_points[away_team]
    if row['neutral'] == True: 
        home_points = ranking_points[home_team]
    else: 
        home_points = ranking_points[home_team] + 3
    away_points = ranking_points[away_team]

    gap = home_points - away_points
    if gap < -10: 
        gap = -10
    elif gap > 10:
        gap = 10
    
    if row['winner'] == home_team: 
        core = 1 - (gap*0.1)
    elif row['winner'] == 'draw': 
        core = gap*0.1
    else: 
        core = 1 + (gap*0.1)

    if np.abs(row['home_score'] - row['away_score']) > 15:
        core = core*1.5

    if row['world_cup'] == True:
        core = core*2
    
    if row['winner'] != 'draw': 
        ranking_points[row['winner']] += core
        ranking_points[row['loser']] -= core
    else:
        ranking_points[home_team] -= core
        ranking_points[away_team] += core

results_df.reset_index(drop=True, inplace=True)

for index, row in results_df.iterrows():
    results_df.at[index, 'home_performance'] = get_performance(results_df, row['home_team'], index, 5)
    results_df.at[index, 'away_performance'] = get_performance(results_df, row['away_team'], index, 5)

  results_df.at[i, 'ranking_home_points'] = ranking_points[home_team]
  results_df.at[i, 'ranking_away_points'] = ranking_points[away_team]


### A continuación podemos ver el proceso de predicción del margen de puntos entre dos equipos

In [8]:
# Módulo con los import necesarios para el proyecto
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

In [None]:
# Se va a aconsiderar a partir de 1996 para que haya datos de ranking
data = results_df.copy()
data = data[data['date'] > '1996-01-01']

# Convertir las columnas 'neutral' y 'world_cup' a tipo entero
data['neutral'] = data['neutral'].astype(int)
data['world_cup'] = data['world_cup'].astype(int)

# Dividir los datos en conjuntos de entrenamiento(70%) y prueba(30%)
train_data, test_data = train_test_split(data, test_size=0.3, random_state=42)

X_train = train_data[['neutral', 'world_cup', 'ranking_home_points', 'ranking_away_points', 'home_performance', 'away_performance']]
y_train = train_data['margin']

X_test = test_data[['neutral', 'world_cup', 'ranking_home_points', 'ranking_away_points', 'home_performance', 'away_performance']]
y_test = test_data['margin']

# Inicializar el escalador
scaler = StandardScaler()

# Escalar las características de entrada
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(),
    'Support Vector Regression': SVR(),
}

for model_name, model in models.items():
    # Entrenar el modelo con el conjunto de entrenamiento
    model.fit(X_train_scaled, y_train)

    # Realizar predicciones sobre el conjunto de prueba
    y_pred = model.predict(X_test_scaled)
    
    # Calcular el error cuadrático medio y el coeficiente de determinación R^2
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    print(f"Model: {model_name}")
    print(f"Mean Squared Error: {mse:.2f}")
    print(f"R^2 Score: {r2:.2f}")
    print("=" * 40)

Model: Linear Regression
Mean Squared Error: 212.81
R^2 Score: 0.45
Model: Random Forest
Mean Squared Error: 230.72
R^2 Score: 0.40
Model: Support Vector Regression
Mean Squared Error: 262.35
R^2 Score: 0.32


### A continuación veremos la predicción del resultado de un partido con varios modelos

In [10]:
# Cambiar la columna de predicción
y_train = train_data['result']
y_test = test_data['result']

### Proceso de predicción con un Clasificador Random Forest

In [11]:
# Inicializar el modelo Random Forest Classifier
rfc = RandomForestClassifier(random_state=42)
# Entrenar el modelo con el conjunto de entrenamiento
rfc.fit(X_train_scaled, y_train)

# Realizar predicciones sobre el conjunto de prueba
y_pred_rfc = rfc.predict(X_test_scaled)

# Calcular la precisión y el informe de clasificación
accuracy_rfc = accuracy_score(y_test, y_pred_rfc)
classification_rep_rfc = classification_report(y_test, y_pred_rfc)

print("Random Forest Classifier Results")
print(f"Accuracy: {accuracy_rfc:.2f}")
print("Classification Report:")
print(classification_rep_rfc)

Random Forest Classifier Results
Accuracy: 0.71
Classification Report:
              precision    recall  f1-score   support

    away_win       0.00      0.00      0.00         6
        draw       0.64      0.63      0.64       157
    home_win       0.75      0.78      0.77       245

    accuracy                           0.71       408
   macro avg       0.46      0.47      0.47       408
weighted avg       0.70      0.71      0.70       408



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### Proceso de predicción con una Regresión Logística

In [12]:
# Inicializar el modelo Logistic Regression
lr = LogisticRegression(random_state=42)
# Entrenar el modelo con el conjunto de entrenamiento
lr.fit(X_train_scaled, y_train)

# Realizar predicciones sobre el conjunto de prueba
y_pred_lr = lr.predict(X_test_scaled)

# Calcular la precisión y el informe de clasificación
accuracy_lr = accuracy_score(y_test, y_pred_lr)
classification_rep_lr = classification_report(y_test, y_pred_lr)

print("Logistic Regression Results")
print(f"Accuracy: {accuracy_lr:.2f}")
print("Classification Report:")
print(classification_rep_lr)

Logistic Regression Results
Accuracy: 0.75
Classification Report:
              precision    recall  f1-score   support

    away_win       0.00      0.00      0.00         6
        draw       0.68      0.69      0.69       157
    home_win       0.79      0.80      0.79       245

    accuracy                           0.75       408
   macro avg       0.49      0.50      0.49       408
weighted avg       0.73      0.75      0.74       408



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### Proceso de predicción con un Clasificador Decission Tree

In [13]:
# Inicializar el modelo Decission Tree Classifier
dtc = DecisionTreeClassifier(random_state=42)
# Entrenar el modelo con el conjunto de entrenamiento
dtc.fit(X_train_scaled, y_train)

# Realizar predicciones sobre el conjunto de prueba
y_pred_dtc = dtc.predict(X_test_scaled)

# Calcular la precisión y el informe de clasificación
accuracy_dtc = accuracy_score(y_test, y_pred_dtc)
classification_rep_dtc = classification_report(y_test, y_pred_dtc)

print("Decision Tree Classifier Results")
print(f"Accuracy: {accuracy_dtc:.2f}")
print("Classification Report:")
print(classification_rep_dtc)

Decision Tree Classifier Results
Accuracy: 0.63
Classification Report:
              precision    recall  f1-score   support

    away_win       0.00      0.00      0.00         6
        draw       0.56      0.60      0.58       157
    home_win       0.72      0.67      0.70       245

    accuracy                           0.63       408
   macro avg       0.43      0.42      0.42       408
weighted avg       0.65      0.63      0.64       408

