# **Desarrollo algoritmo de predicción de los torneos de baloncesto de la NCAA de 2025 Liga Femenina**

**Primer paso:** Procedo a cargar los datos del archivo

In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
import numpy as np
import itertools

# Cargar datos de temporada regular
df = pd.read_csv("WRegularSeasonDetailedResults.csv")

**Ahora procedo a desarrollar la visualización de los parámetros que tengo en el archivo csv teniendo:**

In [2]:
df.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,...,LFGA3,LFTM,LFTA,LOR,LDR,LAst,LTO,LStl,LBlk,LPF
0,2010,11,3103,63,3237,49,H,0,23,54,...,13,6,10,11,27,11,23,7,6,19
1,2010,11,3104,73,3399,68,N,0,26,62,...,21,14,27,14,26,7,20,4,2,27
2,2010,11,3110,71,3224,59,A,0,29,62,...,14,19,23,17,23,8,15,6,0,15
3,2010,11,3111,63,3267,58,A,0,27,52,...,26,16,25,22,22,15,11,14,5,14
4,2010,11,3119,74,3447,70,H,1,30,74,...,17,11,21,21,32,12,14,4,2,14


**Posteriormente calculo las estadísticas por equipo y desarrollo el siguiente paso:**

In [3]:
winner_stats = df.groupby('WTeamID').agg({
    'WScore': ['sum', 'count'],
    'LScore': 'sum',
    'WFGM': 'sum', 'WFGA': 'sum',
    'WFGM3': 'sum', 'WFGA3': 'sum',
    'WFTM': 'sum', 'WFTA': 'sum',
    'WOR': 'sum', 'WDR': 'sum',  # Corregido WOR -> WOR, WDR -> WDR
    'WAst': 'sum', 'WTO': 'sum',
    'WStl': 'sum', 'WBlk': 'sum',
    'WPF': 'sum'
}).reset_index()
winner_stats.columns = ['TeamID', 'WScore_sum', 'games_won', 'LScore_sum_w',
                        'WFGM_sum', 'WFGA_sum', 'WFGM3_sum', 'WFGA3_sum',
                        'WFTM_sum', 'WFTA_sum', 'WOR_sum', 'WDR_sum',
                        'WAst_sum', 'WTO_sum', 'WStl_sum', 'WBlk_sum', 'WPF_sum']

loser_stats = df.groupby('LTeamID').agg({
    'LScore': ['sum', 'count'],
    'WScore': 'sum',
    'LFGM': 'sum', 'LFGA': 'sum',
    'LFGM3': 'sum', 'LFGA3': 'sum',
    'LFTM': 'sum', 'LFTA': 'sum',
    'LOR': 'sum', 'LDR': 'sum',  # Corregido LOR -> LOR, LDR -> LDR
    'LAst': 'sum', 'LTO': 'sum',
    'LStl': 'sum', 'LBlk': 'sum',
    'LPF': 'sum'
}).reset_index()
loser_stats.columns = ['TeamID', 'LScore_sum', 'games_lost', 'WScore_sum_l',
                       'LFGM_sum', 'LFGA_sum', 'LFGM3_sum', 'LFGA3_sum',
                       'LFTM_sum', 'LFTA_sum', 'LOR_sum', 'LDR_sum',
                       'LAst_sum', 'LTO_sum', 'LStl_sum', 'LBlk_sum', 'LPF_sum']

# Combinar estadísticas
team_stats = pd.merge(winner_stats, loser_stats, on='TeamID', how='outer').fillna(0)

# Calcular estadísticas totales
team_stats['games'] = team_stats['games_won'] + team_stats['games_lost']
team_stats['points_scored'] = team_stats['WScore_sum'] + team_stats['LScore_sum']
team_stats['points_allowed'] = team_stats['LScore_sum_w'] + team_stats['WScore_sum_l']

# Calcular estadísticas avanzadas
team_stats['FGM'] = team_stats['WFGM_sum'] + team_stats['LFGM_sum']
team_stats['FGA'] = team_stats['WFGA_sum'] + team_stats['LFGA_sum']
team_stats['FGM3'] = team_stats['WFGM3_sum'] + team_stats['LFGM3_sum']
team_stats['FGA3'] = team_stats['WFGA3_sum'] + team_stats['LFGA3_sum']
team_stats['FTM'] = team_stats['WFTM_sum'] + team_stats['LFTM_sum']
team_stats['FTA'] = team_stats['WFTA_sum'] + team_stats['LFTA_sum']
team_stats['ORB'] = team_stats['WOR_sum'] + team_stats['LOR_sum']  # Rebotes ofensivos
team_stats['DRB'] = team_stats['WDR_sum'] + team_stats['LDR_sum']  # Rebotes defensivos
team_stats['Ast'] = team_stats['WAst_sum'] + team_stats['LAst_sum']
team_stats['TO'] = team_stats['WTO_sum'] + team_stats['LTO_sum']
team_stats['Stl'] = team_stats['WStl_sum'] + team_stats['LStl_sum']
team_stats['Blk'] = team_stats['WBlk_sum'] + team_stats['LBlk_sum']
team_stats['PF'] = team_stats['WPF_sum'] + team_stats['LPF_sum']

# Calcular promedios y porcentajes
team_stats['PPG'] = team_stats['points_scored'] / team_stats['games']
team_stats['PPG_allowed'] = team_stats['points_allowed'] / team_stats['games']
team_stats['FG%'] = team_stats['FGM'] / team_stats['FGA']
team_stats['3P%'] = team_stats['FGM3'] / team_stats['FGA3']
team_stats['FT%'] = team_stats['FTM'] / team_stats['FTA']

**Ahora genero los diferentes promedios teniendo:**

In [4]:
# Promedios por juego
for stat in ['ORB', 'DRB', 'Ast', 'TO', 'Stl', 'Blk', 'PF']:
    team_stats[f'{stat}_pg'] = team_stats[stat] / team_stats['games']

**Y procedo a preparar los datos de entrenamiento, teniendo:**

In [5]:
X_train = []
y_train = []
stats = ['PPG', 'PPG_allowed', 'FG%', '3P%', 'FT%', 'ORB_pg', 'DRB_pg',
         'Ast_pg', 'TO_pg', 'Stl_pg', 'Blk_pg', 'PF_pg']

team_dict = team_stats.set_index('TeamID').to_dict('index')

for _, row in df.iterrows():
    wteam = row['WTeamID']
    lteam = row['LTeamID']
    wloc = row['WLoc']

    # Mapeo de ubicación
    loc_map = {'H': 1, 'A': -1, 'N': 0}
    loc_val = loc_map.get(wloc, 0)

    # Características para el ganador como equipo A
    features = [team_dict[wteam][s] for s in stats] + [team_dict[lteam][s] for s in stats] + [loc_val]
    X_train.append(features)
    y_train.append(1)  # Equipo A (ganador) gana

    # Características para el perdedor como equipo A
    features = [team_dict[lteam][s] for s in stats] + [team_dict[wteam][s] for s in stats] + [-loc_val if loc_val != 0 else 0]
    X_train.append(features)
    y_train.append(0)  # Equipo A (perdedor) pierde


**Ahora procedo a entrenar el modelo, cargar las semillas y procesar las semillas teniendo:**

In [10]:
# Entrenar modelo
model = LogisticRegression(max_iter=1000, random_state=42, n_jobs=-1)
model.fit(X_train, y_train)

# Cargar semillas del torneo
seeds_df = pd.read_csv("WNCAATourneySeeds.csv")
seeds_2025 = seeds_df[seeds_df['Season'] == 2025].copy()

# Procesar semillas
def parse_seed(seed):
    seed_num = seed[1:]
    if seed_num[-1].isalpha():
        return int(seed_num[:-1]), seed_num[-1]
    return int(seed_num), None

seeds_2025['seed_num'] = seeds_2025['Seed'].apply(lambda x: parse_seed(x)[0])
seeds_2025['playin'] = seeds_2025['Seed'].apply(lambda x: parse_seed(x)[1])

**Desarrollo ahora una función para la predicción de partidos**

In [11]:
# Función para predecir partidos
def predict_game(team1, team2, location=0):
    features = [team_dict[team1][s] for s in stats] + [team_dict[team2][s] for s in stats] + [location]
    prob = model.predict_proba([features])[0][1]
    return (team1 if prob >= 0.5 else team2, prob)

**Ahora genero la simulación del play-in**

In [13]:
# Simular First Four (Play-in)
playin_games = seeds_2025[seeds_2025['playin'].notnull()]
playin_winners = {}

for _, row in playin_games.iterrows():
    region = row['Seed'][0]
    seed_num = row['seed_num']
    key = (region, seed_num)

    if key not in playin_winners:
        # Encontrar el otro equipo del play-in
        opponent = seeds_2025[
            (seeds_2025['seed_num'] == seed_num) &
            (seeds_2025['playin'].notnull()) &
            (seeds_2025['TeamID'] != row['TeamID'])
        ]

        if not opponent.empty:
            opponent_id = opponent.iloc[0]['TeamID']
            winner, prob = predict_game(row['TeamID'], opponent_id)
            playin_winners[key] = winner
            print(f"Play-in {region}{seed_num}: {row['TeamID']} vs {opponent_id} -> Ganador: {winner} ({prob:.2%})")


Play-in W11: 3162 vs 3449 -> Ganador: 3449 (37.12%)
Play-in X11: 3235 vs 3162 -> Ganador: 3235 (78.70%)
Play-in X16: 3219 vs 3456 -> Ganador: 3219 (62.84%)
Play-in Y16: 3380 vs 3219 -> Ganador: 3219 (31.97%)


**Procedo a construir el bracket principal**

In [14]:
bracket = {}
regions = seeds_2025['Seed'].str[0].unique()

for region in regions:
    region_teams = {}

    # Agregar equipos principales (no play-in)
    main_teams = seeds_2025[
        (seeds_2025['Seed'].str.startswith(region)) &
        (seeds_2025['playin'].isnull())
    ]
    for _, row in main_teams.iterrows():
        region_teams[row['seed_num']] = row['TeamID']

    # Agregar ganadores de play-in
    for (r, seed_num), winner in playin_winners.items():
        if r == region:
            region_teams[seed_num] = winner

    bracket[region] = region_teams

**Función para simular región**

In [15]:
def simulate_region(teams_dict):
    seeds = sorted(teams_dict.keys())
    teams = [teams_dict[s] for s in seeds]

    # Función para simular ronda
    def simulate_round(matchups):
        winners = []
        for i, j in matchups:
            winner, prob = predict_game(teams[i], teams[j])
            winners.append(winner)
        return winners

    # Round of 64
    matchups = [(0, 15), (7, 8), (4, 11), (3, 12),
                (5, 10), (2, 13), (6, 9), (1, 14)]
    winners = simulate_round(matchups)

    # Sweet 16
    matchups = [(0, 1), (2, 3), (4, 5), (6, 7)]
    sweet16 = simulate_round(matchups)

    # Elite Eight
    matchups = [(0, 1), (2, 3)]
    elite8 = simulate_round(matchups)

    # Final Four de región
    champion, prob = predict_game(elite8[0], elite8[1])
    return champion

**Desarrollo de la Simulación de Todas las Regiones**

In [16]:
region_champions = {}
for region, teams_dict in bracket.items():
    champion = simulate_region(teams_dict)
    region_champions[region] = champion
    print(f"\n🏆 Campeón de la región {region}: {champion}")


🏆 Campeón de la región W: 3376

🏆 Campeón de la región X: 3323

🏆 Campeón de la región Y: 3124

🏆 Campeón de la región Z: 3163


**Simulación Semifinales**

In [17]:
print("\n=== FINAL FOUR ===")
semifinal_winners = []
ff_matchups = list(itertools.combinations(region_champions.keys(), 2))[:2]  # Tomar 2 matchups

for i, (reg1, reg2) in enumerate(ff_matchups):
    team1 = region_champions[reg1]
    team2 = region_champions[reg2]
    winner, prob = predict_game(team1, team2)
    semifinal_winners.append(winner)
    print(f"Semifinal {i+1}: {reg1} ({team1}) vs {reg2} ({team2}) -> Ganador: {winner} ({prob:.2%})")


=== FINAL FOUR ===
Semifinal 1: W (3376) vs X (3323) -> Ganador: 3376 (53.20%)
Semifinal 2: W (3376) vs Y (3124) -> Ganador: 3124 (32.39%)


**Simulación Final Nacional**

In [18]:
champion, prob = predict_game(semifinal_winners[0], semifinal_winners[1])
print("\n=== CAMPEONATO NACIONAL ===")
print(f"Finalista 1: {semifinal_winners[0]}")
print(f"Finalista 2: {semifinal_winners[1]}")
print(f"🎉 CAMPEÓN NACIONAL: {champion} (probabilidad: {prob:.2%})")


=== CAMPEONATO NACIONAL ===
Finalista 1: 3376
Finalista 2: 3124
🎉 CAMPEÓN NACIONAL: 3124 (probabilidad: 32.39%)


**Generación Archivo de Predicción Campeonato Masculino**

In [19]:
pred_df = pd.DataFrame({
    'Season': 2025,
    'Champion': [champion],
    'Probability': [prob]
})
pred_df.to_csv('tourney_predictions_2025.csv', index=False)
print("\nPredicciones guardadas en tourney_predictions_2025.csv")


Predicciones guardadas en tourney_predictions_2025.csv
