In [10]:
import pandas as pd
import numpy as np
from itertools import permutations
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
stats = pd.read_csv("../../submissions/unique_season_stats_and_tourney_wins.csv") 
stats.sample(10)

Unnamed: 0,Season,TeamID,wins,losses,total_games,win_percentage,OrdinalRank,Score,FGM,FGA,FTM,FTA,PF,tournament_wins,FG%,DR_per_gm,OR_per_gm,Stl_per_gm,Blk_per_gm,FT%
5438,2003,1138,5.0,23.0,28.0,0.178571,300.0,1815.0,652.0,1622.0,360.0,536.0,636.0,0.0,0.401973,22.642857,13.571429,6.428571,2.071429,0.671642
11480,2020,1350,24.0,7.0,31.0,0.774194,232.0,2335.0,847.0,1818.0,392.0,505.0,481.0,0.0,0.465897,27.096774,7.548387,8.096774,2.483871,0.776238
7247,2008,1304,19.0,12.0,31.0,0.612903,85.0,2072.0,742.0,1622.0,412.0,618.0,508.0,0.0,0.45746,23.741935,10.322581,8.387097,2.806452,0.666667
6165,2005,1218,16.0,13.0,29.0,0.551724,66.0,1990.0,748.0,1708.0,316.0,478.0,486.0,0.0,0.437939,21.172414,13.034483,6.724138,4.068966,0.661088
1735,1991,1109,2.0,23.0,25.0,0.08,,,,,,,,0.0,,,,,,
1448,1990,1114,20.0,9.0,29.0,0.689655,,,,,,,,0.0,,,,,,
11757,2021,1278,14.0,15.0,29.0,0.482759,12.0,2103.0,704.0,1794.0,488.0,659.0,538.0,0.0,0.392419,23.689655,9.482759,6.310345,4.62069,0.740516
9127,2013,1459,12.0,19.0,31.0,0.387097,254.0,1771.0,632.0,1628.0,333.0,455.0,536.0,0.0,0.388206,22.193548,8.806452,7.225806,2.225806,0.731868
4893,2001,1245,22.0,9.0,31.0,0.709677,,,,,,,,1.0,,,,,,
6167,2005,1220,21.0,8.0,29.0,0.724138,138.0,2113.0,746.0,1647.0,406.0,622.0,564.0,0.0,0.452945,24.172414,12.517241,7.862069,4.655172,0.652733


In [8]:
def calculate_win_probability(team1, team2, teams_df):

    team1_stats = teams_df.loc[teams_df['TeamID'] == team1].iloc[0]
    team2_stats = teams_df.loc[teams_df['TeamID'] == team2].iloc[0]
    
    # Feature vector: difference in key stats between team1 and team2
    stats_diff = [
        team1_stats['win_percentage'] - team2_stats['win_percentage'],
        team2_stats['OrdinalRank'] - team1_stats['OrdinalRank'],
        team1_stats['FG%'] - team2_stats['FG%'],
        team1_stats['DR_per_gm'] - team2_stats['DR_per_gm'],
        team1_stats['OR_per_gm'] - team2_stats['OR_per_gm'],
        team1_stats['Stl_per_gm'] - team2_stats['Stl_per_gm'],
        team1_stats['Blk_per_gm'] - team2_stats['Blk_per_gm'],
        team1_stats['FT%'] - team2_stats['FT%'],
    ]
    
    # Simple logistic regression model to predict the probability
    model = LogisticRegression()
    model.coef_ = np.array([[0.5, 0.3, 0.2, 0.1, 0.3, 0.4, 0.3, 0.2, 0.2]])  # Example coefficients (my own belief --> need to find actual)
    model.intercept_ = np.array([0.0])  # Simplified intercept
    

    prob_team1_wins = model.predict_proba([stats_diff])[0][1]
    
    # Return the predicted probability that team1 wins
    return prob_team1_wins

In [11]:
# Generate all matchups (every possible combination of teams)
matchups = []
for i in range(len(stats)):
    for j in range(i + 1, len(stats)):
        team1 = stats.iloc[i]['TeamID']
        team2 = stats.iloc[j]['TeamID']

        if team1 < team2:
            # Probability that team1 will win
            prob_team1_wins = calculate_win_probability(team1, team2, stats)
            matchups.append({
                'ID': f"2025_{team1}_{team2}",
                'Pred': prob_team1_wins
            })
        else:
            # Calculate the probability that team2 will win (because team2 has a higher TeamID)
            prob_team2_wins = calculate_win_probability(team2, team1, stats)
            matchups.append({
                'ID': f"2025_{team2}_{team1}",
                'Pred': 1 - prob_team2_wins  # Predict that the second team wins
            })
submission_df = pd.DataFrame(matchups)

print(submission_df.head())


AttributeError: 'LogisticRegression' object has no attribute 'classes_'

In [14]:
stats['tournament_wins'] = stats['tournament_wins'].fillna(0)  # Replace NaN values with 0
stats['tournament_wins'] = stats['tournament_wins'].replace(0, 1e-5)  # Replace 0 wins with a small value to avoid divide by zero

# Normalize tournament wins
stats['normalized_tournament_wins'] = stats['tournament_wins'] / stats['tournament_wins'].sum()

# Create a dictionary of teams and their normalized win probabilities
team_probs = dict(zip(stats['TeamID'], stats['normalized_tournament_wins']))

# Initialize a transition matrix (size num_teams x num_teams)
num_teams = len(stats)
transition_matrix = np.zeros((num_teams, num_teams))

# Fill the transition matrix
for i, row in stats.iterrows():
    team_id = row['TeamID']
    for j in stats['TeamID']:
        # Higher transition probability for a team with a higher historical win percentage
        transition_matrix[i, j] = team_probs[j]  # Higher probabilities for teams with more wins

# Normalize the matrix to ensure the rows sum to 1 (transition probabilities)
# Check to prevent division by zero
row_sums = transition_matrix.sum(axis=1, keepdims=True)
row_sums[row_sums == 0] = 1  # Prevent division by zero

transition_matrix /= row_sums  # Now normalize the matrix

# Print the transition matrix to verify (it should be normalized and each row should sum to 1)
print(transition_matrix)


[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [17]:
def simulate_matchup(team1, team2, transition_matrix):
    # Randomly select the winner based on transition probabilities
    team1_idx = stats[stats['TeamID'] == team1].index[0]  # Get the index for team1
    team2_idx = stats[stats['TeamID'] == team2].index[0]  # Get the index for team2

    # Probability of team1 winning
    p_team1_wins = transition_matrix[team1_idx, team2_idx]
    
    # Randomly select the winner based on the probabilities
    return team1 if np.random.rand() < p_team1_wins else team2

def simulate_tournament(teams, transition_matrix):
    while len(teams) > 1:
        next_round = []
        
        # If odd number of teams, the last team automatically advances
        if len(teams) % 2 != 0:
            next_round.append(teams[-1])
            teams = teams[:-1]  # Remove the last team for pairing
        
        for i in range(0, len(teams), 2):
            winner = simulate_matchup(teams[i], teams[i + 1], transition_matrix)
            next_round.append(winner)
        
        teams = next_round

    return teams[0]  # Return the final winner

# Simulate the 2025 tournament
teams = stats['TeamID'].tolist()  # List of all teams
champion = simulate_tournament(teams, transition_matrix)
print(f"The predicted champion for the 2025 tournament is Team {champion}")


The predicted champion for the 2025 tournament is Team 1315


In [19]:
# Step 1: Ensure we use only the latest entry for each team
# Assuming you have a 'Year' column, if not, you can filter by the most recent rows
latest_teams_data = stats.loc[stats.groupby('TeamID')['Season'].idxmax()]  # Get the most recent entry for each team

# Now we extract the TeamIDs from the most recent data
teams = latest_teams_data['TeamID'].tolist()

# Step 2: Modify the simulate_matchup function to correctly reference the team's most recent data
def simulate_matchup(team1, team2, transition_matrix):
    # Get the most recent data for each team
    team1_idx = latest_teams_data[latest_teams_data['TeamID'] == team1].index[0]  # Get the index for team1
    team2_idx = latest_teams_data[latest_teams_data['TeamID'] == team2].index[0]  # Get the index for team2

    # Probability of team1 winning
    p_team1_wins = transition_matrix[team1_idx, team2_idx]
    
    # Randomly select the winner based on the probabilities
    return team1 if np.random.rand() < p_team1_wins else team2

# Step 3: Simulate the tournament
def simulate_tournament(teams, transition_matrix):
    while len(teams) > 1:
        next_round = []
        
        # If odd number of teams, the last team automatically advances
        if len(teams) % 2 != 0:
            next_round.append(teams[-1])
            teams = teams[:-1]  # Remove the last team for pairing
        
        for i in range(0, len(teams), 2):
            winner = simulate_matchup(teams[i], teams[i + 1], transition_matrix)
            next_round.append(winner)
        
        teams = next_round

    return teams[0]  # Return the final winner

# Simulate the 2025 tournament
champion = simulate_tournament(teams, transition_matrix)
print(f"The predicted champion for the 2025 tournament is Team {champion}")


The predicted champion for the 2025 tournament is Team 1348


In [20]:


# Step 1: Generate all possible matchups
team_ids = latest_teams_data['TeamID'].tolist()

# List to store predictions
predictions = []

# Step 2: Simulate each matchup
for i in range(len(team_ids)):
    for j in range(i + 1, len(team_ids)):  # Ensure unique matchups (team i vs team j where i < j)
        team1 = team_ids[i]
        team2 = team_ids[j]
        
        # Get the indexes for team1 and team2
        team1_idx = latest_teams_data[latest_teams_data['TeamID'] == team1].index[0]
        team2_idx = latest_teams_data[latest_teams_data['TeamID'] == team2].index[0]
        
        # Get the probability of the lower TeamID winning
        if team1 < team2:
            matchup_id = f"2025_{team1}_{team2}"
            pred = transition_matrix[team1_idx, team2_idx]
        else:
            matchup_id = f"2025_{team2}_{team1}"
            pred = transition_matrix[team2_idx, team1_idx]
        
        # Append the prediction to the list
        predictions.append([matchup_id, pred])

# Step 3: Create a DataFrame to store the predictions
predictions_df = pd.DataFrame(predictions, columns=["ID", "Pred"])

# Step 4: Save predictions to a CSV file
predictions_df.to_csv("2025_tournament_predictions.csv", index=False)

# Output preview of the generated CSV
print(predictions_df.head())


               ID  Pred
0  2025_1101_1102   0.0
1  2025_1101_1103   0.0
2  2025_1101_1104   0.0
3  2025_1101_1105   0.0
4  2025_1101_1106   0.0


In [26]:

season_features = ['win_percentage', 'FG%', 'DR_per_gm', 'OR_per_gm', 'Stl_per_gm','Blk_per_gm', 'FT%']  

# Normalize season stats by dividing each feature by the max value
for feature in season_features:
    stats[f'{feature}_norm'] = stats[feature] / stats[feature].max()

# Calculate correlation between each team's normalized season statistics and tournament wins
corrs = {}
for feature in season_features:
    corr = stats[f'{feature}_norm'].corr(stats['tournament_wins'])
    corrs[feature] = corr

# Create a transition matrix based on season statistics and tournament wins
num_teams = len(stats)
transition_matrix = np.zeros((num_teams, num_teams))

# Populate the transition matrix with probabilities based on the correlation of each team's features
for i, row_i in stats.iterrows():
    team_i_id = row_i['TeamID']
    team_i_strength = 0
    for feature in season_features:
        feature_corr = corrs[feature]
        team_i_strength += feature_corr * row_i[f'{feature}_norm']
    
    for j, row_j in stats.iterrows():
        if i != j:
            team_j_strength = 0
            for feature in season_features:
                feature_corr = corrs[feature]
                team_j_strength += feature_corr * row_j[f'{feature}_norm']
            
            # Set transition probability based on team strength (higher strength means higher probability of winning)
            transition_matrix[i, j] = team_j_strength / (team_i_strength + team_j_strength)

# Normalize the matrix so rows sum to 1 (probabilities)
transition_matrix = transition_matrix / transition_matrix.sum(axis=1, keepdims=True)

# Print the transition matrix (for verification)
print("Transition Matrix:")
print(transition_matrix)

# Now simulate the hypothetical matchups
def simulate_matchup(team1_id, team2_id, transition_matrix, teams):
    """ Simulate the probability of team1 winning against team2 """
    team1_index = teams.index(team1_id)
    team2_index = teams.index(team2_id)
    
    # Probability that team1 wins
    prob_team1_wins = transition_matrix[team1_index, team2_index]
    
    # Probability that team2 wins (complement of team1 winning)
    prob_team2_wins = 1 - prob_team1_wins
    
    return prob_team1_wins, prob_team2_wins

# Example list of all teams to simulate matchups
teams = stats['TeamID'].tolist()

# Simulate all possible matchups
matchups_predictions = []

for i in range(num_teams):
    for j in range(i + 1, num_teams):
        team1 = teams[i]
        team2 = teams[j]
        
        prob_team1_wins, prob_team2_wins = simulate_matchup(team1, team2, transition_matrix, teams)
        
        # Save the result as per the required format: ID, Pred
        matchup_id = f"2025_{team1}_{team2}"
        matchups_predictions.append([matchup_id, prob_team1_wins])

# Convert predictions to DataFrame for submission
predictions_df = pd.DataFrame(matchups_predictions, columns=["ID", "Pred"])

# Output predictions (replace this with saving to a file if needed)
print(predictions_df.head())

# Example of saving to CSV
# predictions_df.to_csv("predictions_2025.csv", index=False)


KeyboardInterrupt: 