In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, log_loss
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
import warnings

In [5]:
# Load data
games = pd.read_csv("data/WRegularSeasonCompactResults.csv")  
seeds = pd.read_csv("data/WNCAATourneySeeds.csv")
teams = pd.read_csv("data/WTeams.csv")
game_details = pd.read_csv("data/WRegularSeasonDetailedResults.csv")
seasons = pd.read_csv("data/WSeasons.csv")

In [11]:
# Define seasons for training and prediction
TRAIN_SEASONS = [2022, 2023, 2024]  
PREDICT_SEASON = 2025

# Get current day number from the latest game in 2025
CURRENT_DAY = games[games['Season'] == PREDICT_SEASON]['DayNum'].max()

# Create region mapping dictionary for each season
region_maps = {}
for season in TRAIN_SEASONS + [PREDICT_SEASON]:
    region_maps[season] = {
        'W': seasons[seasons['Season'] == season]['RegionW'].iloc[0],
        'X': seasons[seasons['Season'] == season]['RegionX'].iloc[0],
        'Y': seasons[seasons['Season'] == season]['RegionY'].iloc[0],
        'Z': seasons[seasons['Season'] == season]['RegionZ'].iloc[0]
    }

# Process seeds with region information for training seasons
train_seeds_processed = pd.DataFrame()
for season in TRAIN_SEASONS:
    season_seeds = seeds[seeds['Season'] == season].copy()
    season_seeds['SeedValue'] = season_seeds['Seed'].str.extract('(\\d+)').astype(float)
    season_seeds['Region'] = season_seeds['Seed'].str[0]
    season_seeds['RegionName'] = season_seeds['Region'].map(region_maps[season])
    # Ensure TeamID is kept as its own column
    season_seeds = season_seeds[['Season', 'TeamID', 'SeedValue', 'Region', 'RegionName']]
    train_seeds_processed = pd.concat([train_seeds_processed, season_seeds])

# Process seeds with region information for prediction season
predict_seeds_processed = seeds[seeds['Season'] == PREDICT_SEASON].copy()
predict_seeds_processed['SeedValue'] = predict_seeds_processed['Seed'].str.extract('(\\d+)').astype(float)
predict_seeds_processed['Region'] = predict_seeds_processed['Seed'].str[0]
predict_seeds_processed['RegionName'] = predict_seeds_processed['Region'].map(region_maps[PREDICT_SEASON])
# Ensure TeamID is kept as its own column
predict_seeds_processed = predict_seeds_processed[['TeamID', 'SeedValue', 'Region', 'RegionName']]

# Print verification
print("\nTraining seeds sample:")
print(train_seeds_processed.head())
print("\nPrediction seeds sample:")
print(predict_seeds_processed.head())


Training seeds sample:
      Season  TeamID  SeedValue Region RegionName
1608    2024    3376        1.0      W    Albany1
1609    2024    3323        2.0      W    Albany1
1610    2024    3333        3.0      W    Albany1
1611    2024    3231        4.0      W    Albany1
1612    2024    3328        5.0      W    Albany1

Prediction seeds sample:
Empty DataFrame
Columns: [TeamID, SeedValue, Region, RegionName]
Index: []


In [12]:
# Verify the filtering worked
print("Number of games for Train:", len(games))
print("Number of seeds for Predict:", len(seeds))

Number of games for Train: 137028
Number of seeds for Predict: 68


In [13]:
# Function to get last n games' win rate for a specific season
def get_last_n_games_win_rate(team_id, season, n=10):
    season_games = games[games["Season"] == season]
    recent_games = season_games[
        ((season_games["WTeamID"] == team_id) | (season_games["LTeamID"] == team_id)) & 
        (season_games["DayNum"] <= CURRENT_DAY)
    ]
    recent_games = recent_games.tail(n)
    wins = (recent_games["WTeamID"] == team_id).sum()
    return wins / n if len(recent_games) > 0 else 0.5  # Default to 50% if no games

# Function to calculate away and neutral win rates
def calculate_location_win_rates(team_id, season):
    team_games = games[
        (games['Season'] == season) &
        (games['DayNum'] <= CURRENT_DAY) &
        ((games['WTeamID'] == team_id) | (games['LTeamID'] == team_id))
    ]
    
    away_wins = away_games = neutral_wins = neutral_games = 0
    
    for _, game in team_games.iterrows():
        if game['WLoc'] == 'A':  # Away game
            away_games += 1
            if game['WTeamID'] == team_id:
                away_wins += 1
        elif game['WLoc'] == 'N':  # Neutral game
            neutral_games += 1
            if game['WTeamID'] == team_id:
                neutral_wins += 1
    
    return pd.Series({
        'AwayWinRate': away_wins / away_games if away_games > 0 else 0.5,
        'NeutralWinRate': neutral_wins / neutral_games if neutral_games > 0 else 0.5
    })

# Process each training season separately
train_data_list = []
for season in TRAIN_SEASONS:
    # Get teams for this season
    season_teams = pd.DataFrame({'TeamID': games[games['Season'] == season]['WTeamID'].unique()})
    
    # Add Last10WinRate
    season_teams['Last10WinRate'] = season_teams['TeamID'].apply(
        lambda team: get_last_n_games_win_rate(team, season)
    )
    
    # Add Away and Neutral win rates
    location_rates = pd.DataFrame([
        calculate_location_win_rates(team_id, season)
        for team_id in season_teams['TeamID']
    ])
    location_rates['TeamID'] = season_teams['TeamID']
    season_teams = season_teams.merge(location_rates, on='TeamID', how='left')
    
    # Add season column
    season_teams['Season'] = season
    
    train_data_list.append(season_teams)

# Combine all training seasons
train_teams = pd.concat(train_data_list, ignore_index=True)

# Add seed information
train_teams = train_teams.merge(
    train_seeds_processed[['Season', 'TeamID', 'SeedValue', 'RegionName']],
    on=['Season', 'TeamID'],
    how='left'
)

# Process prediction data similarly
predict_teams = pd.DataFrame({'TeamID': games[games['Season'] == PREDICT_SEASON]['WTeamID'].unique()})

# Add Last10WinRate
predict_teams['Last10WinRate'] = predict_teams['TeamID'].apply(
    lambda team: get_last_n_games_win_rate(team, PREDICT_SEASON)
)

# Add Away and Neutral win rates
predict_location_rates = pd.DataFrame([
    calculate_location_win_rates(team_id, PREDICT_SEASON)
    for team_id in predict_teams['TeamID']
])
predict_location_rates['TeamID'] = predict_teams['TeamID']
predict_teams = predict_teams.merge(predict_location_rates, on='TeamID', how='left')

# Add seed information
predict_teams = predict_teams.merge(
    predict_seeds_processed[['TeamID', 'SeedValue', 'RegionName']],
    on='TeamID',
    how='left'
)

print("Training data summary:")
print(f"\nShape: {train_teams.shape}")
print("\nSample of training data:")
print(train_teams.head())
print("\nMissing values in training data:")
print(train_teams.isnull().sum())

print("\nPrediction data summary:")
print(f"\nShape: {predict_teams.shape}")
print("\nSample of prediction data:")
print(predict_teams.head())
print("\nMissing values in prediction data:")
print(predict_teams.isnull().sum())

Training data summary:

Shape: (1072, 7)

Sample of training data:
   TeamID  Last10WinRate  AwayWinRate  NeutralWinRate  Season  SeedValue  \
0    3102            0.7     0.600000            1.00    2022        NaN   
1    3104            0.6     0.400000            0.50    2022        NaN   
2    3112            0.5     0.800000            0.80    2022        NaN   
3    3113            0.3     0.200000            0.25    2022        NaN   
4    3123            0.6     0.615385            0.50    2022        NaN   

  RegionName  
0        NaN  
1        NaN  
2        NaN  
3        NaN  
4        NaN  

Missing values in training data:
TeamID               0
Last10WinRate        0
AwayWinRate          0
NeutralWinRate       0
Season               0
SeedValue         1004
RegionName        1004
dtype: int64

Prediction data summary:

Shape: (362, 6)

Sample of prediction data:
   TeamID  Last10WinRate  AwayWinRate  NeutralWinRate  SeedValue RegionName
0    3103            0.1     0.

In [15]:
# Create training data from games before CURRENT_DAY for all training seasons
train_games_before = games[
    (games['Season'].isin(TRAIN_SEASONS)) & 
    (games['DayNum'] <= CURRENT_DAY)
].copy()

# Create function to format game ID string
def create_game_id(row):
    team1_id = min(row['WTeamID'], row['LTeamID'])
    team2_id = max(row['WTeamID'], row['LTeamID'])
    return f"{row['Season']:04d}_{team1_id:04d}_{team2_id:04d}"

# Add formatted game ID
train_games_before['GameID'] = train_games_before.apply(create_game_id, axis=1)

# Create new DataFrame with lower TeamID
train_games_formatted = pd.DataFrame()
train_games_formatted['GameID'] = train_games_before['GameID']
train_games_formatted['Season'] = train_games_before['Season']
train_games_formatted['TeamID'] = train_games_before.apply(lambda x: min(x['WTeamID'], x['LTeamID']), axis=1)
train_games_formatted['OpponentID'] = train_games_before.apply(lambda x: max(x['WTeamID'], x['LTeamID']), axis=1)
train_games_formatted['Outcome'] = train_games_before.apply(
    lambda x: 1 if x['WTeamID'] == min(x['WTeamID'], x['LTeamID']) else 0, 
    axis=1
)
train_games_formatted['Points_Diff'] = train_games_before.apply(
    lambda x: x['WScore'] - x['LScore'] if x['WTeamID'] == min(x['WTeamID'], x['LTeamID']) 
    else x['LScore'] - x['WScore'],
    axis=1
)

# Add win rates and seed info for both teams
for season in TRAIN_SEASONS:
    season_mask = train_games_formatted['Season'] == season
    
    # Add win rates for TeamID
    train_games_formatted.loc[season_mask, 'Last10WinRate'] = train_games_formatted[season_mask]['TeamID'].apply(
        lambda x: get_last_n_games_win_rate(x, season)
    )
    
    # Add location win rates for TeamID
    location_rates = pd.DataFrame([
        calculate_location_win_rates(team_id, season)
        for team_id in train_games_formatted[season_mask]['TeamID']
    ])
    train_games_formatted.loc[season_mask, 'AwayWinRate'] = location_rates['AwayWinRate'].values
    train_games_formatted.loc[season_mask, 'NeutralWinRate'] = location_rates['NeutralWinRate'].values
    
    # Add win rates for OpponentID
    train_games_formatted.loc[season_mask, 'Last10WinRate_Opponent'] = train_games_formatted[season_mask]['OpponentID'].apply(
        lambda x: get_last_n_games_win_rate(x, season)
    )
    
    # Add location win rates for OpponentID
    opponent_location_rates = pd.DataFrame([
        calculate_location_win_rates(team_id, season)
        for team_id in train_games_formatted[season_mask]['OpponentID']
    ])
    train_games_formatted.loc[season_mask, 'AwayWinRate_Opponent'] = opponent_location_rates['AwayWinRate'].values
    train_games_formatted.loc[season_mask, 'NeutralWinRate_Opponent'] = opponent_location_rates['NeutralWinRate'].values

# Merge with seeds for both teams
train_data = train_games_formatted.merge(
    train_seeds_processed[['Season', 'TeamID', 'SeedValue']],
    on=['Season', 'TeamID'],
    how='left'
)

train_data = train_data.merge(
    train_seeds_processed[['Season', 'TeamID', 'SeedValue']],
    left_on=['Season', 'OpponentID'],
    right_on=['Season', 'TeamID'],
    how='left',
    suffixes=('', '_Opponent')
).drop('TeamID_Opponent', axis=1)

# Calculate differences
diff_columns = ['SeedValue', 'Last10WinRate', 'AwayWinRate', 'NeutralWinRate']
for col in diff_columns:
    diff_col_name = f'{col}Diff'
    train_data[diff_col_name] = train_data[col] - train_data[f'{col}_Opponent']

print("Training data summary:")
print(f"\nShape: {train_data.shape}")
print("\nColumns:", train_data.columns.tolist())
print("\nSample data:")
print(train_data.head())
print("\nMissing values:")
print(train_data.isnull().sum())

Training data summary:

Shape: (15848, 18)

Columns: ['GameID', 'Season', 'TeamID', 'OpponentID', 'Outcome', 'Points_Diff', 'Last10WinRate', 'AwayWinRate', 'NeutralWinRate', 'Last10WinRate_Opponent', 'AwayWinRate_Opponent', 'NeutralWinRate_Opponent', 'SeedValue', 'SeedValue_Opponent', 'SeedValueDiff', 'Last10WinRateDiff', 'AwayWinRateDiff', 'NeutralWinRateDiff']

Sample data:
           GameID  Season  TeamID  OpponentID  Outcome  Points_Diff  \
0  2022_3102_3176    2022    3102        3176        1           17   
1  2022_3104_3149    2022    3104        3149        1           77   
2  2022_3112_3169    2022    3112        3169        1           43   
3  2022_3113_3294    2022    3113        3294        1           30   
4  2022_3123_3454    2022    3123        3454        1            9   

   Last10WinRate  AwayWinRate  NeutralWinRate  Last10WinRate_Opponent  \
0            0.7     0.600000            1.00                     0.4   
1            0.6     0.400000            0.50   

In [16]:
# Import required libraries
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.impute import SimpleImputer
import numpy as np

# Define feature columns
feature_cols = [col for col in train_data.columns 
                if col not in ['GameID', 'Season', 'TeamID', 'OpponentID', 'Outcome', 'RegionName']]

# Create X (features) and y (target)
X = train_data[feature_cols]
y = train_data['Outcome']

# First, handle missing values with SimpleImputer
imputer = SimpleImputer(strategy='median')
X_imputed = imputer.fit_transform(X)
X_imputed = pd.DataFrame(X_imputed, columns=X.columns)

# Then scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

# Create and train the logistic regression model
lr_model = LogisticRegression(random_state=42, max_iter=1000)
lr_model.fit(X_scaled, y)

# Get cross-validation scores
cv_scores = cross_val_score(lr_model, X_scaled, y, cv=5, scoring='roc_auc')

# Print model performance metrics
print("Model Performance Summary:")
print(f"\nCross-validation ROC-AUC scores:")
print(f"Mean: {cv_scores.mean():.3f} (+/- {cv_scores.std() * 2:.3f})")
print(f"Individual scores: {cv_scores}")

# Print feature importance
feature_importance = pd.DataFrame({
    'Feature': feature_cols,
    'Importance': np.abs(lr_model.coef_[0])
})
feature_importance = feature_importance.sort_values('Importance', ascending=False)

print("\nFeatures used in model:")
print(feature_cols)

print("\nFeature Importance Rankings:")
print(feature_importance)

# Print missing value summary before imputation
print("\nMissing values before imputation:")
print(X.isnull().sum()[X.isnull().sum() > 0])

# Save important model components for later use
model_components = {
    'model': lr_model,
    'scaler': scaler,
    'imputer': imputer,
    'feature_cols': feature_cols,
    'feature_importance': feature_importance
}

Model Performance Summary:

Cross-validation ROC-AUC scores:
Mean: 1.000 (+/- 0.000)
Individual scores: [1.         1.         1.         1.         0.99999801]

Features used in model:
['Points_Diff', 'Last10WinRate', 'AwayWinRate', 'NeutralWinRate', 'Last10WinRate_Opponent', 'AwayWinRate_Opponent', 'NeutralWinRate_Opponent', 'SeedValue', 'SeedValue_Opponent', 'SeedValueDiff', 'Last10WinRateDiff', 'AwayWinRateDiff', 'NeutralWinRateDiff']

Feature Importance Rankings:
                    Feature  Importance
0               Points_Diff   19.892380
11          AwayWinRateDiff    0.121229
4    Last10WinRate_Opponent    0.091739
2               AwayWinRate    0.087843
5      AwayWinRate_Opponent    0.085250
3            NeutralWinRate    0.063206
10        Last10WinRateDiff    0.046870
12       NeutralWinRateDiff    0.038626
1             Last10WinRate    0.026544
9             SeedValueDiff    0.019844
8        SeedValue_Opponent    0.013512
6   NeutralWinRate_Opponent    0.008869
7      

In [18]:
# Get tournament teams (teams with seeds in predict_seeds_processed)
# Since we don't have 2025 seeds yet, let's use all teams from 2024 seeds as a placeholder
tourney_teams = train_seeds_processed[train_seeds_processed['Season'] == 2024]['TeamID'].unique()

# Create all possible matchups between tournament teams
predict_games = []
for team1 in tourney_teams:
    for team2 in tourney_teams:
        if team1 < team2:  # Ensure each matchup is only added once
            matchup = {
                'ID': f"{PREDICT_SEASON}_{team1:04d}_{team2:04d}",
                'TeamID': team1,
                'OpponentID': team2,
                'Points_Diff': 0  # Add Points_Diff with 0 to match feature_cols
            }
            predict_games.append(matchup)

# Create prediction DataFrame
predict_games_formatted = pd.DataFrame(predict_games)

# Add win rates for both teams
predict_games_formatted['Last10WinRate'] = predict_games_formatted['TeamID'].apply(
    lambda x: get_last_n_games_win_rate(x, PREDICT_SEASON)
)

predict_games_formatted['Last10WinRate_Opponent'] = predict_games_formatted['OpponentID'].apply(
    lambda x: get_last_n_games_win_rate(x, PREDICT_SEASON)
)

# Add location win rates for both teams
location_rates = pd.DataFrame([
    calculate_location_win_rates(team_id, PREDICT_SEASON)
    for team_id in predict_games_formatted['TeamID']
])
predict_games_formatted['AwayWinRate'] = location_rates['AwayWinRate'].values
predict_games_formatted['NeutralWinRate'] = location_rates['NeutralWinRate'].values

opponent_location_rates = pd.DataFrame([
    calculate_location_win_rates(team_id, PREDICT_SEASON)
    for team_id in predict_games_formatted['OpponentID']
])
predict_games_formatted['AwayWinRate_Opponent'] = opponent_location_rates['AwayWinRate'].values
predict_games_formatted['NeutralWinRate_Opponent'] = opponent_location_rates['NeutralWinRate'].values

# Add seed values (will be NaN since we don't have 2025 seeds yet)
predict_games_formatted['SeedValue'] = np.nan
predict_games_formatted['SeedValue_Opponent'] = np.nan
predict_games_formatted['SeedValueDiff'] = np.nan

# Calculate differences for win rates
for col in ['Last10WinRate', 'AwayWinRate', 'NeutralWinRate']:
    diff_col_name = f'{col}Diff'
    predict_games_formatted[diff_col_name] = predict_games_formatted[col] - predict_games_formatted[f'{col}_Opponent']

# Ensure columns match training data exactly
X_pred = predict_games_formatted[feature_cols].copy()

# Apply the same preprocessing steps as in training
X_pred_imputed = imputer.transform(X_pred)
X_pred_scaled = scaler.transform(X_pred_imputed)

# Get predictions
predict_games_formatted['Pred'] = lr_model.predict_proba(X_pred_scaled)[:, 1]

# Create submission DataFrame
submission = pd.DataFrame({
    'ID': predict_games_formatted['ID'],
    'Pred': predict_games_formatted['Pred']
})

# Print verification
print(f"Total tournament teams: {len(tourney_teams)}")
print(f"Total possible matchups: {len(submission)}")
print("\nSample of predictions:")
print(submission.head(10))

print("\nPrediction distribution:")
print(submission['Pred'].describe())

# Save submission file
submission.to_csv('WomensSubmissionStage1.csv', index=False)
print("\nSubmission saved to 'WomensSubmissionStage1.csv'")

Total tournament teams: 68
Total possible matchups: 2278

Sample of predictions:
               ID      Pred
0  2025_3376_3401  0.739412
1  2025_3376_3400  0.468814
2  2025_3376_3390  0.630762
3  2025_3376_3428  0.539284
4  2025_3376_3397  0.582284
5  2025_3376_3453  0.483670
6  2025_3376_3414  0.549528
7  2025_3376_3417  0.494655
8  2025_3376_3452  0.497287
9  2025_3376_3424  0.520721

Prediction distribution:
count    2278.000000
mean        0.493997
std         0.084028
min         0.283896
25%         0.431901
50%         0.486231
75%         0.554021
max         0.746864
Name: Pred, dtype: float64

Submission saved to 'WomensSubmissionStage1.csv'




In [19]:
# Read both submission files
mens_submission = pd.read_csv('MensSubmissionStage1.csv')
womens_submission = pd.read_csv('WomensSubmissionStage1.csv')

# Combine the submissions
combined_submission = pd.concat([mens_submission, womens_submission], ignore_index=True)

# Print verification
print("Submissions summary:")
print(f"Men's predictions: {len(mens_submission)}")
print(f"Women's predictions: {len(womens_submission)}")
print(f"Combined predictions: {len(combined_submission)}")

print("\nSample of combined predictions:")
print(combined_submission.head(10))

print("\nPrediction distribution:")
print(combined_submission['Pred'].describe())

# Save combined submission file
combined_submission.to_csv('SubmissionStage1.csv', index=False)
print("\nCombined submission saved to 'SubmissionStage1.csv'")

Submissions summary:
Men's predictions: 2278
Women's predictions: 2278
Combined predictions: 4556

Sample of combined predictions:
               ID      Pred
0  2025_1103_1104  0.927970
1  2025_1103_1106  0.761448
2  2025_1103_1110  0.735936
3  2025_1103_1112  0.979956
4  2025_1103_1116  0.921475
5  2025_1103_1120  0.907974
6  2025_1103_1124  0.943842
7  2025_1103_1136  0.911280
8  2025_1103_1140  0.909888
9  2025_1103_1155  0.841687

Prediction distribution:
count    4556.000000
mean        0.576753
std         0.187001
min         0.061411
25%         0.444661
50%         0.538898
75%         0.714800
max         0.979956
Name: Pred, dtype: float64

Combined submission saved to 'SubmissionStage1.csv'
