In [1]:
import pandas as pd
import numpy as np
from itertools import permutations
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [64]:
stats = pd.read_csv("../../submissions/unique_season_stats_and_tourney_wins.csv") 
stats = stats.sample(200)

In [9]:
stats.shape

(200, 20)

In [67]:
stats.columns

Index(['Season', 'TeamID', 'wins', 'losses', 'total_games', 'win_percentage',
       'OrdinalRank', 'Score', 'FGM', 'FGA', 'FTM', 'FTA', 'PF',
       'tournament_wins', 'FG%', 'DR_per_gm', 'OR_per_gm', 'Stl_per_gm',
       'Blk_per_gm', 'FT%'],
      dtype='object')

In [65]:
stats['OrdinalRank'] = stats['OrdinalRank'].max() - stats['OrdinalRank'] + 1

In [69]:
stats[features] = stats[features].fillna(stats[features].mean())

In [71]:
import numpy as np
import pandas as pd

# Sample correlation dictionary (adjusted for better differentiation)
corrs = {
    'win_percentage_norm': 1.0,  # Strongest correlation for overall strength
    'FG%_norm': 0.8,  # Strong correlation for shooting accuracy
    'Blk_per_gm_norm': 0.5,  # Medium correlation for defense
    'DR_per_gm_norm': 0.6,  # Defense rebounds
    'OR_per_gm_norm': 0.6,  # Offensive rebounds
    'Stl_per_gm_norm': 0.4,  # Steals correlation
    'FT%_norm': 0.5,  # Free throw efficiency
    'OrdinalRank': 0.9  # Very strong correlation to rank
}

# List of features to calculate strength for
features = ['win_percentage', 'FG%','DR_per_gm', 'OR_per_gm', 'Stl_per_gm','Blk_per_gm', 'FT%', 'OrdinalRank']

# Adjust the scaling factor to better amplify differences
scaling_factor = 30  # Increased scaling factor for stronger team strength differentiation
print(f"Scaling factor: {scaling_factor}")

# Apply the logistic function with clamping to avoid overflow
max_strength = 50  # Maximum strength cap for the logistic function

# Total number of teams
num_teams = len(stats)  # Assuming 'stats' is your DataFrame with team data

# Mapping from TeamID to index for quick access
team_id_to_index = {team_id: idx for idx, team_id in enumerate(stats['TeamID'])}

# Initialize transition matrix
transition_matrix = np.zeros((num_teams, num_teams))

# Iterate over all teams to calculate their strengths and the transition matrix
for i, row_i in stats.iterrows():
    team_i_id = row_i['TeamID']
    team_i_index = team_id_to_index[team_i_id]
    team_i_strength = 0

    # Calculate strength for team i
    for feature in features:
        if feature in corrs:  # Make sure the feature is in corrs
            feature_corr = corrs[feature] * scaling_factor
            team_i_strength += feature_corr * row_i[feature]
    
    # Print the calculated strength for debugging
    print(f"Team {team_i_id} Strength: {team_i_strength}")

    for j, row_j in stats.iterrows():
        if i != j:  # Skip comparing the team to itself
            team_j_id = row_j['TeamID']
            team_j_index = team_id_to_index[team_j_id]
            team_j_strength = 0

            # Calculate strength for team j
            for feature in features:
                if feature in corrs:  # Make sure the feature is in corrs
                    feature_corr = corrs[feature] * scaling_factor
                    team_j_strength += feature_corr * row_j[feature]
            
            # Print the calculated strength for debugging
            print(f"Team {team_j_id} Strength: {team_j_strength}")
            
            # Calculate total strength and clamp it to avoid overflow
            total_strength = team_i_strength + team_j_strength
            total_strength = np.clip(total_strength, -max_strength, max_strength)  # Clamp the value

            # Print total strength before logistic function for debugging
            print(f"Total Strength (clamped): {total_strength}")

            # Calculate transition probability using the logistic function
            transition_matrix[team_i_index, team_j_index] = 1 / (1 + np.exp(-total_strength))

# Softmax normalization to improve the transition matrix
def softmax(x):
    e_x = np.exp(x - np.max(x, axis=1, keepdims=True))  # Subtract max for numerical stability
    return e_x / e_x.sum(axis=1, keepdims=True)

# Normalize the transition matrix using softmax
transition_matrix = softmax(transition_matrix)

# Display the updated transition matrix
print("Updated Transition Matrix (Softmax normalized):")
print(transition_matrix)

# Generate predictions
# Simulate matchups
matchups_predictions = []

# Iterate through each pair of teams (compare their TeamIDs)
for i in range(num_teams):
    for j in range(i + 1, num_teams):
        # Use .iloc[] for positional access
        team1_id = stats.iloc[i]['TeamID']  # TeamID for team 1
        team2_id = stats.iloc[j]['TeamID']  # TeamID for team 2
        
        # Now use team_id_to_index to get the respective indices in transition_matrix
        team1_index = team_id_to_index[team1_id]
        team2_index = team_id_to_index[team2_id]

        # Simulate the matchup between team1 and team2
        prob_team1_wins = transition_matrix[team1_index, team2_index]
        prob_team2_wins = 1 - prob_team1_wins
        
        # Generate a unique matchup ID (this will now use the TeamIDs)
        matchup_id = f"2025_{team1_id}_{team2_id}"
        matchups_predictions.append([matchup_id, prob_team1_wins])

# Convert predictions to a DataFrame
predictions_df = pd.DataFrame(matchups_predictions, columns=["ID", "Pred"])

# Display the top 5 predictions
print(predictions_df.head())


Scaling factor: 30
Team 1119.0 Strength: 6885.0
Team 1260.0 Strength: 5439.9327731092435
Total Strength (clamped): 50.0
Team 1315.0 Strength: 1350.0
Total Strength (clamped): 50.0
Team 1348.0 Strength: 5439.9327731092435
Total Strength (clamped): 50.0
Team 1148.0 Strength: 5439.9327731092435
Total Strength (clamped): 50.0
Team 1377.0 Strength: 2430.0
Total Strength (clamped): 50.0
Team 1253.0 Strength: 4509.0
Total Strength (clamped): 50.0
Team 1377.0 Strength: 2997.0
Total Strength (clamped): 50.0
Team 1293.0 Strength: 5439.9327731092435
Total Strength (clamped): 50.0
Team 1166.0 Strength: 5439.9327731092435
Total Strength (clamped): 50.0
Team 1237.0 Strength: 5643.0
Total Strength (clamped): 50.0
Team 1409.0 Strength: 7749.0
Total Strength (clamped): 50.0
Team 1418.0 Strength: 4320.0
Total Strength (clamped): 50.0
Team 1245.0 Strength: 5439.9327731092435
Total Strength (clamped): 50.0
Team 1309.0 Strength: 4050.0
Total Strength (clamped): 50.0
Team 1448.0 Strength: 5439.9327731092435

In [68]:
# Check for NaN values in relevant columns
print(stats[features].isnull().sum())


win_percentage     0
FG%               81
DR_per_gm         81
OR_per_gm         81
Stl_per_gm        81
Blk_per_gm        81
FT%               81
OrdinalRank       81
dtype: int64


In [59]:
predictions_df.sample(50)


Unnamed: 0,ID,Pred
17553,2025_1139.0_1390.0,0.00904
3404,2025_1277.0_1299.0,0.006956
10624,2025_1241.0_1349.0,0.001041
17003,2025_1208.0_1355.0,0.010371
520,2025_1256.0_1293.0,0.008614
309,2025_1461.0_1274.0,0.00881
10727,2025_1412.0_1267.0,0.007633
4024,2025_1258.0_1243.0,0.003239
7791,2025_1234.0_1156.0,0.006372
2140,2025_1441.0_1442.0,0.001777


In [43]:
# Assuming 'predictions_df' is the DataFrame with the 'ID' column
# Split the 'ID' column into two new columns 'Team1_ID' and 'Team2_ID'
predictions_df[['Year', 'Team1_ID', 'Team2_ID']] = predictions_df['ID'].str.split('_', expand=True)

# Convert 'Team1_ID' and 'Team2_ID' columns to numeric if necessary
predictions_df['Team1_ID'] = pd.to_numeric(predictions_df['Team1_ID'])
predictions_df['Team2_ID'] = pd.to_numeric(predictions_df['Team2_ID'])

# Display the updated DataFrame
print(predictions_df.head())


               ID      Pred  Year  Team1_ID  Team2_ID
0  2025_1183_1461  0.012510  2025      1183      1461
1  2025_1183_1256  0.001165  2025      1183      1256
2  2025_1183_1425  0.012652  2025      1183      1425
3  2025_1183_1157  0.012651  2025      1183      1157
4  2025_1183_1251  0.010990  2025      1183      1251


In [44]:
teams = pd.read_csv('../../data/Mteams.csv')
teams.head()

Unnamed: 0,TeamID,TeamName,FirstD1Season,LastD1Season
0,1101,Abilene Chr,2014,2025
1,1102,Air Force,1985,2025
2,1103,Akron,1985,2025
3,1104,Alabama,1985,2025
4,1105,Alabama A&M,2000,2025


In [47]:
merged = predictions_df.merge(teams, left_on = 'Team1_ID', right_on = 'TeamID', how = 'right')
merged.sample(20)

Unnamed: 0,ID,Pred,Year,Team1_ID,Team2_ID,TeamID,TeamName,FirstD1Season,LastD1Season
451,2025_1114_1337,0.009262,2025.0,1114.0,1337.0,1114,Ark Little Rock,1985,2025
16591,2025_1422_1267,0.008455,2025.0,1422.0,1267.0,1422,UNC Greensboro,1992,2025
5263,2025_1214_1454,0.0017,2025.0,1214.0,1454.0,1214,Hampton,1996,2025
2694,2025_1156_1395,0.009104,2025.0,1156.0,1395.0,1156,Cleveland St,1985,2025
8511,2025_1258_1243,0.001834,2025.0,1258.0,1243.0,1258,Loy Marymount,1985,2025
10106,2025_1290_1103,0.003153,2025.0,1290.0,1103.0,1290,MS Valley St,1985,2025
18555,2025_1442_1247,0.007159,2025.0,1442.0,1247.0,1442,W Illinois,1985,2025
4743,2025_1191_1131,0.003179,2025.0,1191.0,1131.0,1191,Evansville,1985,2025
6727,,,,,,1237,IUPUI,1999,2025
11824,2025_1343_1201,0.009735,2025.0,1343.0,1201.0,1343,Princeton,1985,2025


In [48]:
merged2 = merged.merge(teams, left_on = 'Team2_ID', right_on = 'TeamID', how = 'right')
merged2.sample(20)

Unnamed: 0,ID,Pred,Year,Team1_ID,Team2_ID,TeamID_x,TeamName_x,FirstD1Season_x,LastD1Season_x,TeamID_y,TeamName_y,FirstD1Season_y,LastD1Season_y
16032,,,,,,,,,,1396,Temple,1985,2025
4628,2025_1132_1210,0.001994,2025.0,1132.0,1210.0,1132.0,Bowling Green,1985.0,2025.0,1210,Georgia Tech,1985,2025
1504,2025_1221_1136,0.010294,2025.0,1221.0,1136.0,1221.0,Holy Cross,1985.0,2025.0,1136,Bryant,2009,2025
6443,2025_1148_1239,0.007498,2025.0,1148.0,1239.0,1148.0,Central Conn,1987.0,2025.0,1239,Jacksonville,1985,2025
17282,2025_1344_1412,0.006806,2025.0,1344.0,1412.0,1344.0,Providence,1985.0,2025.0,1412,UAB,1985,2025
11886,2025_1407_1328,0.010276,2025.0,1407.0,1328.0,1407.0,Troy,1994.0,2025.0,1328,Oklahoma,1985,2025
3207,2025_1149_1171,0.001548,2025.0,1149.0,1171.0,1149.0,Charleston So,1985.0,2025.0,1171,Dartmouth,1985,2025
13493,2025_1116_1349,0.000803,2025.0,1116.0,1349.0,1116.0,Arkansas,1985.0,2025.0,1349,Rice,1985,2025
2969,2025_1229_1167,0.002209,2025.0,1229.0,1167.0,1229.0,Illinois St,1985.0,2025.0,1167,CS Bakersfield,2008,2025
14405,2025_1156_1360,0.008779,2025.0,1156.0,1360.0,1156.0,Cleveland St,1985.0,2025.0,1360,San Diego,1985,2025


In [42]:
predictions_df.sample(50)

Unnamed: 0,ID,Pred
6930,2025_1462_1258,0.006988
4289,2025_1398_1459,0.004395
12876,2025_1274_1218,0.004343
1991,2025_1156_1191,0.000829
18176,2025_1335_1355,0.02762
5103,2025_1441_1337,0.002821
18084,2025_1269_1168,0.00442
15229,2025_1398_1224,0.004395
16184,2025_1334_1269,0.020277
15984,2025_1182_1274,0.006898


In [None]:
predictions_df['WTeamID'] = predictions_df['ID'].apply(lambda x: int(x.split('_')[1]))

In [34]:
results = pd.read_csv('../../data/MRegularSeasonCompactResults.csv')  
results.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT
0,1985,20,1228,81,1328,64,N,0
1,1985,25,1106,77,1354,70,H,0
2,1985,25,1112,63,1223,56,H,0
3,1985,25,1165,70,1432,54,H,0
4,1985,25,1192,86,1447,74,H,0


In [39]:
# Identify missing team IDs
missing_teams = []
for team_id in results['WTeamID']:
    if team_id not in team_id_to_index:
        missing_teams.append(team_id)

for team_id in results['LTeamID']:
    if team_id not in team_id_to_index:
        missing_teams.append(team_id)

# Print missing team IDs
if missing_teams:
    print("Missing team IDs:", set(missing_teams))
else:
    print("No missing team IDs")


Missing team IDs: {1106, 1107, 1108, 1109, 1110, 1111, 1112, 1113, 1115, 1118, 1119, 1120, 1121, 1122, 1123, 1124, 1125, 1126, 1127, 1128, 1129, 1130, 1133, 1134, 1137, 1140, 1141, 1142, 1145, 1146, 1147, 1151, 1153, 1154, 1158, 1159, 1160, 1162, 1163, 1164, 1165, 1169, 1170, 1172, 1173, 1175, 1176, 1177, 1178, 1179, 1180, 1181, 1184, 1186, 1188, 1189, 1190, 1192, 1193, 1195, 1196, 1197, 1198, 1202, 1203, 1204, 1205, 1206, 1207, 1209, 1211, 1212, 1213, 1215, 1216, 1217, 1219, 1222, 1223, 1225, 1227, 1228, 1230, 1231, 1232, 1233, 1235, 1236, 1237, 1240, 1242, 1244, 1245, 1246, 1249, 1250, 1252, 1253, 1254, 1257, 1259, 1260, 1262, 1263, 1264, 1268, 1270, 1271, 1275, 1276, 1278, 1279, 1280, 1283, 1284, 1285, 1286, 1287, 1289, 1291, 1292, 1294, 1295, 1296, 1297, 1298, 1300, 1303, 1305, 1307, 1308, 1309, 1312, 1314, 1315, 1316, 1318, 1319, 1321, 1323, 1324, 1325, 1326, 1327, 1329, 1330, 1331, 1332, 1336, 1338, 1340, 1341, 1342, 1346, 1350, 1352, 1353, 1357, 1358, 1359, 1361, 1364, 1365, 136

In [41]:
# Create a list to store the predictions and actual results
prediction_comparison = []

# Loop through the historical matchups
for _, row in results.iterrows():
    wteam_id = row['WTeamID']
    lteam_id = row['LTeamID']
    
    # Skip if either team is missing in the team_id_to_index
    if wteam_id not in team_id_to_index or lteam_id not in team_id_to_index:
        continue
    
    # Get the team indexes using team_id_to_index
    wteam_index = team_id_to_index[wteam_id]
    lteam_index = team_id_to_index[lteam_id]
    
    # Get the transition probabilities from the transition matrix
    prob_wteam_wins = transition_matrix[wteam_index, lteam_index]
    prob_lteam_wins = 1 - prob_wteam_wins
    
    # The actual result is a binary outcome: 1 for the winning team (WTeamID) and 0 for the losing team (LTeamID)
    actual_result = 1  # since WTeamID won, the result is 1
    
    # Append to comparison list
    prediction_comparison.append({
        'WTeamID': wteam_id,
        'LTeamID': lteam_id,
        'Predicted_Prob_Win': prob_wteam_wins,
        'Actual_Result': actual_result
    })

# Convert to a DataFrame for easy analysis
comparison_df = pd.DataFrame(prediction_comparison)

# Show the first few rows
print(comparison_df.head())

# Calculate accuracy (example metric)
accuracy = (comparison_df['Predicted_Prob_Win'] > 0.5) == comparison_df['Actual_Result']
accuracy_rate = accuracy.mean()
print(f'Accuracy: {accuracy_rate:.4f}')


   WTeamID  LTeamID  Predicted_Prob_Win  Actual_Result
0     1218     1337            0.009581              1
1     1344     1438            0.006806              1
2     1135     1306            0.002133              1
3     1210     1149            0.003787              1
4     1234     1114            0.002652              1
Accuracy: 0.0000
