In [None]:
import pandas as pd
import numpy as np
import random

In [None]:
team_seasons = pd.read_csv('./data/processed/team_season_features.csv')
print(team_seasons.head(4))

In [None]:
previous_tourney_matchups = pd.read_csv('./data/raw/mens/MNCAATourneyCompactResults.csv')

In [None]:
print(team_seasons.head(4))

In [None]:
# Step 1: Filter tournament games from 2003 onwards
tourney_games = previous_tourney_matchups[previous_tourney_matchups['Season'] >= 2003].copy()
print(f"Number of tournament games from 2003 onwards: {len(tourney_games)}")

# Step 2: Randomize team assignment
def randomize_teams(df):
    """
    Randomly assign teams as TeamA and TeamB to avoid bias in the prediction model.
    Returns a dataframe with TeamA/TeamB IDs and a binary target indicating if TeamA won.
    """
    result_df = df.copy()
    
    # Create arrays to store the randomized teams and outcome
    teamA_ids = []
    teamB_ids = []
    teamA_scores = []
    teamB_scores = []
    teamA_won = []
    
    # For each game, randomly decide which team is A and which is B
    for _, game in df.iterrows():
        # Randomly decide if winner is TeamA (coin flip)
        if random.random() < 0.5:
            # Winner is TeamA
            teamA_ids.append(game['WTeamID'])
            teamB_ids.append(game['LTeamID'])
            teamA_scores.append(game['WScore'])
            teamB_scores.append(game['LScore'])
            teamA_won.append(1)  # TeamA won
        else:
            # Winner is TeamB
            teamA_ids.append(game['LTeamID'])
            teamB_ids.append(game['WTeamID'])
            teamA_scores.append(game['LScore'])
            teamB_scores.append(game['WScore'])
            teamA_won.append(0)  # TeamB won
    
    # Add the new columns to the dataframe
    result_df['TeamAID'] = teamA_ids
    result_df['TeamBID'] = teamB_ids
    result_df['TeamAScore'] = teamA_scores
    result_df['TeamBScore'] = teamB_scores
    result_df['TeamA_Won'] = teamA_won
    
    # Keep only necessary columns
    columns_to_keep = ['Season', 'DayNum', 'TeamAID', 'TeamBID', 
                       'TeamAScore', 'TeamBScore', 'TeamA_Won', 'WLoc', 'NumOT']
    return result_df[columns_to_keep]

# Apply the randomization
randomized_games = randomize_teams(tourney_games)
print("\nFirst few rows of randomized games:")
print(randomized_games.head())

In [None]:
# Step 3: Merge team season statistics for both teams
def add_team_features(games_df, stats_df):
    """
    Add team season statistics for both TeamA and TeamB.
    """
    # First, join TeamA features
    merged_df = games_df.merge(
        stats_df,
        left_on=['Season', 'TeamAID'],
        right_on=['Season', 'TeamID'],
        how='left',
        suffixes=('', '_drop')
    )
    
    # Drop duplicate columns and rename columns with TeamA prefix
    teamA_cols = [col for col in merged_df.columns if col not in games_df.columns and not col.endswith('_drop')]
    for col in teamA_cols:
        merged_df.rename(columns={col: f'TeamA_{col}'}, inplace=True)
    
    # Drop columns with _drop suffix
    merged_df = merged_df.drop([col for col in merged_df.columns if col.endswith('_drop')], axis=1)
    
    # Then, join TeamB features
    merged_df = merged_df.merge(
        stats_df,
        left_on=['Season', 'TeamBID'],
        right_on=['Season', 'TeamID'],
        how='left',
        suffixes=('', '_drop')
    )
    
    # Drop duplicate columns and rename columns with TeamB prefix
    teamB_cols = [col for col in merged_df.columns if col not in games_df.columns 
                 and not col.endswith('_drop') 
                 and not col.startswith('TeamA_') 
                 and col != 'TeamID']
    for col in teamB_cols:
        merged_df.rename(columns={col: f'TeamB_{col}'}, inplace=True)
    
    # Drop columns with _drop suffix and TeamID column
    merged_df = merged_df.drop([col for col in merged_df.columns if col.endswith('_drop') or col == 'TeamID'], axis=1)
    
    return merged_df

# Apply the team feature merging
final_dataset = add_team_features(randomized_games, team_seasons)

In [None]:
# Display the shape and a sample of the final dataset
print(f"\nFinal dataset shape: {final_dataset.shape}")
print("\nSample of final dataset columns:")
print(final_dataset.columns[:10].tolist() + ['...'] + final_dataset.columns[-10:].tolist())
print("\nFirst 3 rows of the final dataset (first few columns):")
print(final_dataset[['Season', 'DayNum', 'TeamAID', 'TeamBID', 'TeamA_Won', 'TeamAScore', 'TeamBScore']].head(3))

In [None]:
print(final_dataset.head(3))

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Identify winning team features for each game
def get_winning_team_features(dataset):
    winning_features = []
    
    for idx, row in dataset.iterrows():
        # Extract all feature columns (excluding metadata)
        team_a_cols = [col for col in dataset.columns if col.startswith('TeamA_') and col != 'TeamA_Won']
        team_b_cols = [col for col in dataset.columns if col.startswith('TeamB_')]
        
        # Create a dictionary to hold winning team features
        winning_data = {}
        
        # Determine which team won and get their features
        if row['TeamA_Won'] == 1:
            # TeamA won
            for col in team_a_cols:
                feature_name = col.replace('TeamA_', '')
                winning_data[feature_name] = row[col]
        else:
            # TeamB won
            for col in team_b_cols:
                feature_name = col.replace('TeamB_', '')
                winning_data[feature_name] = row[col]
        
        # Add season and game ID for reference
        winning_data['Season'] = row['Season']
        winning_data['GameID'] = idx
        
        winning_features.append(winning_data)
    
    return pd.DataFrame(winning_features)

# Get winning team features
winning_teams_df = get_winning_team_features(final_dataset)

# Display basic statistics for the winning teams
print(f"Shape of winning teams dataset: {winning_teams_df.shape}")
print("\nSummary statistics for winning teams:")
print(winning_teams_df.describe().T[['mean', 'std', 'min', 'max']].head(10))

# Select numerical features for correlation analysis
# Exclude categorical/text features and identifiers
numerical_features = winning_teams_df.select_dtypes(include=['float64', 'int64']).columns.tolist()
numerical_features = [col for col in numerical_features if col not in ['Season', 'GameID', 'TeamID']]

# Calculate correlation matrix
correlation_matrix = winning_teams_df[numerical_features].corr()

# Create a more readable subset by filtering strong correlations
# Keep correlations with absolute value above threshold
threshold = 0.3
filtered_corr = correlation_matrix.abs()
filtered_corr = filtered_corr.where(filtered_corr > threshold, np.nan)

# Create heatmap figure
plt.figure(figsize=(20, 16))
sns.heatmap(correlation_matrix, annot=False, cmap='coolwarm', center=0, 
           xticklabels=True, yticklabels=True)
plt.title('Correlation Heatmap for Winning Teams\' Features', fontsize=16)
plt.xticks(rotation=90, fontsize=8)
plt.yticks(fontsize=8)
plt.tight_layout()
plt.show()

# Create a filtered heatmap for stronger correlations
plt.figure(figsize=(20, 16))
sns.heatmap(filtered_corr, annot=False, cmap='coolwarm', center=0, 
           xticklabels=True, yticklabels=True)
plt.title('Filtered Correlation Heatmap (|r| > 0.3) for Winning Teams\' Features', fontsize=16)
plt.xticks(rotation=90, fontsize=8)
plt.yticks(fontsize=8)
plt.tight_layout()
plt.show()

# Let's also identify the top pairs of highly correlated features
corr_pairs = []
for i in range(len(numerical_features)):
    for j in range(i+1, len(numerical_features)):
        feature1 = numerical_features[i]
        feature2 = numerical_features[j]
        corr_value = correlation_matrix.loc[feature1, feature2]
        if abs(corr_value) > 0.7:  # Strong correlation threshold
            corr_pairs.append((feature1, feature2, corr_value))

# Sort by absolute correlation value
corr_pairs.sort(key=lambda x: abs(x[2]), reverse=True)

# Display top correlated pairs
print("\nTop 15 strongly correlated feature pairs:")
for feature1, feature2, corr in corr_pairs[:15]:
    print(f"{feature1} and {feature2}: {corr:.3f}")

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

# Create a dataset that pairs winning and losing team features for direct comparison
def create_comparison_dataset(dataset):
    comparisons = []
    
    # Get all TeamA and TeamB columns
    teamA_cols = [col for col in dataset.columns if col.startswith('TeamA_') and col != 'TeamA_Won']
    teamB_cols = [col for col in dataset.columns if col.startswith('TeamB_')]
    
    # Extract feature names and make sure they exist for both teams
    teamA_features = [col.replace('TeamA_', '') for col in teamA_cols]
    teamB_features = [col.replace('TeamB_', '') for col in teamB_cols]
    
    # Find common features that exist for both TeamA and TeamB
    common_features = set(teamA_features).intersection(set(teamB_features))
    
    print(f"Found {len(common_features)} common features to compare")
    
    # Identify which features are numeric
    numeric_features = []
    non_numeric_features = []
    
    for feature in common_features:
        if np.issubdtype(dataset[f'TeamA_{feature}'].dtype, np.number):
            numeric_features.append(feature)
        else:
            non_numeric_features.append(feature)
    
    print(f"Numeric features: {len(numeric_features)}")
    print(f"Non-numeric features: {len(non_numeric_features)}")
    
    for idx, row in dataset.iterrows():
        # Determine which team won
        if row['TeamA_Won'] == 1:
            winning_prefix = 'TeamA_'
            losing_prefix = 'TeamB_'
        else:
            winning_prefix = 'TeamB_'
            losing_prefix = 'TeamA_'
        
        # Create a record for this game
        game_data = {
            'Season': row['Season'],
            'DayNum': row['DayNum'],
            'GameID': idx
        }
        
        # Process numeric features with difference calculation
        for feature in numeric_features:
            winner_val = row[f'{winning_prefix}{feature}']
            loser_val = row[f'{losing_prefix}{feature}']
            
            # Add winning and losing team values
            game_data[f'Winner_{feature}'] = winner_val
            game_data[f'Loser_{feature}'] = loser_val
            
            # Calculate the difference only for numeric features
            game_data[f'Diff_{feature}'] = winner_val - loser_val
        
        # Process non-numeric features (just store them without calculating difference)
        for feature in non_numeric_features:
            game_data[f'Winner_{feature}'] = row[f'{winning_prefix}{feature}']
            game_data[f'Loser_{feature}'] = row[f'{losing_prefix}{feature}']
            # No difference calculation for non-numeric features
        
        comparisons.append(game_data)
    
    return pd.DataFrame(comparisons)

# Create the comparison dataset
comparison_df = create_comparison_dataset(final_dataset)

# Identify numerical features to analyze
diff_columns = [col for col in comparison_df.columns if col.startswith('Diff_')]
feature_names = [col.replace('Diff_', '') for col in diff_columns]

# Calculate statistical significance and effect size for each feature
analysis_results = []

for feature in feature_names:
    winner_values = comparison_df[f'Winner_{feature}']
    loser_values = comparison_df[f'Loser_{feature}']
    
    # Skip any remaining non-numeric features
    if not np.issubdtype(winner_values.dtype, np.number):
        continue
        
    # Calculate mean difference
    mean_diff = winner_values.mean() - loser_values.mean()
    
    # Perform t-test for statistical significance
    t_stat, p_value = stats.ttest_ind(
        winner_values.dropna(), 
        loser_values.dropna(), 
        equal_var=False
    )
    
    # Calculate effect size (Cohen's d)
    pooled_std = np.sqrt((winner_values.std()**2 + loser_values.std()**2) / 2)
    effect_size = mean_diff / pooled_std if pooled_std != 0 else 0
    
    # Store results
    analysis_results.append({
        'Feature': feature,
        'Winner_Mean': winner_values.mean(),
        'Loser_Mean': loser_values.mean(),
        'Mean_Difference': mean_diff,
        'Percent_Difference': (mean_diff / loser_values.mean() * 100) if loser_values.mean() != 0 else np.nan,
        'T_Statistic': t_stat,
        'P_Value': p_value,
        'Effect_Size': effect_size,
        'Is_Significant': p_value < 0.05
    })

# Convert to DataFrame and sort by effect size
results_df = pd.DataFrame(analysis_results)
results_df = results_df.sort_values('Effect_Size', ascending=False)

# Print the top 15 most indicative features (based on effect size)
print("Top 15 Statistics Most Indicative of Winning Teams (largest effect sizes):")
pd.set_option('display.max_columns', None)
print(results_df.head(15))

# Visualize the top 10 most indicative features
top_features = results_df.head(10)['Feature'].tolist()

# Create a bar chart of effect sizes for top features
plt.figure(figsize=(12, 8))
top_results = results_df.head(15).copy()
# Clean up feature names for display
top_results['Feature'] = top_results['Feature'].str.replace('_', ' ')

sns.barplot(x='Effect_Size', y='Feature', data=top_results, palette='viridis')
plt.title('Effect Size of Top 15 Features (Winners vs. Losers)', fontsize=14)
plt.xlabel('Effect Size (Cohen\'s d)', fontsize=12)
plt.ylabel('Feature', fontsize=12)
plt.tight_layout()
plt.show()

# Create boxplots for the top 5 features
plt.figure(figsize=(15, 10))
for i, feature in enumerate(top_features[:5]):
    plt.subplot(2, 3, i+1)
    
    # Create boxplot with swarmplot overlay
    data = pd.DataFrame({
        'value': pd.concat([comparison_df[f'Winner_{feature}'], comparison_df[f'Loser_{feature}']]),
        'group': ['Winners'] * len(comparison_df) + ['Losers'] * len(comparison_df)
    })
    
    sns.boxplot(x='group', y='value', data=data)
    plt.title(f"{feature.replace('_', ' ')}")
    plt.xlabel('')
    
plt.tight_layout()
plt.suptitle('Comparison of Winners vs. Losers: Top 5 Most Indicative Features', fontsize=16, y=1.02)
plt.show()