In [37]:
%%writefile ../src/salary_predict/updated/data_loader_preprocessor.py

import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.feature_selection import f_regression, SelectKBest
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt
import seaborn as sns

def load_data(file_path):
    data = pd.read_csv(file_path)
    print("Data loaded. Shape:", data.shape)
    return data

def format_season(data):
    # Convert season format to a single year for easier numerical analysis
    data['Season'] = data['Season'].apply(lambda x: int(x.split('-')[0]))
    print("Seasons in data:", data['Season'].unique())
    return data

def clean_data(data):
    # Drop columns that may not contribute significantly to the model
    data_clean = data.copy()
    columns_to_drop = ['Injury_Periods', '2nd Apron', 'Wins', 'Losses']
    data_clean.drop(columns_to_drop, axis=1, errors='ignore', inplace=True)
    
    # Handle missing percentage data by filling with column mean
    percentage_cols = ['3P%', '2P%', 'FT%', 'TS%']
    for col in percentage_cols:
        if col in data_clean.columns:
            data_clean[col] = data_clean[col].fillna(data_clean[col].mean())
    
    # Drop remaining NaNs
    data_clean = data_clean.dropna()
    print("Data cleaned. Remaining shape:", data_clean.shape)
    return data_clean

def engineer_features(data):
    # Calculate per-game statistics to normalize performance data
    per_game_cols = ['PTS', 'AST', 'TRB', 'STL', 'BLK', 'TOV']
    for col in per_game_cols:
        data[f'{col[0]}PG'] = data[col] / data['GP']
    
    # Derive additional features to capture important aspects of a player's performance
    data['Availability'] = data['GP'] / 82
    data['SalaryPct'] = data['Salary'] / data['Salary_Cap_Inflated']
    data['Efficiency'] = (data['PTS'] + data['TRB'] + data['AST'] + data['STL'] + data['BLK']) / (data['FGA'] + data['FTA'] + data['TOV'] + 1)
    data['ValueOverReplacement'] = data['VORP'] / (data['Salary'] + 1)
    data['ExperienceSquared'] = data['Years of Service'] ** 2
    data['Days_Injured_Percentage'] = data['Total_Days_Injured'] / data['GP']
    data['WSPG'] = data['WS'] / data['GP']
    data['DWSPG'] = data['DWS'] / data['GP']
    data['OWSPG'] = data['OWS'] / data['GP']
    data['PFPG'] = data['PF'] / data['GP']
    data['ORPG'] = data['ORB'] / data['GP']
    data['DRPG'] = data['DRB'] / data['GP']
    
    # Drop columns used in feature creation or deemed less relevant
    columns_to_drop = ['GP', '2PA', 'OBPM', 'BPM', 'DBPM', '2P', 'GS', 'PTS', 'AST', 'TRB', 'STL', 'BLK',
                       'TOV', 'MP', 'FG', 'FGA', 'FG%', '3P', '3PA', '2P', '2PA', 'FT', 'FTA', 'ORB', 'DRB', 'TRB',
                       'TS%', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%', 'Luxury Tax', '1st Apron', 'BAE',
                       'Standard /Non-Taxpayer', 'Taxpayer', 'Team Room /Under Cap', 'WS', 'DWS', 'WS/48', 'PF', 'OWS', 'Injured']
    data.drop(columns_to_drop, axis=1, errors='ignore', inplace=True)
    print("New features added.")
    return data

def encode_injury_risk(data):
    # Encode injury risk levels for model training
    risk_mapping = {'Low': 0, 'Medium': 1, 'High': 2}
    data['Injury_Risk'] = data['Injury_Risk'].map(risk_mapping).fillna(1)  # Default to Medium if unknown
    return data, risk_mapping

def encode_categorical(data, columns):
    # Encode categorical columns using one-hot encoding
    encoders = {}
    for col in columns:
        encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
        encoded = encoder.fit_transform(data[[col]])
        encoded_df = pd.DataFrame(encoded, columns=encoder.get_feature_names_out([col]), index=data.index)
        data = pd.concat([data.drop(col, axis=1), encoded_df], axis=1)
        encoders[col] = encoder
    return data, encoders

def encode_data(data, encoders=None, player_encoder=None):
    print("Columns before encoding:", data.columns)

    # Encode Injury_Risk
    risk_mapping = {'Low': 0, 'Medium': 1, 'High': 2}
    data['Injury_Risk'] = data['Injury_Risk'].map(risk_mapping).fillna(1)  # Default to Medium if unknown

    # Encode Player column if it's present
    if 'Player' in data.columns:
        if player_encoder is None:
            player_encoder = LabelEncoder()
            data['Player_Encoded'] = player_encoder.fit_transform(data['Player'])
        else:
            data['Player_Encoded'] = player_encoder.transform(data['Player'])
        data.drop('Player', axis=1, inplace=True)  # Drop original Player column after encoding
    
    # Identify initial numeric columns
    initial_numeric_cols = data.select_dtypes(include=['int64', 'float64']).columns.tolist()

    # Encode categorical variables (excluding Season)
    categorical_cols = ['Position', 'Team']
    if encoders is None:
        encoders = {}
        for col in categorical_cols:
            encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
            encoded = encoder.fit_transform(data[[col]])
            encoded_df = pd.DataFrame(encoded, columns=encoder.get_feature_names_out([col]), index=data.index)
            data = pd.concat([data.drop(col, axis=1), encoded_df], axis=1)
            encoders[col] = encoder
    else:
        for col in categorical_cols:
            encoded = encoders[col].transform(data[[col]])
            encoded_df = pd.DataFrame(encoded, columns=encoders[col].get_feature_names_out([col]), index=data.index)
            data = pd.concat([data.drop(col, axis=1), encoded_df], axis=1)

    # Identify final numeric columns (excluding one-hot encoded columns and 'Season')
    numeric_cols = [col for col in initial_numeric_cols if col not in ['Season', 'Injury_Risk', 'Player_Encoded']]

    # Scale numeric features (excluding 'Player_Encoded')
    scaler = StandardScaler()
    data[numeric_cols] = scaler.fit_transform(data[numeric_cols])

    print("Encoded data shape:", data.shape)
    print("Columns after encoding:", data.columns)

    return data, risk_mapping, encoders, scaler, numeric_cols, player_encoder




def scale_features(data, numeric_cols):
    scaler = StandardScaler()
    data[numeric_cols] = scaler.fit_transform(data[numeric_cols])
    return data, scaler

def decode_data(encoded_data, injury_risk_mapping, encoders, scaler, numeric_cols, player_encoder):
    decoded_data = encoded_data.copy()
    
    # Decode Injury_Risk
    inv_injury_risk_mapping = {v: k for k, v in injury_risk_mapping.items()}
    decoded_data['Injury_Risk'] = decoded_data['Injury_Risk'].map(inv_injury_risk_mapping)
    
    # Decode Player column
    if 'Player_Encoded' in decoded_data.columns:
        decoded_data['Player'] = player_encoder.inverse_transform(decoded_data['Player_Encoded'])
        decoded_data.drop('Player_Encoded', axis=1, inplace=True)
    
    # Decode categorical variables
    for col, encoder in encoders.items():
        encoded_cols = [c for c in decoded_data.columns if c.startswith(f"{col}_")]
        decoded_col = encoder.inverse_transform(decoded_data[encoded_cols])
        decoded_data[col] = decoded_col.ravel()  # Flatten the 2D array to 1D
        decoded_data.drop(encoded_cols, axis=1, inplace=True)
    
    # Inverse transform scaled features
    decoded_data[numeric_cols] = scaler.inverse_transform(decoded_data[numeric_cols])
    
    return decoded_data

def select_top_features(X, y, k=10):
    # Select top features based on statistical significance
    selector = SelectKBest(score_func=f_regression, k=k)
    selector.fit(X, y)
    top_features = X.columns[selector.get_support()].tolist()
    print(f"Top {k} features:", top_features)
    return top_features

def calculate_tree_feature_importance(X, y):
    # Calculate feature importance using a Random Forest Regressor
    rf = RandomForestRegressor(n_estimators=100, random_state=42)
    rf.fit(X, y)
    importances = rf.feature_importances_
    feature_importances = pd.DataFrame({
        'Feature': X.columns,
        'Importance': importances
    }).sort_values(by='Importance', ascending=False)
    
    plt.figure(figsize=(12, 8))
    sns.barplot(x='Importance', y='Feature', data=feature_importances.head(20))
    plt.title('Top 20 Feature Importances from Random Forest')
    plt.show()
    
    return feature_importances

if __name__ == "__main__":
    file_path = '../data/processed/nba_player_data_final_inflated.csv'
    data = load_data(file_path)
    data = format_season(data)
    data = clean_data(data)
    data = engineer_features(data)

    # Separate features and target
    X = data.drop(['SalaryPct', 'Salary'], axis=1)
    y = data['SalaryPct']

    # Encode data
    encoded_data, injury_risk_mapping, encoders, scaler, numeric_cols, player_encoder = encode_data(X)

    print("\nInjury Risk Mapping:", injury_risk_mapping)
    print("Encoded Injury Risk range:", encoded_data['Injury_Risk'].min(), "-", encoded_data['Injury_Risk'].max())
    print("\nNumeric columns for scaling:", numeric_cols)

    # Calculate feature importance
    feature_importances = calculate_tree_feature_importance(encoded_data, y)
    print("\nTree-based feature importances:")
    print(feature_importances.head(20))

    # Select top features
    top_features = select_top_features(encoded_data, y)
    print("\nTop features selected using statistical methods:", top_features)

    # Decoding example
    print("\nDecoding Example:")
    decoded_data = decode_data(encoded_data, injury_risk_mapping, encoders, scaler, numeric_cols, player_encoder)
    
    print("\nFirst few rows of decoded data:")
    print(decoded_data[['Player', 'Injury_Risk', 'Position', 'Team', 'Season'] + top_features].head())

    print("\nData types after decoding:")
    print(decoded_data.dtypes)

    print("\nData preprocessing completed. Ready for model training.")


Overwriting ../src/salary_predict/updated/data_loader_preprocessor.py


In [46]:
%%writefile ../src/salary_predict/updated/model_trainer.py
import joblib
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split, GridSearchCV
import pandas as pd
import numpy as np

def inspect_data_types(X):
    print("Data types of features:")
    print(X.dtypes)
    object_columns = X.select_dtypes(include=['object']).columns
    if not object_columns.empty:
        print("Columns with object data types:", object_columns.tolist())
    else:
        print("No columns with object data types.")

def perform_grid_search(model, param_grid, X_train, y_train):
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    print(f"Best parameters for {model.__class__.__name__}: {grid_search.best_params_}")
    print(f"Best score for {model.__class__.__name__}: {-grid_search.best_score_}")
    return grid_search.best_estimator_

def train_and_save_models(X_train, y_train, model_save_path, scaler, feature_names, encoders, player_encoder, numeric_cols):
    # Inspect data types before training
    inspect_data_types(X_train)

    # Initialize models with default parameters
    rf_model = RandomForestRegressor(random_state=42)
    xgb_model = xgb.XGBRegressor(random_state=42, enable_categorical=True)

    # Define parameter grids for grid search
    rf_param_grid = {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5, 10]
    }
    xgb_param_grid = {
        'n_estimators': [100, 200, 300],
        'max_depth': [3, 6, 9],
        'learning_rate': [0.01, 0.1, 0.2]
    }

    # Perform grid search
    best_rf_model = perform_grid_search(rf_model, rf_param_grid, X_train, y_train)
    best_xgb_model = perform_grid_search(xgb_model, xgb_param_grid, X_train, y_train)

    # Train models with best parameters
    best_rf_model.fit(X_train, y_train)
    best_xgb_model.fit(X_train, y_train)

    # Scale the features used for training
    X_train_scaled = scaler.fit_transform(X_train)

    # Save models, scaler, feature names, encoders, and other artifacts
    joblib.dump(best_rf_model, f"{model_save_path}/best_rf_model.pkl")
    joblib.dump(best_xgb_model, f"{model_save_path}/best_xgb_model.pkl")
    joblib.dump(scaler, f"{model_save_path}/scaler.pkl")
    joblib.dump(feature_names, f"{model_save_path}/feature_names.pkl")
    joblib.dump(encoders, f"{model_save_path}/encoders.pkl")
    joblib.dump(injury_risk_mapping, f"{model_save_path}/injury_risk_mapping.pkl")
    joblib.dump(numeric_cols, f"{model_save_path}/numeric_cols.pkl")

    joblib.dump(player_encoder, f"{model_save_path}/player_encoder.pkl")
    print("Models, scaler, feature names, encoders, and other artifacts trained and saved successfully.")

def evaluate_models(X_test, y_test, model_save_path):
    # Load models, scaler, and feature names
    rf_model = joblib.load(f"{model_save_path}/best_rf_model.pkl")
    xgb_model = joblib.load(f"{model_save_path}/best_xgb_model.pkl")

    # Make predictions
    rf_predictions = rf_model.predict(X_test)
    xgb_predictions = xgb_model.predict(X_test)

    # Evaluate models using multiple metrics
    metrics = {'Random Forest': rf_predictions, 'XGBoost': xgb_predictions}

    for model_name, predictions in metrics.items():
        mse = mean_squared_error(y_test, predictions)
        rmse = np.sqrt(mse)
        mae = mean_absolute_error(y_test, predictions)
        r2 = r2_score(y_test, predictions)

        print(f"\n{model_name} Evaluation:")
        print(f"MSE: {mse}")
        print(f"RMSE: {rmse}")
        print(f"MAE: {mae}")
        print(f"R-squared: {r2}")

# Data preprocessing
def load_and_preprocess_data(file_path, predict_season):
    data = load_data(file_path)
    data = format_season(data)
    _, prior_seasons_data = filter_seasons(data, predict_season)
    prior_seasons_data = clean_data(prior_seasons_data)
    prior_seasons_data = engineer_features(prior_seasons_data)
    return prior_seasons_data

# Feature selection
def select_features(data, target_column, additional_features=[]):
    top_features = ['PPG', 'APG', 'RPG', 'SPG', 'TOPG', 'Years of Service', 'PER', 'VORP', 'WSPG', 'OWSPG']
    
    # Add 'Injury_Risk', 'Position', and 'Team' to ensure they're included for encoding
    top_features += ['Injury_Risk', 'Position', 'Team']
    
    # Add any additional features
    top_features += additional_features
    
    # Ensure all selected features are in the dataset
    available_features = [col for col in top_features if col in data.columns]
    
    print("Available features for modeling:", available_features)  # Debug statement

    X = data[available_features]
    y = data[target_column]
    return X, y

# Main execution
if __name__ == "__main__":
    file_path = '../data/processed/nba_player_data_final_inflated.csv'
    predict_season = 2023
    target_column = 'SalaryPct'

    # Load and preprocess data
    preprocessed_data = load_and_preprocess_data(file_path, predict_season)
    print("Columns after preprocessing:", preprocessed_data.columns)

    # Select features
    X, y = select_features(preprocessed_data, target_column)
    print("Columns after feature selection:", X.columns)

    # Encode data
    encoded_data, injury_risk_mapping, encoders, scaler, numeric_cols, player_encoder = encode_data(X)
    print("Columns after encoding:", encoded_data.columns)

    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(encoded_data, y, test_size=0.2, random_state=42)

    # Train and evaluate models
    model_save_path = '../data/models'
    train_and_save_models(X_train, y_train, model_save_path, scaler, encoded_data.columns, encoders, injury_risk_mapping, numeric_cols)
    evaluate_models(X_test, y_test, model_save_path)


Overwriting ../src/salary_predict/updated/model_trainer.py


In [47]:
%%writefile ../src/salary_predict/updated/model_predictor.py

import joblib
import pandas as pd

def load_models_and_utils(model_save_path):
    rf_model = joblib.load(f"{model_save_path}/best_rf_model.pkl")
    xgb_model = joblib.load(f"{model_save_path}/best_xgb_model.pkl")
    scaler = joblib.load(f"{model_save_path}/scaler.pkl")
    feature_names = joblib.load(f"{model_save_path}/feature_names.pkl")
    encoders = joblib.load(f"{model_save_path}/encoders.pkl")
    injury_risk_mapping = joblib.load(f"{model_save_path}/injury_risk_mapping.pkl")
    numeric_cols = joblib.load(f"{model_save_path}/numeric_cols.pkl")
    player_encoder = joblib.load(f"{model_save_path}/player_encoder.pkl")
    return rf_model, xgb_model, scaler, feature_names, encoders, injury_risk_mapping, numeric_cols, player_encoder

def predict(data, model_save_path):
    rf_model, xgb_model, scaler, feature_names, encoders, _, _, player_encoder = load_models_and_utils(model_save_path)
    
    print("Original data shape:", data.shape)
    print("Original data columns:", data.columns.tolist())

    # Preserve player names
    player_names = data['Player'] if 'Player' in data.columns else None
    
    # Drop the player column before encoding
    data = data.drop(columns=['Player'], errors='ignore')
    
    # Encode the data using the loaded encoders
    encoded_data, _, _, _, _, _ = encode_data(data, encoders, player_encoder)
    
    print("Encoded data shape:", encoded_data.shape)
    print("Encoded data columns:", encoded_data.columns.tolist())
    
    # Handle missing features: Add missing columns and set them to zero
    for col in feature_names:
        if col not in encoded_data.columns:
            encoded_data[col] = 0

    # Ensure encoded_data only has feature_names columns
    encoded_data = encoded_data[feature_names]
    
    print("Selected features shape:", encoded_data.shape)
    print("Selected features:", encoded_data.columns.tolist())
    print("Expected features:", feature_names)
    
    # Scale the encoded data
    encoded_data_scaled = scaler.transform(encoded_data)
    
    # Make predictions
    rf_predictions = rf_model.predict(encoded_data_scaled)
    xgb_predictions = xgb_model.predict(encoded_data_scaled)
    
    # Create a DataFrame for predictions
    predictions_df = pd.DataFrame({
        'RF_Predictions': rf_predictions,
        'XGB_Predictions': xgb_predictions,
        'Predicted_Salary': (rf_predictions + xgb_predictions) / 2
    })
    
    # Attach player names back to the predictions
    if player_names is not None:
        predictions_df['Player'] = player_names.values

    # Combine the predictions with the original data (excluding player names)
    result = pd.concat([data.reset_index(drop=True), predictions_df], axis=1)

    return result


if __name__ == "__main__":
    file_path = '../data/processed/nba_player_data_final_inflated.csv'
    predict_season = 2023
    data = load_data(file_path)
    data = format_season(data)
    current_season_data, _ = filter_seasons(data, predict_season)
    current_season_data = clean_data(current_season_data)
    current_season_data = engineer_features(current_season_data)
    model_save_path = '../data/models'
    predictions_df = predict(current_season_data, model_save_path)  # Save predictions as predictions_df
    print(predictions_df.head())
    
    # Save predictions_df for later use
    predictions_df.to_csv('../data/processed/predictions_df.csv', index=False)


Overwriting ../src/salary_predict/updated/model_predictor.py


https://www.hoopsrumors.com/2023/09/salary-matching-rules-for-trades-during-2023-24-season.html

for trade rules


FIRST_TAX_APRON = 172_346_000

def check_salary_matching_rules(outgoing_salary, incoming_salary, team_salary_before_trade):
    if team_salary_before_trade < FIRST_TAX_APRON:
        if outgoing_salary <= 7_500_000:
            max_incoming_salary = 2 * outgoing_salary + 250_000
        elif outgoing_salary <= 29_000_000:
            max_incoming_salary = outgoing_salary + 7_500_000
        else:
            max_incoming_salary = 1.25 * outgoing_salary + 250_000
    else:
        max_incoming_salary = 1.10 * outgoing_salary

    return incoming_salary <= max_incoming_salary

In [40]:
import pandas as pd
import numpy as np
from nba_api.stats.endpoints import leaguegamefinder, playergamelogs
import time

RELEVANT_STATS = ['PTS', 'AST', 'TOV', 'STL', 'BLK', 'OREB', 'DREB', 'FGM', 'FG3M', 'FGA']
PERCENTILE_THRESHOLDS = [99, 98, 97, 96, 95, 90, 75, 50]

def get_champion(season):
    games = leaguegamefinder.LeagueGameFinder(season_nullable=season, season_type_nullable='Playoffs').get_data_frames()[0]
    games['GAME_DATE'] = pd.to_datetime(games['GAME_DATE'])
    last_game = games.sort_values('GAME_DATE').iloc[-2:]
    winner = last_game[last_game['WL'] == 'W'].iloc[0]
    return winner['TEAM_ID'], winner['TEAM_NAME']

def get_champions(start_year, end_year):
    champions = {}
    for year in range(start_year, end_year + 1):
        season = f"{year}-{str(year+1)[-2:]}"
        champ_id, champ_name = get_champion(season)
        champions[season] = {'ChampionTeamID': champ_id, 'ChampionTeamName': champ_name}
        time.sleep(1)  # To avoid overwhelming the API
    return champions

def get_season_from_date(date):
    year = int(date[:4])
    month = int(date[5:7])
    if month >= 10:
        return f"{year}-{str(year+1)[2:]}"
    else:
        return f"{year-1}-{str(year)[2:]}"

def analyze_leaguegamefinder_endpoint(start_season, end_season):
    all_seasons_data = []
    
    for season in range(int(start_season[:4]), int(end_season[:4]) + 1):
        season_str = f"{season}-{str(season+1)[2:]}"
        print(f"Fetching data for season {season_str}")
        
        games = leaguegamefinder.LeagueGameFinder(
            season_nullable=season_str,
            season_type_nullable='Regular Season'
        ).get_data_frames()[0]
        
        games['SEASON'] = games['GAME_DATE'].apply(get_season_from_date)
        all_seasons_data.append(games)
        
        time.sleep(1)  # To avoid overwhelming the API
    
    return pd.concat(all_seasons_data, ignore_index=True)

def calculate_per_game_stats(games_df):
    per_game_stats = games_df.groupby(['SEASON', 'TEAM_ID', 'TEAM_NAME'])[RELEVANT_STATS].mean().reset_index()
    
    # Calculate eFG%
    per_game_stats['eFG%'] = (per_game_stats['FGM'] + 0.5 * per_game_stats['FG3M']) / per_game_stats['FGA']
    
    return per_game_stats

def calculate_percentiles(stats_df):
    percentile_cols = RELEVANT_STATS + ['eFG%']
    
    for col in percentile_cols:
        stats_df[f'{col}_percentile'] = stats_df.groupby('SEASON')[col].rank(pct=True)
    
    return stats_df

def get_current_season_stats(all_seasons_data, current_season):
    current_season_data = all_seasons_data[all_seasons_data['SEASON'] == current_season]
    per_game_stats = calculate_per_game_stats(current_season_data)
    percentile_stats = calculate_percentiles(per_game_stats)
    
    # Calculate league average
    league_avg = per_game_stats[RELEVANT_STATS + ['eFG%']].mean()
    league_avg['TEAM_NAME'] = 'League Average'
    league_avg['SEASON'] = current_season
    league_avg['TEAM_ID'] = 'AVG'
    league_avg = pd.DataFrame(league_avg).transpose()
    
    # Combine team stats with league average
    combined_stats = pd.concat([percentile_stats, league_avg], ignore_index=True)
    return combined_stats

def get_champions_stats(all_seasons_data, start_season, end_season):
    champions = {}
    for year in range(int(start_season[:4]), int(end_season[:4]) + 1):
        season = f"{year}-{str(year+1)[-2:]}"
        champ_id, champ_name = get_champion(season)
        champions[season] = {'ChampionTeamID': champ_id, 'ChampionTeamName': champ_name}
    
    champions_data = all_seasons_data[all_seasons_data.apply(lambda row: row['TEAM_ID'] == champions.get(row['SEASON'], {}).get('ChampionTeamID'), axis=1)]
    champions_stats = calculate_per_game_stats(champions_data)
    return calculate_percentiles(champions_stats)

def compare_stats(current_stats, champions_stats, league_avg):
    # Compare current stats to champions average and league average
    champs_avg = champions_stats[RELEVANT_STATS + ['eFG%']].mean()
    
    comparison = current_stats.copy()
    for stat in RELEVANT_STATS + ['eFG%']:
        comparison[f'{stat}_vs_champs'] = comparison[stat] - champs_avg[stat]
        comparison[f'{stat}_vs_league'] = comparison[stat] - league_avg[stat]
    
    return comparison

def get_team_data(all_seasons_data, team_names, current_season):
    team_data = all_seasons_data[(all_seasons_data['SEASON'] == current_season) & (all_seasons_data['TEAM_NAME'].isin(team_names))]
    return calculate_per_game_stats(team_data)

def simulate_trade(all_seasons_data, team_from, team_to, trade_impact, current_season):
    before_trade = get_team_data(all_seasons_data, [team_from, team_to], current_season)
    
    # Simulate the trade by adjusting team stats
    after_trade = before_trade.copy()
    numeric_columns = before_trade.select_dtypes(include=[np.number]).columns
    
    for stat in numeric_columns:
        if stat in trade_impact:
            after_trade.loc[after_trade['TEAM_NAME'] == team_from, stat] -= trade_impact[stat]
            after_trade.loc[after_trade['TEAM_NAME'] == team_to, stat] += trade_impact[stat]
    
    # Recalculate eFG% for both teams
    for team in [team_from, team_to]:
        team_data = after_trade[after_trade['TEAM_NAME'] == team]
        after_trade.loc[after_trade['TEAM_NAME'] == team, 'eFG%'] = (
            (team_data['FGM'] + 0.5 * team_data['FG3M']) / team_data['FGA']
        ).values[0]
    
    return before_trade, after_trade

def get_player_game_logs(team_id, season):
    player_logs = playergamelogs.PlayerGameLogs(team_id_nullable=team_id, season_nullable=season).get_data_frames()[0]
    return player_logs

def process_player_data(player_logs):
    player_stats = player_logs.groupby(['PLAYER_ID', 'PLAYER_NAME', 'TEAM_ID', 'TEAM_NAME'])[RELEVANT_STATS].mean().reset_index()
    player_stats['eFG%'] = (player_stats['FGM'] + 0.5 * player_stats['FG3M']) / player_stats['FGA']
    return player_stats

def get_champions_player_data(champions, start_season, end_season):
    all_champion_players = []
    for season in range(int(start_season[:4]), int(end_season[:4]) + 1):
        season_str = f"{season}-{str(season+1)[2:]}"
        champ_id = champions[season_str]['ChampionTeamID']
        player_logs = get_player_game_logs(champ_id, season_str)
        player_stats = process_player_data(player_logs)
        player_stats['SEASON'] = season_str
        all_champion_players.append(player_stats)
        time.sleep(1)  # To avoid overwhelming the API
    return pd.concat(all_champion_players, ignore_index=True)

def get_current_season_player_data(all_seasons_data, current_season):
    current_teams = all_seasons_data[all_seasons_data['SEASON'] == current_season]['TEAM_ID'].unique()
    all_current_players = []
    for team_id in current_teams:
        player_logs = get_player_game_logs(team_id, current_season)
        player_stats = process_player_data(player_logs)
        player_stats['SEASON'] = current_season
        all_current_players.append(player_stats)
        time.sleep(1)  # To avoid overwhelming the API
    return pd.concat(all_current_players, ignore_index=True)

def simulate_trade_with_players(team_from_data, team_to_data, traded_players):
    before_trade = pd.concat([team_from_data, team_to_data])
    
    # Move traded players between teams
    traded_from = team_from_data[team_from_data['PLAYER_NAME'].isin(traded_players)].copy()
    traded_to = team_to_data[team_to_data['PLAYER_NAME'].isin(traded_players)].copy()
    
    team_from_after = team_from_data[~team_from_data['PLAYER_NAME'].isin(traded_players)]
    team_to_after = pd.concat([team_to_data[~team_to_data['PLAYER_NAME'].isin(traded_players)], traded_from])
    
    after_trade = pd.concat([team_from_after, team_to_after])
    
    return before_trade, after_trade

def analyze_trade_impact(before_trade, after_trade):
    team_totals_before = before_trade.groupby('TEAM_NAME')[RELEVANT_STATS + ['eFG%']].sum().reset_index()
    team_totals_after = after_trade.groupby('TEAM_NAME')[RELEVANT_STATS + ['eFG%']].sum().reset_index()
    
    trade_impact = team_totals_after.set_index('TEAM_NAME').subtract(team_totals_before.set_index('TEAM_NAME')).reset_index()
    return trade_impact

def main():
    start_season = "2022-23"
    end_season = "2023-24"
    current_season = end_season
    all_seasons_data = analyze_leaguegamefinder_endpoint(start_season, end_season)
    
    # 1. Get champions for the past 10 seasons
    champions = get_champions(int(start_season[:4]), int(end_season[:4]))
    
    # 2. Get player-level data for champions
    champions_player_data = get_champions_player_data(champions, start_season, end_season)
    print("Champions Player Data (Past 10 Seasons):")
    print(champions_player_data)
    
    # 3. Get current season player-level data
    current_season_player_data = get_current_season_player_data(all_seasons_data, current_season)
    print("\nCurrent Season Player Data:")
    print(current_season_player_data)
    
    # 4. Load predictions dataframe
    predictions_df = pd.read_csv('../data/processed/predictions_df.csv')
    print("\nPredictions DataFrame (first few rows):")
    print(predictions_df.head())
    
    # 5. Simulate trade
    team_from = "Los Angeles Lakers"
    team_to = "Boston Celtics"
    traded_players = ["Anthony Davis", "Jayson Tatum"]  # Example players
    
    team_from_data = current_season_player_data[current_season_player_data['TEAM_NAME'] == team_from]
    team_to_data = current_season_player_data[current_season_player_data['TEAM_NAME'] == team_to]
    
    print("\nTeam Data Before Trade:")
    print(pd.concat([team_from_data, team_to_data]))
    
    before_trade, after_trade = simulate_trade_with_players(team_from_data, team_to_data, traded_players)
    
    print("\nTeam Data After Trade:")
    print(after_trade)
    
    # 6. Analyze trade impact
    trade_impact = analyze_trade_impact(before_trade, after_trade)
    print("\nTrade Impact (Difference in Team Stats):")
    print(trade_impact)
    
    # 7. Compare traded players to champions
    traded_player_stats = before_trade[before_trade['PLAYER_NAME'].isin(traded_players)]
    champion_avg = champions_player_data.groupby('SEASON')[RELEVANT_STATS + ['eFG%']].mean().mean()
    
    print("\nTraded Players vs. Champions Average:")
    for _, player in traded_player_stats.iterrows():
        print(f"\n{player['PLAYER_NAME']}:")
        for stat in RELEVANT_STATS + ['eFG%']:
            diff = player[stat] - champion_avg[stat]
            print(f"{stat}: {player[stat]:.2f} (Diff from Champs Avg: {diff:.2f})")
    
    # 8. Analyze salary based on predictions
    traded_players_salary = predictions_df[predictions_df['Player'].isin(traded_players)]
    print("\nSalary Analysis for Traded Players:")
    print(traded_players_salary[['Player', 'Salary', 'Predicted_Salary']])

if __name__ == "__main__":
    main()

Fetching data for season 2022-23
Fetching data for season 2023-24
Champions Player Data (Past 10 Seasons):
    PLAYER_ID               PLAYER_NAME     TEAM_ID       TEAM_NAME  \
0      201145                Jeff Green  1610612743  Denver Nuggets   
1      201599            DeAndre Jordan  1610612743  Denver Nuggets   
2      202397                 Ish Smith  1610612743  Denver Nuggets   
3      202704            Reggie Jackson  1610612743  Denver Nuggets   
4      203484  Kentavious Caldwell-Pope  1610612743  Denver Nuggets   
5      203932              Aaron Gordon  1610612743  Denver Nuggets   
6      203999              Nikola Jokic  1610612743  Denver Nuggets   
7     1627750              Jamal Murray  1610612743  Denver Nuggets   
8     1628418             Thomas Bryant  1610612743  Denver Nuggets   
9     1628427             Vlatko Cancar  1610612743  Denver Nuggets   
10    1628432                Davon Reed  1610612743  Denver Nuggets   
11    1628971               Bruce Brown  

  return pd.concat(all_current_players, ignore_index=True)


In [44]:
%%writefile ../src/salary_predict/updated/trade_utils.py


import numpy as np
import pandas as pd
from nba_api.stats.endpoints import leaguegamefinder
import time
from scipy import stats

RELEVANT_STATS = ['PPG', 'APG', 'TPG', 'SPG', 'BPG', 'ORPG', 'DRPG', 'eFG%']
PERCENTILE_THRESHOLDS = [99, 98, 97, 96, 95, 90, 75, 50]

def get_champion(season):
    games = leaguegamefinder.LeagueGameFinder(season_nullable=season, season_type_nullable='Playoffs').get_data_frames()[0]
    games['GAME_DATE'] = pd.to_datetime(games['GAME_DATE'])
    last_game = games.sort_values('GAME_DATE').iloc[-2:]
    winner = last_game[last_game['WL'] == 'W'].iloc[0]
    return winner['TEAM_ID'], winner['TEAM_NAME']

def get_champions(start_year, end_year):
    champions = {}
    for year in range(start_year, end_year + 1):
        season = f"{year}-{str(year+1)[-2:]}"
        champ_id, champ_name = get_champion(season)
        champions[season] = {'ChampionTeamID': champ_id, 'ChampionTeamName': champ_name}
        time.sleep(1)  # To avoid overwhelming the API
    return champions

def calculate_team_stats(team_players, all_players):
    team_stats = {}
    for stat in RELEVANT_STATS:
        if stat in team_players.columns:
            values = team_players[stat].values
            all_values = all_players[stat].values
            percentiles = np.percentile(all_values, PERCENTILE_THRESHOLDS)
            
            team_stats[stat] = {
                'mean': np.mean(values) if len(values) > 0 else 0,
                'median': np.median(values) if len(values) > 0 else 0,
                'max': np.max(values) if len(values) > 0 else 0,
                'total': values.tolist(),
                'percentile_counts': {
                    f'Top {100-p}%': np.sum(values >= percentiles[i])
                    for i, p in enumerate(PERCENTILE_THRESHOLDS)
                }
            }
    return team_stats

def calculate_champ_stats(champions, num_years=10):
    current_year = max(int(season.split('-')[0]) for season in champions.keys())
    start_year = current_year - num_years + 1
    recent_champions = {k: v for k, v in champions.items() if int(k.split('-')[0]) >= start_year}
    
    champ_stats = pd.DataFrame()
    num_champ_seasons = len(recent_champions)

    for season, champ_info in recent_champions.items():
        games = leaguegamefinder.LeagueGameFinder(
            season_nullable=season,
            team_id_nullable=champ_info['ChampionTeamID'],
            season_type_nullable='Regular Season'
        ).get_data_frames()[0]
        
        season_stats = pd.DataFrame({
            'PPG': [games['PTS'].mean()],
            'APG': [games['AST'].mean()],
            'TPG': [games['TOV'].mean()],
            'SPG': [games['STL'].mean()],
            'BPG': [games['BLK'].mean()],
            'ORPG': [games['OREB'].mean()],
            'DRPG': [games['DREB'].mean()],
            'eFG%': [(games['FGM'].sum() + 0.5 * games['FG3M'].sum()) / games['FGA'].sum()]
        })
        
        champ_stats = pd.concat([champ_stats, season_stats], ignore_index=True)
    
    if champ_stats.empty:
        return {stat: {'mean': 0, 'median': 0, 'max': 0, 'percentile_counts': {f'Top {100-p}%': 0 for p in PERCENTILE_THRESHOLDS}} for stat in RELEVANT_STATS}
    
    champ_percentiles = {}
    for stat in RELEVANT_STATS:
        if stat in champ_stats.columns:
            values = champ_stats[stat]
            champ_percentiles[stat] = {
                'mean': np.mean(values),
                'median': np.median(values),
                'max': np.max(values),
                'percentile_counts': {
                    f'Top {100-p}%': np.sum(values >= np.percentile(values, p)) / num_champ_seasons
                    for p in PERCENTILE_THRESHOLDS
                }
            }
    
    return champ_percentiles

def compare_stats(current_stats, simulated_stats, league_stats, champ_stats):
    comparison = {}
    for stat in RELEVANT_STATS:
        if stat in current_stats and stat in league_stats:
            current_value = current_stats[stat]['mean']
            after_trade_value = simulated_stats[stat]['mean']
            league_average = league_stats[stat]['mean']
            champ_average = champ_stats[stat]['mean']
            
            all_league_values = league_stats[stat]['total']
            current_percentile = stats.percentileofscore(all_league_values, current_value)
            after_trade_percentile = stats.percentileofscore(all_league_values, after_trade_value)
            
            comparison[stat] = {
                'Current': current_value,
                'Current Percentile': current_percentile,
                'After Trade': after_trade_value,
                'After Trade Percentile': after_trade_percentile,
                'League Average': league_average,
                'Champ Average': champ_average,
                'Current vs League': current_value - league_average,
                'After Trade vs League': after_trade_value - league_average,
                'Current vs Champ': current_value - champ_average,
                'After Trade vs Champ': after_trade_value - champ_average,
                'Current Percentile Counts': current_stats[stat]['percentile_counts'],
                'After Trade Percentile Counts': simulated_stats[stat]['percentile_counts'],
                'Champ Percentile Counts': champ_stats[stat]['percentile_counts']
            }
    return comparison

def simulate_trade(team_players, players_leaving, players_joining):
    team_after_trade = team_players[~team_players['Player'].isin(players_leaving)].copy()
    return pd.concat([team_after_trade, players_joining], ignore_index=True)

FIRST_TAX_APRON = 172_346_000

def check_salary_matching_rules(outgoing_salary, incoming_salary, team_salary_before_trade):
    if team_salary_before_trade < FIRST_TAX_APRON:
        if outgoing_salary <= 7_500_000:
            max_incoming_salary = 2 * outgoing_salary + 250_000
        elif outgoing_salary <= 29_000_000:
            max_incoming_salary = outgoing_salary + 7_500_000
        else:
            max_incoming_salary = 1.25 * outgoing_salary + 250_000
    else:
        max_incoming_salary = 1.10 * outgoing_salary

    return incoming_salary <= max_incoming_salary

def analyze_two_team_trade(team1_abbr, team2_abbr, players_leaving_team1, players_leaving_team2, predictions_df, champions):
    try:
        team1_players = predictions_df[predictions_df['Team'] == team1_abbr]
        team2_players = predictions_df[predictions_df['Team'] == team2_abbr]

        players_joining_team1 = team2_players[team2_players['Player'].isin(players_leaving_team2)]
        players_joining_team2 = team1_players[team1_players['Player'].isin(players_leaving_team1)]

        if players_joining_team1.empty or players_joining_team2.empty:
            print("Could not find one or more of the specified players' stats.")
            return

        current_team1_stats = calculate_team_stats(team1_players, predictions_df)
        current_team2_stats = calculate_team_stats(team2_players, predictions_df)

        team1_after_trade = simulate_trade(team1_players, players_leaving_team1, players_joining_team1)
        team2_after_trade = simulate_trade(team2_players, players_leaving_team2, players_joining_team2)

        simulated_team1_stats = calculate_team_stats(team1_after_trade, predictions_df)
        simulated_team2_stats = calculate_team_stats(team2_after_trade, predictions_df)

        league_stats = calculate_team_stats(predictions_df, predictions_df)
        champ_stats = calculate_champ_stats(champions)

        team1_current_salary = team1_players['Salary'].sum()
        team2_current_salary = team2_players['Salary'].sum()
        team1_new_salary = team1_after_trade['Salary'].sum()
        team2_new_salary = team2_after_trade['Salary'].sum()

        outgoing_salary_team1 = team1_players[team1_players['Player'].isin(players_leaving_team1)]['Salary'].sum()
        incoming_salary_team1 = players_joining_team1['Salary'].sum()
        outgoing_salary_team2 = team2_players[team2_players['Player'].isin(players_leaving_team2)]['Salary'].sum()
        incoming_salary_team2 = players_joining_team2['Salary'].sum()

        salary_match_team1 = check_salary_matching_rules(outgoing_salary_team1, incoming_salary_team1, team1_current_salary)
        salary_match_team2 = check_salary_matching_rules(outgoing_salary_team2, incoming_salary_team2, team2_current_salary)

        team1_comparison = compare_stats(current_team1_stats, simulated_team1_stats, league_stats, champ_stats)
        team2_comparison = compare_stats(current_team2_stats, simulated_team2_stats, league_stats, champ_stats)

        return {
            team1_abbr: {
                'comparison': team1_comparison,
                'current_salary': team1_current_salary,
                'new_salary': team1_new_salary,
                'salary_match': salary_match_team1
            },
            team2_abbr: {
                'comparison': team2_comparison,
                'current_salary': team2_current_salary,
                'new_salary': team2_new_salary,
                'salary_match': salary_match_team2
            }
        }

    except Exception as e:
        print(f"Error in analyze_two_team_trade: {str(e)}")
        return None

def calculate_league_stats_from_api(start_year, end_year):
    league_stats = {stat: [] for stat in RELEVANT_STATS}
    
    for year in range(start_year, end_year + 1):
        season = f"{year}-{str(year+1)[-2:]}"
        games = leaguegamefinder.LeagueGameFinder(
            season_nullable=season,
            season_type_nullable='Regular Season'
        ).get_data_frames()[0]
        
        team_stats = games.groupby('TEAM_ID').agg({
            'PTS': 'mean',
            'AST': 'mean',
            'TOV': 'mean',
            'STL': 'mean',
            'BLK': 'mean',
            'OREB': 'mean',
            'DREB': 'mean',
            'FGM': 'sum',
            'FG3M': 'sum',
            'FGA': 'sum'
        })
        
        team_stats['eFG%'] = (team_stats['FGM'] + 0.5 * team_stats['FG3M']) / team_stats['FGA']
        
        for stat, api_stat in zip(RELEVANT_STATS, ['PTS', 'AST', 'TOV', 'STL', 'BLK', 'OREB', 'DREB', 'eFG%']):
            league_stats[stat].extend(team_stats[api_stat].tolist())
        
        time.sleep(1)  # To avoid overwhelming the API
    
    league_percentiles = {}
    for stat in RELEVANT_STATS:
        values = league_stats[stat]
        percentiles = np.percentile(values, PERCENTILE_THRESHOLDS)
        league_percentiles[stat] = {
            'mean': np.mean(values),
            'median': np.median(values),
            'max': np.max(values),
            'total': values,
            'percentile_counts': {
                f'Top {100-p}%': np.sum(np.array(values) >= percentiles[i])
                for i, p in enumerate(PERCENTILE_THRESHOLDS)
            }
        }
    
    return league_percentiles

def identify_overpaid_underpaid(predictions_df):
    predictions_df['Salary_Difference'] = predictions_df['Salary'] - predictions_df['Predicted_Salary']
    predictions_df['Overpaid'] = predictions_df['Salary_Difference'] > 0
    predictions_df['Underpaid'] = predictions_df['Salary_Difference'] < 0
    
    overpaid = predictions_df[predictions_df['Overpaid']].sort_values('Salary_Difference', ascending=False)
    underpaid = predictions_df[predictions_df['Underpaid']].sort_values('Salary_Difference')
    
    return overpaid.head(10), underpaid.head(10)


def identify_overpaid_underpaid(predictions_df):
    # Adjust Predicted_Salary calculation
    predictions_df['Predicted_Salary'] = predictions_df['Predicted_Salary'] * predictions_df['Salary_Cap_Inflated']
    
    predictions_df['Salary_Difference'] = predictions_df['Salary'] - predictions_df['Predicted_Salary']
    predictions_df['Overpaid'] = predictions_df['Salary_Difference'] > 0
    predictions_df['Underpaid'] = predictions_df['Salary_Difference'] < 0
    
    overpaid = predictions_df[predictions_df['Overpaid']].sort_values('Salary_Difference', ascending=False)
    underpaid = predictions_df[predictions_df['Underpaid']].sort_values('Salary_Difference')
    
    return overpaid.head(10), underpaid.head(10)

if __name__ == "__main__":
    predictions_df = pd.read_csv('../data/processed/predictions_df.csv')
    predictions_df = predictions_df[['Season', 'Position', 'Age', 'Team', 'TeamID', 'Years of Service', '3P%', '2P%', 'eFG%', 'FT%', 'PER', 'VORP', 'Salary', 'Total_Days_Injured', 'Injury_Risk', 'Salary Cap', 'Salary_Cap_Inflated', 'PPG', 'APG', 'TPG', 'SPG', 'BPG', 'Availability', 'SalaryPct', 'Efficiency', 'ValueOverReplacement', 'ExperienceSquared', 'Days_Injured_Percentage', 'WSPG', 'DWSPG', 'OWSPG', 'PFPG', 'ORPG', 'DRPG', 'RF_Predictions', 'XGB_Predictions', 'Predicted_Salary', 'Player']]
    
    current_year = 2023
    start_year = current_year - 10
    
    # Calculate league stats from API
    league_stats = calculate_league_stats_from_api(start_year, current_year - 1)
    
    print("League Stats:")
    for stat, values in league_stats.items():
        print(f"\n{stat}:")
        print(f"  Mean: {values['mean']:.2f}")
        print(f"  Median: {values['median']:.2f}")
        print(f"  Max: {values['max']:.2f}")
        print("  Percentile Counts:")
        for percentile, count in values['percentile_counts'].items():
            print(f"    {percentile}: {count}")

    champions = get_champions(start_year, current_year - 1)
    champ_stats = calculate_champ_stats(champions)
    
    print("\nChampion Stats:")
    for stat, values in champ_stats.items():
        print(f"\n{stat}:")
        print(f"  Mean: {values['mean']:.2f}")
        print(f"  Median: {values['median']:.2f}")
        print(f"  Max: {values['max']:.2f}")
        print("  Percentile Counts:")
        for percentile, count in values['percentile_counts'].items():
            print(f"    {percentile}: {count:.2f}")
    
    # Identify overpaid and underpaid players with corrected Predicted_Salary
    overpaid, underpaid = identify_overpaid_underpaid(predictions_df)
    
    print("\nTop 10 Overpaid Players:")
    print(overpaid[['Player', 'Team', 'Salary', 'Predicted_Salary', 'Salary_Difference']])
    
    print("\nTop 10 Underpaid Players:")
    print(underpaid[['Player', 'Team', 'Salary', 'Predicted_Salary', 'Salary_Difference']])
    
    # Example trade analysis
    team1_abbr = 'LAL'
    team2_abbr = 'BOS'
    players_leaving_team1 = ['Anthony Davis', 'D\'Angelo Russell']
    players_leaving_team2 = ['Jayson Tatum', 'Jaylen Brown']
    
    result = analyze_two_team_trade(team1_abbr, team2_abbr, players_leaving_team1, players_leaving_team2, predictions_df, champions)
    
    if result:
        for team_abbr, team_data in result.items():
            print(f"\n{team_abbr} Trade Analysis:")
            print(f"Current Salary: ${team_data['current_salary']:,.2f}")
            print(f"Salary After Trade: ${team_data['new_salary']:,.2f}")
            print(f"Salary Difference: ${team_data['new_salary'] - team_data['current_salary']:,.2f}")
            print(f"Salary Match: {'Yes' if team_data['salary_match'] else 'No'}")
            
            print("\nStat Comparisons:")
            for stat in RELEVANT_STATS:
                values = team_data['comparison'][stat]
                print(f"{stat}:")
                print(f"  Current: {values['Current']:.2f} ({values['Current Percentile']:.1f}%ile)")
                print(f"  After Trade: {values['After Trade']:.2f} ({values['After Trade Percentile']:.1f}%ile)")
                print(f"  Change vs League: {values['After Trade vs League'] - values['Current vs League']:.2f}")
                print(f"  Change vs Champ: {values['After Trade vs Champ'] - values['Current vs Champ']:.2f}")
                print("  Percentile Counts (Current / After Trade / Champ Average):")
                for percentile in PERCENTILE_THRESHOLDS:
                    current_count = values['Current Percentile Counts'][f'Top {100-percentile}%']
                    after_trade_count = values['After Trade Percentile Counts'][f'Top {100-percentile}%']
                    champ_count = values['Champ Percentile Counts'][f'Top {100-percentile}%']
                    print(f"    Top {100-percentile}%: {current_count:.1f} / {after_trade_count:.1f} / {champ_count:.1f}")
                    
                    



Overwriting ../src/salary_predict/updated/trade_utils.py


In [42]:
%%writefile ../src/salary_predict/updated/app.py

import streamlit as st
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import joblib

# Import functions from other modules
from data_loader_preprocessor import load_data, format_season, clean_data, engineer_features, encode_data
from model_trainer import train_and_save_models, evaluate_models
from model_predictor import predict
from trade_utils import analyze_two_team_trade, get_champions


def identify_overpaid_underpaid(predictions_df):
    # Adjust Predicted_Salary calculation
    predictions_df['Predicted_Salary'] = predictions_df['Predicted_Salary'] * predictions_df['Salary_Cap_Inflated']
    
    predictions_df['Salary_Difference'] = predictions_df['Salary'] - predictions_df['Predicted_Salary']
    predictions_df['Overpaid'] = predictions_df['Salary_Difference'] > 0
    predictions_df['Underpaid'] = predictions_df['Salary_Difference'] < 0
    
    overpaid = predictions_df[predictions_df['Overpaid']].sort_values('Salary_Difference', ascending=False)
    underpaid = predictions_df[predictions_df['Underpaid']].sort_values('Salary_Difference')
    
    return overpaid.head(10), underpaid.head(10)


# Utility functions
def load_processed_data(file_path):
    data = load_data(file_path)
    data = format_season(data)
    data = clean_data(data)
    data = engineer_features(data)
    return data

def filter_data_by_season(data, season):
    return data[data['Season'] == season]

# Data visualization functions
def plot_feature_distribution(data, feature):
    fig, ax = plt.subplots(figsize=(10, 6))
    sns.histplot(data[feature], kde=True, ax=ax)
    ax.set_title(f'Distribution of {feature}')
    ax.set_xlabel(feature)
    ax.set_ylabel('Count')
    return fig

def plot_correlation_heatmap(data):
    numeric_data = data.select_dtypes(include=[np.number])
    corr = numeric_data.corr()
    fig, ax = plt.subplots(figsize=(12, 10))
    sns.heatmap(corr, annot=False, cmap='coolwarm', ax=ax)
    ax.set_title('Correlation Heatmap')
    return fig

# Model metrics function
def display_model_metrics(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)

    st.subheader("Model Performance Metrics")
    col1, col2, col3, col4 = st.columns(4)
    col1.metric("Mean Squared Error", f"{mse:.4f}")
    col2.metric("Root Mean Squared Error", f"{rmse:.4f}")
    col3.metric("Mean Absolute Error", f"{mae:.4f}")
    col4.metric("R-squared", f"{r2:.4f}")

# Trade impact display function
def display_trade_impact(result, team1, team2):
    for team_abbr in [team1, team2]:
        st.subheader(f"{team_abbr} Trade Impact")
        
        team_data = result[team_abbr]
        
        col1, col2, col3 = st.columns(3)
        col1.metric("Current Salary", f"${team_data['current_salary']:,.2f}")
        col2.metric("Salary After Trade", f"${team_data['new_salary']:,.2f}")
        col3.metric("Salary Difference", f"${team_data['new_salary'] - team_data['current_salary']:,.2f}")
        
        st.subheader("Stat Comparisons")
        
        # Create a DataFrame for the main stat comparisons
        comparison_data = []
        for stat, values in team_data['comparison'].items():
            comparison_data.append({
                'Stat': stat,
                'Current': f"{values['Current']:.2f} ({values['Current Percentile']:.1f}%ile)",
                'After Trade': f"{values['After Trade']:.2f} ({values['After Trade Percentile']:.1f}%ile)",
                'Champion Average': f"{values['Champ Average']:.2f}",
                'League Average': f"{values['League Average']:.2f}",
                'Change vs League': f"{values['After Trade vs League'] - values['Current vs League']:.2f}",
                'Change vs Champ': f"{values['After Trade vs Champ'] - values['Current vs Champ']:.2f}"
            })
        comparison_df = pd.DataFrame(comparison_data)
        st.table(comparison_df)
        
        st.subheader("Percentile Counts")
        percentile_data = []
        for stat, values in team_data['comparison'].items():
            stat_data = {'Stat': stat}
            for percentile in [99, 98, 97, 96, 95, 90, 75, 50]:
                percentile_key = f"Top {100-percentile}%"
                stat_data[f"Current {percentile_key}"] = values['Current Percentile Counts'][percentile_key]
                stat_data[f"After Trade {percentile_key}"] = values['After Trade Percentile Counts'][percentile_key]
                stat_data[f"Champion {percentile_key}"] = values['Champ Percentile Counts'][percentile_key]
            percentile_data.append(stat_data)
        
        percentile_df = pd.DataFrame(percentile_data)
        st.table(percentile_df)
        
        st.markdown("---")

def display_overpaid_underpaid(predictions_df):
    st.subheader("Top 10 Overpaid and Underpaid Players")

    # Add filters
    col1, col2 = st.columns(2)
    with col1:
        team_filter = st.multiselect("Filter by Team", options=sorted(predictions_df['Team'].unique()))
    with col2:
        position_filter = st.multiselect("Filter by Position", options=sorted(predictions_df['Position'].unique()))

    # Apply filters
    filtered_df = predictions_df
    if team_filter:
        filtered_df = filtered_df[filtered_df['Team'].isin(team_filter)]
    if position_filter:
        filtered_df = filtered_df[filtered_df['Position'].isin(position_filter)]

    # Identify overpaid and underpaid players
    overpaid, underpaid = identify_overpaid_underpaid(filtered_df)

    col1, col2 = st.columns(2)
    with col1:
        st.subheader("Top 10 Overpaid Players")
        st.dataframe(overpaid[['Player', 'Team', 'Position', 'Salary', 'Predicted_Salary', 'Salary_Difference']])

    with col2:
        st.subheader("Top 10 Underpaid Players")
        st.dataframe(underpaid[['Player', 'Team', 'Position', 'Salary', 'Predicted_Salary', 'Salary_Difference']])


# Main Streamlit app
def main():
    st.set_page_config(page_title="NBA Salary Prediction and Trade Analysis", layout="wide")
    st.title("NBA Salary Prediction and Trade Analysis")

    # Sidebar navigation
    st.sidebar.title("Navigation")
    page = st.sidebar.radio("Go to", ["Data Analysis", "Model Results", "Salary Evaluation", "Trade Analysis"])

    # Load base data
    data = load_processed_data('data/processed/nba_player_data_final_inflated.csv')

    # Load existing predictions for 2023
    initial_predictions_df = pd.read_csv('data/processed/predictions_df.csv')

    # Season selection
    seasons = sorted(data['Season'].unique(), reverse=True)
    selected_season = st.selectbox("Select Season", seasons)

    # Load models at the beginning of main()
    model_save_path = 'data/models'
    rf_model = joblib.load(f"{model_save_path}/best_rf_model.pkl")
    xgb_model = joblib.load(f"{model_save_path}/best_xgb_model.pkl")

    # Use initial predictions if 2023 is selected, otherwise retrain
    if selected_season == 2023:
        predictions_df = initial_predictions_df
    else:
        # Train model and make predictions
        train_data = data[data['Season'] < selected_season]
        test_data = data[data['Season'] == selected_season]

        # Prepare the data for training
        X_train = train_data.drop(['SalaryPct', 'Salary', 'Player'], axis=1)
        y_train = train_data['SalaryPct']

        # Encode the training data
        X_train_encoded, _, encoders, scaler, numeric_cols, player_encoder = encode_data(X_train)

        # Train and save models
        train_and_save_models(X_train_encoded, y_train, model_save_path, scaler, X_train_encoded.columns, encoders, player_encoder, numeric_cols)

        # Make predictions on the test data
        predictions_df = predict(test_data, model_save_path)



    if page == "Data Analysis":
        st.header("Data Analysis")

        # Filter data by selected season
        season_data = filter_data_by_season(data, selected_season)

        # Display basic statistics
        st.subheader("Basic Statistics")
        st.write(season_data.describe())

        # Feature distribution
        st.subheader("Feature Distribution")
        feature = st.selectbox("Select Feature", season_data.columns)
        fig = plot_feature_distribution(season_data, feature)
        st.pyplot(fig)

        # Correlation heatmap
        st.subheader("Correlation Heatmap")
        fig = plot_correlation_heatmap(season_data)
        st.pyplot(fig)

        # Data handling explanation
        st.subheader("Data Handling")
        st.write("""
        We preprocessed the data to ensure it's suitable for our models:
        1. Cleaned missing values and outliers
        2. Engineered new features like PPG, APG, etc.
        3. Encoded categorical variables (Position, Team, Injury Risk)
        4. Scaled numerical features
        """)

    elif page == "Model Results":
        st.header("Model Results")

        # Model selection
        model_choice = st.selectbox("Select Model", ["Random Forest", "XGBoost"])

        if model_choice == "Random Forest":
            model = rf_model
            y_pred = predictions_df['RF_Predictions']
        else:
            model = xgb_model
            y_pred = predictions_df['XGB_Predictions']

        # Display model metrics
        display_model_metrics(predictions_df['SalaryPct'], y_pred)

        # Feature importance
        st.subheader("Feature Importance")
        feature_importance = pd.DataFrame({
            'feature': model.feature_names_in_,
            'importance': model.feature_importances_
        }).sort_values('importance', ascending=False)
        st.bar_chart(feature_importance.set_index('feature'))


        # Model explanation
        st.subheader("Model Explanation")
        st.write(f"""
        The {model_choice} model was trained on historical NBA player data to predict salary percentages.
        We used the following techniques to improve model performance:
        1. Feature engineering to create relevant statistics
        2. Proper encoding of categorical variables
        3. Scaling of numerical features
        4. Hyperparameter tuning using GridSearchCV
        """)
        
    elif page == "Salary Evaluation":
        st.header("Salary Evaluation")
        display_overpaid_underpaid(predictions_df)

    elif page == "Trade Analysis":
        st.header("Trade Analysis")
        st.write("""
        Analyze potential trades and their impact on team statistics and salary cap.
        For more information on trade rules, visit: [NBA Trade Rules](https://www.hoopsrumors.com/2023/09/salary-matching-rules-for-trades-during-2023-24-season.html)
        """)

        # Team selection
        teams = sorted(predictions_df['Team'].unique())
        col1, col2 = st.columns(2)
        with col1:
            team1 = st.selectbox("Select Team 1", teams)
        with col2:
            team2 = st.selectbox("Select Team 2", teams, index=1)

        # Player selection
        team1_players = predictions_df[predictions_df['Team'] == team1]['Player'].tolist()
        team2_players = predictions_df[predictions_df['Team'] == team2]['Player'].tolist()

        col1, col2 = st.columns(2)
        with col1:
            players_leaving_team1 = st.multiselect(f"Select players leaving {team1}", team1_players)
        with col2:
            players_leaving_team2 = st.multiselect(f"Select players leaving {team2}", team2_players)

        if st.button("Analyze Trade"):
            champions = get_champions(selected_season - 10, selected_season - 1)
            result = analyze_two_team_trade(team1, team2, players_leaving_team1, players_leaving_team2, predictions_df, champions)
            
            if result:
                display_trade_impact(result, team1, team2)
            else:
                st.error("Trade analysis failed. Please check your selections.")

        # Trade analysis explanation
        st.subheader("Trade Analysis Explanation")
        st.write("""
        Our trade analysis compares team statistics before and after the proposed trade.
        We consider:
        1. Changes in key performance metrics (PPG, RPG, APG, etc.)
        2. Salary implications and cap space impact
        3. Comparison to league averages and recent championship teams
        4. Distribution of top performers in various statistical categories
        5. Overpaid/Underpaid player analysis
        """)

if __name__ == "__main__":
    main()

Overwriting ../src/salary_predict/updated/app.py
