In [8]:
%%writefile ../src/salary_predict/updated/data_loader_preprocessor.py

import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.feature_selection import f_regression, SelectKBest
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt
import seaborn as sns

def load_data(file_path):
    data = pd.read_csv(file_path)
    print("Data loaded. Shape:", data.shape)
    return data

def format_season(data):
    # Convert season format to a single year for easier numerical analysis
    data['Season'] = data['Season'].apply(lambda x: int(x.split('-')[0]))
    print("Seasons in data:", data['Season'].unique())
    return data

def clean_data(data):
    # Drop columns that may not contribute significantly to the model
    data_clean = data.copy()
    columns_to_drop = ['Injury_Periods', '2nd Apron', 'Wins', 'Losses']
    data_clean.drop(columns_to_drop, axis=1, errors='ignore', inplace=True)
    
    # Handle missing percentage data by filling with column mean
    percentage_cols = ['3P%', '2P%', 'FT%', 'TS%']
    for col in percentage_cols:
        if col in data_clean.columns:
            data_clean[col] = data_clean[col].fillna(data_clean[col].mean())
    
    # Drop remaining NaNs
    data_clean = data_clean.dropna()
    print("Data cleaned. Remaining shape:", data_clean.shape)
    return data_clean

def engineer_features(data):
    # Calculate per-game statistics to normalize performance data
    per_game_cols = ['PTS', 'AST', 'TRB', 'STL', 'BLK', 'TOV']
    for col in per_game_cols:
        data[f'{col[0]}PG'] = data[col] / data['GP']
    
    # Derive additional features to capture important aspects of a player's performance
    data['Availability'] = data['GP'] / 82
    data['SalaryPct'] = data['Salary'] / data['Salary_Cap_Inflated']
    data['Efficiency'] = (data['PTS'] + data['TRB'] + data['AST'] + data['STL'] + data['BLK']) / (data['FGA'] + data['FTA'] + data['TOV'] + 1)
    data['ValueOverReplacement'] = data['VORP'] / (data['Salary'] + 1)
    data['ExperienceSquared'] = data['Years of Service'] ** 2
    data['Days_Injured_Percentage'] = data['Total_Days_Injured'] / data['GP']
    data['WSPG'] = data['WS'] / data['GP']
    data['DWSPG'] = data['DWS'] / data['GP']
    data['OWSPG'] = data['OWS'] / data['GP']
    data['PFPG'] = data['PF'] / data['GP']
    data['ORPG'] = data['ORB'] / data['GP']
    data['DRPG'] = data['DRB'] / data['GP']
    
    # Drop columns used in feature creation or deemed less relevant
    columns_to_drop = ['GP', '2PA', 'OBPM', 'BPM', 'DBPM', '2P', 'GS', 'PTS', 'AST', 'TRB', 'STL', 'BLK',
                       'TOV', 'MP', 'FG', 'FGA', 'FG%', '3P', '3PA', '2P', '2PA', 'FT', 'FTA', 'ORB', 'DRB', 'TRB',
                       'TS%', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%', 'Luxury Tax', '1st Apron', 'BAE',
                       'Standard /Non-Taxpayer', 'Taxpayer', 'Team Room /Under Cap', 'WS', 'DWS', 'WS/48', 'PF', 'OWS', 'Injured']
    data.drop(columns_to_drop, axis=1, errors='ignore', inplace=True)
    print("New features added.")
    return data

def encode_injury_risk(data):
    # Encode injury risk levels for model training
    risk_mapping = {'Low': 0, 'Medium': 1, 'High': 2}
    data['Injury_Risk'] = data['Injury_Risk'].map(risk_mapping).fillna(1)  # Default to Medium if unknown
    return data, risk_mapping

def encode_categorical(data, columns):
    # Encode categorical columns using one-hot encoding
    encoders = {}
    for col in columns:
        encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
        encoded = encoder.fit_transform(data[[col]])
        encoded_df = pd.DataFrame(encoded, columns=encoder.get_feature_names_out([col]), index=data.index)
        data = pd.concat([data.drop(col, axis=1), encoded_df], axis=1)
        encoders[col] = encoder
    return data, encoders


def encode_data(data, encoders=None, player_encoder=None):
    print("Columns before encoding:", data.columns)

    # Encode Injury_Risk
    risk_mapping = {'Low': 0, 'Medium': 1, 'High': 2}
    data['Injury_Risk'] = data['Injury_Risk'].map(risk_mapping).fillna(1)  # Default to Medium if unknown

    # Encode Player column if it's present
    if 'Player' in data.columns:
        if player_encoder is None:
            player_encoder = LabelEncoder()
            data['Player_Encoded'] = player_encoder.fit_transform(data['Player'])
        else:
            data['Player_Encoded'] = player_encoder.transform(data['Player'])
        data.drop('Player', axis=1, inplace=True)  # Drop original Player column after encoding
    
    # Identify initial numeric columns
    initial_numeric_cols = data.select_dtypes(include=['int64', 'float64']).columns.tolist()

    # Encode categorical variables (excluding Season)
    categorical_cols = ['Position', 'Team']
    if encoders is None:
        encoders = {}
        for col in categorical_cols:
            encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')  # Updated line
            encoded = encoder.fit_transform(data[[col]])
            encoded_df = pd.DataFrame(encoded, columns=encoder.get_feature_names_out([col]), index=data.index)
            data = pd.concat([data.drop(col, axis=1), encoded_df], axis=1)
            encoders[col] = encoder
    else:
        for col in categorical_cols:
            encoded = encoders[col].transform(data[[col]])
            encoded_df = pd.DataFrame(encoded, columns=encoders[col].get_feature_names_out([col]), index=data.index)
            data = pd.concat([data.drop(col, axis=1), encoded_df], axis=1)

    # Identify final numeric columns (excluding one-hot encoded columns and 'Season')
    numeric_cols = [col for col in initial_numeric_cols if col not in ['Season', 'Injury_Risk', 'Player_Encoded']]

    # Scale numeric features (excluding 'Player_Encoded')
    scaler = StandardScaler()
    data[numeric_cols] = scaler.fit_transform(data[numeric_cols])

    print("Encoded data shape:", data.shape)
    print("Columns after encoding:", data.columns)

    return data, risk_mapping, encoders, scaler, numeric_cols, player_encoder



def scale_features(data, numeric_cols):
    scaler = StandardScaler()
    data[numeric_cols] = scaler.fit_transform(data[numeric_cols])
    return data, scaler

def decode_data(encoded_data, injury_risk_mapping, encoders, scaler, numeric_cols, player_encoder):
    decoded_data = encoded_data.copy()
    
    # Decode Injury_Risk
    inv_injury_risk_mapping = {v: k for k, v in injury_risk_mapping.items()}
    decoded_data['Injury_Risk'] = decoded_data['Injury_Risk'].map(inv_injury_risk_mapping)
    
    # Decode Player column
    if 'Player_Encoded' in decoded_data.columns:
        decoded_data['Player'] = player_encoder.inverse_transform(decoded_data['Player_Encoded'])
        decoded_data.drop('Player_Encoded', axis=1, inplace=True)
    
    # Decode categorical variables
    for col, encoder in encoders.items():
        encoded_cols = [c for c in decoded_data.columns if c.startswith(f"{col}_")]
        decoded_col = encoder.inverse_transform(decoded_data[encoded_cols])
        decoded_data[col] = decoded_col.ravel()  # Flatten the 2D array to 1D
        decoded_data.drop(encoded_cols, axis=1, inplace=True)
    
    # Inverse transform scaled features
    decoded_data[numeric_cols] = scaler.inverse_transform(decoded_data[numeric_cols])
    
    return decoded_data

def select_top_features(X, y, k=10):
    # Select top features based on statistical significance
    selector = SelectKBest(score_func=f_regression, k=k)
    selector.fit(X, y)
    top_features = X.columns[selector.get_support()].tolist()
    print(f"Top {k} features:", top_features)
    return top_features

def calculate_tree_feature_importance(X, y):
    # Calculate feature importance using a Random Forest Regressor
    rf = RandomForestRegressor(n_estimators=100, random_state=42)
    rf.fit(X, y)
    importances = rf.feature_importances_
    feature_importances = pd.DataFrame({
        'Feature': X.columns,
        'Importance': importances
    }).sort_values(by='Importance', ascending=False)
    
    plt.figure(figsize=(12, 8))
    sns.barplot(x='Importance', y='Feature', data=feature_importances.head(20))
    plt.title('Top 20 Feature Importances from Random Forest')
    plt.show()
    
    return feature_importances

if __name__ == "__main__":
    file_path = '../data/processed/nba_player_data_final_inflated.csv'
    data = load_data(file_path)
    data = format_season(data)
    data = clean_data(data)
    data = engineer_features(data)

    # Separate features and target
    X = data.drop(['SalaryPct', 'Salary'], axis=1)
    y = data['SalaryPct']

    # Encode data
    encoded_data, injury_risk_mapping, encoders, scaler, numeric_cols, player_encoder = encode_data(X)

    print("\nInjury Risk Mapping:", injury_risk_mapping)
    print("Encoded Injury Risk range:", encoded_data['Injury_Risk'].min(), "-", encoded_data['Injury_Risk'].max())
    print("\nNumeric columns for scaling:", numeric_cols)

    # Calculate feature importance
    feature_importances = calculate_tree_feature_importance(encoded_data, y)
    print("\nTree-based feature importances:")
    print(feature_importances.head(20))

    # Select top features
    top_features = select_top_features(encoded_data, y)
    print("\nTop features selected using statistical methods:", top_features)

    # Decoding example
    print("\nDecoding Example:")
    decoded_data = decode_data(encoded_data, injury_risk_mapping, encoders, scaler, numeric_cols, player_encoder)
    
    print("\nFirst few rows of decoded data:")
    print(decoded_data[['Player', 'Injury_Risk', 'Position', 'Team', 'Season'] + top_features].head())

    print("\nData types after decoding:")
    print(decoded_data.dtypes)

    print("\nData preprocessing completed. Ready for model training.")


Overwriting ../src/salary_predict/updated/data_loader_preprocessor.py


In [9]:
%%writefile ../src/salary_predict/updated/model_trainer.py
import joblib
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split, GridSearchCV
import pandas as pd
import numpy as np

def inspect_data_types(X):
    print("Data types of features:")
    print(X.dtypes)
    object_columns = X.select_dtypes(include=['object']).columns
    if not object_columns.empty:
        print("Columns with object data types:", object_columns.tolist())
    else:
        print("No columns with object data types.")

def perform_grid_search(model, param_grid, X_train, y_train):
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    print(f"Best parameters for {model.__class__.__name__}: {grid_search.best_params_}")
    print(f"Best score for {model.__class__.__name__}: {-grid_search.best_score_}")
    return grid_search.best_estimator_

def train_and_save_models(X_train, y_train, model_save_path, scaler, feature_names, encoders, player_encoder, numeric_cols):
    # Inspect data types before training
    inspect_data_types(X_train)

    # Initialize models with default parameters
    rf_model = RandomForestRegressor(random_state=42)
    xgb_model = xgb.XGBRegressor(random_state=42, enable_categorical=True)

    # Define parameter grids for grid search
    rf_param_grid = {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5, 10]
    }
    xgb_param_grid = {
        'n_estimators': [100, 200, 300],
        'max_depth': [3, 6, 9],
        'learning_rate': [0.01, 0.1, 0.2]
    }

    # Perform grid search
    best_rf_model = perform_grid_search(rf_model, rf_param_grid, X_train, y_train)
    best_xgb_model = perform_grid_search(xgb_model, xgb_param_grid, X_train, y_train)

    # Train models with best parameters
    best_rf_model.fit(X_train, y_train)
    best_xgb_model.fit(X_train, y_train)

    # Scale the features used for training
    X_train_scaled = scaler.fit_transform(X_train)

    # Save models, scaler, feature names, encoders, and other artifacts
    joblib.dump(best_rf_model, f"{model_save_path}/best_rf_model.pkl")
    joblib.dump(best_xgb_model, f"{model_save_path}/best_xgb_model.pkl")
    joblib.dump(scaler, f"{model_save_path}/scaler.pkl")
    joblib.dump(feature_names, f"{model_save_path}/feature_names.pkl")
    joblib.dump(encoders, f"{model_save_path}/encoders.pkl")
    joblib.dump(injury_risk_mapping, f"{model_save_path}/injury_risk_mapping.pkl")
    joblib.dump(numeric_cols, f"{model_save_path}/numeric_cols.pkl")

    joblib.dump(player_encoder, f"{model_save_path}/player_encoder.pkl")
    print("Models, scaler, feature names, encoders, and other artifacts trained and saved successfully.")

def evaluate_models(X_test, y_test, model_save_path):
    # Load models, scaler, and feature names
    rf_model = joblib.load(f"{model_save_path}/best_rf_model.pkl")
    xgb_model = joblib.load(f"{model_save_path}/best_xgb_model.pkl")

    # Make predictions
    rf_predictions = rf_model.predict(X_test)
    xgb_predictions = xgb_model.predict(X_test)

    # Evaluate models using multiple metrics
    metrics = {'Random Forest': rf_predictions, 'XGBoost': xgb_predictions}

    for model_name, predictions in metrics.items():
        mse = mean_squared_error(y_test, predictions)
        rmse = np.sqrt(mse)
        mae = mean_absolute_error(y_test, predictions)
        r2 = r2_score(y_test, predictions)

        print(f"\n{model_name} Evaluation:")
        print(f"MSE: {mse}")
        print(f"RMSE: {rmse}")
        print(f"MAE: {mae}")
        print(f"R-squared: {r2}")
        
def filter_seasons(data, predict_season):
    """
    Filters the dataset into prior seasons and the target season for prediction.

    Args:
        data (pd.DataFrame): The dataset containing season data.
        predict_season (int): The season that you want to predict.

    Returns:
        tuple: A tuple containing two DataFrames:
            - prior_seasons_data: Data for seasons before the predict_season.
            - target_season_data: Data for the predict_season.
    """
    # Separate data into prior seasons and the target season
    prior_seasons_data = data[data['Season'] < predict_season]
    target_season_data = data[data['Season'] == predict_season]
    
    print(f"Data filtered. Prior seasons shape: {prior_seasons_data.shape}, Target season shape: {target_season_data.shape}")
    
    return target_season_data, prior_seasons_data

# Data preprocessing
def load_and_preprocess_data(file_path, predict_season):
    data = load_data(file_path)
    data = format_season(data)
    _, prior_seasons_data = filter_seasons(data, predict_season)
    prior_seasons_data = clean_data(prior_seasons_data)
    prior_seasons_data = engineer_features(prior_seasons_data)
    return prior_seasons_data

# Feature selection
def select_features(data, target_column, additional_features=[]):
    top_features = ['PPG', 'APG', 'RPG', 'SPG', 'TOPG', 'Years of Service', 'PER', 'VORP', 'WSPG', 'OWSPG']
    
    # Add 'Injury_Risk', 'Position', and 'Team' to ensure they're included for encoding
    top_features += ['Injury_Risk', 'Position', 'Team']
    
    # Add any additional features
    top_features += additional_features
    
    # Ensure all selected features are in the dataset
    available_features = [col for col in top_features if col in data.columns]
    
    print("Available features for modeling:", available_features)  # Debug statement

    X = data[available_features]
    y = data[target_column]
    return X, y

# Main execution
if __name__ == "__main__":
    file_path = '../data/processed/nba_player_data_final_inflated.csv'
    predict_season = 2023
    target_column = 'SalaryPct'

    # Load and preprocess data
    preprocessed_data = load_and_preprocess_data(file_path, predict_season)
    print("Columns after preprocessing:", preprocessed_data.columns)

    # Select features
    X, y = select_features(preprocessed_data, target_column)
    print("Columns after feature selection:", X.columns)

    # Encode data
    encoded_data, injury_risk_mapping, encoders, scaler, numeric_cols, player_encoder = encode_data(X)
    print("Columns after encoding:", encoded_data.columns)

    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(encoded_data, y, test_size=0.2, random_state=42)

    # Train and evaluate models
    model_save_path = '../data/models'
    train_and_save_models(X_train, y_train, model_save_path, scaler, encoded_data.columns, encoders, injury_risk_mapping, numeric_cols)
    evaluate_models(X_test, y_test, model_save_path)


Overwriting ../src/salary_predict/updated/model_trainer.py


In [5]:
%%writefile ../src/salary_predict/updated/model_predictor.py

import joblib
import pandas as pd

def load_models_and_utils(model_save_path):
    rf_model = joblib.load(f"{model_save_path}/best_rf_model.pkl")
    xgb_model = joblib.load(f"{model_save_path}/best_xgb_model.pkl")
    scaler = joblib.load(f"{model_save_path}/scaler.pkl")
    feature_names = joblib.load(f"{model_save_path}/feature_names.pkl")
    encoders = joblib.load(f"{model_save_path}/encoders.pkl")
    injury_risk_mapping = joblib.load(f"{model_save_path}/injury_risk_mapping.pkl")
    numeric_cols = joblib.load(f"{model_save_path}/numeric_cols.pkl")
    player_encoder = joblib.load(f"{model_save_path}/player_encoder.pkl")
    return rf_model, xgb_model, scaler, feature_names, encoders, injury_risk_mapping, numeric_cols, player_encoder

def predict(data, model_save_path):
    rf_model, xgb_model, scaler, feature_names, encoders, _, _, player_encoder = load_models_and_utils(model_save_path)
    
    print("Original data shape:", data.shape)
    print("Original data columns:", data.columns.tolist())

    # Preserve player names
    player_names = data['Player'] if 'Player' in data.columns else None
    
    # Drop the player column before encoding
    data = data.drop(columns=['Player'], errors='ignore')
    
    # Encode the data using the loaded encoders
    encoded_data, _, _, _, _, _ = encode_data(data, encoders, player_encoder)
    
    print("Encoded data shape:", encoded_data.shape)
    print("Encoded data columns:", encoded_data.columns.tolist())
    
    # Handle missing features: Add missing columns and set them to zero
    for col in feature_names:
        if col not in encoded_data.columns:
            encoded_data[col] = 0

    # Ensure encoded_data only has feature_names columns
    encoded_data = encoded_data[feature_names]
    
    print("Selected features shape:", encoded_data.shape)
    print("Selected features:", encoded_data.columns.tolist())
    print("Expected features:", feature_names)
    
    # Scale the encoded data
    encoded_data_scaled = scaler.transform(encoded_data)
    
    # Make predictions
    rf_predictions = rf_model.predict(encoded_data_scaled)
    xgb_predictions = xgb_model.predict(encoded_data_scaled)
    
    # Create a DataFrame for predictions
    predictions_df = pd.DataFrame({
        'RF_Predictions': rf_predictions,
        'XGB_Predictions': xgb_predictions,
        'Predicted_Salary': (rf_predictions + xgb_predictions) / 2
    })
    
    # Attach player names back to the predictions
    if player_names is not None:
        predictions_df['Player'] = player_names.values

    # Combine the predictions with the original data (excluding player names)
    result = pd.concat([data.reset_index(drop=True), predictions_df], axis=1)

    return result


if __name__ == "__main__":
    file_path = '../data/processed/nba_player_data_final_inflated.csv'
    predict_season = 2023
    data = load_data(file_path)
    data = format_season(data)
    current_season_data, _ = filter_seasons(data, predict_season)
    current_season_data = clean_data(current_season_data)
    current_season_data = engineer_features(current_season_data)
    model_save_path = '../data/models'
    predictions_df = predict(current_season_data, model_save_path)  # Save predictions as predictions_df
    print(predictions_df.head())
    
    # Save predictions_df for later use
    predictions_df.to_csv('../data/processed/predictions_df.csv', index=False)


Overwriting ../src/salary_predict/updated/model_predictor.py


https://www.hoopsrumors.com/2023/09/salary-matching-rules-for-trades-during-2023-24-season.html

for trade rules


FIRST_TAX_APRON = 172_346_000

def check_salary_matching_rules(outgoing_salary, incoming_salary, team_salary_before_trade):
    if team_salary_before_trade < FIRST_TAX_APRON:
        if outgoing_salary <= 7_500_000:
            max_incoming_salary = 2 * outgoing_salary + 250_000
        elif outgoing_salary <= 29_000_000:
            max_incoming_salary = outgoing_salary + 7_500_000
        else:
            max_incoming_salary = 1.25 * outgoing_salary + 250_000
    else:
        max_incoming_salary = 1.10 * outgoing_salary

    return incoming_salary <= max_incoming_salary

In [6]:
import pandas as pd
import numpy as np
from nba_api.stats.endpoints import leaguegamefinder, playergamelogs
import time

RELEVANT_STATS = ['PTS', 'AST', 'TOV', 'STL', 'BLK', 'OREB', 'DREB', 'FGM', 'FG3M', 'FGA']
PERCENTILE_THRESHOLDS = [99, 98, 97, 96, 95, 90, 75, 50]

def get_champion(season):
    games = leaguegamefinder.LeagueGameFinder(season_nullable=season, season_type_nullable='Playoffs').get_data_frames()[0]
    games['GAME_DATE'] = pd.to_datetime(games['GAME_DATE'])
    last_game = games.sort_values('GAME_DATE').iloc[-2:]
    winner = last_game[last_game['WL'] == 'W'].iloc[0]
    return winner['TEAM_ID'], winner['TEAM_NAME']

def get_champions(start_year, end_year):
    champions = {}
    for year in range(start_year, end_year + 1):
        season = f"{year}-{str(year+1)[-2:]}"
        champ_id, champ_name = get_champion(season)
        champions[season] = {'ChampionTeamID': champ_id, 'ChampionTeamName': champ_name}
        time.sleep(1)  # To avoid overwhelming the API
    return champions

def get_season_from_date(date):
    year = int(date[:4])
    month = int(date[5:7])
    if month >= 10:
        return f"{year}-{str(year+1)[2:]}"
    else:
        return f"{year-1}-{str(year)[2:]}"

def analyze_leaguegamefinder_endpoint(start_season, end_season):
    all_seasons_data = []
    
    for season in range(int(start_season[:4]), int(end_season[:4]) + 1):
        season_str = f"{season}-{str(season+1)[2:]}"
        print(f"Fetching data for season {season_str}")
        
        games = leaguegamefinder.LeagueGameFinder(
            season_nullable=season_str,
            season_type_nullable='Regular Season'
        ).get_data_frames()[0]
        
        games['SEASON'] = games['GAME_DATE'].apply(get_season_from_date)
        all_seasons_data.append(games)
        
        time.sleep(1)  # To avoid overwhelming the API
    
    return pd.concat(all_seasons_data, ignore_index=True)

def calculate_per_game_stats(games_df):
    per_game_stats = games_df.groupby(['SEASON', 'TEAM_ID', 'TEAM_NAME'])[RELEVANT_STATS].mean().reset_index()
    
    # Calculate eFG%
    per_game_stats['eFG%'] = (per_game_stats['FGM'] + 0.5 * per_game_stats['FG3M']) / per_game_stats['FGA']
    
    return per_game_stats

def calculate_percentiles(stats_df):
    percentile_cols = RELEVANT_STATS + ['eFG%']
    
    for col in percentile_cols:
        stats_df[f'{col}_percentile'] = stats_df.groupby('SEASON')[col].rank(pct=True)
    
    return stats_df

def get_current_season_stats(all_seasons_data, current_season):
    current_season_data = all_seasons_data[all_seasons_data['SEASON'] == current_season]
    per_game_stats = calculate_per_game_stats(current_season_data)
    percentile_stats = calculate_percentiles(per_game_stats)
    
    # Calculate league average
    league_avg = per_game_stats[RELEVANT_STATS + ['eFG%']].mean()
    league_avg['TEAM_NAME'] = 'League Average'
    league_avg['SEASON'] = current_season
    league_avg['TEAM_ID'] = 'AVG'
    league_avg = pd.DataFrame(league_avg).transpose()
    
    # Combine team stats with league average
    combined_stats = pd.concat([percentile_stats, league_avg], ignore_index=True)
    return combined_stats

def get_champions_stats(all_seasons_data, start_season, end_season):
    champions = {}
    for year in range(int(start_season[:4]), int(end_season[:4]) + 1):
        season = f"{year}-{str(year+1)[-2:]}"
        champ_id, champ_name = get_champion(season)
        champions[season] = {'ChampionTeamID': champ_id, 'ChampionTeamName': champ_name}
    
    champions_data = all_seasons_data[all_seasons_data.apply(lambda row: row['TEAM_ID'] == champions.get(row['SEASON'], {}).get('ChampionTeamID'), axis=1)]
    champions_stats = calculate_per_game_stats(champions_data)
    return calculate_percentiles(champions_stats)

def compare_stats(current_stats, champions_stats, league_avg):
    # Compare current stats to champions average and league average
    champs_avg = champions_stats[RELEVANT_STATS + ['eFG%']].mean()
    
    comparison = current_stats.copy()
    for stat in RELEVANT_STATS + ['eFG%']:
        comparison[f'{stat}_vs_champs'] = comparison[stat] - champs_avg[stat]
        comparison[f'{stat}_vs_league'] = comparison[stat] - league_avg[stat]
    
    return comparison

def get_team_data(all_seasons_data, team_names, current_season):
    team_data = all_seasons_data[(all_seasons_data['SEASON'] == current_season) & (all_seasons_data['TEAM_NAME'].isin(team_names))]
    return calculate_per_game_stats(team_data)

def simulate_trade(all_seasons_data, team_from, team_to, trade_impact, current_season):
    before_trade = get_team_data(all_seasons_data, [team_from, team_to], current_season)
    
    # Simulate the trade by adjusting team stats
    after_trade = before_trade.copy()
    numeric_columns = before_trade.select_dtypes(include=[np.number]).columns
    
    for stat in numeric_columns:
        if stat in trade_impact:
            after_trade.loc[after_trade['TEAM_NAME'] == team_from, stat] -= trade_impact[stat]
            after_trade.loc[after_trade['TEAM_NAME'] == team_to, stat] += trade_impact[stat]
    
    # Recalculate eFG% for both teams
    for team in [team_from, team_to]:
        team_data = after_trade[after_trade['TEAM_NAME'] == team]
        after_trade.loc[after_trade['TEAM_NAME'] == team, 'eFG%'] = (
            (team_data['FGM'] + 0.5 * team_data['FG3M']) / team_data['FGA']
        ).values[0]
    
    return before_trade, after_trade

def get_player_game_logs(team_id, season):
    player_logs = playergamelogs.PlayerGameLogs(team_id_nullable=team_id, season_nullable=season).get_data_frames()[0]
    return player_logs

def process_player_data(player_logs):
    player_stats = player_logs.groupby(['PLAYER_ID', 'PLAYER_NAME', 'TEAM_ID', 'TEAM_NAME'])[RELEVANT_STATS].mean().reset_index()
    player_stats['eFG%'] = (player_stats['FGM'] + 0.5 * player_stats['FG3M']) / player_stats['FGA']
    return player_stats

def get_champions_player_data(champions, start_season, end_season):
    all_champion_players = []
    for season in range(int(start_season[:4]), int(end_season[:4]) + 1):
        season_str = f"{season}-{str(season+1)[2:]}"
        champ_id = champions[season_str]['ChampionTeamID']
        player_logs = get_player_game_logs(champ_id, season_str)
        player_stats = process_player_data(player_logs)
        player_stats['SEASON'] = season_str
        all_champion_players.append(player_stats)
        time.sleep(1)  # To avoid overwhelming the API
    return pd.concat(all_champion_players, ignore_index=True)

def get_current_season_player_data(all_seasons_data, current_season):
    current_teams = all_seasons_data[all_seasons_data['SEASON'] == current_season]['TEAM_ID'].unique()
    all_current_players = []
    for team_id in current_teams:
        player_logs = get_player_game_logs(team_id, current_season)
        player_stats = process_player_data(player_logs)
        player_stats['SEASON'] = current_season
        all_current_players.append(player_stats)
        time.sleep(1)  # To avoid overwhelming the API
    return pd.concat(all_current_players, ignore_index=True)

def simulate_trade_with_players(team_from_data, team_to_data, traded_players):
    before_trade = pd.concat([team_from_data, team_to_data])
    
    # Move traded players between teams
    traded_from = team_from_data[team_from_data['PLAYER_NAME'].isin(traded_players)].copy()
    traded_to = team_to_data[team_to_data['PLAYER_NAME'].isin(traded_players)].copy()
    
    team_from_after = team_from_data[~team_from_data['PLAYER_NAME'].isin(traded_players)]
    team_to_after = pd.concat([team_to_data[~team_to_data['PLAYER_NAME'].isin(traded_players)], traded_from])
    
    after_trade = pd.concat([team_from_after, team_to_after])
    
    return before_trade, after_trade

def analyze_trade_impact(before_trade, after_trade):
    team_totals_before = before_trade.groupby('TEAM_NAME')[RELEVANT_STATS + ['eFG%']].sum().reset_index()
    team_totals_after = after_trade.groupby('TEAM_NAME')[RELEVANT_STATS + ['eFG%']].sum().reset_index()
    
    trade_impact = team_totals_after.set_index('TEAM_NAME').subtract(team_totals_before.set_index('TEAM_NAME')).reset_index()
    return trade_impact

def main():
    start_season = "2022-23"
    end_season = "2023-24"
    current_season = end_season
    all_seasons_data = analyze_leaguegamefinder_endpoint(start_season, end_season)
    
    # 1. Get champions for the past 10 seasons
    champions = get_champions(int(start_season[:4]), int(end_season[:4]))
    
    # 2. Get player-level data for champions
    champions_player_data = get_champions_player_data(champions, start_season, end_season)
    print("Champions Player Data (Past 10 Seasons):")
    print(champions_player_data)
    
    # 3. Get current season player-level data
    current_season_player_data = get_current_season_player_data(all_seasons_data, current_season)
    print("\nCurrent Season Player Data:")
    print(current_season_player_data)
    
    # 4. Load predictions dataframe
    predictions_df = pd.read_csv('../data/processed/predictions_df.csv')
    print("\nPredictions DataFrame (first few rows):")
    print(predictions_df.head())
    
    # 5. Simulate trade
    team_from = "Los Angeles Lakers"
    team_to = "Boston Celtics"
    traded_players = ["Anthony Davis", "Jayson Tatum"]  # Example players
    
    team_from_data = current_season_player_data[current_season_player_data['TEAM_NAME'] == team_from]
    team_to_data = current_season_player_data[current_season_player_data['TEAM_NAME'] == team_to]
    
    print("\nTeam Data Before Trade:")
    print(pd.concat([team_from_data, team_to_data]))
    
    before_trade, after_trade = simulate_trade_with_players(team_from_data, team_to_data, traded_players)
    
    print("\nTeam Data After Trade:")
    print(after_trade)
    
    # 6. Analyze trade impact
    trade_impact = analyze_trade_impact(before_trade, after_trade)
    print("\nTrade Impact (Difference in Team Stats):")
    print(trade_impact)
    
    # 7. Compare traded players to champions
    traded_player_stats = before_trade[before_trade['PLAYER_NAME'].isin(traded_players)]
    champion_avg = champions_player_data.groupby('SEASON')[RELEVANT_STATS + ['eFG%']].mean().mean()
    
    print("\nTraded Players vs. Champions Average:")
    for _, player in traded_player_stats.iterrows():
        print(f"\n{player['PLAYER_NAME']}:")
        for stat in RELEVANT_STATS + ['eFG%']:
            diff = player[stat] - champion_avg[stat]
            print(f"{stat}: {player[stat]:.2f} (Diff from Champs Avg: {diff:.2f})")
    
    # 8. Analyze salary based on predictions
    traded_players_salary = predictions_df[predictions_df['Player'].isin(traded_players)]
    print("\nSalary Analysis for Traded Players:")
    print(traded_players_salary[['Player', 'Salary', 'Predicted_Salary']])

if __name__ == "__main__":
    main()

Fetching data for season 2022-23
Fetching data for season 2023-24
Champions Player Data (Past 10 Seasons):
    PLAYER_ID               PLAYER_NAME     TEAM_ID       TEAM_NAME  \
0      201145                Jeff Green  1610612743  Denver Nuggets   
1      201599            DeAndre Jordan  1610612743  Denver Nuggets   
2      202397                 Ish Smith  1610612743  Denver Nuggets   
3      202704            Reggie Jackson  1610612743  Denver Nuggets   
4      203484  Kentavious Caldwell-Pope  1610612743  Denver Nuggets   
5      203932              Aaron Gordon  1610612743  Denver Nuggets   
6      203999              Nikola Jokic  1610612743  Denver Nuggets   
7     1627750              Jamal Murray  1610612743  Denver Nuggets   
8     1628418             Thomas Bryant  1610612743  Denver Nuggets   
9     1628427             Vlatko Cancar  1610612743  Denver Nuggets   
10    1628432                Davon Reed  1610612743  Denver Nuggets   
11    1628971               Bruce Brown  

KeyboardInterrupt: 

In [None]:
%%writefile ../src/salary_predict/updated/overall_team_trade_impact.py

import pandas as pd
import numpy as np
from nba_api.stats.endpoints import playergamelogs, leaguegamefinder
from tabulate import tabulate
import time
from nba_api.stats.endpoints import commonplayerinfo
from nba_api.stats.static import teams

# Constants
RELEVANT_STATS = ['PTS', 'AST', 'TOV', 'STL', 'BLK', 'OREB', 'DREB', 'FGM', 'FG3M', 'FGA']

def load_team_data():
    nba_teams = teams.get_teams()
    team_df = pd.DataFrame(nba_teams)
    return team_df[['id', 'full_name', 'abbreviation']]


def fetch_player_info(player_id, debug=False):
    """Fetch player information based on player ID."""
    try:
        player_info = commonplayerinfo.CommonPlayerInfo(player_id=player_id).get_data_frames()[0]
        if debug:
            print(f"Fetched info for player ID {player_id}: {player_info['DISPLAY_FIRST_LAST'].values[0]}")
        return player_info
    except Exception as e:
        if debug:
            print(f"Error fetching info for player ID {player_id}: {e}")
        return None

def fetch_season_data_by_year(year, debug=False):
    """Fetch player game logs data for a given starting year of the NBA season."""
    season = f"{year}-{str(year+1)[-2:]}"
    if debug:
        print(f"Fetching player data for season {season}")
    try:
        player_logs = playergamelogs.PlayerGameLogs(season_nullable=season).get_data_frames()[0]
        player_logs['SEASON'] = season
        player_logs['GAME_DATE'] = pd.to_datetime(player_logs['GAME_DATE'])
        if debug:
            print(f"Player data for season {season} contains {player_logs.shape[0]} rows.")
        return player_logs
    except Exception as e:
        if debug:
            print(f"Error fetching player data for season {season}: {e}")
        return None

# Helper Functions
def get_champion(season, debug=False):
    """Fetch the champion team for a given NBA season."""
    try:
        games = leaguegamefinder.LeagueGameFinder(season_nullable=season, season_type_nullable='Playoffs').get_data_frames()[0]
        games['GAME_DATE'] = pd.to_datetime(games['GAME_DATE'])
        last_game = games.sort_values('GAME_DATE').iloc[-2:]
        winner = last_game[last_game['WL'] == 'W'].iloc[0]
        if debug:
            print(f"Champion for season {season}: {winner['TEAM_NAME']} ({winner['TEAM_ID']})")
        return winner['TEAM_NAME']
    except Exception as e:
        if debug:
            print(f"Error fetching champion for season {season}: {e}")
        return None

def get_champions(start_year, end_year, debug=False):
    """Fetch champions for each season from start_year to end_year."""
    champions = {}
    for year in range(start_year, end_year + 1):
        season = f"{year}-{str(year+1)[-2:]}"
        champ_name = get_champion(season, debug)
        if champ_name:
            champions[season] = {'ChampionTeamName': champ_name}
        elif debug:
            print(f"Champion data not available for season {season}")
        time.sleep(1)  # To avoid overwhelming the API
    if debug:
        print(f"Champions data: {champions}")
    return champions

def calculate_percentiles(stats_df, debug=False):
    """Calculate percentiles for stats after averages are computed."""
    # Group by season and calculate percentiles for each season separately
    for season in stats_df['SEASON'].unique():
        season_data = stats_df[stats_df['SEASON'] == season]
        for stat in RELEVANT_STATS + ['eFG%']:
            stat_per_game = f'{stat}_per_game'
            if stat_per_game in season_data.columns:
                stats_df.loc[season_data.index, f'{stat}_percentile'] = season_data[stat_per_game].rank(pct=True)
                if debug:
                    print(f"Calculated percentiles for {stat} in season {season}:")
                    print(stats_df.loc[season_data.index, [stat_per_game, f'{stat}_percentile']].head())
    return stats_df

def calculate_team_stats(player_data, period, debug=False):
    """Calculate team-level statistics, including averages."""
    if debug:
        print(f"Calculating {period} team-level statistics.")
        print("Initial player_data head:")
        print(player_data.head())

    # Calculate team-level stats by summing player stats for each team and season
    team_stats = (
        player_data.groupby(['SEASON', 'TEAM_NAME'])[RELEVANT_STATS]
        .sum()
        .reset_index()
    )

    # Calculate the number of games played by each team
    games_played = player_data.groupby(['SEASON', 'TEAM_NAME'])['GAME_ID'].nunique().reset_index(name='GAMES_PLAYED')

    # Merge games played with team stats
    team_stats = pd.merge(team_stats, games_played, on=['SEASON', 'TEAM_NAME'])

    # Calculate stats per game
    for stat in RELEVANT_STATS:
        team_stats[f'{stat}_per_game'] = team_stats[stat] / team_stats['GAMES_PLAYED']

    # Add period column
    team_stats['PERIOD'] = period

    if debug:
        print(f"{period} team-level statistics head:")
        print(team_stats.head())

    return team_stats

def process_champion_team_data(player_data, champions, debug=False):
    """Process the game logs to get data for the champion teams."""
    champion_team_stats = pd.DataFrame()

    for season, champ_info in champions.items():
        champ_name = champ_info['ChampionTeamName']

        # Filter player data for champion team
        champ_data = player_data[(player_data['SEASON'] == season) & (player_data['TEAM_NAME'] == champ_name)]

        if champ_data.empty:
            if debug:
                print(f"No data found for champion team {champ_name} in season {season}")
            continue

        # Calculate team statistics
        champ_stats = calculate_team_stats(champ_data, 'Champion', debug)
        champ_stats['ChampionTeamName'] = champ_name

        champion_team_stats = pd.concat([champion_team_stats, champ_stats], ignore_index=True)

    # Calculate eFG%
    champion_team_stats['eFG%_per_game'] = (
        (champion_team_stats['FGM_per_game'] + 0.5 * champion_team_stats['FG3M_per_game']) / champion_team_stats['FGA_per_game']
    )

    # Calculate percentiles for champion teams within their season
    champion_team_stats = calculate_percentiles(champion_team_stats, debug)

    return champion_team_stats

def calculate_post_trade_team_stats(player_data, traded_players, trade_date, season_data, debug=False):
    """Calculate post-trade team-level statistics, using entire season if necessary."""
    if debug:
        print("Calculating post-trade team-level statistics.")

    # Convert trade_date to datetime
    trade_date = pd.to_datetime(trade_date)

    # Determine the start of the season based on the SEASON column
    season_start_year = int(player_data['SEASON'].iloc[0].split('-')[0])
    season_start_date = pd.to_datetime(f"{season_start_year}-10-01")  # NBA season typically starts in October

    # Determine whether to use entire season data or data after trade date
    if trade_date < season_start_date:
        if debug:
            print(f"Warning: Trade date {trade_date} is earlier than the start of the season {season_start_date}. Using entire season data.")
        post_trade_data = season_data  # Use the entire season data
    else:
        post_trade_data = player_data[player_data['GAME_DATE'] >= trade_date].copy()

    if debug:
        print("Post-trade player data head:")
        print(post_trade_data.head())

    # Calculate post-trade stats
    post_trade_stats = calculate_team_stats(post_trade_data, 'Post-trade', debug)

    # Calculate traded players' post-trade averages
    traded_player_stats = {}
    for player_id, (new_team_name, player_name) in traded_players.items():
        player_post_trade_stats = post_trade_data[post_trade_data['PLAYER_ID'] == player_id][RELEVANT_STATS].mean()
        traded_player_stats[player_id] = player_post_trade_stats.to_dict()
        if debug:
            print(f"{player_name} averages post-trade (to {new_team_name}): {traded_player_stats[player_id]}")

    # Adjust post-trade stats based on traded players
    for player_id, (new_team_name, player_name) in traded_players.items():
        old_team_name = player_data[player_data['PLAYER_ID'] == player_id]['TEAM_NAME'].iloc[0]
        post_trade_games = post_trade_stats.loc[post_trade_stats['TEAM_NAME'] == new_team_name, 'GAMES_PLAYED'].values[0]
        
        if debug:
            print(f"\nAdjusting stats for trade: {player_name} from {old_team_name} to {new_team_name}")

        # Remove player's stats from old team
        for stat in RELEVANT_STATS:
            if debug:
                print(f"  Before adjustment - {old_team_name} {stat}: {post_trade_stats.loc[post_trade_stats['TEAM_NAME'] == old_team_name, stat].values[0]}")
            post_trade_stats.loc[post_trade_stats['TEAM_NAME'] == old_team_name, stat] -= traded_player_stats[player_id][stat] * post_trade_games
            if debug:
                print(f"  After adjustment - {old_team_name} {stat}: {post_trade_stats.loc[post_trade_stats['TEAM_NAME'] == old_team_name, stat].values[0]}")

        # Add player's stats to new team
        for stat in RELEVANT_STATS:
            if debug:
                print(f"  Before adjustment - {new_team_name} {stat}: {post_trade_stats.loc[post_trade_stats['TEAM_NAME'] == new_team_name, stat].values[0]}")
            post_trade_stats.loc[post_trade_stats['TEAM_NAME'] == new_team_name, stat] += traded_player_stats[player_id][stat] * post_trade_games
            if debug:
                print(f"  After adjustment - {new_team_name} {stat}: {post_trade_stats.loc[post_trade_stats['TEAM_NAME'] == new_team_name, stat].values[0]}")

    # Recalculate per-game stats
    for stat in RELEVANT_STATS:
        post_trade_stats[f'{stat}_per_game'] = post_trade_stats[stat] / post_trade_stats['GAMES_PLAYED']

    if debug:
        print("Post-trade team stats calculated successfully.")
        print("Post-trade team stats head:")
        print(post_trade_stats.head())

    return post_trade_stats

def calculate_average_champion_stats(champion_team_data, debug=False):
    """Calculate the average statistics for all champion teams."""
    if debug:
        print("Calculating average champion team statistics.")
    
    # Calculate average stats for all champion teams
    avg_stats = champion_team_data[RELEVANT_STATS + [f'{stat}_per_game' for stat in RELEVANT_STATS] + ['eFG%_per_game']].mean()

    # Create a DataFrame for the average stats
    avg_row = pd.DataFrame([avg_stats], columns=champion_team_data.columns)
    avg_row['SEASON'] = 'Average'
    avg_row['TEAM_NAME'] = 'Average Champion'
    avg_row['PERIOD'] = 'Champion'
    avg_row['ChampionTeamName'] = 'Average Champion'

    # Append the average row to the champion team data
    champion_team_data = pd.concat([champion_team_data, avg_row], ignore_index=True)

    # Recalculate percentiles for champion teams within their data
    champion_team_data = calculate_percentiles(champion_team_data, debug)
    
    if debug:
        print("\nChampion Team Stats with Average:")
        print(tabulate(champion_team_data, headers='keys', tablefmt='grid'))

    # Return the updated champion data with the new average
    return champion_team_data

def compare_team_performance(percentiles, average_champion_stats, traded_teams, debug=True):
    """Generate a comparison table for team performance before and after trades."""
    if debug:
        print("Comparing team performance:")
        print("Percentiles data head:")
        print(percentiles.head())
        print("Percentiles columns:")
        print(percentiles.columns)
        print("Average champion stats:")
        print(average_champion_stats)

    comparison_data = []
    
    for team in traded_teams:
        if debug:
            print(f"Processing team: {team}")
        
        pre_trade_stats = percentiles[(percentiles['TEAM_NAME'] == team) & (percentiles['PERIOD'] == 'Pre-trade')]
        post_trade_stats = percentiles[(percentiles['TEAM_NAME'] == team) & (percentiles['PERIOD'] == 'Post-trade')]
        
        if not pre_trade_stats.empty and not post_trade_stats.empty:
            team_comparison = {'Team': team}
            for stat in RELEVANT_STATS + ['eFG%']:
                if debug:
                    print(f"Processing stat: {stat}")
                    print(f"Pre-trade stats columns: {pre_trade_stats.columns}")
                    print(f"Post-trade stats columns: {post_trade_stats.columns}")
                
                per_game_col = f'{stat}_per_game'
                percentile_col = f'{stat}_percentile'
                
                # Pre-trade stats
                if per_game_col in pre_trade_stats.columns:
                    team_comparison[f'{stat} Pre-trade'] = pre_trade_stats[per_game_col].values[0]
                else:
                    print(f"Warning: {per_game_col} not found in pre_trade_stats")
                    team_comparison[f'{stat} Pre-trade'] = None
                
                if percentile_col in pre_trade_stats.columns:
                    team_comparison[f'{stat} Pre-trade Percentile'] = pre_trade_stats[percentile_col].values[0]
                else:
                    print(f"Warning: {percentile_col} not found in pre_trade_stats")
                    team_comparison[f'{stat} Pre-trade Percentile'] = None
                
                # Post-trade stats
                if per_game_col in post_trade_stats.columns:
                    team_comparison[f'{stat} Post-trade'] = post_trade_stats[per_game_col].values[0]
                else:
                    print(f"Warning: {per_game_col} not found in post_trade_stats")
                    team_comparison[f'{stat} Post-trade'] = None
                
                if percentile_col in post_trade_stats.columns:
                    team_comparison[f'{stat} Post-trade Percentile'] = post_trade_stats[percentile_col].values[0]
                else:
                    print(f"Warning: {percentile_col} not found in post_trade_stats")
                    team_comparison[f'{stat} Post-trade Percentile'] = None
                
                # Champion stats
                if per_game_col in average_champion_stats.columns:
                    team_comparison[f'{stat} Champion'] = average_champion_stats[per_game_col].values[0]
                else:
                    print(f"Warning: {per_game_col} not found in average_champion_stats")
                    team_comparison[f'{stat} Champion'] = None
            
            comparison_data.append(team_comparison)
        else:
            if debug:
                print(f"No data available for comparison for {team}.")
                print("Pre-trade stats head:")
                print(pre_trade_stats.head())
                print("Post-trade stats head:")
                print(post_trade_stats.head())

    comparison_df = pd.DataFrame(comparison_data)

    if debug:
        print("\nComparison Results:")
        print(comparison_df)

    return comparison_df

def validate_post_trade_stats(player_data, trade_date, traded_teams, post_trade_stats, debug=False):
    """Validate the post-trade statistics calculation."""
    trade_date = pd.to_datetime(trade_date)
    post_trade_data = player_data[player_data['GAME_DATE'] >= trade_date]

    validation_results = {}

    for team in traded_teams:
        team_data = post_trade_data[post_trade_data['TEAM_NAME'] == team]
        
        total_points = team_data['PTS'].sum()
        games_played = team_data['GAME_ID'].nunique()
        calculated_ppg = total_points / games_played if games_played > 0 else 0

        reported_ppg = post_trade_stats[post_trade_stats['TEAM_NAME'] == team]['PTS_per_game'].values[0]

        validation_results[team] = {
            'Calculated PPG': calculated_ppg,
            'Reported PPG': reported_ppg,
            'Difference': calculated_ppg - reported_ppg,
            'Games Played': games_played
        }

    if debug:
        print("\nPost-Trade Statistics Validation:")
        print(tabulate(pd.DataFrame(validation_results).T, headers='keys', tablefmt='grid'))

    return validation_results

import streamlit as st
from datetime import datetime
import plotly.graph_objects as go
from nba_api.stats.endpoints import commonplayerinfo
from nba_api.stats.static import teams

def load_team_data():
    nba_teams = teams.get_teams()
    team_df = pd.DataFrame(nba_teams)
    return team_df[['id', 'full_name', 'abbreviation']]

def load_player_data(start_year, end_year):
    player_data = pd.DataFrame()
    for year in range(start_year, end_year + 1):
        data = fetch_season_data_by_year(year)
        if data is not None:
            player_data = pd.concat([player_data, data], ignore_index=True)
    return player_data

def trade_impact_simulator():
    st.subheader("NBA Trade Impact Simulator")

    # Load team and player data
    team_data = load_team_data()
    player_data = load_player_data(2020, 2023)  # Adjust years as needed

    # User inputs
    trade_date = st.date_input('Trade Date', datetime(2023, 12, 20))

    col1, col2 = st.columns(2)
    with col1:
        team1 = st.selectbox('Select Team 1', team_data['full_name'].tolist())
    with col2:
        team2 = st.selectbox('Select Team 2', team_data['full_name'].tolist(), index=1)

    team1_players = player_data[player_data['TEAM_NAME'] == team1]['PLAYER_NAME'].unique()
    team2_players = player_data[player_data['TEAM_NAME'] == team2]['PLAYER_NAME'].unique()

    col1, col2 = st.columns(2)
    with col1:
        players1 = st.multiselect(f'Select Players from {team1}', team1_players)
    with col2:
        players2 = st.multiselect(f'Select Players from {team2}', team2_players)

    if st.button('Simulate Trade'):
        # Convert trade_date to pandas Timestamp for comparison
        trade_date = pd.Timestamp(trade_date)

        # Prepare traded players data
        traded_players = {}
        for player in players1:
            player_id = player_data[player_data['PLAYER_NAME'] == player]['PLAYER_ID'].iloc[0]
            traded_players[player_id] = (team2, player)
        for player in players2:
            player_id = player_data[player_data['PLAYER_NAME'] == player]['PLAYER_ID'].iloc[0]
            traded_players[player_id] = (team1, player)

        # Fetch champion data
        champions = get_champions(2020, 2023)

        # Process champion team data
        champion_team_data = process_champion_team_data(player_data, champions)

        # Calculate pre-trade and post-trade team statistics
        pre_trade_team_stats = calculate_team_stats(player_data[player_data['GAME_DATE'] < trade_date], 'Pre-trade')
        post_trade_team_stats = calculate_post_trade_team_stats(player_data, traded_players, trade_date, player_data)

        # Combine pre-trade and post-trade stats
        combined_stats = pd.concat([pre_trade_team_stats, post_trade_team_stats], ignore_index=True)

        # Calculate eFG% for the combined dataset
        combined_stats['eFG%_per_game'] = (combined_stats['FGM_per_game'] + 0.5 * combined_stats['FG3M_per_game']) / combined_stats['FGA_per_game']

        # Calculate percentiles for the combined stats
        combined_stats = calculate_percentiles(combined_stats)

        # Calculate average champion stats
        average_champion_stats = calculate_average_champion_stats(champion_team_data)

        # Compare pre-trade and post-trade stats for traded teams
        traded_teams = [team1, team2]
        comparison_table = compare_team_performance(combined_stats, average_champion_stats, traded_teams)

        # Display the comparison table
        st.subheader('Trade Impact Comparison')
        st.dataframe(comparison_table)

        # Visualize the results
        st.subheader('Visual Comparison')
        metric = st.selectbox('Select Metric', ['PTS', 'AST', 'TOV', 'STL', 'BLK', 'OREB', 'DREB', 'FGM', 'FG3M', 'FGA', 'eFG%'])

        fig = go.Figure()
        for team in traded_teams:
            team_data = comparison_table[comparison_table['Team'] == team]
            fig.add_trace(go.Bar(x=[f'{team} Pre-trade'], y=[team_data[f'{metric} Pre-trade'].values[0]], name=f'{team} Pre-trade'))
            fig.add_trace(go.Bar(x=[f'{team} Post-trade'], y=[team_data[f'{metric} Post-trade'].values[0]], name=f'{team} Post-trade'))
            fig.add_trace(go.Bar(x=[f'{team} Champion'], y=[team_data[f'{metric} Champion'].values[0]], name=f'{team} Champion'))

        fig.update_layout(title=f'{metric} Comparison', xaxis_title='Team', yaxis_title=metric)
        st.plotly_chart(fig)


def main(debug=True):
    start_year = 2020
    end_year = 2023
    trade_date = '2023-12-20'  # Example trade date
    
    # Traded players with new team names
    traded_players = {
        1628369: ('Los Angeles Lakers', 'Jayson Tatum'),  # Example Player ID and new team
        1630559: ('Boston Celtics', 'Austin Reaves')      # Example Player ID and new team
    }
    
    # Fetch player names
    for player_id in traded_players.keys():
        player_info = fetch_player_info(player_id, debug)
        if player_info is not None:
            traded_players[player_id] = (traded_players[player_id][0], player_info['DISPLAY_FIRST_LAST'].values[0])
    
    # Fetch champion data
    champions = get_champions(start_year, end_year, debug)
    
    # Fetch player data for each season
    player_data = pd.DataFrame()
    season_data = pd.DataFrame()  # To store the full season data
    for year in range(start_year, end_year + 1):
        data = fetch_season_data_by_year(year, debug)
        if data is not None:
            player_data = pd.concat([player_data, data], ignore_index=True)
            season_data = player_data  # Assuming season_data should hold the entire season's data

    if player_data.empty:
        print("Failed to fetch player data. Exiting.")
        return

    # Process champion team data
    champion_team_data = process_champion_team_data(player_data, champions, debug)

    if debug:
        print("\nChampion Team Stats and Percentiles:")
        print(tabulate(champion_team_data, headers='keys', tablefmt='grid'))

    # Debug: Print pre-trade stats for traded players and their teams
    if debug:
        print("\nPre-trade stats:")
        for player_id, (new_team_name, player_name) in traded_players.items():
            # Use all available data if trade date is before the season starts
            if player_data['GAME_DATE'].min() > pd.to_datetime(trade_date):
                player_pre_trade = player_data[player_data['PLAYER_ID'] == player_id]
            else:
                player_pre_trade = player_data[(player_data['PLAYER_ID'] == player_id) & (player_data['GAME_DATE'] < pd.to_datetime(trade_date))]
            
            if not player_pre_trade.empty:
                old_team_name = player_pre_trade['TEAM_NAME'].iloc[0]
                player_total_points = player_pre_trade['PTS'].sum()
                team_total_points = player_data[(player_data['TEAM_NAME'] == old_team_name) & (player_data['GAME_DATE'] < pd.to_datetime(trade_date))]['PTS'].sum()
                print(f"{player_name} (Old team: {old_team_name}):")
                print(f"  Player total points: {player_total_points}")
                print(f"  Team total points: {team_total_points}")
            else:
                print(f"No data available for {player_name}.")

    # Calculate pre-trade and post-trade team statistics
    if player_data['GAME_DATE'].min() > pd.to_datetime(trade_date):
        pre_trade_team_stats = calculate_team_stats(player_data, 'Pre-trade', debug)
    else:
        pre_trade_team_stats = calculate_team_stats(player_data[player_data['GAME_DATE'] < pd.to_datetime(trade_date)], 'Pre-trade', debug)
        
    post_trade_team_stats = calculate_post_trade_team_stats(player_data, traded_players, trade_date, season_data, debug)

    # Debug: Print post-trade stats for traded players and their new teams
    if debug:
        print("\nPost-trade stats:")
        for player_id, (new_team_name, player_name) in traded_players.items():
            player_post_trade = player_data[(player_data['PLAYER_ID'] == player_id) & (player_data['GAME_DATE'] >= pd.to_datetime(trade_date))]
            if not player_post_trade.empty:
                player_total_points = player_post_trade['PTS'].sum()
                team_total_points = player_data[(player_data['TEAM_NAME'] == new_team_name) & (player_data['GAME_DATE'] >= pd.to_datetime(trade_date))]['PTS'].sum()
                print(f"{player_name} (New team: {new_team_name}):")
                print(f"  Player total points: {player_total_points}")
                print(f"  Team total points: {team_total_points}")
            else:
                print(f"No post-trade data found for {player_name}.")

    # Combine pre-trade and post-trade stats
    combined_stats = pd.concat([pre_trade_team_stats, post_trade_team_stats], ignore_index=True)

    # Calculate eFG% for the combined dataset
    combined_stats['eFG%_per_game'] = (combined_stats['FGM_per_game'] + 0.5 * combined_stats['FG3M_per_game']) / combined_stats['FGA_per_game']

    # Calculate percentiles for the combined stats
    percentiles = calculate_percentiles(combined_stats, debug)
    
    if debug:
        print("\nCombined Team Stats and Percentiles:")
        print(tabulate(percentiles, headers='keys', tablefmt='grid'))
    
    # Calculate average champion stats
    average_champion_stats = calculate_average_champion_stats(champion_team_data, debug)

    # Compare pre-trade and post-trade stats for traded teams
    traded_teams = list(set([team_name for _, (team_name, _) in traded_players.items()]))
    comparison_table = compare_team_performance(percentiles, average_champion_stats, traded_teams, debug)
    
    # Print the comparison table
    if debug:
        print("\nTrade Impact Comparison:")
        print(tabulate(comparison_table, headers='keys', tablefmt='grid'))

    # Validate post-trade statistics
    validation_results = validate_post_trade_stats(player_data, trade_date, traded_teams, post_trade_team_stats, debug)

    return validation_results

if __name__ == "__main__":
    main(debug=True)


Overwriting ../src/salary_predict/updated/overall_team_trade_impact.py


In [10]:
%%writefile ../src/salary_predict/updated/app.py

import streamlit as st
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import joblib
import plotly.graph_objects as go
from datetime import datetime

# Import functions from other modules
from data_loader_preprocessor import load_data, format_season, clean_data, engineer_features, encode_data
from model_trainer import train_and_save_models, evaluate_models
from model_predictor import predict
from trade_utils import analyze_two_team_trade, get_champions
from overall_team_trade_impact import (
    fetch_season_data_by_year, get_champions, process_champion_team_data,
    calculate_team_stats, calculate_post_trade_team_stats,
    calculate_average_champion_stats, compare_team_performance,
    trade_impact_simulator
)


@st.cache_data
def load_team_data():
    nba_teams = teams.get_teams()
    team_df = pd.DataFrame(nba_teams)
    return team_df[['id', 'full_name', 'abbreviation']]

@st.cache_data
def load_player_data(start_year, end_year):
    player_data = pd.DataFrame()
    for year in range(start_year, end_year + 1):
        data = fetch_season_data_by_year(year)
        if data is not None:
            player_data = pd.concat([player_data, data], ignore_index=True)
    return player_data

def identify_overpaid_underpaid(predictions_df):
    # Adjust Predicted_Salary calculation
    predictions_df['Predicted_Salary'] = predictions_df['Predicted_Salary'] * predictions_df['Salary_Cap_Inflated']
    
    predictions_df['Salary_Difference'] = predictions_df['Salary'] - predictions_df['Predicted_Salary']
    predictions_df['Overpaid'] = predictions_df['Salary_Difference'] > 0
    predictions_df['Underpaid'] = predictions_df['Salary_Difference'] < 0
    
    overpaid = predictions_df[predictions_df['Overpaid']].sort_values('Salary_Difference', ascending=False)
    underpaid = predictions_df[predictions_df['Underpaid']].sort_values('Salary_Difference')
    
    return overpaid.head(10), underpaid.head(10)


# Utility functions
def load_processed_data(file_path):
    data = load_data(file_path)
    data = format_season(data)
    data = clean_data(data)
    data = engineer_features(data)
    return data

def filter_data_by_season(data, season):
    return data[data['Season'] == season]

# Data visualization functions
def plot_feature_distribution(data, feature):
    fig, ax = plt.subplots(figsize=(10, 6))
    sns.histplot(data[feature], kde=True, ax=ax)
    ax.set_title(f'Distribution of {feature}')
    ax.set_xlabel(feature)
    ax.set_ylabel('Count')
    return fig

def plot_correlation_heatmap(data):
    numeric_data = data.select_dtypes(include=[np.number])
    corr = numeric_data.corr()
    fig, ax = plt.subplots(figsize=(12, 10))
    sns.heatmap(corr, annot=False, cmap='coolwarm', ax=ax)
    ax.set_title('Correlation Heatmap')
    return fig

# Model metrics function
def display_model_metrics(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)

    st.subheader("Model Performance Metrics")
    col1, col2, col3, col4 = st.columns(4)
    col1.metric("Mean Squared Error", f"{mse:.4f}")
    col2.metric("Root Mean Squared Error", f"{rmse:.4f}")
    col3.metric("Mean Absolute Error", f"{mae:.4f}")
    col4.metric("R-squared", f"{r2:.4f}")

# Trade impact display function
def display_trade_impact(result, team1, team2):
    for team_abbr in [team1, team2]:
        st.subheader(f"{team_abbr} Trade Impact")
        
        team_data = result[team_abbr]
        
        col1, col2, col3 = st.columns(3)
        col1.metric("Current Salary", f"${team_data['current_salary']:,.2f}")
        col2.metric("Salary After Trade", f"${team_data['new_salary']:,.2f}")
        col3.metric("Salary Difference", f"${team_data['new_salary'] - team_data['current_salary']:,.2f}")
        
        st.subheader("Stat Comparisons")
        
        # Create a DataFrame for the main stat comparisons
        comparison_data = []
        for stat, values in team_data['comparison'].items():
            comparison_data.append({
                'Stat': stat,
                'Current': f"{values['Current']:.2f} ({values['Current Percentile']:.1f}%ile)",
                'After Trade': f"{values['After Trade']:.2f} ({values['After Trade Percentile']:.1f}%ile)",
                'Champion Average': f"{values['Champ Average']:.2f}",
                'League Average': f"{values['League Average']:.2f}",
                'Change vs League': f"{values['After Trade vs League'] - values['Current vs League']:.2f}",
                'Change vs Champ': f"{values['After Trade vs Champ'] - values['Current vs Champ']:.2f}"
            })
        comparison_df = pd.DataFrame(comparison_data)
        st.table(comparison_df)
        
        st.subheader("Percentile Counts")
        percentile_data = []
        for stat, values in team_data['comparison'].items():
            stat_data = {'Stat': stat}
            for percentile in [99, 98, 97, 96, 95, 90, 75, 50]:
                percentile_key = f"Top {100-percentile}%"
                stat_data[f"Current {percentile_key}"] = values['Current Percentile Counts'][percentile_key]
                stat_data[f"After Trade {percentile_key}"] = values['After Trade Percentile Counts'][percentile_key]
                stat_data[f"Champion {percentile_key}"] = values['Champ Percentile Counts'][percentile_key]
            percentile_data.append(stat_data)
        
        percentile_df = pd.DataFrame(percentile_data)
        st.table(percentile_df)
        
        st.markdown("---")

def display_overpaid_underpaid(predictions_df):
    st.subheader("Top 10 Overpaid and Underpaid Players")

    # Add filters
    col1, col2 = st.columns(2)
    with col1:
        team_filter = st.multiselect("Filter by Team", options=sorted(predictions_df['Team'].unique()))
    with col2:
        position_filter = st.multiselect("Filter by Position", options=sorted(predictions_df['Position'].unique()))

    # Apply filters
    filtered_df = predictions_df
    if team_filter:
        filtered_df = filtered_df[filtered_df['Team'].isin(team_filter)]
    if position_filter:
        filtered_df = filtered_df[filtered_df['Position'].isin(position_filter)]

    # Identify overpaid and underpaid players
    overpaid, underpaid = identify_overpaid_underpaid(filtered_df)

    col1, col2 = st.columns(2)
    with col1:
        st.subheader("Top 10 Overpaid Players")
        st.dataframe(overpaid[['Player', 'Team', 'Position', 'Salary', 'Predicted_Salary', 'Salary_Difference']])

    with col2:
        st.subheader("Top 10 Underpaid Players")
        st.dataframe(underpaid[['Player', 'Team', 'Position', 'Salary', 'Predicted_Salary', 'Salary_Difference']])


# Main Streamlit app
def main():
    st.set_page_config(page_title="NBA Salary Prediction and Trade Analysis", layout="wide")
    st.title("NBA Salary Prediction and Trade Analysis")

    # Sidebar navigation
    st.sidebar.title("Navigation")
    page = st.sidebar.radio("Go to", ["Data Analysis", "Model Results", "Salary Evaluation", "Trade Analysis", "Trade Impact Simulator"])
    
    # Load base data
    data = load_processed_data('data/processed/nba_player_data_final_inflated.csv')

    # Load existing predictions for 2023
    initial_predictions_df = pd.read_csv('data/processed/predictions_df.csv')

    # Season selection
    seasons = sorted(data['Season'].unique(), reverse=True)
    selected_season = st.selectbox("Select Season", seasons)

    # Load models at the beginning of main()
    model_save_path = 'data/models'
    rf_model = joblib.load(f"{model_save_path}/best_rf_model.pkl")
    xgb_model = joblib.load(f"{model_save_path}/best_xgb_model.pkl")

    # Use initial predictions if 2023 is selected, otherwise retrain
    if selected_season == 2023:
        predictions_df = initial_predictions_df
    else:
        # Train model and make predictions
        train_data = data[data['Season'] < selected_season]
        test_data = data[data['Season'] == selected_season]

        # Prepare the data for training
        X_train = train_data.drop(['SalaryPct', 'Salary', 'Player'], axis=1)
        y_train = train_data['SalaryPct']

        # Encode the training data
        X_train_encoded, _, encoders, scaler, numeric_cols, player_encoder = encode_data(X_train)

        # Train and save models
        train_and_save_models(X_train_encoded, y_train, model_save_path, scaler, X_train_encoded.columns, encoders, player_encoder, numeric_cols)

        # Make predictions on the test data
        predictions_df = predict(test_data, model_save_path)



    if page == "Data Analysis":
        st.header("Data Analysis")

        # Filter data by selected season
        season_data = filter_data_by_season(data, selected_season)

        # Display basic statistics
        st.subheader("Basic Statistics")
        st.write(season_data.describe())

        # Feature distribution
        st.subheader("Feature Distribution")
        feature = st.selectbox("Select Feature", season_data.columns)
        fig = plot_feature_distribution(season_data, feature)
        st.pyplot(fig)

        # Correlation heatmap
        st.subheader("Correlation Heatmap")
        fig = plot_correlation_heatmap(season_data)
        st.pyplot(fig)

        # Data handling explanation
        st.subheader("Data Handling")
        st.write("""
        We preprocessed the data to ensure it's suitable for our models:
        1. Cleaned missing values and outliers
        2. Engineered new features like PPG, APG, etc.
        3. Encoded categorical variables (Position, Team, Injury Risk)
        4. Scaled numerical features
        """)

    elif page == "Model Results":
        st.header("Model Results")

        # Model selection
        model_choice = st.selectbox("Select Model", ["Random Forest", "XGBoost"])

        if model_choice == "Random Forest":
            model = rf_model
            y_pred = predictions_df['RF_Predictions']
        else:
            model = xgb_model
            y_pred = predictions_df['XGB_Predictions']

        # Display model metrics
        display_model_metrics(predictions_df['SalaryPct'], y_pred)

        # Feature importance
        st.subheader("Feature Importance")
        feature_importance = pd.DataFrame({
            'feature': model.feature_names_in_,
            'importance': model.feature_importances_
        }).sort_values('importance', ascending=False)
        st.bar_chart(feature_importance.set_index('feature'))


        # Model explanation
        st.subheader("Model Explanation")
        st.write(f"""
        The {model_choice} model was trained on historical NBA player data to predict salary percentages.
        We used the following techniques to improve model performance:
        1. Feature engineering to create relevant statistics
        2. Proper encoding of categorical variables
        3. Scaling of numerical features
        4. Hyperparameter tuning using GridSearchCV
        """)
        
    elif page == "Salary Evaluation":
        st.header("Salary Evaluation")
        display_overpaid_underpaid(predictions_df)

    elif page == "Trade Analysis":
        st.header("Trade Analysis")
        st.write("""
        Analyze potential trades and their impact on team statistics and salary cap.
        For more information on trade rules, visit: [NBA Trade Rules](https://www.hoopsrumors.com/2023/09/salary-matching-rules-for-trades-during-2023-24-season.html)
        """)

        # Team selection
        teams = sorted(predictions_df['Team'].unique())
        col1, col2 = st.columns(2)
        with col1:
            team1 = st.selectbox("Select Team 1", teams)
        with col2:
            team2 = st.selectbox("Select Team 2", teams, index=1)

        # Player selection
        team1_players = predictions_df[predictions_df['Team'] == team1]['Player'].tolist()
        team2_players = predictions_df[predictions_df['Team'] == team2]['Player'].tolist()

        col1, col2 = st.columns(2)
        with col1:
            players_leaving_team1 = st.multiselect(f"Select players leaving {team1}", team1_players)
        with col2:
            players_leaving_team2 = st.multiselect(f"Select players leaving {team2}", team2_players)

        if st.button("Analyze Trade"):
            champions = get_champions(selected_season - 10, selected_season - 1)
            result = analyze_two_team_trade(team1, team2, players_leaving_team1, players_leaving_team2, predictions_df, champions)
            
            if result:
                display_trade_impact(result, team1, team2)
            else:
                st.error("Trade analysis failed. Please check your selections.")

        # Trade analysis explanation
        st.subheader("Trade Analysis Explanation")
        st.write("""
        Our trade analysis compares team statistics before and after the proposed trade.
        We consider:
        1. Changes in key performance metrics (PPG, RPG, APG, etc.)
        2. Salary implications and cap space impact
        3. Comparison to league averages and recent championship teams
        4. Distribution of top performers in various statistical categories
        5. Overpaid/Underpaid player analysis
        """)

    elif page == "Trade Impact Simulator":
        trade_impact_simulator()

if __name__ == "__main__":
    main()

Overwriting ../src/salary_predict/updated/app.py
