In [285]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from textblob import TextBlob
from nba_api.stats.endpoints import leaguedashteamstats
import re

In [286]:
# List of seasons to loop through
seasons = [f"{year}-{str(year+1)[-2:]}" for year in range(2015, 2025)]

# Dictionary to hold data for each season
season_data_frames = {}

# Loop through each season and fetch data
for season in seasons:
    print(f"Fetching data for season: {season}")
    df = leaguedashteamstats.LeagueDashTeamStats(
        season=season,
        per_mode_detailed='PerGame'
    ).get_data_frames()[0]
    season_data_frames[season] = df

df_2015_2016 = season_data_frames['2015-16']
sorted_pts = df_2015_2016.sort_values('REB_RANK')
sorted_pts.tail()


Fetching data for season: 2015-16
Fetching data for season: 2016-17
Fetching data for season: 2017-18
Fetching data for season: 2018-19
Fetching data for season: 2019-20
Fetching data for season: 2020-21
Fetching data for season: 2021-22
Fetching data for season: 2022-23
Fetching data for season: 2023-24
Fetching data for season: 2024-25


Unnamed: 0,TEAM_ID,TEAM_NAME,GP,W,L,W_PCT,MIN,FGM,FGA,FG_PCT,...,REB_RANK,AST_RANK,TOV_RANK,STL_RANK,BLK_RANK,BLKA_RANK,PF_RANK,PFD_RANK,PTS_RANK,PLUS_MINUS_RANK
29,1610612764,Washington Wizards,82,41,41,0.5,48.2,39.5,85.8,0.46,...,26,5,16,8,26,6,21,16,9,17
16,1610612749,Milwaukee Bucks,82,33,49,0.402,48.4,38.4,82.2,0.467,...,27,9,25,13,7,26,17,21,25,26
14,1610612763,Memphis Grizzlies,82,42,40,0.512,48.4,36.8,83.6,0.44,...,28,25,6,6,21,29,27,9,24,20
17,1610612750,Minnesota Timberwolves,82,29,53,0.354,48.5,37.7,81.3,0.464,...,29,8,23,14,19,17,18,7,15,24
22,1610612755,Philadelphia 76ers,82,10,72,0.122,48.3,36.2,84.0,0.431,...,30,20,29,10,3,28,25,25,29,30


In [287]:
sorted_pts.columns

Index(['TEAM_ID', 'TEAM_NAME', 'GP', 'W', 'L', 'W_PCT', 'MIN', 'FGM', 'FGA',
       'FG_PCT', 'FG3M', 'FG3A', 'FG3_PCT', 'FTM', 'FTA', 'FT_PCT', 'OREB',
       'DREB', 'REB', 'AST', 'TOV', 'STL', 'BLK', 'BLKA', 'PF', 'PFD', 'PTS',
       'PLUS_MINUS', 'GP_RANK', 'W_RANK', 'L_RANK', 'W_PCT_RANK', 'MIN_RANK',
       'FGM_RANK', 'FGA_RANK', 'FG_PCT_RANK', 'FG3M_RANK', 'FG3A_RANK',
       'FG3_PCT_RANK', 'FTM_RANK', 'FTA_RANK', 'FT_PCT_RANK', 'OREB_RANK',
       'DREB_RANK', 'REB_RANK', 'AST_RANK', 'TOV_RANK', 'STL_RANK', 'BLK_RANK',
       'BLKA_RANK', 'PF_RANK', 'PFD_RANK', 'PTS_RANK', 'PLUS_MINUS_RANK'],
      dtype='object')

In [288]:
team_name_to_abbr = {
    "Atlanta Hawks": "ATL",
    "Boston Celtics": "BOS",
    "Brooklyn Nets": "BKN",
    "Charlotte Hornets": "CHO",
    "Chicago Bulls": "CHI",
    "Cleveland Cavaliers": "CLE",
    "Dallas Mavericks": "DAL",
    "Denver Nuggets": "DEN",
    "Detroit Pistons": "DET",
    "Golden State Warriors": "GSW",
    "Houston Rockets": "HOU",
    "Indiana Pacers": "IND",
    "LA Clippers": "LAC",
    "Los Angeles Lakers": "LAL",
    "Memphis Grizzlies": "MEM",
    "Miami Heat": "MIA",
    "Milwaukee Bucks": "MIL",
    "Minnesota Timberwolves": "MIN",
    "New Orleans Pelicans": "NOP",
    "New York Knicks": "NYK",
    "Oklahoma City Thunder": "OKC",
    "Orlando Magic": "ORL",
    "Philadelphia 76ers": "PHI",
    "Phoenix Suns": "PHX",
    "Portland Trail Blazers": "POR",
    "Sacramento Kings": "SAC",
    "San Antonio Spurs": "SAS",
    "Toronto Raptors": "TOR",
    "Utah Jazz": "UTA",
    "Washington Wizards": "WAS"
}

for season, df in season_data_frames.items():
    if 'TEAM_NAME' not in df.columns:
        print(f"'TEAM_NAME' column missing in DataFrame for season: {season}")
    else:
        # Map team names to abbreviations
        df['TEAM_ABBREVIATION'] = df['TEAM_NAME'].map(team_name_to_abbr)
        df = df.rename(columns = {"TEAM_ABBREVIATION": "Team"}).drop(columns = ["TEAM_ID", "MIN", "GP", "W", "L"])
        
        # Update the dictionary with the modified DataFrame
        season_data_frames[season] = df

# Verify the mapping
df_2017_2018 = season_data_frames['2017-18']
df_2017_2018.head()





Unnamed: 0,TEAM_NAME,W_PCT,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,...,AST_RANK,TOV_RANK,STL_RANK,BLK_RANK,BLKA_RANK,PF_RANK,PFD_RANK,PTS_RANK,PLUS_MINUS_RANK,Team
0,Atlanta Hawks,0.293,38.2,85.5,0.446,11.2,31.0,0.36,15.8,20.2,...,8,27,12,23,28,14,14,25,26,ATL
1,Boston Celtics,0.671,38.3,85.1,0.45,11.5,30.4,0.377,16.0,20.7,...,20,15,23,18,6,17,22,20,6,BOS
2,Brooklyn Nets,0.341,38.2,86.8,0.441,12.7,35.7,0.356,17.4,22.6,...,9,25,30,16,27,23,19,14,24,BKN
3,Charlotte Hornets,0.439,39.0,86.7,0.45,10.0,27.2,0.369,20.2,27.0,...,24,3,28,18,20,2,1,10,17,CHO
4,Chicago Bulls,0.329,38.7,88.8,0.435,11.0,31.1,0.355,14.6,19.2,...,10,13,18,30,24,9,29,26,29,CHI


In [289]:
playoff_results = pd.read_csv('data/playoff_results.csv') 
playoff_results.tail()

Unnamed: 0,Yr,Lg,Series,Unnamed: 3,Unnamed: 4,Team,W,Unnamed: 7,Team.1,W.1,Unnamed: 10,Favorite,Underdog
130,2016,NBA,Western Conf Semifinals,May 1 - May 11 2016,,Golden State Warriors (1),4,,Portland Trail Blazers (5),1,,GSW (-1600),POR (+1050)
131,2016,NBA,Western Conf Semifinals,Apr 30 - May 12 2016,,Oklahoma City Thunder (3),4,,San Antonio Spurs (2),2,,SAS (-270),OKC (+230)
132,2016,NBA,Eastern Conf Finals,May 17 - May 27 2016,,Cleveland Cavaliers (1),4,,Toronto Raptors (2),2,,CLE (-1350),TOR (+885)
133,2016,NBA,Western Conf Finals,May 16 - May 30 2016,,Golden State Warriors (1),4,,Oklahoma City Thunder (3),3,,GSW (-430),OKC (+345)
134,2016,NBA,Finals,Jun 2 - Jun 19 2016,,Cleveland Cavaliers (1),4,,Golden State Warriors (1),3,,GSW (-220),CLE (+180)


In [290]:
relabeled_dict = {
    "Team": "Team1",
    "Team.1": "Team2",
    "W": "Team1wins",
    "W.1": "Team2wins"
}

playoff_results = playoff_results.rename(columns=relabeled_dict)
playoff_results.head()

Unnamed: 0,Yr,Lg,Series,Unnamed: 3,Unnamed: 4,Team1,Team1wins,Unnamed: 7,Team2,Team2wins,Unnamed: 10,Favorite,Underdog
0,2024,NBA,Eastern Conf First Round,Apr 21 - May 1 2024,,Boston Celtics (1),4,,Miami Heat (8),1,,BOS (-10000),MIA (+2000)
1,2024,NBA,Eastern Conf First Round,Apr 20 - May 5 2024,,Cleveland Cavaliers (4),4,,Orlando Magic (5),3,,CLE (-210),ORL (+180)
2,2024,NBA,Eastern Conf First Round,Apr 21 - May 2 2024,,Indiana Pacers (6),4,,Milwaukee Bucks (3),2,,IND (-125),MIL (+105)
3,2024,NBA,Eastern Conf First Round,Apr 20 - May 2 2024,,New York Knicks (2),4,,Philadelphia 76ers (7),2,,NYK (-140),PHI (+120)
4,2024,NBA,Western Conf First Round,Apr 21 - Apr 29 2024,,Oklahoma City Thunder (1),4,,New Orleans Pelicans (8),0,,OKC (-700),NOP (+500)


In [291]:
def clean_and_extract_team_id(team_name):
    # Remove seed numbers and parentheses
    cleaned_name = re.sub(r'\s\(\d+\)', '', team_name)
    # Extract first three letters and convert to uppercase
    team_id = cleaned_name[:3].upper()
    return team_id



playoff_results['Team1'] = playoff_results['Team1'].apply(clean_and_extract_team_id)
playoff_results['Team2'] = playoff_results['Team2'].apply(clean_and_extract_team_id) 
playoff_results.tail(15)

Unnamed: 0,Yr,Lg,Series,Unnamed: 3,Unnamed: 4,Team1,Team1wins,Unnamed: 7,Team2,Team2wins,Unnamed: 10,Favorite,Underdog
120,2016,NBA,Eastern Conf First Round,Apr 17 - Apr 24 2016,,CLE,4,,DET,0,,CLE (-1700),DET (+1100)
121,2016,NBA,Eastern Conf First Round,Apr 16 - May 1 2016,,TOR,4,,IND,3,,TOR (-400),IND (+325)
122,2016,NBA,Eastern Conf First Round,Apr 17 - May 1 2016,,MIA,4,,CHA,3,,MIA (-170),CHO (+150)
123,2016,NBA,Eastern Conf First Round,Apr 16 - Apr 28 2016,,ATL,4,,BOS,2,,ATL (-165),BOS (+145)
124,2016,NBA,Western Conf First Round,Apr 16 - Apr 27 2016,,GOL,4,,HOU,1,,GSW (-10000),HOU (+4000)
125,2016,NBA,Western Conf First Round,Apr 17 - Apr 24 2016,,SAN,4,,MEM,0,,SAS (-17000),MEM (+6625)
126,2016,NBA,Western Conf First Round,Apr 16 - Apr 25 2016,,OKL,4,,DAL,1,,OKC (-3500),DAL (+1750)
127,2016,NBA,Western Conf First Round,Apr 17 - Apr 29 2016,,POR,4,,LOS,2,,LAC (-525),POR (+415)
128,2016,NBA,Eastern Conf Semifinals,May 2 - May 8 2016,,CLE,4,,ATL,0,,CLE (-500),ATL (+400)
129,2016,NBA,Eastern Conf Semifinals,May 3 - May 15 2016,,TOR,4,,MIA,3,,TOR (-145),MIA (+125)


In [292]:
# Define the mapping of teams
team_replacements = {
    'LOS': 'LAC',
    'CHA': 'CHO',
    'GOL': 'GSW',
    'BRO': 'BKN',
    'OKL': 'OKC',
    'NEW': 'NYK',
    'PHO': 'PHX'
}

# Replace values in 'Team1' and 'Team2' columns
playoff_results['Team1'] = playoff_results['Team1'].replace(team_replacements)
playoff_results['Team2'] = playoff_results['Team2'].replace(team_replacements)

# Print the unique values in 'Team1' to verify the changes
print(playoff_results['Team1'].unique())


['BOS' 'CLE' 'IND' 'NYK' 'OKC' 'DAL' 'MIN' 'DEN' 'MIA' 'PHI' 'PHX' 'GSW'
 'LAC' 'MIL' 'MEM' 'BKN' 'UTA' 'ATL' 'TOR' 'HOU' 'POR' 'WAS' 'SAN']


In [293]:
playoff_results["Year"] = playoff_results["Unnamed: 3"].str.extract(r"(\d{4})").astype(int)

# Step 2: Reshape the data to combine Team1 and Team2 information
team1_data = playoff_results[["Year", "Team1", "Team1wins"]].rename(columns={"Team1": "Team", "Team1wins": "Wins"})
team2_data = playoff_results[["Year", "Team2", "Team2wins"]].rename(columns={"Team2": "Team", "Team2wins": "Wins"})

# Combine both datasets
all_teams = pd.concat([team1_data, team2_data])

# Step 3: Group by year and team to calculate total wins
team_wins_by_year = all_teams.groupby(["Year", "Team"], as_index=False)["Wins"].sum()
team_wins_by_year.sample(20)

Unnamed: 0,Year,Team,Wins
1,2016,BOS,2
98,2022,DAL,9
36,2018,IND,3
76,2020,POR,1
65,2020,BOS,10
137,2024,PHI,2
64,2020,BKN,0
126,2024,CLE,5
138,2024,PHX,0
7,2016,HOU,1


In [294]:
# Create an empty dictionary to store the merged data for each season
merged_season_data = {}

# Iterate over the seasons and merge playoff wins with team stats
for season in available_seasons:
    # Extract the year (e.g., "2021" from "2021-22")
    season_year = season.split('-')[0]

    # Filter playoff wins for the current season from team_wins_by_year
    playoff_wins_season = team_wins_by_year[team_wins_by_year['Year'] == int(season_year)]
    
    if not playoff_wins_season.empty:
        # Get the team stats for the current season
        df = season_data_frames[season]

        # Merge playoff wins with team stats
        df_merged = pd.merge(df, playoff_wins_season[['Team', 'Wins']], how='left', left_on='Team', right_on='Team')

        # Store the merged data in the dictionary
        merged_season_data[season] = df_merged

        # Display the merged data for the season
        print(f"Updated Merged Data for {season} season:")
        print(df_merged.head())  # Print the top rows of the merged data to verify
    else:
        print(f"No playoff win data found for season {season}")



Updated Merged Data for 2016-17 season:
           TEAM_NAME  W_PCT   FGM   FGA  FG_PCT  FG3M  FG3A  FG3_PCT   FTM  \
0      Atlanta Hawks  0.524  38.1  84.4   0.451   8.9  26.1    0.341  18.1   
1     Boston Celtics  0.646  38.6  85.1   0.454  12.0  33.4    0.359  18.7   
2      Brooklyn Nets  0.244  37.8  85.2   0.444  10.7  31.6    0.338  19.4   
3  Charlotte Hornets  0.439  37.7  85.4   0.442  10.0  28.6    0.351  19.4   
4      Chicago Bulls  0.500  38.6  87.1   0.444   7.6  22.3    0.340  18.0   

    FTA  ...  TOV_RANK  STL_RANK  BLK_RANK  BLKA_RANK  PF_RANK  PFD_RANK  \
0  24.9  ...        28         7        14         22        6         3   
1  23.2  ...         8        18        23         23       21        11   
2  24.6  ...        29        21        18         30       25         9   
3  23.8  ...         1        27        17         28        1        15   
4  22.5  ...        12        15        16         12        2        25   

   PTS_RANK  PLUS_MINUS_RANK  Team

In [295]:
# Initialize an empty dictionary to store the correlation tables for each season
season_correlations = {}

# Loop through each season's data in the season_data_frames
for season, season_data in season_data_frames.items():
    # Merge the season data with the playoff wins for the corresponding year
    season_year = int(season.split('-')[0]) + 1  # Extract the year (e.g., '2016-17' -> 2017)
    playoff_wins_season = team_wins_by_year[team_wins_by_year['Year'] == season_year]
    df_merged = pd.merge(season_data, playoff_wins_season[['Team', 'Wins']], how='left', left_on='Team', right_on='Team')
    
    # Select only numeric columns for correlation computation
    df_numeric = df_merged.select_dtypes(include=[float, int])

    # Calculate the correlation matrix for the numeric columns
    correlation_matrix = df_numeric.corr()

    # Get the correlation of each feature with Playoff_Wins (assuming the column is named 'Wins')
    playoff_wins_correlation = correlation_matrix['Wins'].sort_values(ascending=False)

    # Store the correlation table for the season
    season_correlations[season] = playoff_wins_correlation

# Display the correlations for all seasons
# Loop through the dictionary and drop the "Wins" entry from each correlation table (Series)
for season, correlation_table in season_correlations.items():
    correlation_table = correlation_table.drop('Wins')  # Drop the 'Wins' entry
    season_correlations[season] = correlation_table
    print(f"Correlation Table for {season}:")
    print(correlation_table)
    print("\n")



Correlation Table for 2015-16:
W_PCT              0.836893
PLUS_MINUS         0.824299
FG_PCT             0.717308
FG3_PCT            0.624916
FGM                0.601993
DREB               0.568975
PTS                0.557585
REB                0.458896
AST                0.376497
FG3M               0.349493
BLK                0.347824
FTA_RANK           0.347435
FTM_RANK           0.312541
STL_RANK           0.311053
PFD_RANK           0.305498
TOV_RANK           0.288676
TOV                0.262638
FT_PCT             0.206585
FG3A               0.171940
MIN_RANK           0.059238
OREB               0.045726
FGA                0.020382
FT_PCT_RANK        0.017062
OREB_RANK          0.016135
PF                -0.066070
FGA_RANK          -0.078287
FG3A_RANK         -0.121332
PF_RANK           -0.124695
FTM               -0.178282
FG3M_RANK         -0.203622
BLKA              -0.225720
FTA               -0.298295
AST_RANK          -0.318484
PFD               -0.322317
STL              

In [296]:
for season, correlation_table in season_correlations.items():
  print(f"most important features for {season}:")
  print(correlation_table.head(7))
  print("\n")

most important features for 2015-16:
W_PCT         0.836893
PLUS_MINUS    0.824299
FG_PCT        0.717308
FG3_PCT       0.624916
FGM           0.601993
DREB          0.568975
PTS           0.557585
Name: Wins, dtype: float64


most important features for 2016-17:
W_PCT         0.839129
PLUS_MINUS    0.773503
FG3M          0.669983
PTS           0.668914
AST           0.661640
FGM           0.608016
FG3A          0.586860
Name: Wins, dtype: float64


most important features for 2017-18:
FG3_PCT      0.645146
MIN_RANK     0.615397
OREB_RANK    0.566101
FG3M         0.561717
PTS          0.523472
W_PCT        0.511614
FGA_RANK     0.496020
Name: Wins, dtype: float64


most important features for 2018-19:
W_PCT         0.847822
PLUS_MINUS    0.708557
PTS           0.661446
FT_PCT        0.633279
FGM           0.629078
FG_PCT        0.623721
BLK           0.489866
Name: Wins, dtype: float64


most important features for 2019-20:
FTM           0.564891
PFD           0.562837
FTA           0.

In [297]:
season_data_frames['2024-25'].head()

Unnamed: 0,TEAM_NAME,W_PCT,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,...,AST_RANK,TOV_RANK,STL_RANK,BLK_RANK,BLKA_RANK,PF_RANK,PFD_RANK,PTS_RANK,PLUS_MINUS_RANK,Team
0,Atlanta Hawks,0.5,42.8,92.1,0.465,13.1,36.8,0.354,18.6,23.7,...,2,23,2,11,19,13,6,6,21,ATL
1,Boston Celtics,0.711,41.6,90.6,0.46,18.0,49.5,0.364,16.7,21.0,...,17,2,22,6,4,1,15,5,3,BOS
2,Brooklyn Nets,0.342,38.0,84.9,0.447,14.3,39.3,0.364,16.9,20.9,...,18,20,28,30,26,29,10,28,25,BKN
3,Charlotte Hornets,0.229,38.2,89.9,0.425,14.3,41.9,0.341,15.4,19.9,...,27,24,18,18,20,26,12,29,24,CHO
4,Chicago Bulls,0.474,43.4,92.3,0.471,16.4,43.8,0.375,14.8,18.7,...,4,19,21,21,16,8,30,4,21,CHI


In [298]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error


def preprocess_data(season_data, top_features, is_future=False):
    season_data = season_data.fillna(0)  # Replace NaN with 0 in the wins col
    
    if not is_future:
        y = season_data['Wins']  # Use 'Wins' column for past seasons
        season_data = season_data.drop(columns=['Wins'])  # Drop 'Wins' for features
    else:
        y = None  # No 'Wins' for future seasons (like 2024-25)
    
    X = season_data[top_features]  # use only most correlated  features
    return X, y

def train_season_model(season_data, top_features, is_future=False):
    X, y = preprocess_data(season_data, top_features, is_future)
    scaler = StandardScaler() # standardizing features 
    
    if y is not None:
        # Split data into training and test
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        
        # Fit the scaler on the training data
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)  # Use the same scaler for the test set
        
        # Create and train the model
        model = LinearRegression()
        model.fit(X_train_scaled, y_train)
        
        # Make predictions and calculate MSE
        y_pred = model.predict(X_test_scaled)
        mse = mean_squared_error(y_test, y_pred)
        print(f"Season Model Performance (MSE): {mse}")
    else:
        model = LinearRegression() #might have to change later to diff regression
    
    return model, scaler


available_seasons = ['2016-17', '2017-18', '2018-19', '2019-20', '2020-21', '2021-22', '2022-23', '2023-24']
top_features = ['W_PCT', 'PLUS_MINUS', 'FG_PCT', 'FG3M', 'PTS', 'AST']

# Train models for each available season (excluding 2015-16 season)
models = {}  # Store the models and scalers for each season
for season in available_seasons:
    if season != '2015-16': 
        print(f"Training model for {season} season")
        season_data = merged_season_data[season]  # Access  data for the current season
        model, scaler = train_season_model(season_data, top_features, is_future=False) 
        models[season] = {'model': model, 'scaler': scaler}


#now predicitng 2025

X_2024_25 = season_data_frames['2024-25'][top_features]

# Use the most recent model (trained on 2023-24 season)
latest_model = models['2023-24']['model']
latest_scaler = models['2023-24']['scaler']


predicted_wins_2024_25 = latest_model.predict(X_2024_25_scaled)
predicted_wins_2024_25 = [max(0, win) for win in predicted_wins_2024_25] # ensures no neg playoff wins (min is 0)

# Combine predictions with team name
predictions = pd.DataFrame({
    'Team': season_data_frames['2024-25']['TEAM_NAME'],  
    'Predicted Playoff Wins': predicted_wins_2024_25
})

# Display the predictions
print(predictions)

Training model for 2016-17 season
Season Model Performance (MSE): 69.67831754587708
Training model for 2017-18 season
Season Model Performance (MSE): 25.616323035741488
Training model for 2018-19 season
Season Model Performance (MSE): 26.533485003377606
Training model for 2019-20 season
Season Model Performance (MSE): 62.62409665274072
Training model for 2020-21 season
Season Model Performance (MSE): 27.877913873549165
Training model for 2021-22 season
Season Model Performance (MSE): 33.79827785775375
Training model for 2022-23 season
Season Model Performance (MSE): 18.281717091217654
Training model for 2023-24 season
Season Model Performance (MSE): 20.159317187356574
                      Team  Predicted Playoff Wins
0            Atlanta Hawks                5.763707
1           Boston Celtics                6.715792
2            Brooklyn Nets                1.868912
3        Charlotte Hornets                0.000000
4            Chicago Bulls                3.363729
5      Cleveland 

In [299]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error


def preprocess_combined_data(combined_season_data, selected_features):
    """
    Prepares data for training by splitting into features and target and applying scaling.
    """
    combined_season_data = combined_season_data.fillna(0)  # Replace NaN with 0
    
    # Target variable
    y_combined = combined_season_data['Wins']  # Use 'Wins' column as the target
    X_combined = combined_season_data[selected_features]  # Use only selected features for training

    return X_combined, y_combined


def train_combined_model(combined_season_data, selected_features):
    """
    Trains a model using data from all seasons combined.
    """
    X_combined, y_combined = preprocess_combined_data(combined_season_data, selected_features)
    
    # Split into training and test sets
    X_train_combined, X_test_combined, y_train_combined, y_test_combined = train_test_split(
        X_combined, y_combined, test_size=0.2, random_state=42
    )
    
    # Standardize features
    scaler_combined = StandardScaler()
    X_train_scaled_combined = scaler_combined.fit_transform(X_train_combined)
    X_test_scaled_combined = scaler_combined.transform(X_test_combined)
    
    # Train the model
    model_combined = LinearRegression()
    model_combined.fit(X_train_scaled_combined, y_train_combined)
    
    # Evaluate model performance
    y_pred_combined = model_combined.predict(X_test_scaled_combined)
    mse_combined = mean_squared_error(y_test_combined, y_pred_combined)
    print(f"Combined Model Performance (MSE): {mse_combined}")
    
    return model_combined, scaler_combined


# Step 1: Combine all season data
all_season_data = pd.concat([merged_season_data[season] for season in available_seasons], axis=0)

# Step 2: Select common features across seasons (manually chosen or based on importance rankings)
selected_features_combined = ['W_PCT', 'PLUS_MINUS', 'FG3M', 'PTS', 'FG_PCT', 'AST', 'FG3_PCT', ]

# Step 3: Train the combined model
print("Training combined model with data from all seasons...")
combined_model_v1, combined_scaler_v1 = train_combined_model(all_season_data, selected_features_combined)

# Step 4: Predict playoff wins for the 2024-25 season using the combined model
X_2024_25_combined = season_data_frames['2024-25'][selected_features_combined]
X_2024_25_scaled_combined = combined_scaler_v1.transform(X_2024_25_combined)  # Scale features

# Generate predictions
predicted_wins_2024_25_combined = combined_model_v1.predict(X_2024_25_scaled_combined)
predicted_wins_2024_25_combined = [max(0, win) for win in predicted_wins_2024_25_combined]  # Ensure no negative values

# Combine predictions with team names
predictions_combined = pd.DataFrame({
    'Team': season_data_frames['2024-25']['TEAM_NAME'],
    'Predicted Playoff Wins': predicted_wins_2024_25_combined
})

# Display predictions
print(predictions_combined)


Training combined model with data from all seasons...
Combined Model Performance (MSE): 15.303912423079003
                      Team  Predicted Playoff Wins
0            Atlanta Hawks                2.907338
1           Boston Celtics                7.996623
2            Brooklyn Nets                4.417673
3        Charlotte Hornets                1.712157
4            Chicago Bulls                5.803665
5      Cleveland Cavaliers                9.994447
6         Dallas Mavericks                3.867761
7           Denver Nuggets                2.893950
8          Detroit Pistons                4.832239
9    Golden State Warriors                5.232766
10         Houston Rockets                2.621506
11          Indiana Pacers                4.210061
12             LA Clippers                4.559336
13      Los Angeles Lakers                4.055219
14       Memphis Grizzlies                3.314197
15              Miami Heat                5.170723
16         Milwaukee Bucks

In [300]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_predict

def train_and_evaluate_rf_model(all_season_data, selected_features, n_estimators=100, max_depth=None):
    """
    Train and evaluate a Random Forest model on combined data from all seasons.
    Also generates predictions for each season in the dataset.
    """
    # Prepare combined training data
    X_combined, y_combined = preprocess_combined_data(all_season_data, selected_features)
    
    # Standardize features (optional for Random Forest)
    scaler_rf = StandardScaler()
    X_scaled = scaler_rf.fit_transform(X_combined)
    
    # Train Random Forest model
    rf_model = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth, random_state=42)
    rf_model.fit(X_scaled, y_combined)
    
    # Cross-validation predictions for combined data
    cross_val_predictions = cross_val_predict(rf_model, X_scaled, y_combined, cv=5)
    avg_mse_rf = ((cross_val_predictions - y_combined) ** 2).mean()  # Mean Squared Error
    print(f"Combined Random Forest Model Performance (Avg MSE): {avg_mse_rf}")
    
    # Generate predictions for each season
    season_predictions = {}
    for season, data_frame in merged_season_data.items():
        X_season = data_frame[selected_features]
        y_season = data_frame['Wins']
        X_season_scaled = scaler_rf.transform(X_season)
        season_predictions[season] = {
            'Team': data_frame['TEAM_NAME'],
            'Actual Wins': y_season,
            'Predicted Wins': rf_model.predict(X_season_scaled)
        }
    
    return rf_model, scaler_rf, season_predictions

# Train and evaluate the Random Forest model
print("Training and evaluating combined Random Forest model with data from all seasons...")
rf_model_combined, rf_scaler_combined, all_season_predictions = train_and_evaluate_rf_model(
    all_season_data, selected_features_combined, n_estimators=500, max_depth=10
)

# Display predictions for each season
for season, results in all_season_predictions.items():
    predictions_df = pd.DataFrame(results)
    print(f"\nPredictions for {season}:")
    print(predictions_df)


Training and evaluating combined Random Forest model with data from all seasons...
Combined Random Forest Model Performance (Avg MSE): 16.69586617854968

Predictions for 2016-17:
                      Team  Actual Wins  Predicted Wins
0            Atlanta Hawks          4.0        3.354355
1           Boston Celtics          2.0        3.607204
2            Brooklyn Nets          NaN        0.642369
3        Charlotte Hornets          3.0        1.770838
4            Chicago Bulls          NaN        1.113091
5      Cleveland Cavaliers         16.0       11.867480
6         Dallas Mavericks          1.0        2.224898
7           Denver Nuggets          NaN        0.405317
8          Detroit Pistons          0.0        0.684479
9    Golden State Warriors         15.0       14.070000
10         Houston Rockets          1.0        2.947752
11          Indiana Pacers          3.0        3.145073
12             LA Clippers          2.0        2.331922
13      Los Angeles Lakers          N