In [None]:
from nba_api.stats.endpoints import playercareerstats
from nba_api.stats.static import players
import pandas as pd
from tqdm import tqdm
from requests.exceptions import ReadTimeout


#obtaining data from nba_api, standard json access and conversion code
def fetch_player_career_stats(player_id, timeout=None):
    max_retries = 3
    retries = 0
    while retries < max_retries:
        try:
            player_career = playercareerstats.PlayerCareerStats(
                player_id=player_id,
                timeout=timeout
            )
            return player_career.get_data_frames()[0]
        except ReadTimeout as e:
            retries += 1
            print(f"Read timeout. Retrying... (Attempt {retries}/{max_retries})")
    raise Exception("Failed to fetch data after multiple retries.")

# Timeout
timeout = 100

# Get all players
all_players = players.get_players()

# Initialize an empty list to store player dfs
career_dfs = []

# using tqdm to measure conversion process
for player in tqdm(all_players, desc="Processing Players"):
    player_id = player['id']

    # Get player career stats with custom settings
    try:
        career_df = fetch_player_career_stats(
            player_id=player_id,
            timeout=timeout
        )
    except Exception as e:
        print(f"Error fetching data for Player ID {player_id}: {e}")
        continue

    # make sure df is applicable for our criteria of 5 Y period
    if any((career_df['SEASON_ID'] >= '2018') & (career_df['SEASON_ID'] <= '2023')):

        # Filter df for 2018-2023
        mask = (career_df['SEASON_ID'] >= '2018') & (career_df['SEASON_ID'] <= '2023')
        career_df_filtered = career_df.loc[mask].copy()

        # Add player to df
        career_df_filtered['Player_ID'] = player_id
        career_df_filtered['Player_Name'] = player['full_name']

        # Append df to the list
        career_dfs.append(career_df_filtered)

#combining all dfs
all_players_df = pd.concat(career_dfs, ignore_index=True)

#saving to csv
csv_filename = 'all_players_career_stats_2018_2023.csv'
csv_filepath = "/content/data" + csv_filename
all_players_df.to_csv(csv_filepath, index=False)

print(f"CSV file saved to: {csv_filepath}")


ModuleNotFoundError: No module named 'nba_api'

In [None]:
###### FEATURE SELECTION ######
import pandas as pd
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

def custom_feature_selection(data_path):
    players_data = pd.read_csv(data_path)
    players_data = players_data.iloc[:, 5:-2]

    # Define target variable (to be predicted)
    target_variable = 'FG3M'

    # Calculate correlations
    correlations = players_data.corr()[target_variable].abs().sort_values(ascending=False)

    # Select top correlated features
    selected_features = correlations.index[1:10]

    # Extract features and target
    X = players_data[selected_features]
    y = players_data[target_variable].values

    # Split data: train, val, test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

    # Create model
    model = LinearRegression()

    # RFE
    feature_selector = RFE(model, n_features_to_select=9)
    X_train_selected = feature_selector.fit_transform(X_train, y_train)
    selected_features_rfe = X.columns[feature_selector.support_]

    # Print selected features
    print("Top 8 Correlated Features:", selected_features_rfe[1:10])

    return X_train_selected, X_test, y_train, y_test


In [None]:
###### BASELINE MODEL ######
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt

# Random Forest Regressor model
rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
rf_regressor.fit(X_train_rfe, y_train)  # Use the filtered X_train_rfe data

# Predict on the validation set
y_pred_val = rf_regressor.predict(X_val)  # Use the filtered X_val data

# Evaluate the model on the validation set
mse_val = mean_squared_error(y_val, y_pred_val)
r2_val = r2_score(y_val, y_pred_val)

print("Mean Squared Error (MSE) on Validation Set:", mse_val)
print("R-squared (R2) on Validation Set:", r2_val)

#plotting
plt.figure(figsize=(10, 6))
plt.scatter(y_val, y_pred_val, color='blue', alpha=0.5, label='Predicted')
plt.scatter(y_val, y_val, color='red', alpha=0.5, label='Actual')
plt.title('Actual vs. Predicted Values (Validation Set)')
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.legend()
plt.grid(True)
plt.show()

NameError: name 'X_train_rfe' is not defined

In [None]:
###### RNN-LSTM MODEL ######
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt


# class LSTM_next(nn.Module):
#   def __init__(self, input_size, hidden_size, output_size, num_layers):
#     super(LSTM, self).__init__()
#     self.hidden_size = hidden_size
#     self.input_size = input_size
#     self.output_size = output_size
#     self.num_layers = num_layers

#     self.lstm = nn.LSTM(input_size, hidden_size, num_layers = 1, batch_first = True)
#     self.fc1 =  nn.Linear(hidden_size, num_layers)

#   def forward(self, x):
#     h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).requires_grad_()
#     c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).requires_grad_()
#     out, (hn, cn) = self.lstm(x, (h0, c0))
#     hn = hn[-1, :, :]
#     out = self.fc1(hn)
#     return out

class LSTM(nn.Module):
  def __init__(self, input_size, hidden_size, num_layers):
    super(LSTM, self).__init__()
    self.hidden_size = hidden_size

    self.rnn = nn.RNN(input_size, hidden_size, batch_first = True)
    self.fc1 =  nn.Linear(hidden_size, num_layers)

  def forward(self, x):
    h0 = torch.zeros(1, x.size(0), self.hidden_size)
    out, __ = self.rnn(x, h0)
    return self.fc(out[:, -1, :])

# instatiation
input_size = 9
hidden_size = 1
output_size = 1
num_layers = 1

model1 = LSTM(input_size, hidden_size, num_layers)

# preparing data
data_path = 'data_2018_2023.csv'
players_data = pd.read_csv(data_path)
players_data = players_data.iloc[:, 5:-2]

# Define target variable (to be predicted)
target_variable = 'FG3M'

# Calculate correlations
correlations = players_data.corr()[target_variable].abs().sort_values(ascending=False)

# Select top correlated features
selected_features = correlations.index[1:10]

# Extract features and target
X = players_data[selected_features]
y = players_data[target_variable].values
print(X)
print(y)

# Split data: train, val, test
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
x_val, x_test, y_val, y_test = train_test_split(x_test, y_test, test_size=0.5, random_state=42)

x_train_tensor = torch.Tensor(x_train.values)
x_val_tensor = torch.Tensor(x_val.values)
y_train_tensor = torch.Tensor(y_train)
y_val_tensor = torch.Tensor(y_val)

batch_size = 64
epochs = 4

train_dataset = TensorDataset(x_train_tensor, y_train_tensor)
train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)

val_dataset = TensorDataset(x_val_tensor, y_val_tensor)
val_loader = DataLoader(dataset=val_dataset, batch_size=batch_size, shuffle=False)


# training


def training(model, criterion, optimizer, train_loader, val_loader, epochs, lr):

  for epoch in range(epochs):
    total_loss = 0
    for x_train, y_train in train_loader:
      optimizer.zero_grad()
      pred = model1(x_train)
      loss = criterion(pred, y_train)
      loss.backward()
      optimizer.step()
      optimizer.zero_grad()
      total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch + 1}/{epochs}, Loss: {avg_loss:.4f}")

    for x_val, y_val in val_loader:
      pred = model(x_val)
      val_loss += criterion(pred, y_val).item()

    avg_val_loss = val_loss / len(val_loader)
    print(f"Validation Loss: {avg_val_loss:.4f}")

lr = 0.001
criterion = nn.MSELoss()
optimizer = optim.Adam(model1.parameters(), lr = lr)

training(model1, criterion, optimizer, train_loader, val_loader, epochs = 4, lr = 0.001)







      FG3A  FGA  PTS     MIN  FGM  TOV  STL  GP  GS
0      127  157  165   588.0   56   14   17  31   2
1        1  228  304   737.0  124   43   20  61   4
2      156  603  664  1725.0  265   84   37  73  28
3      108  404  508  1141.0  196   59   31  55  12
4       15   18   17   123.0    4    4    1  10   0
...    ...  ...  ...     ...  ...  ...  ...  ..  ..
3550     0  379  525  1039.0  212   70   14  59  37
3551     2  385  596  1326.0  236   61   16  72  70
3552     4  394  650  1609.0  257   81   24  72  33
3553     0  495  785  1852.0  310  114   36  76  76
3554     2  514  818  2169.0  326  117   29  76  76

[3555 rows x 9 columns]
[41  0 56 ...  1  0  0]


RuntimeError: For unbatched 2-D input, hx should also be 2-D but got 3-D tensor