In [29]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Load the dataset
df = pd.read_csv('combined_players.csv')

# Function to convert percentage columns to numeric and handle errors
def convert_to_numeric(column):
    column = column.str.rstrip('%')
    column = pd.to_numeric(column, errors='coerce')  # Convert to numeric, set errors to NaN
    return column / 100.0  # Convert to decimal

# Convert percentage columns to numeric
percentage_columns = ['3P%', 'FG%', 'FT%']
for col in percentage_columns:
    df[col] = convert_to_numeric(df[col])

# Select relevant columns for the model
features = ['3P%', 'FG%', 'FT%', '+/-', 'GmSc', 'MP']
target = 'PTS'

# Drop rows with missing values in the selected columns
df = df.dropna(subset=features + [target])

# Ensure all relevant columns are numeric
df[features + [target]] = df[features + [target]].apply(pd.to_numeric, errors='coerce')
df = df.dropna(subset=features + [target])

# Split the data into features (X) and target (y)
X = df[features]
y = df[target]

# Normalize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into 80% train and 20% test
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Train the model
model = LinearRegression()
model.fit(X_train, y_train)

# Function to calculate averages for a player
def calculate_averages(player_df):
    avg_3p = player_df['3P%'].mean()
    avg_fg = player_df['FG%'].mean()
    avg_ft = player_df['FT%'].mean()
    avg_gmsc = player_df['GmSc'].mean()
    avg_mp = player_df['MP'].mean()
    avg_plus_minus = player_df['+/-'].mean()
    return [avg_3p, avg_fg, avg_ft, avg_plus_minus, avg_gmsc, avg_mp]

# Calculate averages for each player
players = ['Jaylen Brown', 'Jayson Tatum', 'Kyrie Irving', 'Luka Dončić']
avg_data = {}

for player in players:
    player_df = df[df['Player'] == player]
    avg_data[player] = calculate_averages(player_df)

# Create a DataFrame for each player's next game using their averages
next_game_data = pd.DataFrame({
    'Player': [],
    '3P%': [],
    'FG%': [],
    'FT%': [],
    '+/-': [],
    'GmSc': [],
    'MP': []
})

for player, averages in avg_data.items():
    player_next_game = pd.DataFrame([averages], columns=['3P%', 'FG%', 'FT%', '+/-', 'GmSc', 'MP'])
    player_next_game['Player'] = player
    next_game_data = pd.concat([next_game_data, player_next_game], ignore_index=True)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error (MSE): {mse}")
print(f"Mean Absolute Error (MAE): {mae}")
print(f"R-squared (R2): {r2}")

# Normalize the input data using the scaler
next_game_data_scaled = scaler.transform(next_game_data[['3P%', 'FG%', 'FT%', '+/-', 'GmSc', 'MP']])

# Predict the points for the next game for each player using the trained model
predicted_pts_next_game = model.predict(next_game_data_scaled)

# Combine predictions with player data
next_game_data['Predicted_PTS'] = predicted_pts_next_game

# Display the predictions
predictions = next_game_data[['Player', 'Predicted_PTS']].reset_index(drop=True)
print(predictions)


Mean Squared Error (MSE): 8.584832058283505
Mean Absolute Error (MAE): 2.429827527482962
R-squared (R2): 0.8350796768730169
         Player  Predicted_PTS
0  Jaylen Brown      23.668850
1  Jayson Tatum      26.820100
2  Kyrie Irving      26.328407
3   Luka Dončić      32.463159
