# Website Notebook

In [None]:
# Standard imports
# If any of these don't work, try doing `pip install _____`, or try looking up the error message.
import numpy as np
import pandas as pd
import json
import time
import os.path
from os import path
import math
import datetime
import unidecode
import requests
from bs4 import BeautifulSoup
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Modules from sportsrefernece.ncaab for college basketball
from sportsreference.ncaab.boxscore import Boxscore as NCAAB_Boxscore
from sportsreference.ncaab.conferences import Conferences as NCAAB_Conferences
from sportsreference.ncaab.rankings import Rankings as NCAAB_Rankings
from sportsreference.ncaab.roster import Player as NCAAB_Player
from sportsreference.ncaab.roster import Roster as NCAAB_Roster
from sportsreference.ncaab.schedule import Schedule as NCAAB_Schedule
from sportsreference.ncaab.teams import Teams as NCAAB_Teams

# Modules from sportsrefernece.nba for NBA basketball
from sportsreference.nba.boxscore import Boxscore as NBA_Boxscore
from sportsreference.nba.roster import Player as NBA_Player
from sportsreference.nba.roster import Roster as NBA_Roster
from sportsreference.nba.schedule import Schedule as NBA_Schedule
from sportsreference.nba.teams import Teams as NBA_Teams

# Introduction

Hi everyone! In this notebook, we will be constructing a variety of different machine learning models to predict NBA rookie statlines from different college players input from the Anvil App!

Let's get started!

# A. College Data

**In order for us to pass up college stats for this player to be viewed in the front-end, we will need to:**
1. Clean the name
2. Find the college player stats for the input name
    - If they don't exist, tell the user that they don't, and provide the correct error message

### 1. Find college player stats for input name

In [None]:
# This method should hopefully reduce the number of failure cases.
def convert_nba_ncaa_name(name: str) -> str:
    """
    Converts the format of the NBA player_id to the NCAA player_id.

    You may want to elaborate on the logic on this function to reduce the number of failure cases later.
    """
    return unidecode.unidecode(name.lower().replace(" ", "-") + "-1")

### 2. Find the college player stats for the input name

In [None]:
def get_college_stats(player_name: str) -> dict:
    """
    Return a dictionary with the following keys:
    
    success: If the player was correctly found (boolean)
    data: Player data (pd.Dataframe)
    error: Error message, if success is false (string)
    """
    response = {}
    clean_name = convert_nba_ncaa_name(player_name)
    
    # Checking if player does exist with current name
    try:
        player_data = NCAAB_Player(clean_name).dataframe
    except TypeError:
        response['success'] = False
        response['data'] = None
        response['error'] = "This player doesn't exist (or its name is not in the correct format). Please try a different player name."
        return response

    player_data.rename(columns=lambda x: 'NCAAB_' + x, inplace=True)
    last_year = player_data.iloc[[player_data.shape[0] - 2]]
    last_year.set_index(pd.Index(data=[player_name], name='Name'), inplace=True)
    
    response['success'] = True
    response['data'] = last_year # Change for Matt's formatting
    response['error'] = None
    return response

In [None]:
ayton = get_college_stats('Deandre Ayton')

In [None]:
ayton['data']

In [None]:
# Pip install the library if you don't currently have it
!pip install anvil-uplink

### Making front-end endpoint for retrieving college data

In [None]:
import anvil.server
anvil.server.connect("INSERT THE KEY FROM YOUR ANVIL APP HERE")

@anvil.server.callable
def get_college(name):
    """
    Making the front-end to back-end connection, and passing up college data.
    """
    response = get_college_stats(name)
    if not response['success']:
        return response
    
    column_names = ['NCAAB_' + name for name in ['points', 'total_rebounds', 'assists', 'steals', 'blocks', 'games_played']]
    
    def rename_cols(name):
        """
        Rename columns
        """
        new_names = {
            'NCAAB_points': 'pts',
            'NCAAB_total_rebounds': 'reb',
            'NCAAB_assists': 'ast',
            'NCAAB_steals': 'stl',
            'NCAAB_blocks': 'blk',
        }
        if name == 'Name':
            return name
        return new_names[name]
    
    new_data = response['data']
    per_game = (new_data[column_names] / int(new_data['NCAAB_games_played'].values)).drop(columns=['NCAAB_games_played'])
    renamed = per_game.iloc[0].apply(lambda x: round(x, 2)).rename(lambda x: rename_cols(x))
    renamed['Name'] = name
    response['data'] = renamed.to_dict()
    return response

In [None]:
get_college('Ben Simmons')

In [None]:
get_college('Stephen Curry')

# B. Clean/Process Data

In [None]:
# Splitting data
from sklearn.model_selection import train_test_split

data = pd.read_csv('data/player_data_final.csv')
NBA_col = ['NBA_points', 'NBA_total_rebounds', 'NBA_assists', 'NBA_steals', 'NBA_blocks']
train, test = data.drop(columns=NBA_col), data[NBA_col].set_index(pd.Index(data=data['name'], name='name'))
train.set_index('name', inplace=True)

In [None]:
def check_null(df):
    """
    Returns all columns that have a null value in the player's data
    """
    return df.columns[df.isna().any()].tolist()

**In order for us to make predictions from the college data to be rendering in the front-end, we first need to:**
1. Drop unneeded columns
2. Clean the data the same way we did in our initial data

### 1. Drop unneeded columns

In [None]:
def drop_columns(data: pd.DataFrame) -> pd.DataFrame:
    """
    Drops the columns from the input college data.
    """
    def drop_for_nans(data):
        # Drops columns due to NaNs inside the data.
        drop_col = [
            'NCAAB_box_plus_minus',
            'NCAAB_defensive_box_plus_minus',
            'NCAAB_offensive_box_plus_minus',
            'NCAAB_player_efficiency_rating',
            'NCAAB_three_point_percentage',
        ]
        return data.drop(columns=drop_col)
    
    def drop_for_qual(data):
        # Drops columns due to qualitative data
        qual_drop = [
            'NCAAB_conference', 
            'NCAAB_player_id',
            'NCAAB_team_abbreviation',
        ]
        return data.drop(columns=qual_drop)
    
    return drop_for_nans(drop_for_qual(data))

In [None]:
curry = get_college_stats('Stephen Curry')

In [None]:
new_curry = drop_columns(curry['data'])

In [None]:
check_null(new_curry)

### 2. Clean the data the same way we did initially

In [None]:
def clean_qualitative(data: pd.DataFrame) -> pd.DataFrame:
    """
    Cleans the qualitative columns (height, position)
    """

    def convert_height(height: str) -> int:
        """
        Convert height from string to int (6-11 -> 83)
        """
        feet, inches = height.split("-")
        return int(feet) * 12 + int(inches)

    def clean_position(position: str) -> str:
        """
        If the player has a hypened position, remove the second one.
        """
        return position.split('-')[0]
    
    def one_hot_position(player):
        """
        Applies one hot encoding to the player's position.
        """
        positions = positions = ['Center', 'Forward', 'Guard']
        player[positions] = pd.DataFrame([
            [int(p == player['NCAAB_position']) for p in positions]
        ], index=player.index)
        return player
    
    data['NCAAB_height'] = data['NCAAB_height'].apply(convert_height)
    data['NCAAB_position'] = data['NCAAB_position'].apply(clean_position)
    return one_hot_position(data)

In [None]:
clean_curry = clean_qualitative(new_curry)

In [None]:
check_null(clean_curry)

In [None]:
def feature_extraction(data: pd.DataFrame) -> pd.DataFrame:
    """
    Takes the columns we've qualitative selected from the dataset to be our features for our model.
    """
    columns_to_keep = [
        'NCAAB_assists',
        'NCAAB_blocks',
        'NCAAB_field_goal_attempts',
        'NCAAB_field_goal_percentage',
        'NCAAB_field_goals',
        'NCAAB_free_throw_attempt_rate',
        'NCAAB_free_throw_attempts',
        'NCAAB_free_throw_percentage',
        'NCAAB_free_throws',
        'NCAAB_games_played',
        'NCAAB_games_started',
        'NCAAB_height',
        'NCAAB_personal_fouls',
        'NCAAB_points',
        'NCAAB_steal_percentage',
        'NCAAB_steals',
        'NCAAB_three_point_attempt_rate',
        'NCAAB_three_point_attempts',
        'NCAAB_total_rebound_percentage',
        'NCAAB_total_rebounds',
        'NCAAB_turnover_percentage',
        'NCAAB_turnovers',
        'NCAAB_two_point_attempts',
        'NCAAB_two_point_percentage',
        'NCAAB_win_shares',
        'Guard',
        'Forward',
        'Center',
    ]
    return data[columns_to_keep]

In [None]:
final_curry = feature_extraction(clean_curry)

In [None]:
check_null(final_curry)

In [None]:
def extra_cleaning(player_data, train=False):
    """
    Cleans the player's data further if there's any NaN values (older players).
    """
    def check_null(df):
        """
        Returns all columns that have a null value in the player's data
        """
        return df.columns[df.isna().any()].tolist()
    
    columns = check_null(player_data)
    if columns:
        for column in columns:
            # Replace value with the average of our training set
            player_data.loc[:, column] = train.mean()[column]
    return player_data

In [None]:
def format_data(player_data, train):
    """
    Returns the processed data in correct format for inputting into our models.
    """
    cols_dropped = drop_columns(player_data)
    clean_data = clean_qualitative(cols_dropped)
    featured_data = feature_extraction(clean_data)
    final_data = extra_cleaning(featured_data, train)
    return final_data

In [None]:
format_data(curry['data'], train)

# C. Make Models

**In order for us to make our predictions, we first take a look at all the different models we've made for the project.**

1. LinearRegression
2. K-Means
3. DecisionTrees
4. Neural Networks

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train, test, test_size=0.2)

In [None]:
# Re-shuffle data
def shuffle(train, test):
    """
    Shuffe the data around for re-trying results.
    """
    X_train, X_test, y_train, y_test = train_test_split(train, test, test_size=0.2)
    return X_train, X_test, y_train, y_test

In [None]:
# Loss function
def RMSE(y, y_pred):
    """
    Calculates the root mean squared error of the model's predictions.
    """
    return np.sqrt(np.mean((y - y_pred) ** 2))

In [None]:
# Total loss function
def all_RMSE(y, y_pred):
    """
    Calculates the RMSE by each of the different stats, and the total RMSE
    """
    rmses = {}
    for num, name in enumerate(y_pred.columns):
        rmses[name] = RMSE(y[:, num], y_pred[name])
    rmses['Total RMSE'] = sum(rmses.values())
    return pd.Series(rmses)

In [None]:
# Standardize our data
def standardize(train, test):
    """
    Standardize both train and test based off of train's means and std's.
    """
    means, stds = train.mean(), train.std()
    norm_train = (train - means) / stds
    norm_test = (test - means) / stds
    return norm_train, norm_test, means, stds

### 1A. Linear Regression (without standardization)

In [None]:
X_train, X_test, y_train, y_test = shuffle(train, test)

In [None]:
# Create model
from sklearn.linear_model import LinearRegression
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)

In [None]:
# Make predictions
y_pred = linear_model.predict(X_test)

In [None]:
# RMSE for points
RMSE(y_test.values[:, 0], y_pred[:, 0])

In [None]:
# Total RMSE
all_RMSE(y_pred, y_test)

### 1B. Linear Regression (with standardization)

In [None]:
X_train, X_test, y_train, y_test = shuffle(train, test)

In [None]:
# Standardize data
norm_X_train, norm_X_test, _, _ = standardize(X_train, X_test)

In [None]:
# Create model
from sklearn.linear_model import LinearRegression
linear_model = LinearRegression()
linear_model.fit(norm_X_train, y_train)

In [None]:
# Make predictions
y_pred = linear_model.predict(norm_X_test)

In [None]:
# RMSE for points
RMSE(y_test.values[:, 0], y_pred[:, 0])

In [None]:
# Total RMSE
all_RMSE(y_pred, y_test)

### 2. K-Means

In [None]:
# Shuffle the data up
X_train, X_test, y_train, y_test = shuffle(train, test)

In [None]:
# Standardize data
norm_X_train, norm_X_test, _, _ = standardize(X_train, X_test)

In [None]:
# Fit model and produce labels
from sklearn.cluster import KMeans
k_means_cluster = KMeans(n_clusters=120, max_iter=10000, n_init=100).fit(norm_X_train.values)
X_copy = norm_X_train.copy(deep=True)
X_copy['label'] = k_means_cluster.labels_

In [None]:
# Find the predicted labels of the test set
label_pred = k_means_cluster.predict(norm_X_test)

In [None]:
def find_predictions(labels, X_copy, y_train):
    """
    Finds the predictions for these different players
    """
    all_preds = []
    for label in label_pred:
        similar_players = X_copy[X_copy['label'] == label].index
        similar_avg_stats = y_train.loc[similar_players].mean()
        all_preds.append(similar_avg_stats.values)
    return np.array(all_preds)

In [None]:
# Find stat predictions for the test set
y_pred = find_predictions(label_pred, X_copy, y_train)

In [None]:
# RMSE for points
RMSE(y_test.values[:, 0], y_pred[:, 0])

In [None]:
# Total RMSE
all_RMSE(y_pred, y_test)

### 2B. Run K-Means multiple times for long-term RMSEs

In [None]:
def run_x_times(x):
    """
    Runs K-Means 'x' number of times to average the RMSEs.
    """
    total_avg = []
    for _ in range(10):
        # Shuffle the data up
        X_train, X_test, y_train, y_test = shuffle(train, test)

        # Standardize data
        norm_X_train, norm_X_test, _, _ = standardize(X_train, X_test)

        # Fit model and produce labels
        from sklearn.cluster import KMeans
        k_means_cluster = KMeans(n_clusters=120, max_iter=10000, n_init=100).fit(norm_X_train.values)
        X_copy = norm_X_train.copy(deep=True)
        X_copy['label'] = k_means_cluster.labels_

        # Find the predicted labels of the test set
        label_pred = k_means_cluster.predict(norm_X_test)

        def find_predictions(labels, X_copy, y_train):
            """
            Finds the predictions for these different players
            """
            all_preds = []
            for label in label_pred:
                similar_players = X_copy[X_copy['label'] == label].index
                similar_avg_stats = y_train.loc[similar_players].mean()
                all_preds.append(similar_avg_stats.values)
            return np.array(all_preds)

        # Find stat predictions for the test set
        y_pred = find_predictions(label_pred, X_copy, y_train)
        total_avg.append(all_RMSE(y_pred, y_test).values)
        
    return np.array(total_avg)

In [None]:
avgs = run_x_times(10)

In [None]:
for i in range(5):
    print(np.mean(avgs[:, i]))

### 3. Decision Trees

In [None]:
# Shuffle data
X_train, X_test, y_train, y_test = shuffle(train, test)

In [None]:
# Standardize data
norm_X_train, norm_X_test, _, _ = standardize(X_train, X_test)

In [None]:
# Create model
from sklearn.tree import DecisionTreeRegressor
DTR = DecisionTreeRegressor(min_samples_split=5, max_depth=15)
DTR.fit(norm_X_train, y_train)

In [None]:
# Training RMSE
all_RMSE(DTR.predict(norm_X_train), y_train)

In [None]:
# Test total RMSE
all_RMSE(DTR.predict(norm_X_test), y_test)

### 3.1 Trying variety of different Decision Trees

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import RandomForestRegressor

In [None]:
DT_errors = []
ET_errors = []
RF_errors = []

def fit_predict_error(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    return RMSE(model.predict(X_test), y_test)

for i in range(1, 25):
    DTR = DecisionTreeRegressor(max_depth=i)
    error = fit_predict_error(DTR, norm_X_train, y_train, norm_X_test, y_test)
    DT_errors.append(error.values)
    
    ETR = ExtraTreesRegressor(max_depth=i)
    error = fit_predict_error(ETR, norm_X_train, y_train, norm_X_test, y_test)
    ET_errors.append(error.values)
    
    RFR = RandomForestRegressor(max_depth=i)
    error = fit_predict_error(RFR, norm_X_train, y_train, norm_X_test, y_test)
    RF_errors.append(error.values)

In [None]:
import matplotlib.pyplot as plt

# Plotting RMSE's for points
plt.figure(figsize=(10, 8))
plt.plot(np.array(DT_errors)[:, 0], label='Decision Tree Error')
plt.plot(np.array(ET_errors)[:, 0], label='Extra Trees Error')
plt.plot(np.array(RF_errors)[:, 0], label='Random Forest Error')
plt.legend()
plt.show();

### 4. Neural Networks

In [None]:
# Shuffle the data up
X_train, X_test, y_train, y_test = shuffle(train, test)

In [None]:
# Standardize data
norm_X_train, norm_X_test, x_means, x_stds = standardize(X_train, X_test)

In [None]:
# Y-value skewness
import seaborn as sns

fig, axes = plt.subplots(2, 3, figsize=(15, 10))
for col_num in range(len(y_train.columns)):
    sns.distplot(y_train.iloc[:, col_num].to_frame(), ax=axes[col_num // 3][col_num % 3]);

In [None]:
### STANDARDIZE Y-VALUES (this is because y_values are so skewed)
norm_y_train, _, y_means, y_stds = standardize(y_train, y_test)

In [None]:
# Import libraries for model
import keras
from keras.models import Sequential
from keras.layers import Dense

In [None]:
def make_nn_arch():
    """
    Make basic neural network architecture for our dataset.
    """
    model = Sequential()
    model.add(Dense(35, activation='tanh', input_shape=(28,)))
    model.add(Dense(15, activation='tanh', input_shape=(35,)))
    model.add(Dense(1, activation='tanh', input_shape=(15,)))
    # For a mean squared error regression problem
    model.compile(
        optimizer=keras.optimizers.SGD(
            lr=0.001,
            momentum=0.9,
            nesterov=True
        ), #How to Learn
        loss='mse', # What to Learn
    )
    return model

In [None]:
# Create and fit Neural Network
model = make_nn_arch()
model.fit(
    x=norm_X_train,
    y=norm_y_train['NBA_points'], 
    epochs=100,
)

In [None]:
# Loss function
def RMSE(y, y_pred):
    """
    Calculates the root mean squared error of the model's predictions.
    """
    return np.sqrt(np.mean((y - y_pred) ** 2))

In [None]:
# Make predictions
y_pred = model.predict(norm_X_test)

In [None]:
# RMSE for points
RMSE(((y_pred * y_stds['NBA_points']) + y_means['NBA_points']).flatten(), y_test['NBA_points'])

### 4.1 Make Neural Networks for all stats

In [None]:
def train_all_and_predict():
    
    rmses = []
    
    for column in y_train.columns:

        # Create and fit Neural Network
        model = make_nn_arch()
        model.fit(
            x=norm_X_train,
            y=norm_y_train[column], 
            epochs=100,
        )
        y_pred = model.predict(norm_X_test)

        # RMSE by stat category
        rmse = RMSE(((y_pred * y_stds[column]) + y_means[column]).flatten(), y_test[column])
        rmses.append(rmse)
    
    return rmses

In [None]:
rmses = train_all_and_predict()

In [None]:
rmses

# D. Training the final models

In [None]:
####################
### FINAL MODELS ###
####################

In [None]:
# Train your best models here! Remove any comments if you don't need them.

# Standardize data

# Split data

# Train models

# Predict

# Calculate RMSE

In [None]:
@anvil.server.callable
def get_predicts(name):
    
    ### Let this method take in a name, and output predictions for this player!
    
    # TODO
    
    return ...

In [None]:
def run():
    while True:
        name = input()
        if name == 'exit' or name == 'quit':
            break
        print(get_predicts(name))
# name = 'Stephen Curry'

In [None]:
run()

In [None]:
get_college('Stephen Curry')