In [230]:
from nba_api.stats.endpoints import commonplayerinfo
from nba_api.stats.endpoints import leaguegamelog
from nba_api.stats.endpoints import boxscoreplayertrackv2
from nba_api.stats.endpoints import teamgamelog
from nba_api.stats.static import teams
import pandas as pd
import requests
import time

# Step 1: Utility Stuff

We're going to get the IDs of every team and store them. Later, we can cross reference the IDs with the `list_of_teams` to figure out which team was which

In [228]:
list_of_teams = teams.get_teams()
all_team_ids = []
dict_ids_to_name = dict()
for team_obj in list_of_teams:
    all_team_ids.append(team_obj['id'])
    dict_ids_to_name[team_obj['id']] = team_obj['full_name']
dict_ids_to_name

{1610612737: 'Atlanta Hawks',
 1610612738: 'Boston Celtics',
 1610612739: 'Cleveland Cavaliers',
 1610612740: 'New Orleans Pelicans',
 1610612741: 'Chicago Bulls',
 1610612742: 'Dallas Mavericks',
 1610612743: 'Denver Nuggets',
 1610612744: 'Golden State Warriors',
 1610612745: 'Houston Rockets',
 1610612746: 'Los Angeles Clippers',
 1610612747: 'Los Angeles Lakers',
 1610612748: 'Miami Heat',
 1610612749: 'Milwaukee Bucks',
 1610612750: 'Minnesota Timberwolves',
 1610612751: 'Brooklyn Nets',
 1610612752: 'New York Knicks',
 1610612753: 'Orlando Magic',
 1610612754: 'Indiana Pacers',
 1610612755: 'Philadelphia 76ers',
 1610612756: 'Phoenix Suns',
 1610612757: 'Portland Trail Blazers',
 1610612758: 'Sacramento Kings',
 1610612759: 'San Antonio Spurs',
 1610612760: 'Oklahoma City Thunder',
 1610612761: 'Toronto Raptors',
 1610612762: 'Utah Jazz',
 1610612763: 'Memphis Grizzlies',
 1610612764: 'Washington Wizards',
 1610612765: 'Detroit Pistons',
 1610612766: 'Charlotte Hornets'}

Takes a string formatted as "x:xx" or "xx:xx" and returns the equivalent number of seconds as an int... will be useful for filtering players based on playing time

In [3]:
def str_to_mins(inp):
    lst = inp.split(":")
    sec = int(lst[0])*60 + int(lst[1])
    return sec

# Step 2: Storing Individual Games + Winning/Losing Players
We are going to create an empty dataframe where we are going to store everything. Our columns are going to be `game_id`, `winning_team_id`, `winning_team_players`, and `losing_team_players`. To build our basic model where we one-hot encode players names and try to see if our model can predict at better than 50% accuracy who is going to win, this information will suffice.

In [231]:
record = pd.DataFrame(columns=['game_id', 'winning_team_id', 'winning_team_players', 'losing_team_players'])

Now, we need to iterate through every season from 1997-1998 to 2018-2019, check every single team's game log (accessed through the `TeamGameLog` endpoint), and insert the game into `record` if the team won. By only inserting the game if the team won, we ensure that we don't duplicate any entries. When considering a given game, we will use the `BoxScorePlayerTrackV2` endpoint, which takes in a `GAME_ID` and returns the boxscore, to get the list of players who played at least 3 minutes on each team. Afterwards, we'll save to a pickle file so that we can access the data easily.

### WARNING: Takes very very long to run... stored in a pickle file so you can just unpack it

In [245]:
import timeit
for year in range(2016,2019):
    for team_id in all_team_ids:
        
        print(year, dict_ids_to_name[team_id])
        
        # a little code block to retry if the NBA site tries to block us
        max_retries = 10
        for _ in range(max_retries):
            try:
                gamelog_for_season = teamgamelog.TeamGameLog(team_id, year, timeout=15)
                break
            except:
                pass
        
        try:
            gamelog_for_season
        except NameError:
            break #raise NameError(f"Tried {max_retries} times but couldn't get gamelog")
                    
        gamelog_df = gamelog_for_season.get_data_frames()[0] #return statement is a list, we want the first element

        start = timeit.default_timer()
        for row in gamelog_df.itertuples():
            if row.WL == "W": #remember, only want the winning team so we don't duplicate anything
                time.sleep(.7)
                
                ministart = timeit.default_timer()
                # a little code block to retry if the NBA site tries to block us
                max_retries = 5
                for _ in range(max_retries):
                    try:
                        bx = boxscoreplayertrackv2.BoxScorePlayerTrackV2(row.Game_ID, timeout=15)
                        break
                    except:
                        pass
                    
                try:
                    bx
                except NameError:
                    break #raise NameError(f"Tried {max_retries} times but couldn't get gamelog")
                ministop = timeit.default_timer()
                if ministop-ministart > 10:
                    print("Fetched boxscore #", row.Index,"in",ministop-ministart,"seconds")
                
                #rest of code, here we are separating the boxscore into the winning team players and losing team players
                players = bx.get_data_frames()[0]
                team_list = players['TEAM_ID'].unique()
                players_dict = dict()
                
                for team in team_list:
                    players_dict[team] = set()
                
                for innerrow in players.itertuples():
                    if str_to_mins(innerrow.MIN) >= 3*60:
                        players_dict[innerrow.TEAM_ID].add(innerrow.PLAYER_NAME)
                        
                
                #definitions for our insertion into the dataframe
                game_id = row.Game_ID
                winning_team_id = team_id
                winning_team_players = players_dict[winning_team_id]
                losing_team_players = players_dict[next(iter(set(players_dict.keys()) - {winning_team_id}))]
                record = record.append({'game_id': game_id, \
                                        'winning_team_id': winning_team_id, \
                                        'winning_team_players': winning_team_players, \
                                        'losing_team_players': losing_team_players}, ignore_index=True)
        stop = timeit.default_timer()
        print("Analyzed games in",stop-start,"seconds")

2016 Atlanta Hawks
Analyzed games in 55.47568930000125 seconds
2016 Boston Celtics
Analyzed games in 69.32438439999532 seconds
2016 Cleveland Cavaliers
Analyzed games in 65.60309780000534 seconds
2016 New Orleans Pelicans
Analyzed games in 43.83745690000069 seconds
2016 Chicago Bulls
Analyzed games in 55.30768100000569 seconds
2016 Dallas Mavericks
Analyzed games in 45.270042499993 seconds
2016 Denver Nuggets
Analyzed games in 53.763859800004866 seconds
2016 Golden State Warriors
Analyzed games in 90.65585510000528 seconds
2016 Houston Rockets
Analyzed games in 70.22965439999825 seconds
2016 Los Angeles Clippers
Analyzed games in 67.40251020000142 seconds
2016 Los Angeles Lakers
Analyzed games in 34.48479590000352 seconds
2016 Miami Heat
Analyzed games in 53.67464579999796 seconds
2016 Milwaukee Bucks
Analyzed games in 55.54073829999834 seconds
2016 Minnesota Timberwolves
Analyzed games in 39.5967793000018 seconds
2016 Brooklyn Nets
Analyzed games in 28.4038670000009 seconds
2016 New Y

In [248]:
record = record.drop_duplicates(subset="game_id")

In [249]:
import pickle
record.to_pickle("one_hot_model_dataset")

# Step 3: Data Exploration
Now, we can import the pickled dataset and start using it for data exploration. This is a relatively uninteresting dataset, since we only have the names of players. However, one useful thing to do is to store the set of all unique players. We'll use it later when trying to encode our data.

In [251]:
record = pd.read_pickle("one_hot_model_dataset")
record.head()

Unnamed: 0,game_id,winning_team_id,winning_team_players,losing_team_players
0,29701183,1610612737,"{Tyrone Corbin, Drew Barry, Anthony Miller, Ch...","{Terry Mills, Rex Walters, Voshon Lenard, Mark..."
1,29701167,1610612737,"{Tyrone Corbin, Anthony Miller, Christian Laet...","{B.J. Armstrong, Dell Curry, Donald Royal, Ant..."
2,29701142,1610612737,"{Tyrone Corbin, Anthony Miller, Christian Laet...","{Theo Ratliff, Joe Smith, Benoit Benjamin, All..."
3,29701130,1610612737,"{Tyrone Corbin, Anthony Miller, Christian Laet...","{God Shammgod, Ledell Eackles, Ben Wallace, Ch..."
4,29701115,1610612737,"{Tyrone Corbin, Anthony Miller, Christian Laet...","{B.J. Armstrong, Dell Curry, Glen Rice, Matt G..."


In [252]:
#Number of unique players
all_players = set()
for index, row in record.iterrows():
    all_players.update(row["winning_team_players"], row["losing_team_players"])
all_players = list(all_players)
all_players = sorted(all_players)
print("The # of unique players that have played at least 3 minutes in a game from the 1997-98 season to the 2018-19 season is", len(all_players))

The # of unique players that have played at least 3 minutes in a game from the 1997-98 season to the 2018-19 season is 2023


# Step 4: Data Processing
Cool! Now we have our `record` which is a table that contains all the data we need. Unfortunately, a machine learning algorithm cannot take this as an input. We need to [one-hot encode](https://en.wikipedia.org/wiki/One-hot) our data, secifically, using the Python methods describe [here](https://machinelearningmastery.com/how-to-one-hot-encode-sequence-data-in-python/). This means we will have to create a column for every player, whose value will be 1 if the player was in the game and 0 otherwise.

However, here we run into our first issue. If we just create a single column for each player, the algorithm will have no way of knowing who was on what team. Therefore, we need to create two columns for each player (e.g., for LeBron James, we have columns `LeBronJames_A` and `LeBronJames_B`). So, now we can make one of the teams A, and one of the teams B. However, we need to make sure that we don't, for example, encode the winning team as team A always. Otherwise, the ML algorithm will think that being team A is what makes you win, instead of seeing how individual players affect the chance of winning.

One way we can solve this issue is by creating two entries for each game. We can encode (Winning Team -> A, Losing Team -> B) + make it a positive training example (i.e. 1), and then make an entry for the reverse, encoding (Winning Team -> B, Losing Team -> A) + make it a negative training example (i.e. 0). This way, the algorithm won't correlate being team A or team B with winning. This way, when we use the algorithm for inference, we can log the teams in arbitrary order without having to worry about who is A or who is B.

A useful method. We will require the condensed version of players names a lot, so makes sense to pay the overhead and do it now so that we can access it in constant time instead of having to do expensive string operations at each step

In [253]:
name_to_condensed_dict = dict()
for player in all_players:
    player_name_condensed = player.replace(" ", "")
    name_to_condensed_dict[player] = player_name_condensed

Now we're going to add columns to `ml_record` for every single player indicating whether they played on team A or B, plus a column for the outcome of the game (1 if team A won, 0 if team B won)

In [254]:
cols = []
for player in all_players:
    player_name_condensed = name_to_condensed_dict[player]
    str_a = f"{player_name_condensed}_A"
    str_b = f"{player_name_condensed}_B"
    cols.append(str_a)
    cols.append(str_b)
cols.append("label")
ml_record = pd.DataFrame(columns=cols, index=range(record.shape[0]*2))
ml_record

Unnamed: 0,A.C.Green_A,A.C.Green_B,A.J.Bramlett_A,A.J.Bramlett_B,A.J.Guyton_A,A.J.Guyton_B,AJHammons_A,AJHammons_B,AJPrice_A,AJPrice_B,...,ZhaireSmith_B,ZhouQi_A,ZhouQi_B,ZoranDragic_A,ZoranDragic_B,ZoranPlaninic_A,ZoranPlaninic_B,ZydrunasIlgauskas_A,ZydrunasIlgauskas_B,label
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52131,,,,,,,,,,,...,,,,,,,,,,
52132,,,,,,,,,,,...,,,,,,,,,,
52133,,,,,,,,,,,...,,,,,,,,,,
52134,,,,,,,,,,,...,,,,,,,,,,


This is another very important block of code. We are going through each game, considering each player, and changing the appropriate column for that player to 1. Additionally, we are creating a second training example with the opposite result.

In [None]:
offset = record.shape[0]
for row in record.itertuples():
    index = row.Index
    winning_team_players = row.winning_team_players
    losing_team_players = row.losing_team_players
    if index%100 == 0:
        print(f"iteratation {index} of {offset}: {round(index/offset*100,2)}%")
    #create the first training example, which has a result of 1 and the winning players on team A
    a_players = winning_team_players
    b_players = losing_team_players
    for player in a_players:
        player_name_condensed = name_to_condensed_dict[player]
        str_a = player_name_condensed + "_A"
        ml_record.at[index, str_a] = 1
    for player in b_players:
        player_name_condensed = name_to_condensed_dict[player]
        str_b = player_name_condensed + "_B"
        ml_record.at[index, str_b] = 1
    ml_record.at[index, "label"] = 1
    
    #create a second training example, which has a result of 0 and the winning players on team B
    b_players = winning_team_players
    a_players = losing_team_players
    for player in a_players:
        player_name_condensed = name_to_condensed_dict[player]
        str_a = player_name_condensed + "_A"
        ml_record.at[index+offset, str_a] = 1
    for player in b_players:
        player_name_condensed = name_to_condensed_dict[player]
        str_b = player_name_condensed + "_B"
        ml_record.at[index+offset, str_b] = 1
    ml_record.at[index+offset, "label"] = 0

Ok, so now we're 90% of the way there. Now, we only need to drop the `game_id`, `winning_team_players`, and `losing_team_players` columns (because they're not going into the model), and then we can start preparing the data for training. Additionally, we will pickle this new dataframe since the method above takes a while to finish.  

In [256]:
ml_record = ml_record.fillna(0)
ml_record.to_pickle("encoded_one_hot_model_dataset")

The extra taining example above unfortunately came out with the blank spaces as `NaN`, but we can replace those with zeroes easily

In [257]:
ml_record = pd.read_pickle("encoded_one_hot_model_dataset")
ml_record

Unnamed: 0,A.C.Green_A,A.C.Green_B,A.J.Bramlett_A,A.J.Bramlett_B,A.J.Guyton_A,A.J.Guyton_B,AJHammons_A,AJHammons_B,AJPrice_A,AJPrice_B,...,ZhaireSmith_B,ZhouQi_A,ZhouQi_B,ZoranDragic_A,ZoranDragic_B,ZoranPlaninic_A,ZoranPlaninic_B,ZydrunasIlgauskas_A,ZydrunasIlgauskas_B,label
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52133,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
52134,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
52135,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
52136,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [258]:
ml_record.to_pickle("encoded_one_hot_model_dataset")

We're going to drop some useless columns and turn this into a numpy array, which is what scikit-learn likes as an input

In [259]:
#convert to final numpy form
import numpy as np 

# Labels are the values we want to predict
labels = np.array(ml_record['label'])
# Remove the labels column; axis 1 refers to the columns
values = ml_record.drop(['label'], axis = 1)
# Saving feature names for later use
feature_list = list(values.columns)
# Convert to numpy array
values = np.array(values)

Now, finally, we can split this into a training, validation, and testing dataset. We're ready!

In [260]:
#split into training, validation, and testing data
from sklearn.model_selection import train_test_split

X = values
y = labels

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=1)
X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.25, random_state=1) #this gives 20% test, 20% val, 60% train

## Step 6: Machine Learning - Random Forest
We're going to be using scikit-learn's `RandomForestClassifier` for our first attempt. 

In [269]:
#here's the magical step! We train the classifier
from sklearn.ensemble import RandomForestClassifier

# Instantiate model with 500 decision trees
rf = RandomForestClassifier(n_estimators = 500, random_state = 42)
# Train the model on training data
rf.fit(X_train, y_train);

In [270]:
from joblib import dump
dump(rf, 'random_forest_week6_model.joblib')

['random_forest_week6_model.joblib']

In [271]:
from joblib import load
rf = load('random_forest_week6_model.joblib')

In [272]:
# the next part of the code is to predict based on the validation set and evaluate the performance of the model...
# we don't want to use the test set until we're certain that we're absolutely done with tweaking parameters
from sklearn import metrics
from sklearn.metrics import classification_report

y_pred = rf.predict(X_val)
# Model Accuracy, how often is the classifier correct?
print("Accuracy: ",metrics.accuracy_score(y_val, y_pred))
print(classification_report(y_val, y_pred))

Accuracy:  0.6538166474875335
              precision    recall  f1-score   support

           0       0.69      0.69      0.69      5807
           1       0.61      0.61      0.61      4621

    accuracy                           0.65     10428
   macro avg       0.65      0.65      0.65     10428
weighted avg       0.65      0.65      0.65     10428



## Step 7: Machine Learning - Logistic Regression
We're going to be using scikit-learn's `LogisticRegression` for our first attempt. 

In [261]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(random_state=0, verbose = 1, max_iter=100000)
lr.fit(X_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   24.2s finished


LogisticRegression(max_iter=100000, random_state=0, verbose=1)

In [262]:
from joblib import dump
dump(lr, 'logistic_regression_week6_model.joblib')

['logistic_regression_week6_model.joblib']

In [263]:
from joblib import load
lr = load('logistic_regression_week6_model.joblib')

In [264]:
from sklearn.metrics import accuracy_score, classification_report
from sklearn import metrics

y_pred = lr.predict(X_val)
# Model Accuracy, how often is the classifier correct?
print("Accuracy: ",metrics.accuracy_score(y_val, y_pred))
print(classification_report(y_val, y_pred))

Accuracy:  0.6940928270042194
              precision    recall  f1-score   support

           0       0.72      0.73      0.73      5807
           1       0.66      0.64      0.65      4621

    accuracy                           0.69     10428
   macro avg       0.69      0.69      0.69     10428
weighted avg       0.69      0.69      0.69     10428



In [284]:
l = list(lr.coef_[0])
type(l)

list

In [293]:
d = dict()
for i in range(len(l)):
    coef = l[i]
    name = feature_list[i]
    d[name] = coef

In [299]:
d["DraymondGreen_B"]

0.25990892792187387

## Step 8: Machine Learning - Naive Bayes
We're going to be using scikit-learn's `BernoulliNB` for our first attempt. 

In [265]:
from sklearn.naive_bayes import BernoulliNB

bnb = BernoulliNB(binarize = None)
bnb.fit(X_train,y_train)

BernoulliNB(binarize=None)

In [266]:
from joblib import dump
dump(bnb, 'bernoulli_naivebayes_week6_model.joblib')

['bernoulli_naivebayes_week6_model.joblib']

In [267]:
from joblib import load
bnb = load('bernoulli_naivebayes_week6_model.joblib')

In [268]:
y_pred = bnb.predict(X_val)
# Model Accuracy, how often is the classifier correct?
print("Accuracy: ",metrics.accuracy_score(y_val, y_pred))
print(classification_report(y_val, y_pred))

Accuracy:  0.692270809359417
              precision    recall  f1-score   support

           0       0.72      0.72      0.72      5807
           1       0.65      0.66      0.65      4621

    accuracy                           0.69     10428
   macro avg       0.69      0.69      0.69     10428
weighted avg       0.69      0.69      0.69     10428



## Step 9: Machine Learning - K-Nearest Neighbors
We're going to be using scikit-learn's `svm` for our first attempt. 

In [273]:
from sklearn.neighbors import KNeighborsClassifier

neigh = KNeighborsClassifier(n_neighbors=3)
neigh.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=3)

In [274]:
from joblib import dump
dump(neigh, 'knn_week6_model.joblib')

['knn_week6_model.joblib']

In [275]:
from joblib import load
neigh = load('knn_week6_model.joblib')

In [276]:
num_samples = 200
y_pred = neigh.predict(X_val[0:num_samples,:])
# Model Accuracy, how often is the classifier correct?
print("Accuracy: ",metrics.accuracy_score(y_val[0:num_samples], y_pred))
print(classification_report(y_val[0:num_samples], y_pred))

Accuracy:  0.61
              precision    recall  f1-score   support

           0       0.63      0.69      0.65       108
           1       0.59      0.52      0.55        92

    accuracy                           0.61       200
   macro avg       0.61      0.60      0.60       200
weighted avg       0.61      0.61      0.61       200

