# Predicting winners of the NBA 2016-2017 with Decision Trees

Take each of the months and amalgamate into a text file
- https://www.basketball-reference.com/leagues/NBA_2017_games.html

In [274]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
import numpy as np
import os
from collections import defaultdict
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.grid_search import GridSearchCV

In [275]:
data_folder = 'data'
data_filename = 'data/basketball.csv'
df = pd.read_csv(data_filename)

In [276]:
df.head()

Unnamed: 0,Date,Start (ET),Visitor/Neutral,PTS,Home/Neutral,PTS.1,Unnamed: 6,Unnamed: 7,Attend.,Notes
0,Tue Oct 25 2016,7:30 pm,New York Knicks,88,Cleveland Cavaliers,117,Box Score,,20562,
1,Tue Oct 25 2016,10:30 pm,San Antonio Spurs,129,Golden State Warriors,100,Box Score,,19596,
2,Tue Oct 25 2016,10:00 pm,Utah Jazz,104,Portland Trail Blazers,113,Box Score,,19446,
3,Wed Oct 26 2016,7:30 pm,Brooklyn Nets,117,Boston Celtics,122,Box Score,,18624,
4,Wed Oct 26 2016,7:00 pm,Dallas Mavericks,121,Indiana Pacers,130,Box Score,OT,17923,


In [277]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 10 columns):
Date               1309 non-null object
Start (ET)         1309 non-null object
Visitor/Neutral    1309 non-null object
PTS                1309 non-null int64
Home/Neutral       1309 non-null object
PTS.1              1309 non-null int64
Unnamed: 6         1309 non-null object
Unnamed: 7         73 non-null object
Attend.            1309 non-null int64
Notes              3 non-null object
dtypes: int64(3), object(7)
memory usage: 102.3+ KB


In [278]:
#Clean up the header row into something more appropiate
df = pd.read_csv(data_filename, parse_dates=["Date"])

df.columns = ["Date", "Start (ET)", "Visitor Team", "VisitorPts", "Home Team", "HomePts", "OT?", "Score Type", "Attendance","Notes"]

In [279]:
df.head()

Unnamed: 0,Date,Start (ET),Visitor Team,VisitorPts,Home Team,HomePts,OT?,Score Type,Attendance,Notes
0,2016-10-25,7:30 pm,New York Knicks,88,Cleveland Cavaliers,117,Box Score,,20562,
1,2016-10-25,10:30 pm,San Antonio Spurs,129,Golden State Warriors,100,Box Score,,19596,
2,2016-10-25,10:00 pm,Utah Jazz,104,Portland Trail Blazers,113,Box Score,,19446,
3,2016-10-26,7:30 pm,Brooklyn Nets,117,Boston Celtics,122,Box Score,,18624,
4,2016-10-26,7:00 pm,Dallas Mavericks,121,Indiana Pacers,130,Box Score,OT,17923,


## Extracting new features

In [280]:
#investigate home win stats
df['HomeWin'] = df['VisitorPts'] < df['HomePts']
y_true = df['HomeWin'].values

In [281]:
hw_percent = df['HomeWin'].mean()
print('The percentage of home wins is {0:.2f}'.format(hw_percent*100))

The percentage of home wins is 58.29


In [282]:
#predict which team is doing well, by looking at win of previous game
from collections import defaultdict
won_last = defaultdict(int)

In [283]:
#create new feature to store results of new features
df['HomeLastWin'] = 0
df['VisitorLastWin'] = 0

In [284]:
#update over rows and update current row with teams last results
for index, row in df.iterrows():
    home_team = row["Home Team"]
    visitor_team = row["Visitor Team"]
    row["HomeLastWin"] = won_last[home_team]
    df.set_value(index,"HomeLastWin",won_last[home_team])
    df.set_value(index,"VisitorLastWin",won_last[visitor_team])
    #update dict with either 1 or 0 depending on which team won current game
    won_last[home_team] = int(row["HomeWin"])
    won_last[visitor_team] = 1 - int(row["HomeWin"])  

In [285]:
df.head()

Unnamed: 0,Date,Start (ET),Visitor Team,VisitorPts,Home Team,HomePts,OT?,Score Type,Attendance,Notes,HomeWin,HomeLastWin,VisitorLastWin
0,2016-10-25,7:30 pm,New York Knicks,88,Cleveland Cavaliers,117,Box Score,,20562,,True,0,0
1,2016-10-25,10:30 pm,San Antonio Spurs,129,Golden State Warriors,100,Box Score,,19596,,False,0,0
2,2016-10-25,10:00 pm,Utah Jazz,104,Portland Trail Blazers,113,Box Score,,19446,,True,0,0
3,2016-10-26,7:30 pm,Brooklyn Nets,117,Boston Celtics,122,Box Score,,18624,,True,0,0
4,2016-10-26,7:00 pm,Dallas Mavericks,121,Indiana Pacers,130,Box Score,OT,17923,,True,0,0


In [286]:
df.iloc[1000:1005]

Unnamed: 0,Date,Start (ET),Visitor Team,VisitorPts,Home Team,HomePts,OT?,Score Type,Attendance,Notes,HomeWin,HomeLastWin,VisitorLastWin
1000,2017-03-14,8:00 pm,Portland Trail Blazers,77,New Orleans Pelicans,100,Box Score,,15530,,True,1,1
1001,2017-03-14,7:30 pm,Indiana Pacers,81,New York Knicks,87,Box Score,,18261,,True,0,1
1002,2017-03-15,7:30 pm,Minnesota Timberwolves,104,Boston Celtics,117,Box Score,,18624,,True,1,1
1003,2017-03-15,8:00 pm,Memphis Grizzlies,98,Chicago Bulls,91,Box Score,,21583,,False,1,1
1004,2017-03-15,7:30 pm,Utah Jazz,97,Detroit Pistons,83,Box Score,,14033,,False,0,1


In [287]:
#use new feature alone for prediction
seed=7
kfold = 5
clf = DecisionTreeClassifier(random_state = seed)
X_previous_wins = df[["HomeLastWin", "VisitorLastWin"]].values

In [288]:
scores = cross_val_score(clf,X_previous_wins,y_true,scoring='accuracy',cv=kfold)
print("Accuracy: {0:.1f}".format(np.mean(scores)*100))

Accuracy: 58.3


Using only this new feature, gives a result (only slightly better) than random value above 

### Try different features:
 - which team generally considered better
 - which team won last encounter
 Use information about standings:
  - https://www.basketball-reference.com/leagues/NBA_2016_standings.html#all_expanded_standings
 

In [289]:
#make new dataframe from standings data and use this information in first dataframe
standings_filename = os.path.join(data_folder,'standings.csv')
standings = pd.read_csv(standings_filename,skiprows=1)

In [290]:
standings.head()

Unnamed: 0,Rk,Team,Overall,Home,Road,E,W,A,C,SE,...,Post,≤3,≥10,Oct,Nov,Dec,Jan,Feb,Mar,Apr
0,1,Golden State Warriors,73-9,39-2,34-7,27-3,46-6,9-1,8-2,10-0,...,25-5,7-2,44-5,3-0,16-0,11-2,14-2,9-1,15-2,5-2
1,2,San Antonio Spurs,67-15,40-1,27-14,24-6,43-9,9-1,7-3,8-2,...,22-7,4-4,44-6,1-1,13-3,14-2,11-2,11-1,13-3,4-3
2,3,Cleveland Cavaliers,57-25,33-8,24-17,35-17,22-8,14-4,8-8,13-5,...,19-11,4-7,32-8,2-1,11-3,8-5,13-3,8-5,11-5,4-3
3,4,Toronto Raptors,56-26,32-9,24-17,39-13,17-13,14-2,11-7,14-4,...,21-9,6-6,28-10,2-0,9-7,9-6,12-2,7-4,11-5,6-2
4,5,Oklahoma City Thunder,55-27,32-9,23-18,18-12,37-15,6-4,4-6,8-2,...,15-13,8-6,33-5,2-0,9-7,12-3,13-3,6-5,11-5,2-4


In [291]:
standings["Team"].values[0]

'Golden State Warriors'

In [292]:
#make new column in original dataset
df["HomeTeamRanksHigher"] = 0

for index, row in df.iterrows():
    home_team = row["Home Team"]
    visitor_team = row["Visitor Team"]
    #get standings rank from standings dataset for the home and visitor team
    home_rank = standings[standings["Team"] == home_team]["Rk"].values[0]
    visitor_rank = standings[standings["Team"] == visitor_team]["Rk"].values[0]
    row["HomeTeamRanksHigher"] = int(home_rank > visitor_rank)
    df.set_value(index,"HomeTeamRanksHigher",int(home_rank < visitor_rank))
     

In [293]:
X_home_higher = df[["HomeLastWin", "VisitorLastWin","HomeTeamRanksHigher"]].values
clf = DecisionTreeClassifier(random_state = seed)
scores = cross_val_score(clf,X_home_higher,y_true,scoring='accuracy',cv=kfold)
print("Accuracy: {0:.1f}".format(np.mean(scores)*100))

Accuracy: 62.9


Improvement over previous result

In [294]:
#Add info about how the teams perform against each other
last_match_winner = defaultdict(int)
df["HomeTeamWonLast"] = 0

for index,row in df.iterrows():
    home_team = row["Home Team"]
    visitor_team = row["Visitor Team"]
    teams = tuple(sorted([home_team,visitor_team]))
    #who won last encounter
    home_team_won_last = 1 if last_match_winner[teams] == row["Home Team"] else 0
    df.set_value(index,"HomeTeamWonLast",home_team_won_last)
    #who won this one?
    winner = row["Home Team"] if row["HomeWin"] else row["Visitor Team"]
    last_match_winner[teams] = winner

In [295]:
#Creates a tuple called teams, store the previous result in a 
#dictionary. When those two teams play each other next, recreate the tuple, and look up the previous result. 

In [296]:
X_lastwinner = df[[ "HomeTeamWonLast", "HomeTeamRanksHigher","HomeLastWin", "VisitorLastWin",]].values
clf = DecisionTreeClassifier(random_state=seed, criterion="entropy")
scores = cross_val_score(clf, X_lastwinner, y_true, scoring='accuracy',cv=kfold)
print("Accuracy: {0:.1f}".format(np.mean(scores)*100))

Accuracy: 64.8


Accuracy has improved with this new feature

In [297]:
#try encoding categorical data as integers
encoding = LabelEncoder()
encoding.fit(df["Home Team"].values)
#use same transformer for homw and away, so same team gets same integer value as both home/away
home_teams = encoding.transform(df["Home Team"].values)
visitor_teams = encoding.transform(df["Visitor Team"].values)
#result is [home[0] vis[0]] , [home[1], vis[1]]..... 
X_teams = np.vstack([home_teams, visitor_teams]).T

In [298]:
print(X_teams[:2]);print(home_teams[:2]);print(visitor_teams[:2]);

[[ 5 19]
 [ 9 26]]
[5 9]
[19 26]


In [299]:
#should have one hot encoded; teams an integer in range, algo. may place importance 
#on the int val eg 1 and 2 similar but 4 and 10 not

In [300]:
onehot = OneHotEncoder()
X_teams = onehot.fit_transform(X_teams)
X_teams

<1309x60 sparse matrix of type '<class 'numpy.float64'>'
	with 2618 stored elements in Compressed Sparse Row format>

In [301]:
#have to convert to dense matrix for later processing using hp stack
X_teams = X_teams.todense()
#X_teams

In [302]:
clf = DecisionTreeClassifier(random_state=seed)
scores = cross_val_score(clf, X_teams, y_true, scoring='accuracy',cv=kfold)
print("Accuracy: {0:.1f}%".format(np.mean(scores) * 100))

Accuracy: 59.1%


In [303]:
#Using just the teams playing does not give better results

In [304]:
X_teams

matrix([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  1.,  0.],
        ..., 
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

## Try random forest?
 - 1) Bagging - randomly sample our datset - effectively creating new sets
 - 2) Random subset of features

In [305]:
clf = RandomForestClassifier(random_state=seed)
scores = cross_val_score(clf, X_teams, y_true, scoring='accuracy',cv=kfold)
print("Accuracy: {0:.1f}%".format(np.mean(scores) * 100))

Accuracy: 59.9%


Using just the teams and random forest see small improvement.  Try throwing more features at it:

In [307]:
X_all = np.column_stack([X_lastwinner, X_teams])
clf = RandomForestClassifier(random_state=14)
scores = cross_val_score(clf, X_all, y_true, scoring='accuracy',cv=kfold)
print("Accuracy: {0:.1f}%".format(np.mean(scores) * 100))

Accuracy: 60.3%


Better accuracy, lets try various parameters using grid search

In [311]:
params = {
    "max_features": [2, 10, 'auto'],
    "n_estimators": [100, 200],
    "criterion": ["gini", "entropy"],
    "min_samples_leaf": [2, 4, 6],
}
clf = RandomForestClassifier(random_state=seed)
grid = GridSearchCV(clf, parameter_space,cv=kfold)
grid.fit(X_all, y_true)
print("Accuracy: {0:.1f}%".format(grid.best_score_ * 100))

Accuracy: 63.8%


In [312]:
print(grid.best_estimator_)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=6, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=7, verbose=0, warm_start=False)


Overall better accuracy using more engineered features and random forest