In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn import tree
import matplotlib.pyplot as plt 

In [2]:
home = pd.read_csv('home_completed.csv')
away = pd.read_csv('away_completed.csv')

In [3]:
#put all data into one large data frame
main = home.append(away, ignore_index=True).sort_values(['Checker']) # puts into date order
#put columns into more organized order
main = main[['Year','Month', 'Day' , 'Team', 'Spread', 'Line', '+/-', '3P', '3P%', '3PA', '3PAr', 'AST', 'BLK', 'DRB', 'Day',
       'FG', 'FG%', 'FGA', 'FT', 'FT%', 'FTA', 'FTr', 'Home', 'Losses',
        'ORB', 'PF', 'PTS', 'STL', 'Starters MP', 'TOV', 'TS%', 'Wins',
        'eFG%',  'Checker']]
#number of points a team won or lost by
main['+/-'] = main['+/-']/5
#puts the odds into floats
main['Spread'] = main['Spread'].replace(' PK', '0').astype(float)
#gets how many points a team went over or under the spread
main['Cover'] = main['Spread'] + main['+/-']
#true or false on whether a team covered the spread
main['ATS'] = np.where(main['Cover'].isnull(), np.nan,
          np.where(main['Cover'] > 0,   1, 0))
teams = list(main['Team'])
new_teams = []
#adds the opponent to the df
for i in range(0, len(teams), 2):
    new_teams.append(teams[i+1])
    new_teams.append(teams[i])
main['Opponent'] = new_teams
main = main.dropna()

In [4]:
def team_df(team):
    t = main[main['Team']== team].reset_index()
    del t['index']
    t['Season'] = t['Checker'].apply(season) #define what season a game is played
    t['Games Played'] = t['Wins'] + t['Losses'] #total games played
    #create running averages for all stats up to a given game date
    t_df = pd.DataFrame()
    for year in years:
        temp = t[t['Season']==year].reset_index()
        temp['Avg Win/Loss'] = (temp['+/-'].cumsum() - temp['+/-'])/temp.index
        temp['Avg 3P'] = (temp['3P'].cumsum() - temp['3P'])/temp.index
        temp['Avg 3PA'] = (temp['3PA'].cumsum() - temp['3PA'])/temp.index
        temp['Avg 3P%'] = (temp['Avg 3P']/temp['Avg 3PA'])
        temp['Avg Starter MP'] = (temp['Starters MP'].cumsum() - temp['Starters MP'])/temp.index
        temp['Avg AST'] = (temp['AST'].cumsum() - temp['AST'])/temp.index
        temp['Avg BLK'] = (temp['BLK'].cumsum() - temp['BLK'])/temp.index
        temp['Avg DRB'] = (temp['DRB'].cumsum() - temp['DRB'])/temp.index
        temp['Avg ORB'] = (temp['ORB'].cumsum() - temp['ORB'])/temp.index
        temp['Avg FG'] = (temp['FG'].cumsum() - temp['FG'])/temp.index
        temp['Avg FGA'] = (temp['FGA'].cumsum() - temp['FGA'])/temp.index
        temp['Avg FG%'] = (temp['Avg FG']/temp['Avg FGA'])
        temp['Avg FT'] = (temp['FT'].cumsum() - temp['FT'])/temp.index
        temp['Avg FTA'] = (temp['FTA'].cumsum() - temp['FTA'])/temp.index
        temp['Avg 3PAr'] = (temp['Avg 3PA']/temp['Avg FGA'])
        temp['Avg FTr'] = (temp['Avg FTA']/temp['Avg FGA'])
        temp['Avg 3P%'] = (temp['Avg FT']/temp['Avg FTA'])/temp.index
        temp['Avg PF'] = (temp['PF'].cumsum() - temp['PF'])/temp.index
        temp['Avg PTS'] = (temp['PTS'].cumsum() - temp['PTS'])/temp.index
        temp['Avg STL'] = (temp['STL'].cumsum() - temp['STL'])/temp.index
        temp['Avg TOV'] = (temp['TOV'].cumsum() - temp['TOV'])/temp.index
        temp['Avg TS%'] = temp['Avg PTS']/ (2*(temp['Avg FGA'] + .44 * temp['Avg FTA']))
        temp['Avg eFG%'] = (temp['Avg FG'] + .5 * temp['Avg 3P'])/ temp['Avg FGA']
        temp['Avg Starter MP'] = (temp['Starters MP'].cumsum() - temp['Starters MP'])/temp.index
        temp['Winning Percentage'] = temp['Wins']/temp.index
        t_df = t_df.append(temp)
    return t_df

In [5]:
#put each row into a season
def season(date):
    date = date[:6]
    if date < '200809':
        return '2007'
    elif date < '200909':
        return '2008'
    elif date < '201009':
        return '2009'
    elif date < '201109':
        return '2010'
    elif date < '201109':
        return '2010'
    elif date < '201209':
        return '2011'
    elif date < '201309':
        return '2012'
    elif date < '201409':
        return '2013'
    elif date < '201509':
        return '2014'
    elif date < '201609':
        return '2015'
    elif date < '201709':
        return '2016'
    elif date < '201809':
        return '2017'
    elif date < '201909':
        return '2018'

In [6]:
teams = set(teams)
years = ['2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018']
all_team_data = {}
for team in teams:
    all_team_data[team] = team_df(team)

In [7]:
giant_df = pd.DataFrame()
for df in all_team_data.values():
    giant_df = giant_df.append(df)

In [8]:
for team in teams:
    t = all_team_data[team]
    opp = giant_df[giant_df['Opponent']==team]
    opp = opp.rename(columns = {'Avg Win/Loss':'oppAvg Win/Loss', 'Avg 3P':'oppAvg 3P', 'Avg 3PA':'oppAvg 3PA' ,
       'Avg 3P%':'oppAvg 3P%', 'Avg Starter MP':'oppAvg Starter MP', 'Avg AST':'oppAvg AST', 'Avg BLK':'oppAvg BLK', 'Avg DRB':'oppAvg DRB', 'Avg ORB':'oppAvg ORB',
       'Avg FG':'oppAvg FG', 'Avg FGA':'oppAvg FGA', 'Avg FG%':'oppAvg FG%', 'Avg FT':'oppAvg FT', 'Avg FTA':'oppAvg FTA', 'Avg 3PAr':'oppAvg 3PAr',
       'Avg FTr':'oppAvg FTr', 'Avg PF':'oppAvg PF', 'Avg PTS':'oppAvg PTS', 'Avg STL':'oppAvg STL', 'Avg TOV':'oppAvg TOV', 'Avg TS%':'oppAvg TS%',
       'Avg eFG%':'oppAvg eFG%', 'Winning Percentage':'oppWinning Percentage'})
    opp = opp[['Checker', 'oppAvg Win/Loss', 'oppAvg 3P', 'oppAvg 3PA',
       'oppAvg 3P%', 'oppAvg Starter MP', 'oppAvg AST', 'oppAvg BLK',
       'oppAvg DRB', 'oppAvg ORB', 'oppAvg FG', 'oppAvg FGA', 'oppAvg FG%',
       'oppAvg FT', 'oppAvg FTA', 'oppAvg 3PAr', 'oppAvg FTr', 'oppAvg PF',
       'oppAvg PTS', 'oppAvg STL', 'oppAvg TOV', 'oppAvg TS%', 'oppAvg eFG%',
       'oppWinning Percentage']]
    all_team_data[team] = pd.merge(t,opp)

In [9]:
phi = all_team_data['Philadelphia']

In [10]:
total_matrix = phi[['Cover', '+/-', 'Wins', 'Losses', 'Home', 'Avg Win/Loss', 'Avg 3P',
       'Avg 3PA', 'Avg 3P%', 'Avg Starter MP', 'Avg AST', 'Avg BLK', 'Avg DRB',
       'Avg ORB', 'Avg FG', 'Avg FGA', 'Avg FG%', 'Avg FT', 'Avg FTA',
       'Avg 3PAr', 'Avg FTr', 'Avg PF', 'Avg PTS', 'Avg STL', 'Avg TOV',
       'Avg TS%', 'Avg eFG%', 'Winning Percentage', 'oppAvg Win/Loss','oppAvg 3P',
        'oppAvg 3PA', 'oppAvg 3P%', 'oppAvg Starter MP',
        'oppAvg AST', 'oppAvg BLK', 'oppAvg DRB', 'oppAvg ORB', 'oppAvg FG',
        'oppAvg FGA', 'oppAvg FG%', 'oppAvg FT', 'oppAvg FTA', 'oppAvg 3PAr',
        'oppAvg FTr', 'oppAvg PF', 'oppAvg PTS', 'oppAvg STL', 'oppAvg TOV',
        'oppAvg TS%', 'oppAvg eFG%', 'oppWinning Percentage']]
total_matrix = total_matrix.dropna()

In [11]:
feature_matrix = total_matrix[['Avg 3P%', 'Avg Starter MP', 'Avg AST', 'Avg ORB', 'Avg FG%', 'Avg FT', 'Avg FTA',
                               'Avg 3PAr', 'Avg FTr', 'Avg PF', 'Avg TOV',
                               'Avg TS%', 'Avg eFG%', 'Winning Percentage', 'oppAvg Win/Loss','oppAvg 3P',
                               'oppAvg 3PA', 'oppAvg 3P%', 'oppAvg Starter MP',
                               'oppAvg AST', 'oppAvg BLK', 'oppAvg DRB', 'oppAvg ORB', 'oppAvg FG',
                               'oppAvg FGA', 'oppAvg FG%', 'oppAvg FT', 'oppAvg FTA', 'oppAvg 3PAr',
                               'oppAvg FTr', 'oppAvg PF', 'oppAvg PTS', 'oppAvg STL', 'oppAvg TOV',
                               'oppAvg TS%', 'oppAvg eFG%', 'oppWinning Percentage']]

In [12]:
results = total_matrix['+/-']
#create target matrix 
# -1 = loss, 0 = tie, 1 = win
temp_target = results.as_matrix()

  after removing the cwd from sys.path.


In [13]:
def create_target(temp_target):
    target = []
    for result in temp_target:
        if result < 0:
            target.append(-1)
        elif result == 0:
            target.append(0)
        else:
            target.append(1)
    return target
target = create_target(temp_target)

In [14]:
#split the data into training and testing data
feature_train, feature_test, target_train, target_test = train_test_split(feature_matrix, target, test_size=0.25, random_state=42)

In [15]:
#Create a decision tree model
DTmodel = tree.DecisionTreeClassifier()
DTmodel = DTmodel.fit(feature_train, target_train)

In [16]:
DTmodel

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [17]:
predict = DTmodel.predict(feature_test)

In [18]:
accuracy = accuracy_score(target_test, predict)
accuracy

0.6050420168067226

**Parameter tuning for Decision Tree Classifier:**

In [19]:
max_depths = np.linspace(1, 50, 50)

In [20]:
min_samples_split = np.linspace(0.1, 1.0, 10)

In [21]:
min_samples_leaf = np.linspace(0.1, .5, 10)

In [26]:
d = 0
s = 0
l = 0 
max_acc = 0
best_tree = tree.DecisionTreeClassifier()
for x in max_depths:
    for y in min_samples_split:
        for z in min_samples_leaf:
                model = tree.DecisionTreeClassifier(max_depth=x, min_samples_split = y, min_samples_leaf=z)
                model = model.fit(feature_train, target_train)
                predict = model.predict(feature_test)
                acc = accuracy_score(target_test, predict)
                if acc > max_acc:
                    max_acc = acc
                    best_tree = model
                    d, s, l = x, y, z

In [27]:
max_acc

0.6932773109243697

In [28]:
d, s, l

(3.0, 0.2, 0.1)

In [29]:
#The code below from: 
#https://stackoverflow.com/questions/25274673/is-it-possible-to-print-the-decision-tree-in-scikit-learn
from sklearn import tree
from sklearn.externals.six import StringIO  
import pydot 

dot_data = StringIO() 
tree.export_graphviz(best_tree, out_file=dot_data) 
graph = pydot.graph_from_dot_data(dot_data.getvalue()) 

graph[0].write_pdf("tree.pdf")

#This creates a pdf showing the decision tree created by the classifier. 

In [25]:
important = DTmodel.feature_importances_
len(important)
cols = np.array(feature_matrix.columns)
for c in cols:
    if important[np.where(cols == c)] > 0:
        print(c)

Avg 3P%
Avg Starter MP
Avg AST
Avg ORB
Avg FG%
Avg FT
Avg FTA
Avg 3PAr
Avg FTr
Avg PF
Avg TOV
Avg TS%
Avg eFG%
Winning Percentage
oppAvg Win/Loss
oppAvg 3P
oppAvg 3PA
oppAvg Starter MP
oppAvg AST
oppAvg BLK
oppAvg DRB
oppAvg ORB
oppAvg FG
oppAvg FGA
oppAvg FTA
oppAvg 3PAr
oppAvg FTr
oppAvg PF
oppAvg PTS
oppAvg STL
oppAvg TOV
oppAvg TS%
oppAvg eFG%
oppWinning Percentage


In [31]:
#get important features from tree
important = best_tree.feature_importances_
len(important)
cols = np.array(feature_matrix.columns)

for c in cols:
    if important[np.where(cols == c)] > 0:
        print(c)

Avg Starter MP
Avg PF
Avg TOV
Avg TS%
Winning Percentage
oppAvg Win/Loss
oppWinning Percentage


In [32]:
feature_matrix.columns

Index(['Avg 3P%', 'Avg Starter MP', 'Avg AST', 'Avg ORB', 'Avg FG%', 'Avg FT',
       'Avg FTA', 'Avg 3PAr', 'Avg FTr', 'Avg PF', 'Avg TOV', 'Avg TS%',
       'Avg eFG%', 'Winning Percentage', 'oppAvg Win/Loss', 'oppAvg 3P',
       'oppAvg 3PA', 'oppAvg 3P%', 'oppAvg Starter MP', 'oppAvg AST',
       'oppAvg BLK', 'oppAvg DRB', 'oppAvg ORB', 'oppAvg FG', 'oppAvg FGA',
       'oppAvg FG%', 'oppAvg FT', 'oppAvg FTA', 'oppAvg 3PAr', 'oppAvg FTr',
       'oppAvg PF', 'oppAvg PTS', 'oppAvg STL', 'oppAvg TOV', 'oppAvg TS%',
       'oppAvg eFG%', 'oppWinning Percentage'],
      dtype='object')

**Random forest classifier:**

In [28]:
RFmodel = RandomForestClassifier(n_estimators = 100)
RFmodel = RFmodel.fit(feature_train, target_train)
RFpredict = RFmodel.predict(feature_test)

In [29]:
acc = accuracy_score(target_test, RFpredict)
acc

0.6890756302521008

In [30]:
maxAcc = 0
est = 0
for i in range(1,100):
    RFmodel = RandomForestClassifier(n_estimators=i)
    RFmodel = RFmodel.fit(feature_train, target_train)
    RFpredict = RFmodel.predict(feature_test)
    acc = accuracy_score(target_test, RFpredict)
    if acc > maxAcc:
        maxAcc = acc
        est = i

In [31]:
print(maxAcc)
print(est)

0.6932773109243697
38


In [32]:
RFmodel = RandomForestClassifier(n_estimators=75)
RFmodel = RFmodel.fit(feature_train, target_train)
RFpredict = RFmodel.predict(feature_test)
acc = accuracy_score(target_test, RFpredict)
acc

0.680672268907563

In [80]:
def get_team_data(team):
    stats = all_team_data[team]
    stats = stats.dropna()
    feature_matrix = stats[['Avg 3P%', 'Avg Starter MP', 'Avg AST', 'Avg ORB', 'Avg FG%', 'Avg FT', 'Avg FTA',
                               'Avg 3PAr', 'Avg FTr', 'Avg PF', 'Avg TOV',
                               'Avg TS%', 'Avg eFG%', 'Winning Percentage', 'oppAvg Win/Loss','oppAvg 3P',
                               'oppAvg 3PA', 'oppAvg 3P%', 'oppAvg Starter MP',
                               'oppAvg AST', 'oppAvg BLK', 'oppAvg DRB', 'oppAvg ORB', 'oppAvg FG',
                               'oppAvg FGA', 'oppAvg FG%', 'oppAvg FT', 'oppAvg FTA', 'oppAvg 3PAr',
                               'oppAvg FTr', 'oppAvg PF', 'oppAvg PTS', 'oppAvg STL', 'oppAvg TOV',
                               'oppAvg TS%', 'oppAvg eFG%', 'oppWinning Percentage']]
    results = stats[['+/-']]
    results = results.values
    target = create_target(results)
    return feature_matrix, target

In [81]:
def get_team_season_data(team, year):
    stats = all_team_data[team]
    season_stats = stats.loc[stats['Season'] == year]
    season_stats = season_stats.dropna()
    feature_matrix = season_stats[['Avg 3P%', 'Avg Starter MP', 'Avg AST', 'Avg ORB', 'Avg FG%', 'Avg FT', 'Avg FTA',
                               'Avg 3PAr', 'Avg FTr', 'Avg PF', 'Avg TOV',
                               'Avg TS%', 'Avg eFG%', 'Winning Percentage', 'oppAvg Win/Loss','oppAvg 3P',
                               'oppAvg 3PA', 'oppAvg 3P%', 'oppAvg Starter MP',
                               'oppAvg AST', 'oppAvg BLK', 'oppAvg DRB', 'oppAvg ORB', 'oppAvg FG',
                               'oppAvg FGA', 'oppAvg FG%', 'oppAvg FT', 'oppAvg FTA', 'oppAvg 3PAr',
                               'oppAvg FTr', 'oppAvg PF', 'oppAvg PTS', 'oppAvg STL', 'oppAvg TOV',
                               'oppAvg TS%', 'oppAvg eFG%', 'oppWinning Percentage']]
    results = season_stats[['+/-']]
    results = results.values
    target = create_target(results)
    return feature_matrix, target

In [82]:
def season_model(team, year):
    feature_matrix, target = get_team_season_data(team, year)
    f_train, f_test, t_train, t_test = train_test_split(feature_matrix, target, test_size=0.5, random_state=42)
    model = tree.DecisionTreeClassifier(max_depth=d, min_samples_split =s, min_samples_leaf=l)
    model = model.fit(f_train, t_train)
    predict = model.predict(f_test)
    acc = accuracy_score(t_test, predict)
    
    #get important features for the season
    important = model.feature_importances_
    cols = np.array(feature_matrix.columns)
    print(team + " " + year + ":")
    print("Accuracy: " + str(acc))
    print("Important Features:")
    for c in cols:
        if important[np.where(cols == c)] > 0:
            print('\t' + c)
    print('\n')

In [83]:
for year in years:
    season_model('Philadelphia', year)

Philadelphia 2007:
Accuracy: 0.6666666666666666
Important Features:
	oppAvg Starter MP
	oppAvg PF
	oppAvg PTS
	oppAvg eFG%


Philadelphia 2008:
Accuracy: 0.4878048780487805
Important Features:
	Avg ORB
	Avg TOV
	Avg TS%
	oppAvg Win/Loss
	oppAvg PF


Philadelphia 2009:
Accuracy: 0.6216216216216216
Important Features:
	Avg PF
	oppAvg AST
	oppAvg ORB
	oppAvg FG


Philadelphia 2010:
Accuracy: 0.5897435897435898
Important Features:
	Winning Percentage
	oppAvg FG%


Philadelphia 2011:
Accuracy: 0.5945945945945946
Important Features:
	Avg TOV
	oppAvg eFG%
	oppWinning Percentage


Philadelphia 2012:
Accuracy: 0.358974358974359
Important Features:
	Avg eFG%
	oppAvg BLK
	oppAvg FG


Philadelphia 2013:
Accuracy: 0.675
Important Features:
	Avg FTr
	oppAvg Win/Loss
	oppAvg FTA


Philadelphia 2014:
Accuracy: 0.5853658536585366
Important Features:
	oppAvg Win/Loss
	oppAvg 3P%
	oppAvg BLK


Philadelphia 2015:
Accuracy: 0.8048780487804879
Important Features:
	Avg AST
	oppWinning Percentage


Philadelph

In [87]:
#Try model on other teams
def predict_team(model, team):
    feature, target = get_team_data(team)
    predict = model.predict(feature)
    acc = accuracy_score(target, predict)
    print(team + ": " + str(acc))

In [88]:
for team in teams:
    predict_team(tree_model, team)

New York: 0.5692963752665245
Milwaukee: 0.5264270613107822
L.A. Clippers: 0.601842374616172
San Antonio: 0.5771428571428572
Indiana: 0.5491388044579534
Minnesota: 0.6409978308026031
Chicago: 0.574468085106383
Charlotte: 0.588855421686747
Philadelphia: 0.6789473684210526
Brooklyn: 0.5723684210526315
Houston: 0.5869346733668341
New Orleans: 0.49375866851595007
Memphis: 0.5667686034658511
Oklahoma City: 0.6247357293868921
Utah: 0.5799793601651186
Toronto: 0.6086508753861998
Orlando: 0.5915637860082305
Detroit: 0.5615711252653928
Golden State: 0.5616438356164384
Phoenix: 0.6240681576144835
Denver: 0.570385818561001
Atlanta: 0.6067527308838133
Sacramento: 0.5610021786492375
Cleveland: 0.6085271317829457
L.A. Lakers: 0.5816733067729084
Boston: 0.586565752128666
Washington: 0.6058394160583942
Portland: 0.572463768115942
Dallas: 0.6069600818833163
Miami: 0.5672514619883041


In [None]:
def predict_plot_season(feature, target):
    predict = RFmodel.predict(feature)
    correctness = []
    for i in predict:
        if i == target[i]:
            correctness.append(1)
        else:
            correctness.append(0)
    game_count = np.linspace(1, len(predict), len(predict))
    plt.scatter(game_count, correctness)
    plt.show()
    print(acc)