In [8]:
import sqlalchemy
import pandas as pd
import numpy as np
engine = sqlalchemy.create_engine('mysql://root:password@127.0.0.1:3306/nba')

In [2]:
sql = """
select l.game_date_est,
-- PLAYER STATS
t.PLAYER_ID,
t.MIN as player_MIN,
t.FGA as player_FGA,
t.FGM as player_FGM,
t.FG3M as player_FG3M,
t.FG3A as player_FG3A,
t.FTA as player_FTA,
t.FTM as player_FTM,
t.OREB as player_OREB,
t.DREB as player_DREB,
t.AST as player_AST,
t.STL as player_STL,
t.BLK as player_BLK,
t.TO as player_TO,
t.PTS as player_PTS,
a.OFF_RATING as player_OFF_RATING,
a.DEF_RATING as player_DEF_RATING,
a.NET_RATING as player_NET_RATING,
a.PIE as player_PIE,
t.PLUS_MINUS as player_PLUS_MINUS,
-- PLAYER TEAM STATS
t_p.PLUS_MINUS as team_PLUS_MINUS,
-- OPPOSING TEAM STATS
t_o.OREB as opponent_OREB,
t_o.DREB as opponent_DREB,
t_o.STL as opponent_STL,
t_o.BLK as opponent_BLK,
a_o.OFF_RATING as opponent_OFF_RATING,
a_o.DEF_RATING as opponent_DEF_RATING,
a_o.NET_RATING as opponent_NET_RATING,
t.score
from traditional_boxscores t
join line_score l on l.game_id = t.game_id and l.team_id = t.team_id
join advanced_boxscores a on a.game_id = t.game_id and a.player_id = t.player_id and a.team_id = t.team_id
join advanced_boxscores_team a_o on t.game_id = a_o.game_id and t.team_id != a_o.team_id
join traditional_boxscores_team t_p on t_p.game_id = t.game_id and t.team_id = t_p.team_id
join traditional_boxscores_team t_o on t_o.game_id = t.game_id and t.team_id != t_o.team_id
"""
og_df = pd.read_sql(sql, engine)
og_df['game_date_est'] = pd.to_datetime(og_df.game_date_est)
og_df['player_MIN'] = og_df['player_MIN'].apply(lambda t: int(t.split(":")[0]) * 60 + int(t.split(":")[1]))
og_df.sort_values(['PLAYER_ID', 'game_date_est'], inplace=True)

df = og_df

In [3]:
grouped_players = df.groupby('PLAYER_ID')
player_play_time = grouped_players['player_MIN'].sum()

import operator

sorted_play_times = sorted(player_play_time.iteritems(), key=operator.itemgetter(1), reverse=True)

top_player_ids = [player_id for (player_id, playtime) in sorted_play_times[:200]]

In [4]:
cols = df.keys()
initial_cols = cols
excluded = ['game_date_est', 'PLAYER_ID', 'score', 'index']
# df for every game that every player played in
player_games = pd.DataFrame()

# watch out for PLUS_MINUS! everything else can be averaged. Need to delete index before render to CSV!

for i, player_id in enumerate(top_player_ids):
    if (i + 1) % 20 == 0:
        print "%0.2f%%" % ((float(i + 1) / len(top_player_ids)) * 100),
    player_df = df[(df['PLAYER_ID'] == player_id)].sort_values(['game_date_est'])
    player_df['index'] = range(len(player_df)) 
    for col in cols:
        if 'PLUS_MINUS' not in col and col not in excluded:
            # take a cumulative sum, but remove the value of the current row!
            cumulative_sum = player_df[col].cumsum() - player_df[col]
            player_df[col + '_prev_out'] = pd.rolling_mean(player_df[col], 1).shift(1)
            player_df[col + '_mean_3_out'] = pd.rolling_mean(player_df[col], 3).shift(1)
            player_df[col + '_mean_5_out'] = pd.rolling_mean(player_df[col], 5).shift(1)
            player_df[col + '_mean_out'] = cumulative_sum.div(player_df['index'] + 1)
        #elif 'PLUS_MINUS' in col:
        #    cumulative_sum = player_df[col].cumsum() - player_df[col]
        #    player_df[col + '_cumsum_out'] = cumulative_sum
    player_games = player_games.append(player_df.fillna(0))

10.00% 20.00% 30.00% 40.00% 50.00% 60.00% 70.00% 80.00% 90.00% 100.00%


In [5]:
cols = player_games.keys()
# we want score to come last!
csv_cols = [col for col in cols if col not in excluded and col.endswith('_out')] + ['score']

# normalize the output columns -- do we want to normalize the score here??
#player_games[csv_cols] = player_games[csv_cols].apply(lambda x: (x - x.mean()) / (x.max() - x.min()))

# exclude first 10 games of the season
output_player_games = player_games[player_games['index'] > 10]

In [9]:
data_cols = [col for col in csv_cols if col != 'score']
target_cols = ['score']
input_df = output_player_games[csv_cols]
shuffled = input_df.reindex(np.random.permutation(input_df.index))

nba_input_data  = shuffled[data_cols].as_matrix()
nba_target_data = shuffled['score'].as_matrix()


In [10]:
import numpy as np
from sklearn import cross_validation, preprocessing
from sklearn.metrics import mean_squared_error

import skflow

# Create random dataset.
X = nba_input_data
y = nba_target_data

X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.2, random_state=42)

scaler = preprocessing.StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

# 9.8 mean error!
hidden_units = [1000, 1000]
regressor = skflow.TensorFlowDNNRegressor(hidden_units=hidden_units, steps=1000, learning_rate=0.1, batch_size=100)

regressor.fit(X_train, y_train)
score = mean_squared_error(regressor.predict(X_test), y_test)
print("Mean Squared Error for {0}: {1:f}".format(str(hidden_units), score))

Step #1, avg. loss: 782.17108
Step #101, epoch #1, avg. loss: 198.94139
Step #201, epoch #2, avg. loss: 175.67123
Step #301, epoch #3, avg. loss: 163.27388
Step #401, epoch #5, avg. loss: 151.62372
Step #501, epoch #6, avg. loss: 142.44727
Step #601, epoch #7, avg. loss: 129.51329
Step #701, epoch #9, avg. loss: 122.73361
Step #801, epoch #10, avg. loss: 113.05290
Step #901, epoch #11, avg. loss: 101.54024
Mean Squared Error for [1000, 1000]: 143.703241


In [11]:
result = regressor.predict(scaler.fit_transform(nba_input_data))

In [15]:
import urllib2
from bs4 import BeautifulSoup

schedule_url = "http://www.nba.com/gameline/20160310/"
html = urllib2.urlopen(schedule_url).read()

In [104]:
soup = BeautifulSoup(html, 'html.parser')
game_divs = soup.findAll("div", { "class" : "nbaTeamsRow" })
games = []
for div in game_divs:
    teams = tuple([str(team_div.text) for team_div in div.findAll("h5")])
    if teams not in games:
        games.append(teams)

todays_teams = []
for (t1, t2) in games:
    if t1 not in todays_teams:
        todays_teams.append(t1)
    if t2 not in todays_teams:
        todays_teams.append(t2)
teams_sql = ",".join(["'%s'" % team_abbr for team_abbr in todays_teams])
sql = "select distinct team_abbreviation, TEAM_ID from line_score where TEAM_ABBREVIATION in (%s)" % teams_sql
df = pd.read_sql(sql, engine)


abbr_id_map = {}
for (abbr, tid) in df.values:
    abbr_id_map[abbr] = int(tid)

sql = 'select distinct team_id, player_id from traditional_boxscores'
team_player_df = pd.read_sql(sql, engine)

from collections import defaultdict
    
team_player_map = defaultdict(list)
for (team_id, player_id) in team_player_df.values:
    if int(player_id) not in team_player_map[int(team_id)]:
        team_player_map[int(team_id)].append(int(player_id))

In [105]:
player_id_set = set()
predict_df = pd.DataFrame()
for player_ids in team_player_map.values():
    for player_id in player_ids:
        player_id_set.add(player_id)
        player_df = output_player_games[output_player_games['PLAYER_ID'] == str(player_id)]
        predict_df = predict_df.append(player_df)

In [218]:
def add_row(player_id): 
    player_df = predict_df[(predict_df['PLAYER_ID'] == str(player_id))].sort_values(['game_date_est'])
    player_df['index'] = range(len(player_df))
    
    if len(player_df) == 0:
        return None
    
    tmp_row = player_df.tail(1)
    new_row = pd.DataFrame(columns=["game_date_est"], data=[pd.to_datetime('2016-03-09')])
    for col in initial_cols:
        if col not in ['index', 'game_date_est']:
            new_row[col] = tmp_row[col].values[0]
    
    for col in initial_cols:
        if 'PLUS_MINUS' not in col and col not in excluded and len(player_df) > 0:
            cumulative_sum = player_df[col].cumsum()
            try:
                new_row[col + '_prev_out'] = pd.rolling_mean(player_df[col], 1).values[-1]
                new_row[col + '_mean_3_out'] = pd.rolling_mean(player_df[col], 3).values[-1]
                new_row[col + '_mean_5_out'] = pd.rolling_mean(player_df[col], 5).values[-1]
                new_row[col + '_mean_out'] = cumulative_sum.div(player_df['index'] + 1).values[-1]
            except ValueError as e:
                pass # lol
    return new_row

In [219]:
# add a new row for each player-game (above)!
# sort predict_df by player_id, game_date_est

player_games = pd.DataFrame()

for i, player_id in enumerate(player_id_set):
    if i % 100 == 0:
        print "%d out of %d [%d]" % (i, len(player_id_set), len(player_games))
    new_row = add_row(player_id)
    if new_row is None:
        continue
    player_games = player_games.append(new_row.fillna(0))

0 out of 463 [0]
100 out of 463 [35]
200 out of 463 [77]
300 out of 463 [125]
400 out of 463 [170]


In [282]:
data_cols = [col for col in csv_cols if col != 'score']
prediction_data = player_games[data_cols].as_matrix()

expected_scores = regressor.predict(scaler.fit_transform(prediction_data))

player_scores = {}
for player_id, score in zip(player_games['PLAYER_ID'].values, [i[0] for i in expected_scores]):
    player_scores[player_id] = score
    
sorted_players = sorted(player_scores.items(), key=operator.itemgetter(1), reverse=True)

predicted = []
for player_id, score in sorted_players[0:20]:
    sql = 'select player_name from traditional_boxscores where player_id = %d limit 1' % int(player_id)
    name = pd.read_sql(sql, engine).values[0][0]
    #print "%s: %d" % (name, player_scores[player_id])
    predicted.append((name, player_scores[player_id]))

In [288]:
real_data = open('actual_data.txt').read().split('\n')

In [295]:
actual = [(i.split(";")[3], i.split(";")[5]) for i in real_data[1:21]]
ideal = actual[0:20]

data = []
for p, i in zip(predicted, ideal):
    data.append([i[0], i[1], p[0], p[1]])
    
df = pd.DataFrame(columns=["Actual player", "Actual_Score", "Predicted Player", "Predicted Score"], data=data)

In [296]:
df

Unnamed: 0,Actual player,Actual_Score,Predicted Player,Predicted Score
0,"Westbrook, Russell",67.2,Draymond Green,42.408337
1,"Walker, Kemba",57.9,Stephen Curry,41.635914
2,"Harden, James",51.8,John Wall,41.100163
3,"Paul, Chris",48.6,Rajon Rondo,40.474224
4,"Holiday, Jrue",47.8,Russell Westbrook,40.241577
5,"Middleton, Khris",40.4,Reggie Jackson,39.392998
6,"Collison, Darren",38.0,Chris Paul,39.043102
7,"Irving, Kyrie",37.2,Jimmy Butler,37.785873
8,"Booker, Devin",34.1,Derrick Favors,37.706657
9,"Thomas, Isaiah",34.1,James Harden,37.639923
