In [1]:
from __future__ import absolute_import, division, print_function
import matplotlib.pyplot as plt  
import seaborn as seabornInstance 
from urllib.request import urlopen
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn import metrics
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.svm import SVC
import numpy as np
import pandas as pd
import os
import pickle
import pdb
import requests
import re
from IPython.display import display, HTML
from bs4 import BeautifulSoup, Comment
# TensorFlow and tf.keras
import tensorflow as tf
from tensorflow import keras
%matplotlib inline

In [2]:
NBA_URL = 'https://www.basketball-reference.com'
TEAMS = [
    #Initials, Name
    ('ATL', 'Hawks'),
    ('BOS', 'Celtics'),
    ('BRK', 'Nets'),
    ('CHI', 'Bulls'),
    ('CHO', 'Hornets'),
    ('CLE', 'Cavaliers'),
    ('DAL', 'Mavericks'),
    ('DEN', 'Nuggets'),
    ('DET', 'Pistons'),
    ('GSW', 'Warriors'),
    ('HOU', 'Rockets'),
    ('IND', 'Pacers'),
    ('LAC', 'Clippers'),
    ('LAL', 'Lakers'),
    ('MEM', 'Grizzlies'),
    ('MIA', 'Heat'),
    ('MIL', 'Bucks'),
    ('MIN', 'Timberwolves'),
    ('NOP', 'Pelicans'),
    ('NYK', 'Knicks'),
    ('OKC', 'Thunder'),
    ('ORL', 'Magic'),
    ('PHI', '76ers'),
    ('PHO', 'Suns'),
    ('POR', 'Trailblazers'),
    ('SAC', 'Kings'),
    ('SAS', 'Spurs'),
    ('TOR', 'Raptors'),
    ('UTA', 'Jazz'),
    ('WAS', 'Wizards')
]
TEAM_NAMES = dict()

def get_season_data_per_poss(year):
    with (open("Data/" + str(year) + "/all_team-stats-per_poss.pkl", "rb")) as openfile:
        data = pickle.load(openfile)
    with (open("Data/" + str(year) + "/all_opponent-stats-per_poss.pkl", "rb")) as openfile:
        opp_data = pickle.load(openfile)
    return data, opp_data

def get_season_data(year):
    with (open("Data/" + str(year) + "/all_team-stats-base.pkl", "rb")) as openfile:
        data = pickle.load(openfile)
    with (open("Data/" + str(year) + "/all_opponent-stats-base.pkl", "rb")) as openfile:
        opp_data = pickle.load(openfile)
    return data, opp_data

    #ratio, wins = convert_data_to_ratio(data)
def convert_data_to_ratio(data):
    wins = data[data.columns[-1]]
    data = data.drop(labels='Win', axis=1)
    columns = data.columns
    for column in columns:
        if '%' in column:
            data = data.drop(column, axis=1)
    away = data[data.columns[:len(data.columns)//2]]
    home = data[data.columns[len(data.columns)//2:]]
    ratio = []
    for index, row in away.iterrows():
        away_row = np.array(away.iloc[index], dtype=np.float)
        home_row = np.array(home.iloc[index], dtype=np.float)
        home_row[home_row == 0] = 1
        ratio.append(away_row / home_row)
    assert(len(ratio) == len(wins))
    return ratio, wins
def get_team_names(year):
    rtn = []
    standings_soup = "https://www.basketball-reference.com/leagues/NBA_" + str(year) + "_standings.html"
    standings_soup = BeautifulSoup(urlopen(standings_soup), 'html.parser')
    comment = standings_soup.find("div",  id='all_expanded_standings')\
            .findAll(text=lambda text:isinstance(text, Comment))
    extracted_comment = comment[0].extract()
    standings_soup = BeautifulSoup(extracted_comment)
    rows = standings_soup.find('tbody').findAll("tr")
    for row in rows:
        rtn.append(row.find("td", attrs={'data-stat': "team_name"}).getText())
    return sorted(rtn)
    
    
# training data: previous year's stats + current season stats
def get_team_cumulative_stats(year):
    team_stats = dict()
    for initials, name in TEAMS:
        with (open("Data/" + str(year-1) + "/" + str(initials) + \
                   "_season_game_basic_cumulative_stats.pkl", "rb")) as openfile:    
            team_stats[initials] = pickle.load(openfile)
    return team_stats
def get_total_games(team_stats):
    num_games = 0
    for key in team_stats:
        num_games +=  team_stats[key].shape[0]
    return num_games
    
#create training data
def get_training_data(year):
    training_data = []
    results = []
    team_names = get_team_names(year)
    assert(team_names == get_team_names(year-1))
    
    team_cumulative_stats = get_team_cumulative_stats(year)
    season_data, season_opp_data = get_season_data_per_poss(year-1)
    season_data_dict = convert_season_data_to_dict(season_data, team_names)
    season_opp_data_dict = convert_season_data_to_dict(season_opp_data, team_names)
    for key in season_data_dict.keys():
        assert(key in season_opp_data_dict)
        season_data_dict[key] = np.concatenate((season_data_dict[key], season_opp_data_dict[key]), axis=None)
    sample = "https://www.basketball-reference.com/leagues/NBA_2019_games.html"
    year_soup = BeautifulSoup(urlopen(sample), 'html.parser')
    months = year_soup.find("div", attrs={'class': 'filter'})
    months = months.findAll("a")
    months = [NBA_URL + month['href'] for month in months]
    
    for month in months:
        month_soup = BeautifulSoup(urlopen(month), 'html.parser')
        games = month_soup.find("table", attrs={'id': 'schedule'}).find('tbody').findAll("tr")
        for game in games:
            game_data = []
            win = 0
            if game.find('td') and game.find('td', attrs={'data-stat':'visitor_pts'}) \
                and game.find('td', attrs={'data-stat':'visitor_pts'}).getText():
                away = game.find('td', attrs={'data-stat':'visitor_team_name'}).getText()
                home = game.find('td', attrs={'data-stat':'home_team_name'}).getText()
                away_pts = game.find('td', attrs={'data-stat':'visitor_pts'}).getText()
                away_pts = int(away_pts)
                home_pts = game.find('td', attrs={'data-stat':'home_pts'}).getText()
                home_pts = int(home_pts)
                game_data.extend(season_data_dict[away])
                game_data.extend(season_data_dict[home])
                assert(home_pts != away_pts)
                if home_pts > away_pts:
                    win = 1
                results.append(win)
                training_data.append(game_data)
    assert(len(training_data) == len(results))
    return training_data, results
            
                
                
                
def convert_season_data_to_dict(pandas_frame, team_names):
    pandas_frame.drop([column for column in list(pandas_frame) if '%' in column], inplace=True, axis=1)
    pandas_frame.drop([column for column in list(pandas_frame) if 'FG' in column], inplace=True, axis=1)
    pandas_frame.drop([column for column in list(pandas_frame) if 'PTS' in column], inplace=True, axis=1)
    output = dict()
    count = 0
    for team in pandas_frame['Team']:
        for team_name in team_names:
            if team_name in team:
                row = pandas_frame.iloc[count, 3:]
                output[team_name] = row.values
        count += 1
    return output


In [3]:
#get_training_data()
data, results = get_training_data(2018)



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


In [50]:
data = np.array(data).astype(float)
results = np.array(results).astype(float)
print (data.shape)

(1307, 60)


In [55]:
tf.reset_default_graph()
X = tf.placeholder(shape=(None, data.shape[1]),  dtype=tf.float64, name='X')
y = tf.placeholder(shape=(None,2), dtype=tf.float64, name='y')
print(data.shape[1]/2)
W1 = tf.Variable(np.random.rand(data.shape[1], int(data.shape[1]/2)), dtype=tf.float64)
W2 = tf.Variable(np.random.rand(int(data.shape[1]/2), 2), dtype=tf.float64)
b1 = tf.Variable(np.random.rand(int(data.shape[1]/2)))
bo = tf.Variable(np.random.rand(2))
A1 = tf.sigmoid(tf.add(tf.matmul(X,W1), b1))
keep_prob = tf.placeholder("float")
#A1 = tf.nn.dropout(A1, keep_prob)
print(A1.shape, W2.shape, bo.shape)
predictions = tf.sigmoid(tf.add(tf.matmul(A1, W2), bo))

#deltas = tf.square(y_est - y)
#loss = tf.reduce_sum(deltas)


#cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=predictions, labels=y))
#optimizer = tf.train.AdamOptimizer(learning_rate=0.0001).minimize(cost)
cross_entropy = tf.reduce_mean(-tf.reduce_sum(y * tf.log(predictions),reduction_indices=[1]))
optimizer = tf.train.GradientDescentOptimizer(0.005).minimize(cross_entropy)
training_epochs = 5000

train_data, test_data, train_results, test_results = train_test_split(data, results, test_size=0.1) 
print(train_results.shape)
train_result = []
for i in train_results:
    if i == 0:
        train_result.append([1,0])
    else:
        train_result.append([0,1])
test_result = []
for i in test_results:
    if i == 0:
        test_result.append([1,0])
    else:
        test_result.append([0,1])
        
total_batch = 10

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    for epoch in range(training_epochs):
        avg_cost = 0.0
        total_batch = int(len(train_data)/ total_batch)
        train_data_batches = np.array_split(train_data, total_batch)
        train_result_batches = np.array_split(train_result, total_batch)
        for i in range(total_batch):
            batch_x, batch_y = train_data_batches[i], train_result_batches[i]
            _, c = sess.run([optimizer, cross_entropy], 
                            feed_dict={
                                X: batch_x, 
                                y: batch_y, 
                                keep_prob: 0.8
                            })
            avg_cost += c/ total_batch
        if epoch % 1000 == 0:
            print("Epoch:", '%04d' % (epoch+1), "cost=", \
                "{:.9f}".format(avg_cost))
    correct_prediction = tf.equal(tf.argmax(predictions, 1), tf.argmax(y, 1))
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
    print("Accuracy:", accuracy.eval({X: test_data, y: test_result, keep_prob: 1.0}))
        
        

30.0
(?, 30) (30, 2) (2,)
(1176,)
Epoch: 0001 cost= 0.000000106
Epoch: 1001 cost= 0.000000106
Epoch: 2001 cost= 0.000000106
Epoch: 3001 cost= 0.000000106
Epoch: 4001 cost= 0.000000106
Accuracy: 0.57251906


In [29]:
kFold = KFold(n_splits=5, shuffle=True, random_state=None)
linear_results = []
polynomial_results = []
logistic_results = []
svm_linear_results = []
svm_polynomial_results = []
svm_rbf_results = []
for train_index, test_index in kFold.split(data):
    regression_model = LinearRegression()
    train_data, train_result = data[train_index], results[train_index]
    test_data, test_result = data[test_index], results[test_index]
    regression_model.fit(train_data, train_result)
    test_prediction = regression_model.predict(test_data)

    total = len(test_prediction)
    correct = 0
    for i in range(len(test_prediction)):
        if test_prediction[i] > 0.5:
            pred = 1
        else:
            pred = 0
        if pred == test_result[i]:
            correct += 1
    linear_results.append(float(correct)/ total)
    rmse = mean_squared_error(test_result, test_prediction)
    r2 = r2_score(test_result, test_prediction)
    
    
    polynomial_features = PolynomialFeatures(degree=2)
    train_data_poly = polynomial_features.fit_transform(train_data)
    test_data_poly = polynomial_features.fit_transform(test_data)
    polynomial_model = LinearRegression()
    polynomial_model.fit(train_data_poly,train_result)
    polynomial_test_prediction = polynomial_model.predict(test_data_poly)
    total = len(polynomial_test_prediction)
    correct = 0
    for i in range(len(polynomial_test_prediction)):
        if polynomial_test_prediction[i] > 0.5:
            pred = 1
        else:
            pred = 0
        if pred == test_result[i]:
            correct += 1
    polynomial_results.append(float(correct)/ total)
    
    rsme_polynomial = mean_squared_error(test_result, polynomial_test_prediction)
    r2_polynomial = r2_score(test_result, polynomial_test_prediction)
    
# model evaluation
    logistic_model = LogisticRegression()
    logistic_model.fit(train_data, train_result)
    logistic_test_prediction = logistic_model.predict(test_data)
    test_prediction = logistic_test_prediction
    
    total = len(test_prediction)
    correct = 0
    for i in range(len(test_prediction)):
        if test_prediction[i] > 0.5:
            pred = 1
        else:
            pred = 0
        if pred == test_result[i]:
            correct += 1
    logistic_results.append(float(correct)/ total)
    rmse_logistic = mean_squared_error(test_result, test_prediction)
    r2_logistic = r2_score(test_result, test_prediction)
    
    svm_linear_model = SVC(kernel='linear')
    svm_linear_model.fit(train_data, train_result)
    test_prediction = svm_linear_model.predict(test_data)
    total = len(test_prediction)
    correct = 0
    for i in range(len(test_prediction)):
        if test_prediction[i] > 0.5:
            pred = 1
        else:
            pred = 0
        if pred == test_result[i]:
            correct += 1
    svm_linear_results.append(float(correct)/total)
'''
    svm_polynomial_model = SVC(kernel='poly', degree = 2)
    svm_polynomial_model.fit(train_data, train_result)
    test_prediction = svm_polynomial_model.predict(test_data)
    total = len(test_prediction)
    correct = 0
    for i in range(len(test_prediction)):
        if test_prediction[i] > 0.5:
            pred = 1
        else:
            pred = 0
        if pred == test_result[i]:
            correct += 1
    svm_polynomial_results.append(float(correct)/total)
'''
    svm_rbf_model = SVC(kernel='rbf')
    svm_rbf_model.fit(train_data, train_result)
    test_prediction = svm_rbf_model.predict(test_data)
    total = len(test_prediction)
    correct = 0
    for i in range(len(test_prediction)):
        if test_prediction[i] > 0.5:
            pred = 1
        else:
            pred = 0
        if pred == test_result[i]:
            correct += 1
    svm_rbf_results.append(float(correct)/total)
    

# printing values
    #print('Slope:' ,regression_model.coef_)
    #print('Intercept:', regression_model.intercept_)
print("Linear results: ", np.mean(linear_results))
#print('Root mean squared error: ', rmse)
#print('R2 score: ', r2)

print("Polynomial results: ", np.mean(polynomial_results))
#print('Polynomial Root mean squared error: ', rsme_polynomial)
#print('Polynomial R2 score: ', r2_polynomial)

print("Logistic results: ", np.mean(logistic_results))
#print('Logistic Root mean squared error: ', rmse_logistic)
#print('Logistic R2 score: ', r2_logistic)

print("SVM linear results: ", np.mean(svm_linear_results))
print("SVM polynomial results: ", np.mean(svm_polynomial_results))
print("SVM rbf results: ", np.mean(svm_rbf_results))




Linear results:  0.6656693281857798
Polynomial results:  0.5677371238045099
Logistic results:  0.6427159193939926
SVM linear results:  0.6411570296276798
SVM polynomial results:  0.6434734286800621
SVM rbf results:  0.6243221900500131


