### Imports

In [1]:
spark.version
DSX=True

In [13]:
import re
import datetime
from pyspark.sql.functions import *
import pandas as pd
pd.set_option('display.max_colwidth', 80)

import numpy as np
import seaborn as sns
sns.set_palette("deep", desat=0.6)
sns.set_context(rc={"figure.figsize": (8,4)})

import matplotlib.pyplot as plt
%matplotlib notebook
%matplotlib inline

### Load In NBA Score Data Set

In [50]:
cleaned_dir = ''
if(DSX) :
    cleaned_dir = './nba-rt-prediction/sparkfiles/cleanedDF'
else :
    cleaned_dir = '/data2/nba-rt-prediction/sparkfiles/cleanedDF'

df = spark.read.format('csv')\
                    .option("header", "true")\
                    .option("inferSchema", "true")\
                    .option("dateFormat", "yyyy-MM-dd")\
                    .load(cleaned_dir).coalesce(2)

# For some reason my key is none upon load ! Rebuild
df = df.withColumn("key", concat(date_format(df.dateOrig, "yyyy-MM-dd"),lit("."),col("away_team"),lit("."),col("home_team")))
            


In [51]:
from pyspark.sql.functions import sum as sum_, lag, col, coalesce, lit, mean
from pyspark.sql.window import Window

#Define window for rolling average - use last 4 mins (8 data points)
w = Window.partitionBy("key").orderBy("pct_complete")

#score differences = away score momentum
for i in range(0,48,1) :
    df = df.withColumn("min_dly_" + str(i), lag(col("home_score"), i*2,default=0).over(w))

df = df.cache()

In [52]:
f_list = []
for i in range(0,48,1) :
    f_list.append("min_dly_" + str(i))
    
    
f_list2 = ["cf1","cf2","home_score", "away_score", "score_diff_amh", "home_team_spread","pct_complete","overunder"]

featl = list(f_list[:] + f_list2[:])
len(featl)

56

In [53]:
#df.printSchema()
df_pd_X= df.select(featl).toPandas()
df_pd_Y= df.select(["home_win"]).toPandas()

### Sanity Check with simple RF 

In [54]:
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
train_X, test_X, train_y, test_y = train_test_split(df_pd_X, df_pd_Y, train_size=0.7, random_state=0)

clf = RandomForestClassifier(max_depth=2, random_state=0)
clf.fit(train_X, train_y)

#RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
#            max_depth=2, max_features='auto', max_leaf_nodes=None,
#            min_impurity_decrease=0.0, min_impurity_split=None,
#            min_samples_leaf=1, min_samples_split=2,
#            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
#            oob_score=False, random_state=0, verbose=0, warm_start=False)
print(clf.feature_importances_)
pred_y = clf.predict(test_X)
print("Test fraction correct (Accuracy) = {:.3f}".format(clf.score(test_X, test_y)))


[ 0.02131294  0.          0.07908173  0.01174947  0.          0.          0.
  0.03048018  0.          0.          0.00324764  0.          0.          0.
  0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.00769305  0.0075895   0.          0.
  0.          0.          0.          0.0102238   0.          0.          0.
  0.          0.          0.          0.          0.          0.
  0.02960714  0.          0.          0.          0.          0.01210472
  0.          0.30453333  0.0253332   0.06680222  0.00761712  0.25083343
  0.082144    0.          0.04964654]
Test fraction correct (Accuracy) = 0.783




### Sanity Check with simple Logistic Reg

In [55]:
train_X.head()

Unnamed: 0,min_dly_0,min_dly_1,min_dly_2,min_dly_3,min_dly_4,min_dly_5,min_dly_6,min_dly_7,min_dly_8,min_dly_9,...,min_dly_46,min_dly_47,cf1,cf2,home_score,away_score,score_diff_amh,home_team_spread,pct_complete,overunder
994,22.615385,20.296296,18.0,15.0,12.6,11.0,9.909091,7.0,4.081081,2.0,...,0.0,0.0,0.349891,0.005337,22.615385,23.230769,0.615385,2.5,22.916667,208.5
7201,47.0,45.666667,42.0,39.0,37.0,34.0,32.0,30.0,27.0,25.944444,...,0.0,0.0,-2.711215,-0.05487,47.0,43.0,-4.0,-4.0,45.833333,203.0
5125,100.0,95.0,91.0,89.0,83.551724,82.0,78.866667,74.736842,72.833333,70.769231,...,3.447038,1.723519,210.0,8360.250582,100.0,121.0,21.0,1.0,100.0,209.0
21144,117.875,115.3125,112.75,110.1875,107.625,105.0625,102.5,99.9375,97.375,94.8125,...,0.0,0.0,-50.160513,-8.069551,117.875,96.791667,-21.083333,4.0,95.833333,217.5
14358,71.935484,70.0,66.0,64.0,62.56,60.32,56.323529,56.0,56.0,53.783784,...,0.0,0.0,7.960298,0.299678,71.935484,79.935484,8.0,-4.5,75.0,201.5


In [21]:
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LogisticRegressionCV
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Dropout
from keras.utils import np_utils

Using TensorFlow backend.


In [22]:
train_X, test_X, train_y, test_y = train_test_split(X, Y, train_size=0.7, random_state=0)
lr = LogisticRegressionCV()
lr.fit(train_X, train_y)
pred_y = lr.predict(test_X)
print("Test fraction correct (Accuracy) = {:.3f}".format(lr.score(test_X, test_y)))

  y = column_or_1d(y, warn=True)


Test fraction correct (Accuracy) = 0.793


### Use DNN Keras

In [41]:
model = Sequential()
model.add(Dense(56, input_shape=(56,)))
model.add(Activation('relu'))
model.add(Dropout(0.2))
model.add(Dense(32))
model.add(Activation('relu'))
model.add(Dropout(0.2))
model.add(Dense(32))
model.add(Activation('relu'))
model.add(Dropout(0.2))
model.add(Dense(32))
model.add(Activation('relu'))
model.add(Dropout(0.2))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dropout(0.2))
model.add(Dense(1))
model.add(Activation('sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [57]:
train_X_np = train_X.values
train_y_np = train_y.values

cb = model.fit(train_X_np, train_y_np, verbose=0, batch_size=100, epochs=500)

In [59]:
test_X_np = test_X.values
test_y_np = test_y.values
(loss, accuracy) = model.evaluate(test_X_np, test_y_np, verbose=1,)

#print('Test score:', score[0])
#print('Test accuracy:', score[1])



In [60]:
cb.history 

{'acc': [0.81484909898328506,
  0.81670060836039438,
  0.81383077024729289,
  0.81744121285726179,
  0.81563599143640086,
  0.81651545713133689,
  0.81563599060871184,
  0.81424736098169592,
  0.81267357745494606,
  0.81591371756265663,
  0.81383077024729289,
  0.81563599198819359,
  0.81350675644068515,
  0.81855211856519261,
  0.8179503787647594,
  0.81420107026648425,
  0.81859840491020641,
  0.81609886796402498,
  0.80998888919142042,
  0.81526568921003884,
  0.81720977417960516,
  0.81137752125735985,
  0.81225698860767392,
  0.81359933019015462,
  0.81415478377800432,
  0.81452508545257363,
  0.81309016538624235,
  0.80938714773560916,
  0.8158211421578091,
  0.81683946887424008,
  0.81605258166315464,
  0.81508053977982553,
  0.81313645232719223,
  0.81336789422731803,
  0.81406221030443116,
  0.81526568700286817,
  0.81332160741879833,
  0.81572856628945567,
  0.81716348488801849,
  0.81267357607546442,
  0.81577485475335332,
  0.81503425214361691,
  0.81355304200215328,
  0.81

In [89]:
# Merge the results back to the Pandas DF for quick vis
#len(model.predict(test_X))
vis_pd = test_X
vis_pd["quarter"] = np.where(vis_pd['pct_complete'] > 75, 'Q4', np.where(vis_pd['pct_complete'] > 50, 'Q3', np.where(vis_pd['pct_complete'] > 25, 'Q2', 'Q1')))
vis_pd["probability"] = model.predict(test_X_np)
vis_pd["prediction"]  = np.where(vis_pd['probability'] > 0.5, 1.0, 0.0)
vis_pd["label"] = test_y_np
vis_pd["correct"] = np.where(vis_pd['prediction'] == vis_pd['label'], 'yes', 'no')





A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is tryin

In [90]:
vis_pd.head(20)

Unnamed: 0,min_dly_0,min_dly_1,min_dly_2,min_dly_3,min_dly_4,min_dly_5,min_dly_6,min_dly_7,min_dly_8,min_dly_9,...,away_score,score_diff_amh,home_team_spread,pct_complete,overunder,probability,label,prediction,correct,quarter
8966,64.703704,61.0,61.0,58.0,57.918919,48.935484,46.878788,45.0,43.25,41.0,...,53.0,-11.703704,-7.5,58.333333,212.5,0.810699,0,1,no,Q3
21948,42.864865,42.0,38.871795,36.774194,34.947368,34.0,34.0,32.0,32.0,27.0,...,39.297297,-3.567568,-4.5,42.708333,220.0,0.7544,1,1,yes,Q2
9055,56.741514,47.371429,45.0,44.142857,43.0,39.0,38.208333,37.0,35.0,31.0,...,49.0,-7.741514,-3.5,50.0,193.0,0.792305,1,1,yes,Q2
26355,61.0,61.0,58.12,54.157895,50.0,49.351852,45.0,42.0,39.0,39.0,...,50.473684,-10.526316,-15.0,59.375,206.0,0.849684,1,1,yes,Q3
11599,75.285714,73.0,69.783784,68.0,65.5625,62.411765,59.0,59.0,57.965517,56.0,...,79.0,3.714286,-2.0,70.833333,195.5,0.582832,0,1,no,Q3
1992,37.0,37.0,36.5,33.142857,31.162162,29.0,25.0,25.0,23.846154,23.0,...,52.0,15.0,5.0,46.875,193.5,0.058643,1,0,no,Q2
4467,85.0,85.0,81.4,79.0,79.0,79.0,77.548387,73.04878,70.818182,67.395349,...,76.949802,-8.050198,-5.0,77.083333,206.0,0.934172,1,1,yes,Q4
25365,21.75,18.72,15.0,15.0,13.0,8.588235,6.3125,2.264151,0.0,0.0,...,18.0,-3.75,-5.5,15.625,207.0,0.760524,1,1,yes,Q1
27958,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,-2.0,5.5,2.083333,217.0,0.524831,0,1,no,Q1
8573,50.751938,49.20155,47.651163,46.100775,44.550388,43.0,43.0,43.0,43.0,41.0625,...,35.751938,-15.0,-6.25,60.416667,205.25,0.999713,1,1,yes,Q3


In [92]:
import brunel
%brunel data('vis_pd') x(quarter) y(correct) bin(correct) color(#count) label(#count) style('symbol:rect; border-radius:15')

<IPython.core.display.Javascript object>

In [None]:
#    Q3
# DL  16.5%
# Log 20.8

### Use DNN - Tflow  (sample .. not tested)

In [6]:
#Vector Assembler
#feature_cols  = ["home_score", "away_score", "score_diff_amh", "home_team_spread","pct_complete", "cf1", "cf2"]
feature_cols = ["home_score", "away_score", "score_diff_amh", "home_team_spread","pct_complete"]



In [45]:
!uname -a


Linux yp-spark-dal09-env5-0044 3.10.0-693.5.2.el7.x86_64 #1 SMP Fri Oct 20 20:32:50 UTC 2017 x86_64 x86_64 x86_64 GNU/Linux


In [None]:
## import tensorflow as tf
import numpy as np
import pandas as pd
import math

from sklearn.tree import DecisionTreeClassifier 
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
import tensorflow as tf
import numpy as np

def load_csv(filename):
    file = pd.read_csv(filename, header=0)

    # get sample's metadata
    n_samples = int(file.columns[0])
    n_features = int(file.columns[1])

    # divide samples into explanation variables and target variable
    data = np.empty((n_samples, n_features))
    target = np.empty((n_samples,), dtype=np.int)
    for i, row in enumerate(file.itertuples()):
        target[i] = np.asarray(row[-1], dtype=np.int)
        data[i] = np.asarray(row[1:n_features+1], dtype=np.float64)
    return (data, target)

# output train data 
def get_batch_data(x_train, y_train, size=None):
    if size is None:
        size = len(x_train)
    batch_xs = x_train
    batch_ys = []

    # convert to 1-of-N vector
    for i in range(len(y_train)):
        val = np.zeros((CLASS_SIZE), dtype=np.float64)
        val[y_train[i]] = 1.0
        batch_ys.append(val)
    batch_ys = np.asarray(batch_ys)
    return batch_xs[:size], batch_ys[:size]

# output test data
def get_test_data(x_test, y_test):
    batch_ys = []

    # convert to 1-of-N vector
    for i in range(len(y_test)):
        val = np.zeros((CLASS_SIZE), dtype=np.float64)
        val[y_test[i]] = 1.0
        batch_ys.append(val)
    return x_test, np.asarray(batch_ys)

# for parameter initialize
def get_stddev(in_dim, out_dim):
    return 1.3 / math.sqrt(float(in_dim) + float(out_dim))

# DNN Model Class
class Classifier:
    def __init__(self, hidden_units=[10], n_classes=0, data_size = 0):
        self._hidden_units = hidden_units
        self._n_classes = n_classes
        self._data_size = data_size
        self._sess = tf.Session()

    # build model
    def inference(self, x):
        hidden = []

        # Input Layer
        with tf.name_scope("input"):
            weights = tf.Variable(tf.truncated_normal([DATA_SIZE, self._hidden_units[0]], stddev=get_stddev(DATA_SIZE, self._hidden_units[0]), seed=42), name='weights')
            biases = tf.Variable(tf.zeros([self._hidden_units[0]]), name='biases')
            input = tf.matmul(x, weights) + biases

        # Hidden Layers
        for index, num_hidden in enumerate(self._hidden_units):
            if index == len(self._hidden_units) - 1: break
            with tf.name_scope("hidden{}".format(index+1)):
                weights = tf.Variable(tf.truncated_normal([num_hidden, self._hidden_units[index+1]], seed=42, stddev=get_stddev(num_hidden, self._hidden_units[index+1])), name='weights')
                biases = tf.Variable(tf.zeros([self._hidden_units[index+1]]), name='biases')
                inputs = input if index == 0 else hidden[index-1]
                hidden.append(tf.nn.relu(tf.matmul(inputs, weights) + biases, name="hidden{}".format(index+1)))
        
        # Output Layer
        with tf.name_scope('output'):
            weights = tf.Variable(tf.truncated_normal([self._hidden_units[-1], self._n_classes], seed=42, stddev=get_stddev(self._hidden_units[-1], self._n_classes)), name='weights')
            biases = tf.Variable(tf.zeros([self._n_classes]), name='biases')
            logits = tf.nn.softmax(tf.matmul(hidden[-1], weights) + biases)

        return logits

    # loss function
    def loss(self, logits, y):        
        #return -tf.reduce_mean(y * tf.log(logits))
        return tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=y))

    # fitting function for train data
    def fit(self, x_train=None, y_train=None, steps=200):
        # build model
        x = tf.placeholder(tf.float32, [None, DATA_SIZE])
        y = tf.placeholder(tf.float32, [None, CLASS_SIZE])
        logits = self.inference(x)
        loss = self.loss(logits, y)
        train_op = tf.train.AdamOptimizer(0.003).minimize(loss)

        # save variables
        self._x = x
        self._y = y
        self._logits = logits
 
        # init parameters
        #init = tf.initialize_all_variables() 
        init = tf.global_variables_initializer()
        self._sess.run(init)

        # train
        for i in range(steps):
            batch_xs, batch_ys = get_batch_data(x_train, y_train)
            self._sess.run(train_op, feed_dict={x: batch_xs, y: batch_ys})

    # evaluation function for test data
    def evaluate(self, x_test=None, y_test=None):
        x_test, y_test = get_test_data(x_test, y_test)
        
        # build accuracy calculate step
        correct_prediction = tf.equal(tf.argmax(self._logits, 1), tf.argmax(self._y, 1))
        accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

        # evaluate
        return self._sess.run([accuracy], feed_dict={self._x: x_test, self._y: y_test})

    # label pridiction
    def predict(self, samples):
        predictions = tf.argmax(self._logits, 1)
        return self._sess.run(predictions, {self._x: samples})