#Load in Files

### Imports

In [1]:
spark.version

u'2.2.0'

In [2]:
import re
import datetime
from pyspark.sql.functions import *
import pandas as pd
pd.set_option('display.max_colwidth', 80)

import numpy as np
import seaborn as sns
sns.set_palette("deep", desat=0.6)
sns.set_context(rc={"figure.figsize": (8,4)})

import matplotlib.pyplot as plt
%matplotlib notebook
%matplotlib inline

### Load In NBA Score Data Set

In [12]:
cleaned_dir = '/data2/nba-rt-prediction/sparkfiles/cleanedDF'
df = spark.read.format('csv')\
                    .option("header", "true")\
                    .option("inferSchema", "true")\
                    .option("dateFormat", "yyyy-MM-dd")\
                    .load(cleaned_dir).coalesce(2)

                # For some reason my key is none upon load ! Rebuild
df = df.withColumn("key", concat(date_format(df.dateOrig, "yyyy-MM-dd"),lit("."),col("away_team"),lit("."),col("home_team")))
            
#df.printSchema()
df_pd_X= df.select(["cf1","cf2","home_score", "away_score", "score_diff_amh", "home_team_spread","pct_complete"]).toPandas()
X = df_pd_X.values
df_pd_Y= df.select(["home_win"]).toPandas()
Y = df_pd_Y.values

In [13]:
X
# Create Train / cv or Dev / Test Sets

array([[ 29.35296743,   8.39902472,  86.80487805, ...,   8.96747967,
         11.66666667,  97.91666667],
       [ 23.74256995,   4.84313707,  85.34146341, ...,   8.72357724,
         11.66666667,  96.875     ],
       [ 20.17445863,   3.24555738,  83.87804878, ...,   8.4796748 ,
         11.66666667,  95.83333333],
       ..., 
       [  0.        ,   0.        ,   2.09059233, ...,   0.        ,
          3.5       ,   2.08333333],
       [  0.        ,   0.        ,   1.04529617, ...,   0.        ,
          3.5       ,   1.04166667],
       [  0.        ,   0.        ,   0.        , ...,   0.        ,
          3.5       ,   0.        ]])

### Use DNN Keras

In [14]:
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LogisticRegressionCV
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Dropout
from keras.utils import np_utils

In [16]:
train_X, test_X, train_y, test_y = train_test_split(X, Y, train_size=0.7, random_state=0)
lr = LogisticRegressionCV()
lr.fit(train_X, train_y)
pred_y = lr.predict(test_X)
print("Test fraction correct (Accuracy) = {:.2f}".format(lr.score(test_X, test_y)))

Test fraction correct (Accuracy) = 0.82


In [17]:
model = Sequential()
model.add(Dense(6, input_shape=(7,)))
model.add(Activation('sigmoid'))
model.add(Dropout(0.2))
model.add(Dense(16))
model.add(Activation('sigmoid'))
model.add(Dropout(0.2))
model.add(Dense(1))
model.add(Activation('sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [18]:
cb = model.fit(train_X, train_y, verbose=0, batch_size=100, epochs=50)

In [19]:

(loss, accuracy) = model.evaluate(test_X, test_y, verbose=1,)
#print('Test score:', score[0])
#print('Test accuracy:', score[1])



In [21]:
cb.history 

{'acc': [0.63935662219200218,
  0.63822568544463076,
  0.64249811546381286,
  0.65946217742988467,
  0.69062578396510765,
  0.72241769486499452,
  0.76514199562604834,
  0.77883890481653095,
  0.792535809887567,
  0.78864036207973254,
  0.79442070638393447,
  0.79768785734696734,
  0.79429504491780145,
  0.79643126478084936,
  0.79680824056586064,
  0.79454636566301606,
  0.79341542854114944,
  0.79454636522860167,
  0.80057803396305027,
  0.79467202672469428,
  0.79429504872267198,
  0.79819049647058726,
  0.79731087744250972,
  0.7965569187570799,
  0.80271424820867132,
  0.80145764818261034,
  0.80346820308923061,
  0.804599146607474,
  0.80522744045631467,
  0.80598139798829949,
  0.80598140098426052,
  0.80924855119830308,
  0.80937421225998141,
  0.80359386761124396,
  0.80560441920732728,
  0.80623271602216928,
  0.80774063379748384,
  0.80510178011366695,
  0.81063080629412032,
  0.80748931651260425,
  0.80309122474267269,
  0.80673536272557067,
  0.80660969863797172,
  0.81226

In [None]:
### RNN Attempt

In [33]:
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional

rnn_train_X = sequence.pad_sequences(train_X, maxlen=10)
np.shape(rnn_train_X)
np.shape(train_X)


rnn_train_X[2]
#test_X, test_y

array([  0,   0,   0,  -8,   0,  77,  66, -10,  -2,  58], dtype=int32)

### Use DNN - Tflow

In [6]:
#Vector Assembler
#feature_cols  = ["home_score", "away_score", "score_diff_amh", "home_team_spread","pct_complete", "cf1", "cf2"]
feature_cols = ["home_score", "away_score", "score_diff_amh", "home_team_spread","pct_complete"]



In [None]:
## import tensorflow as tf
import numpy as np
import pandas as pd
import math

from sklearn.tree import DecisionTreeClassifier 
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
import tensorflow as tf
import numpy as np

def load_csv(filename):
    file = pd.read_csv(filename, header=0)

    # get sample's metadata
    n_samples = int(file.columns[0])
    n_features = int(file.columns[1])

    # divide samples into explanation variables and target variable
    data = np.empty((n_samples, n_features))
    target = np.empty((n_samples,), dtype=np.int)
    for i, row in enumerate(file.itertuples()):
        target[i] = np.asarray(row[-1], dtype=np.int)
        data[i] = np.asarray(row[1:n_features+1], dtype=np.float64)
    return (data, target)

# output train data 
def get_batch_data(x_train, y_train, size=None):
    if size is None:
        size = len(x_train)
    batch_xs = x_train
    batch_ys = []

    # convert to 1-of-N vector
    for i in range(len(y_train)):
        val = np.zeros((CLASS_SIZE), dtype=np.float64)
        val[y_train[i]] = 1.0
        batch_ys.append(val)
    batch_ys = np.asarray(batch_ys)
    return batch_xs[:size], batch_ys[:size]

# output test data
def get_test_data(x_test, y_test):
    batch_ys = []

    # convert to 1-of-N vector
    for i in range(len(y_test)):
        val = np.zeros((CLASS_SIZE), dtype=np.float64)
        val[y_test[i]] = 1.0
        batch_ys.append(val)
    return x_test, np.asarray(batch_ys)

# for parameter initialize
def get_stddev(in_dim, out_dim):
    return 1.3 / math.sqrt(float(in_dim) + float(out_dim))

# DNN Model Class
class Classifier:
    def __init__(self, hidden_units=[10], n_classes=0, data_size = 0):
        self._hidden_units = hidden_units
        self._n_classes = n_classes
        self._data_size = data_size
        self._sess = tf.Session()

    # build model
    def inference(self, x):
        hidden = []

        # Input Layer
        with tf.name_scope("input"):
            weights = tf.Variable(tf.truncated_normal([DATA_SIZE, self._hidden_units[0]], stddev=get_stddev(DATA_SIZE, self._hidden_units[0]), seed=42), name='weights')
            biases = tf.Variable(tf.zeros([self._hidden_units[0]]), name='biases')
            input = tf.matmul(x, weights) + biases

        # Hidden Layers
        for index, num_hidden in enumerate(self._hidden_units):
            if index == len(self._hidden_units) - 1: break
            with tf.name_scope("hidden{}".format(index+1)):
                weights = tf.Variable(tf.truncated_normal([num_hidden, self._hidden_units[index+1]], seed=42, stddev=get_stddev(num_hidden, self._hidden_units[index+1])), name='weights')
                biases = tf.Variable(tf.zeros([self._hidden_units[index+1]]), name='biases')
                inputs = input if index == 0 else hidden[index-1]
                hidden.append(tf.nn.relu(tf.matmul(inputs, weights) + biases, name="hidden{}".format(index+1)))
        
        # Output Layer
        with tf.name_scope('output'):
            weights = tf.Variable(tf.truncated_normal([self._hidden_units[-1], self._n_classes], seed=42, stddev=get_stddev(self._hidden_units[-1], self._n_classes)), name='weights')
            biases = tf.Variable(tf.zeros([self._n_classes]), name='biases')
            logits = tf.nn.softmax(tf.matmul(hidden[-1], weights) + biases)

        return logits

    # loss function
    def loss(self, logits, y):        
        #return -tf.reduce_mean(y * tf.log(logits))
        return tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=y))

    # fitting function for train data
    def fit(self, x_train=None, y_train=None, steps=200):
        # build model
        x = tf.placeholder(tf.float32, [None, DATA_SIZE])
        y = tf.placeholder(tf.float32, [None, CLASS_SIZE])
        logits = self.inference(x)
        loss = self.loss(logits, y)
        train_op = tf.train.AdamOptimizer(0.003).minimize(loss)

        # save variables
        self._x = x
        self._y = y
        self._logits = logits
 
        # init parameters
        #init = tf.initialize_all_variables() 
        init = tf.global_variables_initializer()
        self._sess.run(init)

        # train
        for i in range(steps):
            batch_xs, batch_ys = get_batch_data(x_train, y_train)
            self._sess.run(train_op, feed_dict={x: batch_xs, y: batch_ys})

    # evaluation function for test data
    def evaluate(self, x_test=None, y_test=None):
        x_test, y_test = get_test_data(x_test, y_test)
        
        # build accuracy calculate step
        correct_prediction = tf.equal(tf.argmax(self._logits, 1), tf.argmax(self._y, 1))
        accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

        # evaluate
        return self._sess.run([accuracy], feed_dict={self._x: x_test, self._y: y_test})

    # label pridiction
    def predict(self, samples):
        predictions = tf.argmax(self._logits, 1)
        return self._sess.run(predictions, {self._x: samples})