In [61]:
import numpy as np
import pandas as pd
import csv
import pyarrow
import boto3
import datetime
import matplotlib.pyplot as plt
import nba_py
from keras.datasets import imdb
from keras.models import Sequential, Model
from keras.layers import Dropout, Dense, Input, LSTM, GRU, Flatten
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
import keras

In [15]:
raw_scores_2000 = pd.read_parquet('boxscores_raw_2000-01.parquet')
raw_scores_2001 = pd.read_parquet('boxscores_raw_2001-02.parquet')
raw_scores_2002 = pd.read_parquet('boxscores_raw_2002-03.parquet')

In [None]:
def get_team_ids_dict():
    '''
    Get unique team ids dictionary for 1 hot encoding
    '''
    return {nba_py.constants.TEAMS[team]['id']: i for i, team in enumerate(nba_py.constants.TEAMS)}

In [178]:
class team_sequence_generator():
    def __init__(self, input_df, sequence_length=20, process_on_init=False):
        self.raw_df = input_df
        self.df = None
        self.team_dict = get_team_ids_dict()
        self.sequence_data = None
    
        self.four_factor_cols = ['home_game', 'EFG_PCT', 'FTA_RATE', 'TM_TOV_PCT', 'OREB_PCT',
                                 'OPP_EFG_PCT', 'OPP_FTA_RATE', 'OPP_TOV_PCT', 'OPP_OREB_PCT']

        self.feature_cols = ['home_game', 'EFG_PCT', 'FTA_RATE', 'TM_TOV_PCT', 'OREB_PCT',
                             'OPP_EFG_PCT', 'OPP_FTA_RATE', 'OPP_TOV_PCT', 'OPP_OREB_PCT',
                             'OFF_RATING', 'DEF_RATING', 'NET_RATING', 'AST_PCT', 'AST_TOV',
                             'AST_RATIO', 'DREB_PCT', 'REB_PCT', 'TS_PCT', 'USG_PCT', 'PACE', 'PIE',
                             'PTS_OFF_TOV', 'PTS_2ND_CHANCE', 'PTS_FB', 'PTS_PAINT',
                             'OPP_PTS_OFF_TOV', 'OPP_PTS_2ND_CHANCE', 'OPP_PTS_FB', 'OPP_PTS_PAINT',
                             'BLK', 'BLKA', 'PF', 'PFD', 'PCT_FGA_2PT', 'PCT_FGA_3PT', 'PCT_PTS_2PT',
                             'PCT_PTS_2PT_MR', 'PCT_PTS_3PT', 'PCT_PTS_FB', 'PCT_PTS_FT',
                             'PCT_PTS_OFF_TOV', 'PCT_PTS_PAINT', 'PCT_AST_2PM', 'PCT_UAST_2PM',
                             'PCT_AST_3PM', 'PCT_UAST_3PM', 'PCT_AST_FGM', 'PCT_UAST_FGM',
                             'PTS_QTR1', 'PTS_QTR2', 'PTS_QTR3', 'PTS_QTR4',
                             'LARGEST_LEAD', 'LEAD_CHANGES']

        self.info_cols = ['GAME_ID', 'TEAM_ID', 'HOME_TEAM_ID', 'VISITOR_TEAM_ID', 'GAME_DATE_EST', 'PTS', 'ngame']
        
        if process_on_init:
            self.tidy_raw_data()
            self.generate_sequences(self.feature_cols, 20)

    def get_team_ids_dict(self):
        '''
        Get unique team ids dictionary for 1 hot encoding
        '''
        return {nba_py.constants.TEAMS[team]['id']: i for i, team in enumerate(nba_py.constants.TEAMS)}
    
    
    def normalise_input_features(self, input_df, features):
        df = input_df.copy()

        for feature in features:
            feat_max = df[feature].describe()['max']
            feat_mean = df[feature].describe()['mean']
            feat_std = df[feature].describe()['std']

            if feat_max > 1:
                df[feature] = (df[feature] - feat_mean) / feat_std

        return df

    def tidy_raw_data(self):
        # Munge a bit
        df = self.raw_df.copy()
        
        
        df['ngame'] = df[['GAME_ID']].apply(lambda x: int(x.GAME_ID[-4:]), axis=1)
        df['home_game'] = (df.TEAM_ID == df.HOME_TEAM_ID).astype(int)
        
        df = df[self.feature_cols + self.info_cols]
        
        df.loc[:, 'GAME_DATE_EST'] = pd.to_datetime(df['GAME_DATE_EST'], format='%Y-%m-%d')
        df = self.normalise_input_features(df, self.feature_cols)
        
        self.df = df
        return df


    def generate_sequences(self, features, sequence_length=15):
        df = self.df.copy()
        team_dict = get_team_ids_dict()

        team_a_sequences = np.empty((0, sequence_length, len(features)))
        team_b_sequences = np.empty((0, sequence_length, len(features)))
        targets = np.empty(0)
        home_inputs = np.empty(0)
        team_a_index = np.empty(0)
        team_b_index = np.empty(0)

        df.TEAM_ID = df.TEAM_ID.astype(str)
        df.VISITOR_TEAM_ID = df.VISITOR_TEAM_ID.astype(str)
        df.HOME_TEAM_ID = df.HOME_TEAM_ID.astype(str)

        for game in df.ngame:
            home_team, away_team = df.loc[df.ngame == game, ['HOME_TEAM_ID', 'VISITOR_TEAM_ID']].values[0]

            #Keep current game to get target, then remove
            home_df = df.loc[(df.TEAM_ID == home_team) & (df.ngame <= game), :]
            away_df = df.loc[(df.TEAM_ID == away_team) & (df.ngame <= game), :]

            home_win = int(home_df.loc[home_df.ngame == game, 'PTS'].values[0] >
                           away_df.loc[away_df.ngame == game, 'PTS'].values[0])

            home_df = home_df.loc[df.ngame < game, :]
            away_df = away_df.loc[df.ngame < game, :]

            if home_df.shape[0] > sequence_length and away_df.shape[0] > sequence_length:
                #TODO: DO SOME PADDING
                #             home_sequence = np.zeros((sequence_length, four_factors.shape[0]))
                home_sequence = home_df.iloc[:sequence_length, :][features].values
                away_sequence = away_df.iloc[:sequence_length, :][features].values

                # Add Home team as TEAM A
                team_a_sequences = np.append(team_a_sequences, [home_sequence], axis=0)
                team_a_index = np.append(team_a_index, self.team_dict[home_team])
                team_b_sequences = np.append(team_b_sequences, [away_sequence], axis=0)
                team_b_index = np.append(team_b_index, self.team_dict[away_team])
                home_inputs = np.append(home_inputs, [1])

                # Add Away team as TEAM A
                team_a_sequences = np.append(team_a_sequences, [away_sequence], axis=0)
                team_a_index = np.append(team_a_index, self.team_dict[away_team])
                team_b_sequences = np.append(team_b_sequences, [home_sequence], axis=0)
                team_b_index = np.append(team_b_index, self.team_dict[home_team])
                home_inputs = np.append(home_inputs, [0])

                targets = np.append(targets, [home_win] * 2)

        self.sequence_data = {'team_a_sequences': team_a_sequences, 'team_b_sequences': team_b_sequences, 
                              'team_a_index': team_a_index, 'team_b_sequence':team_b_index, 
                              'home_inputs': home_inputs, 'results':targets}
        return self.sequence_data 


y2000_sequences = team_sequence_generator(raw_scores_2000, process_on_init=True)
y2001_sequences = team_sequence_generator(raw_scores_2001, process_on_init=True)
y2002_sequences = team_sequence_generator(raw_scores_2002, process_on_init=True)

In [135]:
Xa = np.concatenate((Xa_2000, Xa_2001, Xa_2002[:1000]))
Xb = np.concatenate((Xb_2000, Xb_2001, Xb_2002[:1000]))
teamida = np.concatenate((team_id_a_2000, team_id_a_2001, team_id_a_2002[:1000]))
teamidb = np.concatenate((team_id_b_2000, team_id_b_2001, team_id_b_2002[:1000]))
home_inputs = np.concatenate((home_inputs_2000, home_inputs_2001, home_inputs_2002[:1000]))
y  = np.concatenate((y_2000, y_2001, y_2002[:1000]))

Xa_test = Xa_2002[1000:]
Xb_test = Xb_2002[1000:]
teamida_test = team_id_a_2002[1000:]
teamidb_test = team_id_b_2002[1000:]
home_inputs_test = home_inputs_2002[1000:]
y_test = y_2002[1000:]

In [30]:
X_comb = np.concatenate((X_2000, X_2001), axis=0)
y_comb = np.concatenate((y_2000, y_2001), axis=0)
home_inputs_comb = np.concatenate((home_inputs_2000, home_inputs_2001), axis=0)

In [140]:
y_test.shape

(1920,)

In [155]:
from keras import backend as K
from keras.layers import Lambda
# def build_model(X_train_a, X_train_b, team_id_train_a, team_id_train_b, home_input_train, y_train,
#                 X_test_a, X_test_b, team_id_train_a, team_id_train_b, home_input_test, y_test):

def initialise_model(sequence_length, feature_length, team_vector_dim):
    np.random.seed(42)
    
    hidden_dimension = team_vector_dim
    
    team_a_index = Input(shape=(1,), name='team_a_id_input')
    team_b_index = Input(shape=(1,), name='team_b_id_input') 
    team_embedding = Embedding(len(get_team_ids_dict())+1, team_vector_dim)

    squeeze = Lambda(lambda x: K.squeeze(x, axis=1))
    team_a_encoded =  squeeze(team_embedding(team_a_index))
    #print(team_embedding(team_a_index).shape)
    team_b_encoded = squeeze(team_embedding(team_b_index))
   
    team_a = Input(shape=(sequence_length, feature_length))
    team_b = Input(shape=(sequence_length, feature_length))
    
    shared_gru = GRU(hidden_dimension)
    
    # When we reuse the same layer instance multiple times, the weights of the layer
    # are also being reused (it is effectively *the same* layer)
    # Also: Initialise hidden state of the Payer level GRU with the player vector
    print(team_a_encoded.shape)
    encoded_a = shared_gru(team_a, initial_state=team_a_encoded)
    encoded_b = shared_gru(team_b, initial_state=team_b_encoded)

    # We can then concatenate the two vectors:
    
    gru_out = keras.layers.concatenate([encoded_a, encoded_b], axis=-1)
#     gru_flat = Flatten()(gru_out)
    
    # Add aux features
    # TODO: Add more aux. features, eg. TSL game, time, games in last week.
    # TODO: ADD TEAM EMBEDDINGS
    home_input = Input(shape=(1,), name='home_input')
    merged_vector = keras.layers.concatenate([gru_out, home_input])

    dense_pre_out = Dropout(0.25)(Dense(128)(merged_vector))
    
    # And add a logistic regression on top
    predictions = Dense(1, activation='sigmoid')(dense_pre_out)

    model = Model(inputs=[team_a, team_b, team_a_index, team_b_index, home_input], outputs=predictions)

    model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['accuracy'])
    
    return model

def train_model(model, X_train_a, X_train_b, team_id_train_a, team_id_train_b, home_input_train, y_train,
                X_test_a, X_test_b, team_id_test_a, team_id_test_b, home_input_test, y_test):
    team_id_train_a = team_id_train_a.astype(int)
    team_id_train_b = team_id_train_b.astype(int)
    team_id_test_a = team_id_test_a.astype(int)
    team_id_test_b = team_id_test_b.astype(int)
    
    thing = [X_train_a, X_train_b, team_id_train_a, team_id_train_b, home_input_train,
            X_test_a, X_test_b, team_id_test_a, team_id_test_b, home_input_test, y_test]
    for i in thing:
        print(i.shape)
    model.fit([X_train_a, X_train_b, team_id_train_a, team_id_train_b, home_input_train], 
              y_train, epochs=15, batch_size=2)   
    
    return model

def eval_model(model, X_train_a, X_train_b, team_id_train_a, team_id_train_b, home_input_train, y_train,
                X_test_a, X_test_b, team_id_test_a, team_id_test_b, home_input_test, y_test):
#     print(model.summary())

    scores_train = model.evaluate([X_train_a, X_train_b, team_id_train_a, team_id_train_b,
                             home_input_train], y_train, verbose=0)
    scores = model.evaluate([X_test_a, X_test_b, team_id_test_a, team_id_test_b,
                             home_input_test], y_test, verbose=0)
#     preds_train = model.predict_classes(X_test)

    
    print("Train Accuracy: %.2f%%" % (scores_train[1]*100))
    print("Test Accuracy: %.2f%%" % (scores[1]*100))

    print("Train Loss: %.2f%%" % (scores_train[0]))
    print("Test Loss: %.2f%%" % (scores[0]))
    #     preds = model. ([X_test_a, X_test_b, home_input_test])
    
#     print(np.unique(preds_train))
#     print(np.unique(preds))


    
model = initialise_model(Xa_2000.shape[1], Xa_2000.shape[2], 64)
model = train_model(model, Xa, Xb, teamida, teamidb, home_inputs, y,
                    Xa_test, Xb_test, teamida_test, teamidb_test, home_inputs_test, y_test)

In [156]:
eval_model(model, Xa, Xb, teamida, teamidb, home_inputs, y,
           Xa_test, Xb_test, teamida_test, teamidb_test, home_inputs_test, y_test)

Train Accuracy: 60.32%
Test Accuracy: 62.71%
Train Loss: 0.68%
Test Loss: 0.68%


In [None]:
def train_sequential_basic(X_train, y_train, X_test=None, y_test=None, train_ratio = 0.9):
    numpy.random.seed(12)
    
    if X_test is None:    
        train_ratio = 0.9
        train_max_idx = int(train_ratio*X.shape[0])

        X_test = X_train[train_max_idx:]
        y_test = y[train_max_idx:]
        
        X_train = X_train[:train_max_idx]
        y_train = y[:train_max_idx]
        
    model = Sequential()
    model.add(GRU(100, input_shape=(X_train.shape[1], X_train.shape[2])))
    model.add(Input(input_shape=(X_train.shape[0], 1)))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='RMSprop', metrics=['accuracy'])
    print(model.summary())
    model.fit(X_train, y_train, epochs=3, batch_size=32)
    
    # Final evaluation of the model
    scores_train = model.evaluate(X_train, y_train, verbose=0)
    scores = model.evaluate(X_test, y_test, verbose=0)
    preds_train = model.predict_classes(X_test)
    preds = model.predict_classes(X_test)
    
    print(np.unique(preds_train))
    print(np.unique(preds))
    
    print("Train Accuracy: %.2f%%" % (scores_train[1]*100))
    print("Test Accuracy: %.2f%%" % (scores[1]*100))

train_sequential(X_2000, y_2000, X_2001, y_2001)

In [183]:
np.unique(model.predict_classes(X_test))

array([1])

In [248]:
np.bincount(y_test.astype(int))#/y_test.shape[0]

array([ 728, 1014], dtype=int64)

In [226]:
def get_sin_cos_sequences(time_steps=20, count_rows=500, f1=1, f2=1.5, a1=1, a2=0.8):
    """
    Generate some dummy data for testing sequence to sequence clustering
    - Sine and cosine curves, with parameters provided as function inputs
    """
    
    x = np.arange(time_steps)
    
    _, X = np.mgrid[0:count_rows//2, 0:time_steps:1]
    targets = np.zeros(count_rows)
    targets[:count_rows//2] = 1
    rands1 = f1 * 2 * np.pi * np.random.rand(count_rows//2, 1)
    rands2 = f2 * 2 * np.pi * np.random.rand(count_rows//2, 1)
    
    feature_length = 2
    
    # sequences1a shape: 250, 50
    sequences1a =   a1 * np.sin(f1*X*rands1/time_steps)
    sequences1b =   np.random.rand(1) * a1 * np.sin(f1*X*rands1/time_steps)
    
    # Want sequences to be 250, 50, 2
    sequences1 = np.concatenate(([np.ravel(sequences1a)], [np.ravel(sequences1b)]), axis=0) 
    print(sequences1.shape)
    sequences1 = sequences1.T.reshape(count_rows//2, time_steps, feature_length)
    print(sequences1.shape)

    sequences2a =   a2 * np.cos(f2*X*rands2/time_steps)
    sequences2b =   np.random.rand(1) * a2* np.cos(f2*X*rands2/time_steps)
    sequences2 = np.concatenate(([np.ravel(sequences2a)], [np.ravel(sequences2b)]), axis=0) 
    sequences2 = sequences2.T.reshape(count_rows//2, time_steps, feature_length)
    
    sequences = np.concatenate((sequences1, sequences2), axis=0)
    print(sequences.shape)
    
#     for i in range(count_rows//2):
#         plt.plot(x, sequences1[i], alpha=0.05)
#         plt.plot(x, sequences1b[i], alpha=0.05)
#         plt.plot(x, sequences2[i], alpha=0.05)

#     plt.xlabel('sample(n)')
#     plt.ylabel('voltage(V)')
#     plt.show()
    
#     sequences = np.vstack([sequences1, sequences2])
    return sequences, targets #sequences.reshape(count_rows, time_steps, 1)
#     
X_dummy, y_dummy = get_sin_cos_sequences()

(2, 5000)
(250, 20, 2)
(500, 20, 2)


In [244]:
numpy.random.seed(12)

train_ratio = 0.9
train_max_idx = int(train_ratio*X.shape[0])

# X_train = X[:train_max_idx]
# y_train = y[:train_max_idx]
# X_test = X[train_max_idx:]
# y_test = y[train_max_idx:]
print(X.shape)

def train_sequential_dummy(X, y):
    model = Sequential()
    model.add(GRU(100, input_shape=(X.shape[1],X.shape[2])))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='RMSprop', metrics=['accuracy'])
    print(model.summary())
    model.fit(X, y, epochs=3, batch_size=32)
    # Final evaluation of the model
    scores = model.evaluate(X, y, verbose=0)
    preds = model.predict(X)
    print("Accuracy: %.2f%%" % (scores[1]*100))
    
train_sequential_dummy(X_dummy, y_dummy)

(1754, 20, 5)
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
gru_26 (GRU)                 (None, 100)               30900     
_________________________________________________________________
dense_40 (Dense)             (None, 1)                 101       
Total params: 31,001
Trainable params: 31,001
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/3
Epoch 2/3
Epoch 3/3
Accuracy: 100.00%
