## Data Modeling and Evaluation
<img src="https://github.com/dustinvanstee/nba-rt-prediction/raw/master/pngs/model_eval_method.png" width="800" height="500" align="middle"/>


## Imports

In [None]:
spark.version
DSX=True

In [None]:
import re
import datetime
from pyspark.sql.functions import *
from pyspark.sql.types import StructType, StructField
from pyspark.sql.types import DoubleType, IntegerType, StringType,DateType
import pandas as pd
pd.set_option('display.max_colwidth', 80)
pd.set_option('display.max_columns', 70)


from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator, RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.feature import StandardScaler
from pyspark.ml.feature import PolynomialExpansion
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import Bucketizer

import numpy as np
import seaborn as sns
sns.set_palette("deep", desat=0.6)
sns.set_context(rc={"figure.figsize": (8,4)})

import matplotlib.pyplot as plt
%matplotlib notebook
%matplotlib inline

## Load In NBA Score Data Set

In [3]:
cleaned_dir = ''
if(DSX) :
    cleaned_dir = './nba-rt-prediction/sparkfiles/cleanedDF'
else :
    cleaned_dir = '/data2/nba-rt-prediction/sparkfiles/cleanedDF'

df = spark.read.format('csv')\
                    .option("header", "true")\
                    .option("inferSchema", "true")\
                    .option("dateFormat", "yyyy-MM-dd")\
                    .load(cleaned_dir).repartition(2)

                # For some reason my key is none upon load ! Rebuild
df = df.withColumn("key", concat(date_format(df.dateOrig, "yyyy-MM-dd"),lit("."),col("away_team"),lit("."),col("home_team")))

#New TARGET Variable!
df = df.withColumn("label_home_pts_lts", col("final_home_score") - (col("home_score") ))  
df = df.withColumn("label_away_pts_lts", col("final_away_score") - (col("away_score") ))  
# Projected score based on spread // overunder
df = df.withColumn("home_partial_proj", (col("pct_left")/100)*(col("overunder")/2 - col("home_team_spread") /2 ))  
df = df.withColumn("away_partial_proj", (col("pct_left")/100)*(col("overunder")/2 - col("away_team_spread") /2 )) 
# Projected score based avg scoring for game so far 
df = df.withColumn("home_partial_proj2", (col("home_score") /  (col("pct_complete") + 0.01)) * col("pct_left") )
df = df.withColumn("away_partial_proj2", (col("away_score") /  (col("pct_complete") + 0.01)) * col("pct_left") )
#since the above projections blows up for beginning of the game, clip it with spread ...
df = df.withColumn("home_partial_proj2_clip", when(col("pct_complete") < 33, col("home_partial_proj")).otherwise(col("home_partial_proj2")) )  
df = df.withColumn("away_partial_proj2_clip", when(col("pct_complete") < 33, col("away_partial_proj")).otherwise(col("away_partial_proj2")) )  


# Convert N min score difference to a final score project for pct of game left, 3min~ 6.25%, 6min, 12.5 pct
df = df.withColumn("asm_3min_partial_proj", col("pct_left")*(col("asm_3min")/ lit(6.25))) 
df = df.withColumn("asm_6min_partial_proj", col("pct_left")*(col("asm_6min")/ lit(12.5))) 
df = df.withColumn("asm_9min_partial_proj", col("pct_left")*(col("asm_9min")/ lit(18.75))) 

df = df.withColumn("home_3min_partial_proj", col("pct_left")*(col("home_3min_abs") / lit(6.25))) 
df = df.withColumn("home_6min_partial_proj", col("pct_left")*(col("home_6min_abs") / lit(12.5))) 
df = df.withColumn("home_9min_partial_proj", col("pct_left")*(col("home_9min_abs") / lit(18.75))) 
df = df.withColumn("home_12min_partial_proj", col("pct_left")*(col("home_12min_abs") / lit(25.0))) 

df = df.withColumn("away_3min_partial_proj", col("pct_left")*(col("away_3min_abs") / lit(6.25))) 
df = df.withColumn("away_6min_partial_proj", col("pct_left")*(col("away_6min_abs") / lit(12.5)))
df = df.withColumn("away_9min_partial_proj", col("pct_left")*(col("away_9min_abs") / lit(18.75))) 
df = df.withColumn("away_12min_partial_proj", col("pct_left")*(col("away_12min_abs") / lit(25.0))).cache()


#df.printSchema()
#df.toPandas()
#df.sample(0.001,).toPandas()
junk, showdata = df.randomSplit([0.999, 0.001], seed = 82)
showdata.toPandas().head(5)

Unnamed: 0,dateOrig,ts,away_team_full,away_score,home_team_full,home_score,timestring,timeleft,away_team,home_team,score_diff_amh,date,time_elapsed,pct_complete,pct_left,pct_complete_inv,pct_left_inv,home_points_per_pct,away_points_per_pct,FNL,Q4,Q3,Q2,Q1,quarter,home_3min_abs,home_6min_abs,home_9min_abs,home_12min_abs,away_3min_abs,away_6min_abs,away_9min_abs,away_12min_abs,amh_3min,asm_3min,...,asm_6min_partial_proj,asm_9min_partial_proj,cf1,cf2,away_team_spread,overunder,away_team_ml,home_team_spread,home_team_ml,dateStr,away_team_vegas_fscore,home_team_vegas_fscore,final_away_score,final_home_score,away_win,home_win,dfa,outlier,key,label_home_pts_lts,label_away_pts_lts,home_partial_proj,away_partial_proj,home_partial_proj2,away_partial_proj2,home_partial_proj2_clip,away_partial_proj2_clip,home_3min_partial_proj,home_6min_partial_proj,home_9min_partial_proj,home_12min_partial_proj,away_3min_partial_proj,away_6min_partial_proj,away_9min_partial_proj,away_12min_partial_proj
0,2016-04-05,21:21:08,Phoenix,56.0,Atlanta,45.887399,(10:48 IN 3RD),23.5,pho,atl,10.112601,2016-04-05,24.5,51.041667,48.958333,0.019591,0.020425,0.899001,1.097121,0,0,1,0,0,Q3,3.725909,8.887399,14.357988,17.887399,1.838509,10.0,14.705882,17.031944,12.0,-1.887399,...,4.357685,-2.233688,7.207979,0.158196,14.5,207.5,-110,-14.5,-110.0,2016-04-05,96.5,111.0,90,103,0,1,-3.376983,0,2016-04-05.pho.atl,57.112601,34.0,54.34375,47.244792,44.005823,53.703764,44.005823,53.703764,29.186285,34.808981,37.490301,35.029491,14.401656,39.166667,38.398693,33.354225
1,2016-04-06,21:58:19,New Orleans,95.0,Boston,102.666667,(0:29 IN 4TH),0.5,nor,bos,-7.666667,2016-04-06,47.5,98.958333,1.041667,0.010105,0.959079,1.037463,0.95999,0,1,0,0,0,Q4,11.666667,16.666667,21.041667,26.666667,6.0,9.481013,16.0,22.0,-2.0,-5.666667,...,-0.598805,-0.259259,-33.728836,-17.464692,14.25,208.25,-110,-14.25,-110.0,2016-04-06,97.0,111.25,97,104,0,1,1.239583,0,2016-04-06.nor.bos,1.333333,2.0,1.158854,1.010417,1.080593,0.999899,1.080593,0.999899,1.944444,1.388889,1.168981,1.111111,1.0,0.790084,0.888889,0.916667
2,2016-04-07,20:41:11,Phoenix,29.0,Houston,39.625,(10:38 IN 2ND),35.0,pho,hou,-10.625,2016-04-07,13.0,27.083333,72.916667,0.036922,0.013714,1.463023,1.07073,0,0,0,1,0,Q2,9.760747,18.720023,27.679299,36.638575,7.280543,13.79638,20.312217,26.828054,-8.144796,-2.480204,...,-28.721248,-38.152024,-6.210725,-0.099046,12.0,213.5,-110,-12.0,-110.0,2016-04-07,100.75,112.75,124,115,1,0,-3.895833,0,2016-04-07.pho.hou,75.375,95.0,82.213542,73.463542,106.643316,78.048105,82.213542,73.463542,113.875377,109.200132,107.641717,106.862509,84.939668,80.478884,78.991956,78.248492
3,2016-04-13,23:36:15,LA Clippers,55.0,Phoenix,62.0,(0:07 IN 2ND),24.5,lac,pho,-7.0,2016-04-13,23.5,48.958333,51.041667,0.020425,0.019591,1.266357,1.123381,0,0,0,1,0,Q2,7.555556,11.0,15.177778,19.0,0.0,6.529412,11.177778,13.076923,0.555556,-7.555556,...,-18.254902,-16.123932,-4.887026,-0.103732,5.0,206.0,170,-5.0,-200.0,2016-04-13,100.5,105.5,105,114,0,1,-9.78125,0,2016-04-13.lac.pho,52.0,50.0,53.848958,51.296875,64.625098,57.328716,64.625098,57.328716,61.703704,44.916667,41.317284,38.791667,0.0,26.661765,30.428395,26.698718
4,2016-04-21,19:47:06,Toronto,8.918919,Indiana,9.0,(8:10 IN 1ST),44.5,tor,ind,-0.081081,2016-04-21,3.5,7.291667,92.708333,0.137124,0.010786,1.234116,1.222998,0,0,0,0,1,Q1,9.0,9.0,9.0,9.0,8.918919,8.918919,8.918919,8.918919,0.0,-0.081081,...,-0.601351,-0.400901,-0.042048,-0.000553,-1.333333,193.0,-115,1.333333,-38.333333,2016-04-21,97.166667,95.833333,101,85,1,0,-4.356419,0,2016-04-21.tor.ind,76.0,92.081081,88.845486,90.081597,114.271856,113.24238,88.845486,90.081597,133.5,66.75,44.5,33.375,132.297297,66.148649,44.099099,33.074324


### Create Train / cv or Dev / Test Sets
#### What is unique here, is that games should be randomized, not points in games, as points in games are correlated !

In [4]:
#trainingData, testData = df.randomSplit([0.7, 0.3], seed = 82)
#print "Training set size: " + str(trainingData.count())
#print "Testing set size: " + str(testData.count())
##print "Distribution of Default and Non-Default in trainingData is: ", trainingData.groupBy("label").count().take(3)

#TODO, add key code here ....
ddf = df.select(['key']).distinct()
trainingGames, testGames = ddf.randomSplit([0.7, 0.3], seed = 82)

trainingData = df.join(trainingGames, df["key"] == trainingGames["key"], "inner").drop(trainingGames["key"]).cache()
testData = df.join(testGames, df["key"] == testGames["key"], "inner").drop(testGames["key"]).cache()
print "Training set size: " + str(trainingData.count())
print "Testing set size: " + str(testData.count())


Training set size: 20982
Testing set size: 9882


### Linear Regression to Predict Score for Home and Away Team

In [5]:
def linreg_train_and_test(f_cols,labelCol,trainingData,testData):
    # create a label column ....
    trainingData = trainingData.withColumn('label', trainingData[labelCol].cast(DoubleType()))
    testData = testData.withColumn('label', testData[labelCol].cast(DoubleType()))

    
    assembler = VectorAssembler(inputCols=f_cols, outputCol="features")
    polyExpansion = PolynomialExpansion(degree=1, inputCol="features", outputCol="polyFeatures")
    scaler = StandardScaler(withMean=False, withStd=False, inputCol="polyFeatures", outputCol="features_scaled")
    # Create initial LogisticRegression model
    linreg = LinearRegression(labelCol='label',featuresCol="features_scaled",fitIntercept=False)
    
    pipeline = Pipeline(stages=[assembler,polyExpansion,scaler, linreg])

    # Create ParamGrid for Cross Validation
    paramGrid = (ParamGridBuilder()
             .addGrid(linreg.regParam, [0.01,0.1,1.0])
             .addGrid(linreg.elasticNetParam, [0.0])
             .addGrid(linreg.maxIter, [200])
             .build())
    
    linreg_evaluator = RegressionEvaluator(labelCol='label',metricName="mse")
    
    # Create N-fold CrossValidator
    linreg_cv = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=linreg_evaluator, numFolds=2)

    # Run cross validations
    linregCvModel = linreg_cv.fit(trainingData)
    
    predictions = linregCvModel.transform(testData)

    linreg_mse = linreg_evaluator.evaluate(predictions)

    print "MSE for Linear Regression: " + str(linreg_mse) 
    
    return (linreg_mse,predictions,linregCvModel)

### Run and Evaluate Linear Regression Tests

In [6]:
linreg_results_dict = {}


#Create Feature Columns for experiments
home_feature_cols = []
away_feature_cols = []

home_feature_cols.append(["home_score", "pct_complete"])
home_feature_cols.append(["home_score", "pct_complete", "overunder", "home_team_spread"])
home_feature_cols.append(["home_partial_proj"])
home_feature_cols.append(["home_partial_proj2_clip"])
home_feature_cols.append(["home_partial_proj", "home_partial_proj2_clip"])
home_feature_cols.append(["home_partial_proj", "home_partial_proj2_clip","home_6min_partial_proj"])
home_feature_cols.append(["home_partial_proj", "home_partial_proj2_clip","home_6min_partial_proj","asm_3min_partial_proj", "asm_6min_partial_proj", "asm_9min_partial_proj"])
home_feature_cols.append(["home_partial_proj", "home_partial_proj2_clip","home_3min_partial_proj","home_6min_partial_proj","home_9min_partial_proj","home_12min_partial_proj"])
home_feature_cols.append(["home_partial_proj", "home_partial_proj2_clip","home_3min_partial_proj","home_6min_partial_proj","home_9min_partial_proj","home_12min_partial_proj","asm_3min_partial_proj", "asm_6min_partial_proj", "asm_9min_partial_proj"])
# Create Away Features exactly the same as home
for row in home_feature_cols :
    away_features = [ x.replace('home','away') for x in row]
    away_feature_cols.append(away_features)

#0,9
for i in range(6,7) :
    print "Iteration " + str(i) + ": Home features " + str(home_feature_cols[i])
    linreg_results_dict['home_model'+str(i)] = linreg_train_and_test(home_feature_cols[i],"label_home_pts_lts",trainingData,testData)
    print "Iteration " + str(i) + ": Away Home features " + str(away_feature_cols[i])
    linreg_results_dict['away_model'+str(i)] = linreg_train_and_test(away_feature_cols[i],"label_away_pts_lts",trainingData,testData)


Iteration 6: Home features ['home_partial_proj', 'home_partial_proj2_clip', 'home_6min_partial_proj', 'asm_3min_partial_proj', 'asm_6min_partial_proj', 'asm_9min_partial_proj']
MSE for Linear Regression: 62.6266460173
Iteration 6: Away Home features ['away_partial_proj', 'away_partial_proj2_clip', 'away_6min_partial_proj', 'asm_3min_partial_proj', 'asm_6min_partial_proj', 'asm_9min_partial_proj']
MSE for Linear Regression: 65.4165661904


### Select Best Linear Model from results above

In [7]:
BEST_LINEAR_MODEL = 6
MODEL_IDX = 2

### Look at the Coefficients
best_model = linreg_results_dict["home_model" + str(BEST_LINEAR_MODEL)][MODEL_IDX]
linreg_mdl = best_model.bestModel.stages[3]
print linreg_mdl.coefficients
print linreg_mdl.intercept


[0.943573228271,0.043388835179,0.00976840377871,-0.00134801580747,0.00619385693795,0.0337017367907]
0.0


### Linear Regression Visualization - Plot the residuals

In [8]:
vis_pd = linreg_results_dict["home_model"+str(BEST_LINEAR_MODEL)][1].toPandas()
vis_pd['residual'] = vis_pd.label_home_pts_lts-vis_pd.prediction

In [9]:
#vis_pd[vis_pd.label_home_pts_lts < 0].head(2)
vis_pd.head(2)

Unnamed: 0,dateOrig,ts,away_team_full,away_score,home_team_full,home_score,timestring,timeleft,away_team,home_team,score_diff_amh,date,time_elapsed,pct_complete,pct_left,pct_complete_inv,pct_left_inv,home_points_per_pct,away_points_per_pct,FNL,Q4,Q3,Q2,Q1,quarter,home_3min_abs,home_6min_abs,home_9min_abs,home_12min_abs,away_3min_abs,away_6min_abs,away_9min_abs,away_12min_abs,amh_3min,asm_3min,...,away_team_ml,home_team_spread,home_team_ml,dateStr,away_team_vegas_fscore,home_team_vegas_fscore,final_away_score,final_home_score,away_win,home_win,dfa,outlier,label_home_pts_lts,label_away_pts_lts,home_partial_proj,away_partial_proj,home_partial_proj2,away_partial_proj2,home_partial_proj2_clip,away_partial_proj2_clip,home_3min_partial_proj,home_6min_partial_proj,home_9min_partial_proj,home_12min_partial_proj,away_3min_partial_proj,away_6min_partial_proj,away_9min_partial_proj,away_12min_partial_proj,key,label,features,polyFeatures,features_scaled,prediction,residual
0,2017-11-11,21:12:02,Orlando,3,Denver,1.583333,(10:55 IN 1ST),47,orl,den,1.416667,2017-11-11,1,2.083333,97.916667,0.47977,0.010213,0.759635,1.439309,0,0,0,0,1,Q1,1.583333,1.583333,1.583333,1.583333,3,3,3,3,0,1.416667,...,250,-7.5,-300,2017-11-11,106.25,113.75,107,125,0,1,0.25,0,123.416667,104,111.380208,104.036458,74.061173,140.326433,111.380208,104.036458,24.805556,12.402778,8.268519,6.201389,47,23.5,15.666667,11.75,2017-11-11.orl.den,123.416667,"[111.380208333, 111.380208333, 12.4027777791, 22.1944444418, 11.0972222209, ...","[111.380208333, 111.380208333, 12.4027777791, 22.1944444418, 11.0972222209, ...","[111.380208333, 111.380208333, 12.4027777791, 22.1944444418, 11.0972222209, ...",110.337342,13.079324
1,2017-11-11,21:13:03,Orlando,3,Denver,7.228571,(9:51 IN 1ST),46,orl,den,-4.228571,2017-11-11,2,4.166667,95.833333,0.239942,0.010435,1.734441,0.719827,0,0,0,0,1,Q1,7.228571,7.228571,7.228571,7.228571,3,3,3,3,0,-4.228571,...,250,-7.5,-300,2017-11-11,106.25,113.75,107,125,0,1,-0.561905,0,117.771429,104,109.010417,101.822917,165.859081,68.834796,109.010417,101.822917,110.838095,55.419048,36.946032,27.709524,46,23.0,15.333333,11.5,2017-11-11.orl.den,117.771429,"[109.010416667, 109.010416667, 55.4190476187, -64.8380952374, -32.4190476187...","[109.010416667, 109.010416667, 55.4190476187, -64.8380952374, -32.4190476187...","[109.010416667, 109.010416667, 55.4190476187, -64.8380952374, -32.4190476187...",107.28872,10.482709


In [10]:
import brunel
# http://brunel.mybluemix.net/docs/ -> chart composition
x = {'col1': range(0,150), 'col2': range(0,150)}
xyline = pd.DataFrame(data=x)
%brunel data('vis_pd') x(prediction) y(label_home_pts_lts)  color(quarter) + data('xyline') line  x(col1) y(col2)\
        title("Expected vs Actual") style(' .header {label-location:center}; size:5;') :: width=600, height=600

# Linear Regression Residuals for Home Team
# Predicted Points Left to Score vs Actual Points left to Score

<IPython.core.display.Javascript object>

In [11]:
%brunel data('vis_pd') x(time_elapsed) y(residual) color(home_team)

# Here the residuals shrink over time.  Usually not good for linear regression
# Probably should build multiple models for different time slices

<IPython.core.display.Javascript object>

In [12]:
#%brunel data('vis_pd') x(time_elapsed) y(residual) color(home_team)
%brunel data('vis_pd') bar x(quarter) yrange(residual) stderr(residual:2)

<IPython.core.display.Javascript object>

## Logistic Regression to predict outcome
<img src="https://github.com/dustinvanstee/nba-rt-prediction/raw/master/pngs/lr_flow.png" width="800" height="500" align="middle"/>


In [13]:
def logr_train_and_test(f_cols,trainingData,testData):

    # Add Time Buckets
    splits = [-float("inf"), 0.0, 12.0, 24.0, 36.0, float("inf")]
    bucketizer = Bucketizer(splits=splits, inputCol="time_elapsed", outputCol="bucketedFeatures")
    encoder = OneHotEncoder(inputCol="bucketedFeatures", outputCol="bucketedFeaturesVec")
    #encoded = encoder.transform(bucketedData)
    #encoded.select(['bucketedFeatures','time_elapsed','bucketedFeaturesVec']).show(100)

    assembler = VectorAssembler(inputCols=f_cols, outputCol="features1")
    assembler2 = VectorAssembler(inputCols=["features1","bucketedFeaturesVec"], outputCol="features")
    polyExpansion = PolynomialExpansion(degree=2, inputCol="features", outputCol="polyFeatures")
    scaler = StandardScaler(withMean=False, withStd=True, inputCol="polyFeatures", outputCol="features_scaled")
    # Create initial LogisticRegression model
    lr = LogisticRegression(labelCol="label", featuresCol="features_scaled", threshold=0.5)
    pipeline_lr = Pipeline(stages=[bucketizer, encoder, assembler,assembler2,polyExpansion,scaler, lr])
    #model_lr = pipeline_lr.fit(trainingData)

    # Create ParamGrid for Cross Validation
    paramGrid = (ParamGridBuilder()
             .addGrid(lr.regParam, [0.001,0.01,0.1])
             .addGrid(lr.elasticNetParam, [0.0])
             .addGrid(lr.maxIter, [200])
             .build())
    
    lr_evaluator = BinaryClassificationEvaluator(labelCol="label",metricName="areaUnderROC")
    
    # Create 5-fold CrossValidator
    lr_cv = CrossValidator(estimator=pipeline_lr, estimatorParamMaps=paramGrid, evaluator=lr_evaluator, numFolds=2)

    # Run cross validations
    lrCvModel = lr_cv.fit(trainingData)
    
    predictions = lrCvModel.transform(testData)

    areaUnderROC = lr_evaluator.evaluate(predictions)

    
    print "areaUnderROC for Logistic Regression: " + str(areaUnderROC) #0.6918242957971713
    print "Cross tab for prediction vs actual table"
    crosstab = lrCvModel.transform(testData).stat.crosstab("home_win", "prediction")
    crosstab.show()
    # Calculate Accuracy
    tp = float(crosstab.toPandas().values[0][2])
    tn = float(crosstab.toPandas().values[1][1])
    tot = float(np.sum(crosstab.toPandas().values[0:2,1:3]))
    accuracy = (tp+tn)/tot
    print "Accuracy = " + str(accuracy)

    return (areaUnderROC,predictions,lrCvModel,crosstab,)


### Build Logistic Dataframe based on Linear prediction results

In [14]:
#predictions = lrCvModel.transform(testData)
home_mdl = linreg_results_dict['home_model'+str(BEST_LINEAR_MODEL)][MODEL_IDX].bestModel
away_mdl = linreg_results_dict['away_model'+str(BEST_LINEAR_MODEL)][MODEL_IDX].bestModel

logrTrainingData = home_mdl.transform(trainingData).withColumn("home_score_projection", col("home_score") + col("prediction")).drop("prediction","features","polyFeatures","features_scaled")
logrTrainingData = away_mdl.transform(logrTrainingData).withColumn("away_score_projection", col("away_score") + col("prediction")).drop("prediction","features","polyFeatures","features_scaled")

logrTestData = home_mdl.transform(testData).withColumn("home_score_projection", col("home_score") + col("prediction")).drop("prediction","features","polyFeatures","features_scaled")
logrTestData = away_mdl.transform(logrTestData).withColumn("away_score_projection", col("away_score") + col("prediction")).drop("prediction","features","polyFeatures","features_scaled")

#away_df.select(["key","away_score","home_score","pct_complete","home_win","prediction","home_score_projection","away_score_projection"]).show()
logrTrainingData = logrTrainingData.withColumn("home_win_proj", when(col("home_score_projection") > col("away_score_projection"), 1.0).otherwise(0.0))
logrTestData     = logrTestData.withColumn("home_win_proj", when(col("home_score_projection") > col("away_score_projection"), 1.0).otherwise(0.0))

logrTrainingData = logrTrainingData.withColumn("home_win_margin_proj", col("home_score_projection") - col("away_score_projection")).cache()
logrTestData     = logrTestData.withColumn("home_win_margin_proj", col("home_score_projection") - col("away_score_projection")).cache()

logrTrainingData = logrTrainingData.withColumn('label', logrTrainingData['home_win'].cast(DoubleType()))
logrTestData = logrTestData.withColumn('label', logrTestData['home_win'].cast(DoubleType()))


# Run Logistic Regression
logr_results_dict = {}

#Create Feature Columns for experiments
# note, time_elapsed is already embedded in logr_train_and_test function above ..
logr_feature_cols = []
logr_feature_cols.append(["home_win_margin_proj","home_score", "away_score", "home_team_spread"])

logr_feature_cols.append(["home_win_proj"])
logr_feature_cols.append(["home_win_margin_proj"])
logr_feature_cols.append(["home_win_proj","home_win_margin_proj"])

logr_feature_cols.append(["home_score", "away_score", "pct_complete"])
logr_feature_cols.append(["home_score", "away_score", "pct_complete", "pct_complete_inv"])
logr_feature_cols.append(["home_score", "away_score", "pct_complete", "pct_complete_inv", "home_points_per_pct", "away_points_per_pct"])
logr_feature_cols.append(["home_score", "away_score", "pct_complete", "pct_complete_inv", "home_points_per_pct", "away_points_per_pct", "overunder"])

logr_feature_cols.append(["home_score", "away_score", "score_diff_amh", "pct_complete"])
logr_feature_cols.append(["home_score", "away_score", "score_diff_amh", "home_team_spread","pct_complete"])
logr_feature_cols.append(["home_score", "away_score", "score_diff_amh", "home_team_spread","pct_complete", "cf1", "cf2"])
logr_feature_cols.append(["home_score", "away_score", "score_diff_amh", "home_team_spread","pct_complete", "cf1", "cf2","asm_3min","asm_6min","asm_9min"])

# Run My Logistic Experiments
for i in range(0,1) :
    print "Iteration " + str(i) + " Cols = " + str(logr_feature_cols[i])
    logr_results_dict['model'+str(i)] = logr_train_and_test(logr_feature_cols[i],logrTrainingData,logrTestData)


Iteration 0 Cols = ['home_win_margin_proj', 'home_score', 'away_score', 'home_team_spread']
areaUnderROC for Logistic Regression: 0.86284459829
Cross tab for prediction vs actual table
+-------------------+----+----+
|home_win_prediction| 0.0| 1.0|
+-------------------+----+----+
|                1.0|1098|5480|
|                0.0|2314| 990|
+-------------------+----+----+

Accuracy = 0.788706739526


### Select Best Logistic Model

In [15]:
BEST_LOGISTIC_MODEL = 0

### Overall Logistic Error Analysis

In [16]:
PREDICTION_DF_INDEX=1
vis_df = logr_results_dict['model'+str(BEST_LOGISTIC_MODEL)][PREDICTION_DF_INDEX]
vis_df = vis_df.withColumn("correct", when(col("home_win") == col("prediction"),'yes').otherwise('no'))
vis_pd = vis_df.toPandas()

In [17]:
%brunel data('vis_pd') x(time_elapsed) y(score_diff_amh) color(correct) ::  width=900, height=1100
            
# data('vis_pd') x(time_elapsed) y(score_diff_amh) color(prediction) | 
# This gr

<IPython.core.display.Javascript object>

In [18]:
%brunel data('vis_pd') x(quarter) y(correct) bin(correct) color(#count) label(#count) style('symbol:rect; border-radius:15')

<IPython.core.display.Javascript object>

In [None]:
# TODO : Summarize worst performing games .. analyze why they didnt work

### Game Visualization Utility
Here, lets select a game of interest and plot the predictions and scores (2 graphs)

In [19]:
# From logistic regression function ->    return (areaUnderROC,predictions,lrCvModel,crosstab,)
PREDICTION_DF_INDEX = 1
vis_df = logr_results_dict['model'+str(BEST_LOGISTIC_MODEL)][PREDICTION_DF_INDEX]

#vis_df.show()
GAME_KEY = "2017-11-16.gst.bos"
vis_df.select('key').distinct().toPandas().head(2)

Unnamed: 0,key
0,2016-04-13.sac.hou
1,2016-04-13.mia.bos


In [20]:
vis_df = vis_df.select(["key","away_score","home_score","pct_complete","home_win","prediction","probability","time_elapsed","score_diff_amh"]). \
  withColumn("correct", when(col("home_win") == col("prediction"),1.0).otherwise(0.0))

#convert to pandas DF for vis
vis_pd = vis_df.toPandas()

# Select Game of interest
vis_pd = vis_pd[vis_pd.key == GAME_KEY]
vis_pd['home_win_probability'] = vis_pd.probability.apply(lambda x: x[1])
vis_pd['away_win_probability'] = vis_pd.probability.apply(lambda x: x[0])

vis_pd.head(2)


Unnamed: 0,key,away_score,home_score,pct_complete,home_win,prediction,probability,time_elapsed,score_diff_amh,correct,home_win_probability,away_win_probability
464,2017-11-16.gst.bos,1.158088,0.827206,1.041667,1,0,"[0.674868753307, 0.325131246693]",0.5,0.330882,0,0.325131,0.674869
465,2017-11-16.gst.bos,3.474265,2.481618,3.125,1,0,"[0.697323007516, 0.302676992484]",1.5,0.992647,0,0.302677,0.697323


In [21]:
import brunel
# http://brunel.mybluemix.net/docs/ -> chart composition
x = {'x': range(0,48), 'y25': [0.25]*48, 'y50': [0.5]*48, 'y75': [0.75]*48}
xyline = pd.DataFrame(data=x)

%brunel data('vis_pd') x(time_elapsed) y(home_score) + x(time_elapsed) y(away_score)  \
        title("Running Score") style(' .header {label-location:center}; width:500; height:500') | \
        data('vis_pd') line x(time_elapsed) y(away_win_probability:linear) + \
        data('xyline') line x(x) y(y50) \
        title("Away Win Probabilty") style('.header {label-location:center}') :: width=900, height=900
#%brunel data('vis_pd') x(time_elapsed) y(home_score) + x(time_elapsed) y(away_score)

<IPython.core.display.Javascript object>

### compare to ESPN.com ...
<img src="https://github.com/dustinvanstee/nba-rt-prediction/raw/master/pngs/game_comp.png" width="800" height="500" align="middle"/>


### Deployment
<img src="https://github.com/dustinvanstee/nba-rt-prediction/raw/master/pngs/deployment_method.png" width="800" height="500" align="middle"/>


In [None]:
import pandas as pd
pd.options.display.max_columns = 999
pd.set_option('display.max_colwidth', 200)

import warnings
warnings.filterwarnings('ignore')
import time
from datetime import datetime
import math
import urllib3, requests, json

In [None]:
#%%bash
#touch __init__.py
#rm -rf ./wml_deployfuncs.py
#wget https://github.com/dustinvanstee/lendingclub/raw/master/lendingclub-flask-demo/wml_deployfuncs.py

import wml_deployfuncs as wml

In [None]:
# @hidden_cell
cc_creds = {
  "url": "https://ibm-watson-ml.mybluemix.net",
  "access_key": "8I7slbLraBwPGRVdAvhVBs4quUlHxQBfVh9AcsReS3CEYVe+pQs2Lmppeo/ZVIpYHxGxQ3pIogjgEOjN0TGDTcL0h32gVzPkwMbmHXNpi+FQYUqQmv73SQJrb1WXWeZv",
  "username": "0b45b40e-f2e5-43a4-bc0a-55cb076a4ee6",
  "password": "813db8af-b707-4e59-a676-357cfe1ac299"
}


dv_creds = {
  "url": "https://ibm-watson-ml.mybluemix.net",
  "access_key": "kbXV3OOJ0i2mjGVhB461icjYpZlBFyiIjIpOn/ys0bSNe4rD50whFt1EcTocKgHvHxGxQ3pIogjgEOjN0TGDTcL0h32gVzPkwMbmHXNpi+FQYUqQmv73SQJrb1WXWeZv",
  "username": "7ddbfc51-2af5-4029-8e7f-f609a255fd5b",
  "password": "f5604e9e-7220-4f23-8a42-1ff814a72362",
  "instance_id": "d51854a2-84b2-41db-90f0-ac2419a944f2"
}
# Using Dustin's WML creds for now
creds = dv_creds

#### Save the model to WML repository

In [None]:
logr_mdl_data.printSchema()

In [None]:
# WML requires the target column is called label
MODEL_IDX = 2

home_mdl_data = trainingData.withColumn('label', trainingData["label_home_pts_lts"].cast(DoubleType())).select(home_feature_cols[BEST_LINEAR_MODEL][:] + ['label','label_home_pts_lts'])
away_mdl_data = trainingData.withColumn('label', trainingData["label_away_pts_lts"].cast(DoubleType())).select(away_feature_cols[BEST_LINEAR_MODEL][:] + ['label','label_away_pts_lts','label_home_pts_lts'])
logr_mdl_data = logrTrainingData.select(logr_feature_cols[BEST_LOGISTIC_MODEL][:] + ['label','time_elapsed'])

# Select best model from above ...
home_mdl = linreg_results_dict['home_model'+str(BEST_LINEAR_MODEL)][MODEL_IDX].bestModel
away_mdl = linreg_results_dict['away_model'+str(BEST_LINEAR_MODEL)][MODEL_IDX].bestModel
logr_mdl = logr_results_dict['model'+str(BEST_LOGISTIC_MODEL)][MODEL_IDX].bestModel

print "Saving Modeling...Model ID:"
published_model_name_or_id = wml.save_model_by_name(creds, "nba_home_linear", home_mdl, home_mdl_data)
published_model_name_or_id = wml.save_model_by_name(creds, "nba_away_linear", away_mdl, away_mdl_data)
published_model_name_or_id = wml.save_model_by_name(creds, "nba_homewin_logistic", logr_mdl, logr_mdl_data)



#### Deploy the saved model

In [None]:
# cannot re-run this cell.  Start from above
published_models_json = wml.get_published_models(creds)
nba_home_linear_scoring_url = wml.deploy_model(creds, published_models_json, "nba_home_linear")
nba_away_linear_scoring_url = wml.deploy_model(creds, published_models_json, "nba_away_linear")
nba_homewin_logistic_scoring_url = wml.deploy_model(creds, published_models_json, "nba_homewin_logistic")


#### Scoring: Call REST API
    Create a JSON Sample record for scoring

In [None]:
#	home_partial_proj	home_partial_proj2_clip	label_home_pts_lts	prediction	timeleft	home_score	overunder	home_team_spread
#   23.687500	        27.329689	            29.000000	        22.165724	12.0	   82.000000	194.5	5
home_score = 80.0
away_score = 82.0
time_left  = 12.0
overunder  = 224.5
home_spread = 5.0

hpp = (overunder / 2 - home_spread /2) * time_left/48.0
app = (overunder / 2 + home_spread /2) * time_left/48.0

hpp2c = ((home_score) / ((48.0-time_left)/48.0)) * (time_left/48.0)
app2c = ((away_score) / ((48.0-time_left)/48.0)) * (time_left/48.0)

# momentum terms
h6m = hpp
a6m = app
asm3m = 0
asm6m = 0
asm9m = 0

#print hpp
#print hpp2c
#print app
#print app2c

home_sample_data = {
  "fields": ['home_partial_proj', 'home_partial_proj2_clip', 'home_6min_partial_proj', 'asm_3min_partial_proj', 'asm_6min_partial_proj', 'asm_9min_partial_proj'],
  "values": [[hpp, hpp2c, h6m, asm3m, asm6m, asm9m]]
}
away_sample_data = {
  "fields": ['away_partial_proj', 'away_partial_proj2_clip', 'away_6min_partial_proj', 'asm_3min_partial_proj', 'asm_6min_partial_proj', 'asm_9min_partial_proj'],
  "values": [[app, app2c, a6m, asm3m, asm6m, asm9m]]
}

home_sample_json = json.dumps(home_sample_data)
away_sample_json = json.dumps(away_sample_data)


#### Make API call for scoring

In [None]:
# Get the scoring endpoint from the WML service
home_scoring_response = wml.score_example(creds, nba_home_linear_scoring_url, home_sample_json)
away_scoring_response = wml.score_example(creds, nba_away_linear_scoring_url, away_sample_json)


#### Grab Prediction Value

In [None]:
home_inference = json.loads(home_scoring_response)
away_inference = json.loads(away_scoring_response)

# First zip the fields and values together
zipped_home_inference = zip(home_inference['fields'], home_inference['values'].pop())
zipped_away_inference = zip(away_inference['fields'], away_inference['values'].pop())
home_plts = [v for (k,v) in zipped_home_inference if k == 'prediction'].pop()
away_plts = [v for (k,v) in zipped_away_inference if k == 'prediction'].pop()

home_score_projection = home_score + home_plts
away_score_projection = away_score + away_plts

# Next iterate through items and grab the prediction value
print "HSP : " + str(home_score_projection)
print "ASP : " + str(away_score_projection)
#print "Default Probability for this borrower is: " + str([v for (k,v) in zipped_inference if k == 'probability'].pop())

#### Make Logistic API Call

In [None]:
hwp = 1.0 if home_score_projection > away_score_projection else 0.0
hwmp = home_score_projection - away_score_projection
time_elapsed = 48-time_left

logr_sample_data = {
  "fields": ['home_win_margin_proj', 'away_score','home_score','home_team_spread','time_elapsed'],
  "values": [[hwmp,away_score,home_score,home_spread,time_elapsed]]
}

logr_sample_json = json.dumps(logr_sample_data)

logr_scoring_response = wml.score_example(creds, nba_homewin_logistic_scoring_url, logr_sample_json)

logr_inference = json.loads(logr_scoring_response)

# First zip the fields and values together
zipped_logr_inference = zip(logr_inference['fields'], logr_inference['values'].pop())
logr_prediction = [v for (k,v) in zipped_logr_inference if k == 'prediction'].pop()
logr_probability = [v for (k,v) in zipped_logr_inference if k == 'probability'].pop()

#home_score_projection = home_score + home_plts
#away_score_projection = away_score + away_plts

# Next iterate through items and grab the prediction value
#print "HSP : " + str(home_score_projection)
#logr_sample_json
print zipped_logr_inference
logr_probability

## Default Prediction App Powered by Watson Machine Learning
Go to the web app: https://nba-rt-demo.mybluemix.net/

To view the source of this web app, go here https://github.com/dustinvanstee/nba-rt-prediction