In [2]:
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
import sklearn.naive_bayes as nb
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
import utils
import matplotlib.pyplot as plt



In [3]:
df = pd.read_csv('shot_logs.csv')

In [4]:
# Choose the features
features = [
    "SHOT_NUMBER",
    "PERIOD",
#     "GAME_CLOCK",
    "SHOT_CLOCK",
    "DRIBBLES",
    "TOUCH_TIME",
    "SHOT_DIST",
    "PTS_TYPE",
    "CLOSE_DEF_DIST",
#     "FGM",  # cant have this because it is knowledge from after the shot (indicates result)
]

non_scaled_features = [
#     "PTS_TYPE"
]

advanced_features = [
    "LOCATION",
    "GAME_ID",
    "MATCHUP",
    "player_id",
    "CLOSEST_DEFENDER_PLAYER_ID",
]
target = ['PTS']

In [5]:
# Run normalization functions
df['SHOT_CLOCK'] = utils.normalize_shotclock(df['SHOT_CLOCK'])
df['LOCATION'] = utils.normalize_location(df['LOCATION'])
df['PTS'] = np.float64(df['PTS'])
features.remove("PTS_TYPE")
df = utils.rescale_features(features, df, non_scaled_features)
features.append("PTS_TYPE")

In [6]:
clf_list = []
clf_list.append(LinearRegression(normalize=True, n_jobs=-1))
clf_list.append(nb.GaussianNB())
clf_list.append(AdaBoostClassifier())
clf_list.append(MLPClassifier(hidden_layer_sizes=(len(features)), activation='logistic'))

In [7]:
score_list = []
for i in range(4):
    mean = utils.get_cross_validated_score(df[features], df[target], clf_list[i])[0]
    score_list.append(mean)



In [9]:
for i in range(4):
    score_list[i] = round(score_list[i], 4)

x_axis = [1,2,3,4]
clf_names = ['Linear Reg.', 'Gaussian NB', 'AdaBoost', 'MLP']
plt.plot(x_axis, score_list, 'ro')
plt.axis([0, 5, 0, 1])
plt.xlabel('Model')
plt.ylabel('Mean Accuracy')
plt.xticks(x_axis, clf_names)
for index, result in zip(x_axis, score_list):
    plt.annotate(result, xy=(index, result))
plt.title('Cross Validated Score Comparison')
plt.show()