In [1]:
import math
import numpy as np
import pandas as pd
import time
from datetime import datetime
# sklearn
import sklearn
from sklearn.metrics import accuracy_score, roc_auc_score, log_loss
# sklearn-panads
from scipy.sparse import csr_matrix
from sklearn_pandas import DataFrameMapper, cross_val_score
from fastFM import sgd



In [2]:
df = pd.read_csv('/Users/david/datasets/kaggle/kobe_bryant_shot_selection/data.csv')

In [3]:
cat_columns = ["action_type", "combined_shot_type", "period", "playoffs",
               "shot_type", "shot_zone_area", "shot_zone_basic","shot_zone_range",
                "team_id","team_name","matchup","opponent"] 
need_binarize_num_columns = ["season"]
num_columns = ["game_event_id","game_id","lat","loc_x","loc_y","lon","minutes_remaining",
               "seconds_remaining","shot_distance","game_date","shot_id"]


label_column = "shot_made_flag"

In [4]:
lb = sklearn.preprocessing.LabelBinarizer()
sc = sklearn.preprocessing.StandardScaler()
for col in df.columns:
    if col == "game_date":
        df["game_date"] = df["game_date"].apply(lambda d: time.mktime(time.strptime(d, "%Y-%m-%d")))
    if col in cat_columns + need_binarize_num_columns:
        # transform categorical data to one-hot encoding
        df[col] = lb.fit_transform(df[col])
    if col in num_columns:
        # apply standard scaling 
        df[col] = sc.fit_transform(df[col])



In [5]:
from fastFM.datasets import make_user_item_regression
from sklearn.cross_validation import train_test_split

vec_check_nan = np.vectorize(math.isnan)
df = df[~vec_check_nan(df["shot_made_flag"])]

y = np.array(df["shot_made_flag"])
X = csr_matrix(np.array(df.drop(["shot_made_flag"], axis=1)))

begin_cut_point = len(y) - 4200
seq_length = 5000

# agg validate
y_validate_pred = []
# Convert dataset to binary classification task.
y_labels = np.ones_like(y)
y_labels[y < np.mean(y)] = -1

for cut_point in range(begin_cut_point,len(y)):
    X_train = X[cut_point - seq_length : cut_point,]
    X_test = X[cut_point:,]
    y_train = y_labels[cut_point - seq_length : cut_point,]
    y_test = y_labels[cut_point:,]

    fm = sgd.FMClassification(n_iter=100000, init_stdev=0.1, l2_reg_w=0,
                          l2_reg_V=0, rank=2, step_size=0.001)
    fm.fit(X_train, y_train)
    y_pred = fm.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    
    # append the validation set sample prediction.
    y_validate_pred.append(y_pred[0])
    print(acc)

0.584047619048
0.58371040724
0.583373034778
0.583512032404
0.585557673975
0.58379022646
0.583214115403
0.583591700453
0.583730916031
0.584347411119
0.583293556086
0.583194079733
0.58476599809
0.586099832816
0.586000955566
0.584946236559
0.583173996176
0.583552474301
0.582735533238
0.582874910309
0.584449760766
0.58458961474
0.584729535663
0.583193679674
0.583333333333
0.58251497006
0.582654528031
0.584471603163
0.584132310642
0.584751858068
0.585611510791
0.583593187815
0.583253358925
0.582673386129
0.582333173308
0.583673469388
0.582853025937
0.58299303387
0.583133109082
0.583273251622
0.583173076923
0.582591969223
0.58177008177
0.581428915083
0.582050048123
0.581227436823
0.58112662494
0.580784974717
0.57972061657
0.580823897856
0.580963855422
0.58086285852
0.580038572806
0.580660718592
0.580318379161
0.579734620024
0.579150579151
0.579290369298
0.57870593916
0.577879739193
0.577777777778
0.577434162841
0.577815369744
0.576988155668
0.57833655706
0.578718258767
0.581035316884
0.58020

In [6]:
y_validate_true = y_labels[len(y)-len(y_validate_pred):,]

In [9]:
acc = accuracy_score(y_validate_true, y_validate_pred)

In [10]:
acc

0.58809523809523812