In [1]:
import math
import numpy as np
import pandas as pd
import time
from datetime import datetime
# sklearn
import sklearn
from sklearn.metrics import accuracy_score, roc_auc_score, log_loss
# sklearn-panads
from scipy.sparse import csr_matrix, vstack, hstack
from sklearn_pandas import DataFrameMapper, cross_val_score
from fastFM import sgd



In [2]:
df = pd.read_csv('/Users/david/datasets/kaggle/kobe_bryant_shot_selection/data.csv')

In [3]:
cat_columns = ["action_type", "combined_shot_type", "period", "playoffs",
               "shot_type", "shot_zone_area", "shot_zone_basic","shot_zone_range",
                "team_id","team_name","matchup","opponent"] 
need_binarize_num_columns = ["season"]
num_columns = ["game_event_id","game_id","lat","loc_x","loc_y","lon","minutes_remaining",
               "seconds_remaining","shot_distance","game_date","shot_id"]


label_column = "shot_made_flag"

In [4]:
lb = sklearn.preprocessing.LabelBinarizer()
sc = sklearn.preprocessing.StandardScaler()
for col in df.columns:
    if col == "game_date":
        df["game_date"] = df["game_date"].apply(lambda d: time.mktime(time.strptime(d, "%Y-%m-%d")))
    if col in cat_columns + need_binarize_num_columns:
        # transform categorical data to one-hot encoding
        df[col] = lb.fit_transform(df[col])
    if col in num_columns:
        # apply standard scaling 
        df[col] = sc.fit_transform(df[col])



In [5]:
# delay k data matrix.
def delay_k_data(data, k):
    w = data.shape[1]
    data_zeros = csr_matrix(np.zeros((k, w)))
    data_delayed = data[:-k,]
    return vstack((data_zeros, data_delayed))

In [6]:
vec_check_nan = np.vectorize(math.isnan)
# submission test data.
submit_data_indices = [i for i, x in enumerate(vec_check_nan(df["shot_made_flag"])) if x]
# data_available indices.
nrows = df.shape[0]
available_indices = [i for i in range(0, nrows) if i not in submit_data_indices]
# validation indices.
validation_data_indices = [x + 1 for x in submit_data_indices if x + 1 not in submit_data_indices]

In [7]:
from fastFM.datasets import make_user_item_regression

# parameters.
window = 15 # time window.
seq_length = 5000

y = np.array(df["shot_made_flag"])
# custom labels for factorization machines
y_labels = y.copy()
y_labels[y == 0] = -1
y_labels[vec_check_nan(y)] = 0

# original data matrix.
org_X = csr_matrix(np.array(df.drop(["shot_made_flag"], axis=1)))

# concatenate delay = 0 ~ k-1 data.
X = org_X.copy()
for i in range(1,window):
    X = hstack([X, delay_k_data(csr_matrix(y_labels).transpose(), i)])
X=X.tocsr()

# agg validate
y_validate_pred = []
y_validate_pred_proba = []

available_first = available_indices[0]

cut_point = validation_data_indices[0]
y_train_weird = True

for cut_point in validation_data_indices:
    begin_index = max(cut_point - seq_length, available_first)
    end_index = cut_point
    if (end_index > begin_index):
        train_range = range(begin_index, end_index)
        train_indices = [i for i in train_range if i not in submit_data_indices]
        test_index = cut_point

        X_train = X[train_indices,]
        y_train = y_labels[train_indices,]
        X_test = X[test_index]
        y_test = y_labels[test_index]

        if y_train_weird == True and len(set(y_train)) != 2:
            y_pred = y_train[0]
            y_validate_pred.append(y_pred)[0]
        else:
            y_train_weird = False
            fm = sgd.FMClassification(n_iter=10000, init_stdev=0.1, l2_reg_w=0,
                                  l2_reg_V=0, rank=3, step_size=0.001)
            fm.fit(X_train, y_train)
            y_pred = fm.predict(X_test)
            y_pred_proba = fm.predict_proba(X_test)
            y_validate_pred.append(y_pred)
            y_validate_pred_proba.append(y_pred_proba)

In [8]:
y_validate_true = y_labels[validation_data_indices[1:],]

In [9]:
acc = accuracy_score(y_validate_true, y_validate_pred)

ValueError: Found input variables with inconsistent numbers of samples: [4204, 8408]

In [None]:
acc

In [None]:
ll = logloss(y_validate_true, y_validate_pred_proba)

In [None]:
ll

In [None]:
auc = roc_auc_score(y_validate_true, y_validate_pred_proba)