In [None]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm
import warnings
from multiprocessing import Pool

In [None]:
def create_supplimentary_logs_map(root_dir):
    map = {}
    for dir, _, files in tqdm(os.walk(root_dir)):
        for file in files:
            fp = os.path.join(dir, file)
            if file.endswith('.xml'):
                map[file.strip('.xml')] = fp
    return map

logs_dir = r"E:\Desktop\hudl-logs\DATA\player_stat_logs"
log_map = create_supplimentary_logs_map(logs_dir)

In [None]:
def get_all_game_log_paths(dir):
    paths = {}
    for path,_,files in os.walk(dir):
        for file in files:
            id = file.split('.')[0].split('_')[0]
            if file.endswith('.csv') and id in log_map:
                fp = os.path.join(path, file)
                paths[id] = fp
    return paths

dir = r"E:\Desktop\hudl-logs\DATA\game_logs"
logs = get_all_game_log_paths(dir)

In [None]:
temp = {}
for k in tqdm(log_map):
    v = log_map[k]
    try:
        pd.read_xml(v, attrs_only=True)
        temp[k] = v
    except:
        pass
temp

In [15]:
fp = temp['122806']
df = pd.read_xml(fp)

for name in df['name']:
    print(name)
df

InStat Index
Seconds on the court
Points
Field goals made
Field goals attempted
Field goals, %
3-pt field goals made
3-pt field goals attempted
3-pt field goals, %
Free throws made
Free throws attempted
Free throws, %
Offensive rebounds
Defensive rebounds
Rebounds
Assists
Steals
Blocks
Turnovers
Fouls
2-pt field goals made
2-pt field goals attempted
2-pt field goals, %
Fouls drawn
Games played
Defensive rating
Offensive rating
Net rating
Opponent's field goals attempted
Opponent's field goals made
Opponent's field goals, %
Contested field goals made
Contested field goals, %
Uncontested field goals made
Uncontested field goals, %
Number of player's possessions
PnR Handlers attempted
PnR Handlers made
Catch and shoot attempted
Catch and shoot made
Screens off attempted
Isolations attempted
Isolations made
Hand off attempted
Hand off made
Cuts attempted
Cuts made
Transitions attempted
Transitions made
Plus/Minus
Usage Percentage
Deflections
Right drives
Right drives made
Right drives made

Unnamed: 0,id,name,value_sum,value_avg
0,0,InStat Index,2613.70,217.81
1,2,Seconds on the court,23475.70,1956.31
2,3,Points,165.00,13.75
3,4,Field goals made,55.00,4.58
4,5,Field goals attempted,130.00,10.83
...,...,...,...,...
115,544,Steals to turnovers,0.81,
116,545,Draw foul rate,0.73,
117,546,True shooting percentage,55.07,
118,547,Effective field goal percentage,46.92,


In [None]:
def load_and_concat_shots_from_log(fp: str, labels, df1=None):
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        temp_df = pd.read_csv(fp, sep=';', names=labels)
        mask = (temp_df['action_name'].str.contains('-', regex=False) | 
                temp_df['action_name'].str.contains('\+', regex=True)) & \
               ~temp_df['action_name'].str.contains('1', regex=False)
        shots = temp_df[mask]
        shots['attempt_type'] = shots['action_name'].str[0]
        shots['shot_outcome'] = np.where(shots['action_name'].str.contains("\+", regex=True), 0, 1)
        features = shots[
            [
                'shot_outcome', 
                'attempt_type', 
                'player_id', 
                'team_id', 
                'opponent_id',
                'opponent_team_id',
                'teammate_id',
                'possession_team_id',
                'playtype', 
                'shot_type',
                'pos_x', 
                'pos_y', 
                'half',
            ]
        ].copy()
    return pd.concat([features, df1], axis=0) if df1 is not None else features

def generate_features():
    dir = r"E:\Desktop\hudl-logs\DATA\game_logs"
    logs = get_all_game_log_paths(dir)[0:1000]
    fp_1 = r'logs\15-16\707.csv'
    df_1 = pd.read_csv(fp_1, delimiter=';')
    labels = df_1.columns

    all_features = []
    for log in tqdm(logs):
        all_features.append(load_and_concat_shots_from_log(log, labels))
    return pd.concat(all_features, axis=0)

features = generate_features()


In [None]:
features.dropna(subset=['playtype', 'shot_type', 'pos_x', 'pos_y'], inplace=True)

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

X = features.drop('shot_outcome', axis=1)
y = features['shot_outcome']

categorical_features = ['playtype', 'shot_type']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse=False))])

numeric_features = ['pos_x', 'pos_y']  # add more numerical features if available
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)
X

In [None]:
X_train.shape

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

model = Sequential()
model.add(Dense(10, input_dim=X_train.shape[1], activation='relu'))  # input layer requires input_dim param
model.add(Dense(8, activation='relu'))  # hidden layer
model.add(Dense(4, activation='relu'))  # hidden layer
model.add(Dense(2, activation='relu'))  # hidden layer
model.add(Dense(1, activation='sigmoid'))  # output layer
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
# fit
history = model.fit(X_train, y_train, epochs=100, batch_size=256, validation_data=(X_test, y_test))

In [None]:
import matplotlib.pyplot as plt

# Plot training & validation accuracy values
plt.figure(figsize=(14, 5))

plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')

# Plot training & validation loss values
plt.subplot(1, 2, 2)
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')

plt.show()

In [None]:
sum(y.values) / len(y.values)

In [None]:
import pandas as pd

fp = r"162144_Brooklyn Nets - Denver Nuggets.csv"
fp_1 = r'logs\15-16\707.csv'
df_1 = pd.read_csv(fp_1, delimiter=';')
labels = df_1.columns

df = pd.read_csv(fp)
len(df.columns), len(df_1.columns)
df.values

new_df = pd.DataFrame(columns=labels)
for i in range (len(df.values)):
    row = df.values[i][0].split(';')
    new_df.loc[i] = row
new_df