- **Name:** Claude Fried
- **Pace:** Part-time
- **Scheduled project review date/time:** 
- **Instructor name:** James Irving
- **Blog post URL:** 

### Imports

In [1]:
import os
import re
import pickle
import bz2
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from scipy import sparse
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.metrics import (classification_report, 
                             balanced_accuracy_score, 
                             log_loss)
from sklearn.linear_model import RidgeClassifier, SGDClassifier
from sklearn.tree import DecisionTreeClassifier

import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras import Sequential

import baseball_support as bsb
from league_player_classes import *

In [2]:
%matplotlib inline
plt.style.use(['ggplot', 'seaborn-talk'])
pd.set_option('display.max_columns', None)

# Set colors for plotting.
COLORS = {
    'blue': '#1f77b4',
    'green': '#2eb41f',
    'yellow': '#b4b21f',
    'red': '#b41f1f',
    'purple': '#b41f94',
    'grey': '#7d7d7d'
}

# Set random seed.
np.random.seed(2020)
tf.random.set_seed(2020)
tf.compat.v1.set_random_seed(2020)

# Check for device.
# os.environ['CUDA_VISIBLE_DEVICES'] = '-1' # Prevents using GPU.
print(bsb.headerize(tf.test.gpu_device_name()))

********************************************************************************
*                                /device:GPU:0                                 *
********************************************************************************


In [3]:
(X_preprocessor, y_preprocessor, 
 X_train, y_train, X_test, y_test) = bsb.load_modeling_tools()

X_train.shape, y_train.shape, X_test.shape, y_test.shape

Loading...
Loaded: X_test_processed
Loaded: X_train_processed
Loaded: y_test_processed
Loaded: y_train_processed
Loaded: X_preprocessor
Loaded: y_preprocessor
Complete!


((5897770, 10067), (5897770,), (564198, 10067), (564198,))

# Model

## SKLearn

### SGD (Logistic Regression)

In [None]:
# log_sgd = SGDClassifier(loss='log',
#                         random_state=2021)
# log_sgd.fit(X_train, y_train)

In [None]:
# bsb.print_metrics(X_train, 
#                   y_train, 
#                   classifier=log_sgd, 
#                   target_names=y_preprocessor.classes_, 
#                   heading='SGD: Training Data')
# bsb.print_metrics(X_test, 
#                   y_test, 
#                   classifier=log_sgd, 
#                   target_names=y_preprocessor.classes_, 
#                   heading='SGD: Test Data')

### Decision Tree

In [None]:
# dtc = DecisionTreeClassifier(max_features='auto',
#                              max_depth=50,
#                              random_state=2021)
# dtc.fit(X_train, y_train)

In [None]:
# bsb.print_metrics(X_train, 
#                   y_train, 
#                   classifier=dtc, 
#                   target_names=y_preprocessor.classes_, 
#                   heading='DTC: Training Data')
# bsb.print_metrics(X_test, 
#                   y_test, 
#                   classifier=dtc, 
#                   target_names=y_preprocessor.classes_, 
#                   heading='DTC: Test Data')

## Tensorflow

In [None]:
def build_model(X_data_shape, 
                y_data_shape,
                activation='relu',
                output_activation='softmax',
                optimizer='adam',
                loss='categorical_crossentropy'):
    """Returns a compiled Sequential Tensorflow model."""
    model = Sequential()
    # Input
    model.add(
        layers.Dense(X_data_shape[1], 
                     activation=activation, 
                     input_shape=(X_data_shape[1],))
    )
    # Hidden layer
    model.add(
        layers.Dense((X_data_shape[1] + y_data_shape[1])//2, 
                     activation=activation)
    )
    # Output
    model.add(layers.Dense(y_data_shape[1], activation=output_activation))
    
    model.compile(optimizer=optimizer, loss=loss)
    return model

In [None]:
# Process y columns for tensorflow model.
ohe = OneHotEncoder(categories='auto', sparse=False)
ohe_y_train = ohe.fit_transform(
    y_preprocessor.inverse_transform(y_train).reshape(-1, 1)
)
ohe_y_test = ohe.transform(
    y_preprocessor.inverse_transform(y_test).reshape(-1, 1)
)

ohe_y_train.shape, ohe_y_test.shape

In [None]:
model = build_model(X_train.shape, ohe_y_train.shape)
model.summary()

In [None]:
history = model.fit(X_train, 
                    ohe_y_train,
                    batch_size=16000,
                    epochs=5,
                    validation_data=(X_test, ohe_y_test))

# Interpret