### Goal
- prepare stats data for machine learning

### Steps
- : review dtypes,and prepare continuous numeric, discrete numeric, categorical, character and identity columns
Input: nfl_weekly_stats.parquet
Work:
    - separate continuous numeric, discrete numeric, categorical, character and identity columns
    - label encode
    - merge all back together
Output:
    - nfl_ml_weekly_stats.parquet - cleaned, labeled data

In [1]:
import os


In [2]:
from src import *

Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)


In [3]:

db = database_loader.DatabaseLoader(get_config('connection_string'))
DEBUG = False
SCHEMA = 'controls'

file_name = "nfl_weekly_stats"
data_directory = get_config('data_directory')

input_path = os.path.join(data_directory,  f"{file_name}.parquet")
assert  os.path.exists(input_path)


In [4]:
stats_df = pd.read_parquet(input_path)
stats_df.head()

Unnamed: 0_level_0,season,week,team,rush_yards,efficiency,percent_attempts_gte_eight_defenders,avg_time_to_los,rush_attempts,avg_rush_yards,rush_touchdowns,...,receiving_first_downs,receiving_2pt_conversions,special_teams_tds,opponent,team_score,team_coach,opposing_team,opposing_score,opposing_coach,spread
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,2022,1,ARI,26,5.930769,10.0,2.445889,10,2.6,1,...,12,1,0,KC,21,Kliff Kingsbury,KC,44,Andy Reid,-23
1,2022,3,ARI,39,5.421026,7.692308,2.580545,13,3.0,0,...,15,0,0,LA,12,Kliff Kingsbury,LA,20,Sean McVay,-8
2,2022,4,ARI,55,3.860545,40.0,2.565267,15,3.666667,0,...,10,0,0,CAR,26,Kliff Kingsbury,CAR,16,Matt Rhule,10
3,2022,6,ARI,37,5.610541,6.666667,2.708,15,2.466667,0,...,10,0,0,SEA,9,Kliff Kingsbury,SEA,19,Pete Carroll,-10
4,2022,7,ARI,92,2.619457,8.333333,2.810818,12,7.666667,1,...,9,1,0,NO,42,Kliff Kingsbury,NO,34,Dennis Allen,8


#### drop columns we dont need

In [5]:

stats_df.drop(columns=[
    'opponent', 'team_coach', 'opposing_coach'
], inplace=True)


#### dependent_columns_df: split out the potential y dependent columns

In [6]:
stats_shape = stats_df.shape
print('stats_df      ', stats_shape)

stats_df       (3569, 65)


In [7]:
set_ready_cols = {'season', 'week', 'team', 'opposing_score', 'team_score', 'spread'}
ready_columns_df = stats_df[list(set_ready_cols)]
stats_df = stats_df.drop(columns=set_ready_cols)

print('ready_columns_df', ready_columns_df.shape)
print('stats_df        ', stats_df.shape)

ready_columns_df (3569, 6)
stats_df         (3569, 59)


#### character_columns_df :  split out the character columns

In [8]:
character_columns_df = stats_df.select_dtypes(include='object')
stats_df = stats_df.drop(columns=character_columns_df.columns)
set_character_columns = set(character_columns_df.columns)

print('character_columns', character_columns_df.shape)
print('stats_df      ', stats_df.shape)

character_columns (3569, 1)
stats_df       (3569, 58)


#### iterate over categories until we have the right set of categorical columns

In [9]:
from collections import defaultdict

categories = defaultdict(lambda: [])

for column in stats_df.columns:
    n = len(stats_df[column].value_counts())
    if n < 35:
        categories[stats_df[column].dtype.name].append(column)

for k, v in categories.items():
    print("---")
    print(f"{k} columns : {v}")

assert 'int64' in categories and len(categories) == 1


---
int64 columns : ['rush_attempts', 'rush_touchdowns', 'pass_touchdowns', 'interceptions', 'fumble', 'own_kickoff_recovery', 'safety', 'qb_hit', 'touchdown', 'passing_tds', 'ps_interceptions', 'sacks', 'sack_fumbles', 'sack_fumbles_lost', 'passing_first_downs', 'passing_2pt_conversions', 'rushing_tds', 'rushing_fumbles', 'rushing_fumbles_lost', 'rushing_first_downs', 'rushing_2pt_conversions', 'receiving_tds', 'receiving_fumbles', 'receiving_fumbles_lost', 'receiving_first_downs', 'receiving_2pt_conversions', 'special_teams_tds']


### discrete_columns_df:  split out discrete columns

In [10]:
discrete_columns_df = stats_df[categories['int64']]
stats_df = stats_df.drop(columns=discrete_columns_df.columns)
set_discrete_numeric = set(discrete_columns_df.columns)

print('discrete_columns', discrete_columns_df.shape)
print('stats_df      ', stats_df.shape)

discrete_columns (3569, 27)
stats_df       (3569, 31)


### numeric_columns_df: split out numeric columns

In [11]:
# all numeric columns and their values
numeric_columns_df = stats_df.select_dtypes(include='number')  ## review whats left over

# this set should all be floats - find any int64's and convert them
dtype_mapping = {col: 'float' for col in numeric_columns_df.select_dtypes(include='int64')}
numeric_columns_df = numeric_columns_df.astype(dtype_mapping)

stats_df = stats_df.drop(columns=numeric_columns_df.columns)

print('numeric_columns_df', numeric_columns_df.shape)
print('stats_df      ', stats_df.shape)

numeric_columns_df (3569, 31)
stats_df       (3569, 0)


### encode discrete columns

In [12]:
from sklearn.preprocessing import LabelEncoder

label_encoders = {}


def label_encode(labels_df: pd.DataFrame, labels: dict):
    print("shape before labels", labels_df.shape, end="; ")

    for col in labels_df.columns:
        encoder = LabelEncoder()
        labels_df[col] = encoder.fit_transform(labels_df[col])
        labels[col] = dict(df=labels_df, encoder=encoder)

    print(f"after labels", labels_df.shape)
    return labels_df, labels


In [13]:
discrete_columns_df, label_encoders = label_encode(labels_df=discrete_columns_df, labels=label_encoders)
len(label_encoders)

shape before labels (3569, 27); after labels (3569, 27)


27

### encode character_columns_df columns

In [14]:
character_columns_df, label_encoders = label_encode(labels_df=character_columns_df, labels=label_encoders)
len(label_encoders)

shape before labels (3569, 1); after labels (3569, 1)


28

### split dependent columns and ready columns that need no prep

In [15]:
# two ways we can go
#    1) multi-column dependency, or
#    2) just get the difference between the win and loss column - this is the 'spread' column

# because in this step we are only trying to get the important dimensions for further evaluation, we'll use the spread
# and we won't use encoding for this set - it is a set of integers both negative and positive where negative is 'lower' than positive

dependent_columns_df = ready_columns_df[['spread']]
ready_columns_df.drop(columns=['spread', 'team_score', 'opposing_score'], inplace=True)

dropped_cols = 2


### review dataframes

In [16]:

datasets = {
    'dependent_columns_df': dependent_columns_df.shape,
    'discrete_columns_df': discrete_columns_df.shape,
    'character_columns_df': character_columns_df.shape,
    'numeric_columns_df': numeric_columns_df.shape,
    'ready_columns_df': ready_columns_df.shape
}

col_count = 0
row_count = stats_df.shape[0]
for key, shape in datasets.items():
    col_count += shape[1]
    assert shape[0] == row_count
    print(key, shape)

assert stats_df.shape[1] == 0
assert stats_shape[1] == col_count + dropped_cols


dependent_columns_df (3569, 1)
discrete_columns_df (3569, 27)
character_columns_df (3569, 1)
numeric_columns_df (3569, 31)
ready_columns_df (3569, 3)


### save original column values from labeled columns

In [17]:
original_column_names = {}

for col, labels_dict in label_encoders.items():
    df = labels_dict['df']
    label_encoder = labels_dict['encoder']
    original_labels = label_encoder.inverse_transform(df[col])
    original_column_names[col] = original_labels


In [18]:
dataframes = [
    ready_columns_df,
    dependent_columns_df,
    discrete_columns_df,
    character_columns_df,
    numeric_columns_df
]

merge_df = pd.concat(dataframes, axis=1)

assert stats_shape[1] == merge_df.shape[1] + dropped_cols
assert stats_shape[0] == merge_df.shape[0]

merge_df.shape

(3569, 63)

In [19]:
%%time

file_name = "nfl_ml_weekly_stats"

data_directory = get_config('data_directory')
if not os.path.exists(data_directory):
    os.makedirs(data_directory)

output_path = os.path.join(data_directory, f"{file_name}.parquet")
merge_df.to_parquet(output_path, engine='fastparquet', compression='snappy')

CPU times: user 31.3 ms, sys: 8.38 ms, total: 39.7 ms
Wall time: 71.4 ms
