In [22]:
import pandas as pd
from pandasql import sqldf
from sqlalchemy import create_engine, text

import numpy as np
from numpy import dtype


In [23]:
import os
import sys

sys.path.append(os.path.abspath("../src"))

In [24]:
from src import *

In [25]:
db = database_loader.DatabaseLoader(get_config('connection_string'))
DEBUG = False
SCHEMA = 'controls'

In [26]:
stats_df = db.read_table(table_name="team_weekly_stats", schema="controls")
stats_df.head()

Unnamed: 0,season,week,team,rush_yards,efficiency,percent_attempts_gte_eight_defenders,avg_time_to_los,rush_attempts,avg_rush_yards,rush_touchdowns,...,receiving_2pt_conversions,special_teams_tds,opponent,team_score,team_coach,opposing_team,opposing_score,opposing_coach,spread,ops_load_timestamp
0,2022,1,ARI,26,5.930769,10.0,2.445889,10,2.6,1,...,1,0,KC,21,Kliff Kingsbury,KC,44,Andy Reid,-23,2023-07-07 07:52:13.473767
1,2022,3,ARI,39,5.421026,7.692308,2.580545,13,3.0,0,...,0,0,LA,12,Kliff Kingsbury,LA,20,Sean McVay,-8,2023-07-07 07:52:13.473767
2,2022,4,ARI,55,3.860545,40.0,2.565267,15,3.666667,0,...,0,0,CAR,26,Kliff Kingsbury,CAR,16,Matt Rhule,10,2023-07-07 07:52:13.473767
3,2022,6,ARI,37,5.610541,6.666667,2.708,15,2.466667,0,...,0,0,SEA,9,Kliff Kingsbury,SEA,19,Pete Carroll,-10,2023-07-07 07:52:13.473767
4,2022,7,ARI,92,2.619457,8.333333,2.810818,12,7.666667,1,...,1,0,NO,42,Kliff Kingsbury,NO,34,Dennis Allen,8,2023-07-07 07:52:13.473767


#### drop columns we dont need

In [27]:

stats_df.drop(columns=[
    'ops_load_timestamp', 'opponent', 'team_coach', 'opposing_coach'
], inplace=True)


#### dependent_columns_df: split out the potential y dependent columns

In [28]:
stats_shape = stats_df.shape
print('stats_df      ', stats_shape)

stats_df       (3569, 65)


In [29]:
set_ready_cols = {'season', 'week', 'opposing_score', 'team_score', 'spread'}
ready_columns_df = stats_df[list(set_ready_cols)]
stats_df = stats_df.drop(columns=set_ready_cols)

print('ready_columns_df', ready_columns_df.shape)
print('stats_df        ', stats_df.shape)

ready_columns_df (3569, 5)
stats_df         (3569, 60)


#### character_columns_df :  split out the character columns

In [30]:
character_columns_df = stats_df.select_dtypes(include='object')
stats_df = stats_df.drop(columns=character_columns_df.columns)
set_character_columns = set(character_columns_df.columns)

print('character_columns', character_columns_df.shape)
print('stats_df      ', stats_df.shape)

character_columns (3569, 2)
stats_df       (3569, 58)


#### Blah Blah

In [31]:
from collections import defaultdict

categories = defaultdict(lambda: [])

for column in stats_df.columns:
    n = len(stats_df[column].value_counts())
    if n < 35:
        categories[stats_df[column].dtype.name].append(column)

for k, v in categories.items():
    print("---")
    print(f"{k} columns : {v}")

assert 'int64' in categories and len(categories) == 1


---
int64 columns : ['rush_attempts', 'rush_touchdowns', 'pass_touchdowns', 'interceptions', 'fumble', 'own_kickoff_recovery', 'safety', 'qb_hit', 'touchdown', 'passing_tds', 'ps_interceptions', 'sacks', 'sack_fumbles', 'sack_fumbles_lost', 'passing_first_downs', 'passing_2pt_conversions', 'rushing_tds', 'rushing_fumbles', 'rushing_fumbles_lost', 'rushing_first_downs', 'rushing_2pt_conversions', 'receiving_tds', 'receiving_fumbles', 'receiving_fumbles_lost', 'receiving_first_downs', 'receiving_2pt_conversions', 'special_teams_tds']


In [32]:
discrete_columns_df = stats_df[categories['int64']]
stats_df = stats_df.drop(columns=discrete_columns_df.columns)
set_discrete_numeric = set(discrete_columns_df.columns)

print('discrete_columns', discrete_columns_df.shape)
print('stats_df      ', stats_df.shape)

discrete_columns (3569, 27)
stats_df       (3569, 31)


In [33]:
# all numeric columns and their values
numeric_columns_df = stats_df.select_dtypes(include='number')  ## review whats left over

# this set should all be floats - find any int64's and convert them
dtype_mapping = {col: 'float' for col in numeric_columns_df.select_dtypes(include='int64')}
numeric_columns_df = numeric_columns_df.astype(dtype_mapping)

stats_df = stats_df.drop(columns=numeric_columns_df.columns)

print('numeric_columns_df', numeric_columns_df.shape)
print('stats_df      ', stats_df.shape)

numeric_columns_df (3569, 31)
stats_df       (3569, 0)


### encode discrete columns

In [34]:
from sklearn.preprocessing import LabelEncoder

label_encoders = {}


def label_encode(labels_df: pd.DataFrame, labels: dict):
    print("shape before labels", labels_df.shape, end="; ")

    for col in labels_df.columns:
        encoder = LabelEncoder()
        labels_df[col] = encoder.fit_transform(labels_df[col])
        labels[col] = dict(df=labels_df, encoder=encoder)

    print(f"after labels", labels_df.shape)
    return labels_df, labels


In [35]:
discrete_columns_df, label_encoders = label_encode(labels_df=discrete_columns_df, labels=label_encoders)
len(label_encoders)

shape before labels (3569, 27); after labels (3569, 27)


27

### encode character_columns_df columns

In [36]:
character_columns_df, label_encoders = label_encode(labels_df=character_columns_df, labels=label_encoders)
len(label_encoders)

shape before labels (3569, 2); after labels (3569, 2)


29

### encode dependent columns

In [37]:
# two ways we can go
#    1) multi-column dependency, or
#    2) just get the difference between the win and loss column - this is the 'spread' column

# because in this step we are only trying to get the important dimensions for further evaluation, we'll use the spread
# and we won't use encoding for this set - it is a set of integers both negative and positive where negative is 'lower' than positive

dependent_columns_df = ready_columns_df[['spread']]
ready_columns_df.drop(columns=['spread', 'team_score', 'opposing_score'], inplace=True)

dropped_cols = 2


In [41]:

datasets = {
    'dependent_columns_df': dependent_columns_df.shape,
    'discrete_columns_df': discrete_columns_df.shape,
    'character_columns_df': character_columns_df.shape,
    'numeric_columns_df': numeric_columns_df.shape,
    'ready_columns_df': ready_columns_df.shape
}

col_count = 0
row_count = stats_df.shape[0]
for key, shape in datasets.items():
    col_count += shape[1]
    assert shape[0] == row_count
    print(key, shape)

assert stats_df.shape[1] == 0
assert stats_shape[1] == col_count + dropped_cols


dependent_columns_df (3569, 1)
discrete_columns_df (3569, 27)
character_columns_df (3569, 2)
numeric_columns_df (3569, 31)
ready_columns_df (3569, 2)


In [39]:
original_column_names = {}

for col, labels_dict in label_encoders.items():
    df = labels_dict['df']
    label_encoder = labels_dict['encoder']
    original_labels = label_encoder.inverse_transform(df[col])
    original_column_names[col] = original_labels


In [43]:
datasets.values()

dict_values([(3569, 1), (3569, 27), (3569, 2), (3569, 31), (3569, 2)])

In [44]:
dataframes = [
    ready_columns_df,
    dependent_columns_df,
    discrete_columns_df,
    character_columns_df,
    numeric_columns_df
]

merge_df = pd.concat(dataframes, axis=1)

assert stats_shape[1] == merge_df.shape[1] + dropped_cols
assert stats_shape[0] == merge_df.shape[0]

merge_df.shape

(3569, 63)

In [48]:
merge_df

Unnamed: 0,week,season,spread,rush_attempts,rush_touchdowns,pass_touchdowns,interceptions,fumble,own_kickoff_recovery,safety,...,passing_air_yards,passing_yards_after_catch,avg_dakota,carries,rushing_yards,receptions,targets,receiving_yards,receiving_air_yards,receiving_yards_after_catch
0,1,2022,-23,0,1,2,0,6,0,0,...,267.0,115.0,0.007858,22.0,103.0,24.0,36.0,205.0,266.0,115.0
1,3,2022,-8,3,0,0,0,2,0,0,...,356.0,166.0,-0.000555,21.0,70.0,37.0,55.0,314.0,332.0,166.0
2,4,2022,10,5,0,2,1,3,0,0,...,180.0,87.0,0.011065,37.0,132.0,23.0,31.0,207.0,182.0,87.0
3,6,2022,-10,5,0,0,1,4,0,0,...,333.0,114.0,0.003919,28.0,144.0,23.0,36.0,222.0,340.0,114.0
4,7,2022,8,2,1,1,0,2,0,0,...,216.0,94.0,0.013561,29.0,137.0,20.0,29.0,204.0,216.0,94.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3564,12,2016,-5,4,0,3,0,1,0,0,...,393.0,161.0,0.025579,19.0,56.0,41.0,53.0,449.0,393.0,161.0
3565,13,2016,-8,4,0,1,1,3,0,0,...,286.0,120.0,0.006349,18.0,87.0,21.0,34.0,271.0,251.0,120.0
3566,14,2016,5,6,1,2,1,2,0,0,...,234.0,59.0,0.029430,23.0,107.0,14.0,21.0,234.0,234.0,59.0
3567,16,2016,20,9,0,1,0,0,0,0,...,349.0,107.0,0.026037,35.0,208.0,18.0,26.0,270.0,333.0,107.0


2