### Goal
- prepare stats data for machine learning

### Steps
- : review dtypes,and prepare continuous numeric, discrete numeric, categorical, character and identity columns
Input: nfl_weekly_stats.parquet
Work:
    - separate continuous numeric, discrete numeric, categorical, character and identity columns
    - label encode
    - merge all back together
Output:
    - nfl_ml_weekly_stats.parquet - cleaned, labeled data

In [1]:
import os
import sys
sys.path.append(os.path.abspath("../src"))


In [2]:
from src import *


Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)


In [3]:
logger = configs.configure_logging("pbp_logger")
logger.setLevel(logging.INFO)

#### load stats data

In [4]:

input_file_name = "nfl_weekly_offense"
power_column = "offense_power"

data_directory = get_config('data_directory')
input_path = os.path.join(data_directory,  f"{input_file_name}.parquet")
assert  os.path.exists(input_path)

original_stats_df = pd.read_parquet(input_path)
original_stats_df.head()

Unnamed: 0_level_0,team,season,week,receiving_yards_after_catch,ps_completions,receiving_yards,targets,rushing_yards,rushing_tds,special_teams_tds,...,player_jersey_number,percent_attempts_gte_eight_defenders,team_score,team_coach,opposing_team,opposing_score,opposing_coach,spread,count,win
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,ARI,2016,1,105.0,24.0,271.0,37.0,92.0,1.0,0.0,...,31.0,43.75,21,Bruce Arians,NE,23,Bill Belichick,-2.0,1,loss
1,ARI,2016,2,157.0,18.0,315.0,34.0,105.0,1.0,0.0,...,23.0,50.0,40,Bruce Arians,TB,7,Dirk Koetter,33.0,1,win
2,ARI,2016,3,90.0,26.0,287.0,50.0,88.0,2.0,0.0,...,31.0,26.315789,18,Bruce Arians,BUF,33,Rex Ryan,-15.0,1,loss
3,ARI,2016,4,116.0,27.0,325.0,45.0,118.0,0.0,0.0,...,31.0,29.411765,13,Bruce Arians,LA,17,Jeff Fisher,-4.0,1,loss
4,ARI,2016,5,49.0,11.0,124.0,27.0,172.0,2.0,0.0,...,31.0,22.222222,33,Bruce Arians,SF,21,Chip Kelly,12.0,1,win


#### drop columns we don't need

In [5]:
from src.build_power_scores import prepare_power_data

features_df = prepare_power_data(original_stats_df)


2023-07-18 06:39:38,901 - INFO - encode the target win/loss column
2023-07-18 06:39:38,902 - INFO - create a features dataframe for feature selection ...
2023-07-18 06:39:38,903 - INFO - scale all features  ...


#### scale numeric data for ml feature selection

#### review correlations

In [None]:
from src.utils_eda import plot_heatmap

# Create heatmap using plt.imshow()
numeric_df = original_stats_df.select_dtypes(include='number').drop(columns=['season', 'week',  'count'])
plot_heatmap(original_stats_df, drop_columns=['season', 'week',  'count'])

### best correlations to target

In [None]:
from src.utils_eda import correlate_to_target, plot_correlations

top_correlations, set_correlations = correlate_to_target(features_df, 'target', 30)
plot_correlations(top_correlations['corr'], top_correlations['y'], 'Feature Correlations')


### train xgboost feature model

In [None]:
from src.utils_eda import calc_feature_importance

y = features_df.pop('target')

top_features, set_features = calc_feature_importance(features_df, y, top_n=30)
plot_correlations(top_features['corr'], top_features['y'], "Feature Importance")

### concat a weighted average "power" score to the stats dataset

In [None]:
from src.build_power_scores import concat_power_score

concat_power_score(df=original_stats_df, summary_data=top_features, threshold=.01, power_column=power_column)
original_stats_df.head()

#### sanity check the power score

In [None]:
X = original_stats_df[[ power_column ]]
y = original_stats_df[[ 'target']]


In [None]:
from src.build_power_scores import create_shallow_model

model, callbacks = create_shallow_model(X)

r = model.fit(X, y, epochs=200, batch_size=64, verbose=0, validation_split=0.2, callbacks=callbacks)

In [None]:
from src.utils_eda import plot_loss

plot_loss(r)

### save the dataset

In [None]:
#time

data_directory = get_config('data_directory')
if not os.path.exists(data_directory):
    os.makedirs(data_directory)

output_path = os.path.join(data_directory, f"{input_file_name}_ml.parquet")
original_stats_df.to_parquet(output_path, engine='fastparquet', compression='snappy')