### Goal
- prepare stats data for machine learning

### Steps
- : review dtypes,and prepare continuous numeric, discrete numeric, categorical, character and identity columns
Input: nfl_weekly_stats.parquet
Work:
    - separate continuous numeric, discrete numeric, categorical, character and identity columns
    - label encode
    - merge all back together
Output:
    - nfl_ml_weekly_stats.parquet - cleaned, labeled data

In [None]:
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
from src import *


In [None]:

db = database_loader.DatabaseLoader(get_config('connection_string'))
DEBUG = False
SCHEMA = 'controls'



#### load stats data

In [None]:
import ipywidgets as widgets
from IPython.display import display

# Display a dropdown widget for file selection
file_dropdown = widgets.Dropdown(
    options=["nfl_weekly_defense", "nfl_weekly_offense"],
    description='Select File:',
)

# # Display a button widget to trigger notebook execution
# run_button = widgets.Button(description='Run Notebook')
#
# def run_notebook(_):
#     selected_file = file_dropdown.value
#     # Your notebook code for processing the selected file goes here
#     print(f"Processing {selected_file}...")
#     # ...
#     # Rest of the notebook code
#
# # Attach the run_notebook function to the button's on_click event
# run_button.on_click(run_notebook)
#
# Display the widgets
display(file_dropdown)
#




In [None]:
print("Selected ", file_dropdown.value)

In [None]:

input_file_name = file_dropdown.value

data_directory = get_config('data_directory')
input_path = os.path.join(data_directory,  f"{input_file_name}.parquet")
assert  os.path.exists(input_path)

original_stats_df = pd.read_parquet(input_path)
original_stats_df.head()

#### drop columns we don't need

In [None]:
import numpy as np
original_stats_df['target'] = np.where(original_stats_df['win'] == 'win', 1,
                                     np.where(original_stats_df['win'] == 'loss', 0, 2) )

raw_features_df = original_stats_df.drop(columns=['season', 'week','team', 'win', 'spread','team_coach', 'opposing_coach', 'count', 'team_score', 'opposing_team', 'opposing_score' ])
raw_features_df.head()

#### scale numeric data for ml feature selection

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

features = scaler.fit_transform(raw_features_df.to_numpy())
features_df = pd.DataFrame(features, columns=raw_features_df.columns)

print("Scaled Dataset Using MinMaxScaler")
features_df.head()

#### review correlations

In [None]:
# Create heatmap using plt.imshow()
numeric_df = original_stats_df.select_dtypes(include='number').drop(columns=['season', 'week',  'count'])
correlation_matrix = numeric_df.corr()
plt.figure(figsize=(10, 8))
plt.imshow(correlation_matrix, cmap='coolwarm', interpolation='nearest')
plt.colorbar()
plt.title('Heatmap')


# Set x-axis and y-axis labels
plt.xticks(np.arange(len(numeric_df.columns)), numeric_df.columns, rotation=45)
plt.yticks(np.arange(len(numeric_df.columns)), numeric_df.columns)


plt.show()

### best correlations to target

In [None]:
from src.utils_eda import correlate_to_target, plot_correlations

top_correlations, set_correlations = correlate_to_target(features_df, 'target', 30)
plot_correlations(top_correlations['corr'], top_correlations['y'], 'Feature Correlations')


### train xgboost feature model

In [None]:
from src.utils_eda import calc_feature_importance

y = features_df.pop('target')

top_features, set_features = calc_feature_importance(features_df, y, top_n=30)
plot_correlations(top_features['corr'], top_features['y'], "Feature Importance")

### concat a weighted average "power" score to the stats dataset

In [None]:
from src.build_power_scores import concat_power_score

concat_power_score(df=original_stats_df, summary_data=top_features, threshold=.01, power_column="defense_power")
original_stats_df.head()

In [None]:
X = original_stats_df[[ 'defense_power']]
y = original_stats_df[[ 'target']]


In [None]:
from src.build_power_scores import create_shallow_model

model, callbacks = create_shallow_model(X)

r = model.fit(X, y, epochs=200, batch_size=64, verbose=0, validation_split=0.2, callbacks=callbacks)

In [None]:
from src.utils_eda import plot_loss

plot_loss(r)

In [None]:
from src.utils_eda import plot_accuracy

plot_accuracy(r)

### save the dataset

In [None]:
%%time

data_directory = get_config('data_directory')
if not os.path.exists(data_directory):
    os.makedirs(data_directory)

output_path = os.path.join(data_directory, f"{input_file_name}_ml.parquet")
original_stats_df.to_parquet(output_path, engine='fastparquet', compression='snappy')