#### Example pipeline on how to use _firstDown_

In [37]:
import firstDown

# Load data
pbp, players = firstDown.load_data.datasets.nfl_data()

# Get players' positions
dataset = firstDown.feature_engineering.build_features.get_positions(pbp, players)

# Clean data
dataset = firstDown.preprocessing.clean.drop_penalties(dataset, penalty_col='first_down_penalty')
dataset = firstDown.preprocessing.clean.drop_control_rows(dataset, control_col='desc')

In [38]:
# Search for NaN values
nan = firstDown.preprocessing.deal_nan.search_nan(dataset)
display(nan)

Unnamed: 0,var,rows_nan,rows_other,rows_total
0,play_id,0,36377,36377
1,game_id,0,36377,36377
2,old_game_id,0,36377,36377
3,home_team,0,36377,36377
4,away_team,0,36377,36377
...,...,...,...,...
367,xpass,8087,28290,36377
368,pass_oe,8863,27514,36377
369,rush_pos,24505,11872,36377
370,rec_pos,22604,13773,36377


In [39]:
# Feature Engineering
dataset = firstDown.feature_engineering.build_features.inertia(dataset)
dataset = firstDown.feature_engineering.build_features.play_type(dataset)
dataset = firstDown.feature_engineering.build_features.defense_rush(dataset)
dataset = firstDown.feature_engineering.build_features.defense_pass(dataset)
dataset = firstDown.feature_engineering.build_features.defense_scramble(dataset)

# Select relevant features
dataset = dataset[['first_down','ydstogo','down','inertia','score_differential','play_category','rush_pos','pass_pos','rec_pos','def_vs_rush','def_vs_pass','def_vs_qb_scramble','shotgun','wp','temp','wind','roof','surface','location','half_seconds_remaining','game_half','yardline_100']]

In [40]:
# Train / test split
X_train, X_test, y_train, y_test = firstDown.preprocessing.split.split_data(dataset, 'first_down', 0.2)

In [41]:
# If needed, replace or drop NaN values using replace_nan or drop_nan

In [42]:
# Fill nan for temp and wind

X_train, value_w = firstDown.preprocessing.deal_nan.replace_nan(X_train, 'wind', method='num')
X_train, value_t = firstDown.preprocessing.deal_nan.replace_nan(X_train, 'temp', method='median')

X_test, value_w = firstDown.preprocessing.deal_nan.replace_nan(X_test, 'wind', method='num')
X_test, value_t = firstDown.preprocessing.deal_nan.replace_nan(X_test, 'temp', method='num', num=value_t)

In [43]:
# Scale numerical columns
num_cols = ['ydstogo','inertia','score_differential','def_vs_rush','def_vs_pass','def_vs_qb_scramble','wp','temp','wind','half_seconds_remaining','yardline_100']

X_train, get_scaler = firstDown.preprocessing.scale.scaler(X_train, num_cols=num_cols)

X_test = firstDown.preprocessing.scale.scaler_transform(X_test, num_cols=num_cols, scaler=get_scaler)

# Encoding columns
one_hot_cols = ['down','play_category','rush_pos','pass_pos','rec_pos','roof','surface','location','game_half']

encoder = firstDown.feature_engineering.encode.one_hot(X_train, one_hot_cols)
X_train = firstDown.feature_engineering.encode.one_hot_transform(X_train, one_hot_cols, encoder)
X_test = firstDown.feature_engineering.encode.one_hot_transform(X_test, one_hot_cols, encoder)

In [44]:
# Train
clf = firstDown.train.models.get_model()

# Hyperparameter tuning
from scipy.stats import randint

search = firstDown.hyper_tuning.random_search.r_search(
    param_dist = {
        "n_estimators": randint(50, 300),
        "max_depth": randint(5, 50),
        "min_samples_split": randint(2, 20),
        "min_samples_leaf": randint(1, 10),
        "max_features": ["sqrt", "log2", None]
    },
    model=clf)

best_model = firstDown.train.models.do_fit(search, X_train, y_train)
y_pred, y_pred_prob = firstDown.train.models.do_predict(X_test, best_model)

Fitting 3 folds for each of 20 candidates, totalling 60 fits
[LightGBM] [Info] Number of positive: 6256, number of negative: 22845
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.013716 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1502
[LightGBM] [Info] Number of data points in the train set: 29101, number of used features: 55
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.214975 -> initscore=-1.295191
[LightGBM] [Info] Start training from score -1.295191


In [45]:
# Model performance metrics
metrics = {
        "Accuracy": firstDown.metrics.model_metrics.accuracy(y_test, y_pred),
        "Recall": firstDown.metrics.model_metrics.recall(y_test, y_pred),
        "Precision": firstDown.metrics.model_metrics.precision(y_test, y_pred),
        "F1 Score": firstDown.metrics.model_metrics.f1(y_test, y_pred),
        "ROC AUC Score": firstDown.metrics.model_metrics.roc_auc(y_test, y_pred_prob)
}

print(metrics)

{'Accuracy': 0.812671797691039, 'Recall': 0.3513862024500322, 'Precision': 0.6042128603104213, 'F1 Score': 0.4443538524256013, 'ROC AUC Score': 0.8316372871143844}
