#### Example notebook on how to use _firstDown_

In [31]:
import firstDown

# Load data
pbp, players = firstDown.load_data.datasets.nfl_data()

# Get players' positions
dataset = firstDown.feature_engineering.build_features.get_positions(pbp, players)

# Clean data
dataset = firstDown.preprocessing.clean.drop_penalties(dataset, penalty_col='first_down_penalty')
dataset = firstDown.preprocessing.clean.drop_control_rows(dataset, control_col='desc')

In [32]:
# Search for NaN values
nan = firstDown.preprocessing.deal_nan.search_nan(dataset)
display(nan)

Unnamed: 0,var,rows_nan,rows_other,rows_total
0,play_id,0,36377,36377
1,game_id,0,36377,36377
2,old_game_id,0,36377,36377
3,home_team,0,36377,36377
4,away_team,0,36377,36377
...,...,...,...,...
367,xpass,8087,28290,36377
368,pass_oe,8863,27514,36377
369,rush_pos,24505,11872,36377
370,rec_pos,22604,13773,36377


In [33]:
# Feature Engineering
dataset = firstDown.feature_engineering.build_features.inertia(dataset)
dataset = firstDown.feature_engineering.build_features.play_type(dataset)
dataset = firstDown.feature_engineering.build_features.defense_rush(dataset)
dataset = firstDown.feature_engineering.build_features.defense_pass(dataset)
dataset = firstDown.feature_engineering.build_features.defense_scramble(dataset)

# Select relevant features
dataset = dataset[['first_down','ydstogo','down','inertia','score_differential','play_category','rush_pos','pass_pos','rec_pos','def_vs_rush','def_vs_pass','def_vs_qb_scramble','shotgun','wp','temp','wind','roof','surface','location','half_seconds_remaining','game_half','yardline_100']]

In [34]:
# Train / test split
X_train, X_test, y_train, y_test = firstDown.preprocessing.split.split_data(dataset, 'first_down', 0.2)

In [36]:
X_test.sample(23)

Unnamed: 0,ydstogo,down,inertia,score_differential,play_category,rush_pos,pass_pos,rec_pos,def_vs_rush,def_vs_pass,...,shotgun,wp,temp,wind,roof,surface,location,half_seconds_remaining,game_half,yardline_100
20038,8.0,2.0,0.3,-8.0,pass,,QB,TE,0.722513,0.655303,...,1.0,0.183079,73.0,5.0,outdoors,grass,Home,19.0,Half1,67.0
696,10.0,1.0,0.0,0.0,rush,RB,,,,,...,1.0,0.417332,75.0,11.0,outdoors,grass,Home,1248.0,Half1,88.0
27061,0.0,,,8.0,other,,,,0.750916,0.677054,...,0.0,0.86659,58.0,3.0,outdoors,grass,Home,949.0,Half2,35.0
26348,10.0,2.0,0.0,-3.0,rush,RB,,,0.791469,0.687688,...,1.0,0.505119,38.0,20.0,outdoors,a_turf,Home,706.0,Half1,67.0
17015,14.0,2.0,0.0,-12.0,pass,,QB,TE,0.684524,0.740196,...,1.0,0.173146,,,dome,sportturf,Home,1337.0,Half2,50.0
10276,7.0,4.0,0.0,0.0,other,,,,0.8125,0.679245,...,0.0,0.580242,,,closed,fieldturf,Home,1549.0,Half1,20.0
21733,1.0,4.0,0.0,24.0,other,,,,0.763393,0.644444,...,0.0,0.999649,53.0,3.0,outdoors,grass,Home,290.0,Half2,63.0
19424,0.0,,0.0,-17.0,other,,,,0.719178,0.681507,...,0.0,0.06431,,,closed,fieldturf,Home,1570.0,Half2,35.0
7687,12.0,2.0,0.0,7.0,pass,,QB,TE,0.894737,0.575342,...,1.0,0.672714,66.0,5.0,outdoors,fieldturf,Home,1173.0,Half1,92.0
35587,10.0,2.0,0.25,0.0,pass,,QB,WR,0.750716,0.672249,...,1.0,0.606579,,,outdoors,,Home,1170.0,Half1,70.0


In [23]:
# If needed, replace or drop NaN values using replace_nan or drop_nan

In [24]:
# Fill nan for temp and wind

X_train, value_w = firstDown.preprocessing.deal_nan.replace_nan(X_train, 'wind', method='num')
X_train, value_t = firstDown.preprocessing.deal_nan.replace_nan(X_train, 'temp', method='median')

X_test, value_w = firstDown.preprocessing.deal_nan.replace_nan(X_test, 'wind', method='num')
X_test, value_t = firstDown.preprocessing.deal_nan.replace_nan(X_test, 'temp', method='num', num=value_t)

In [None]:
# Scale numerical columns
num_cols = ['ydstogo','inertia','score_differential','def_vs_rush','def_vs_pass','def_vs_qb_scramble','wp','temp','wind','half_seconds_remaining','yardline_100']

X_train, get_scaler = firstDown.preprocessing.scale.scaler(X_train, num_cols=num_cols)

X_test = firstDown.preprocessing.scale.scaler_transform(X_test, num_cols=num_cols, scaler=get_scaler)

# Encoding columns
one_hot_cols = ['down','play_category','rush_pos','pass_pos','rec_pos','roof','surface','location','game_half']

encoder = firstDown.feature_engineering.encode.one_hot(X_train, one_hot_cols)
X_train = firstDown.feature_engineering.encode.one_hot_transform(X_train, one_hot_cols, encoder)
X_test = firstDown.feature_engineering.encode.one_hot_transform(X_test, one_hot_cols, encoder)

In [None]:
# Train
clf = firstDown.train.models.get_model()

# Hyperparameter tuning
from scipy.stats import randint

search = firstDown.hyper_tuning.random_search.r_search(
    param_dist = {
        "n_estimators": randint(50, 300),
        "max_depth": randint(5, 50),
        "min_samples_split": randint(2, 20),
        "min_samples_leaf": randint(1, 10),
        "max_features": ["sqrt", "log2", None]
    },
    model=clf)

best_model = firstDown.train.models.do_fit(search, X_train, y_train)
y_pred, y_pred_prob = firstDown.train.models.do_predict(X_test, best_model)

In [27]:
# Model performance metrics
metrics = {
        "Accuracy": firstDown.metrics.model_metrics.accuracy(y_test, y_pred),
        "Recall": firstDown.metrics.model_metrics.recall(y_test, y_pred),
        "Precision": firstDown.metrics.model_metrics.precision(y_test, y_pred),
        "F1 Score": firstDown.metrics.model_metrics.f1(y_test, y_pred),
        "ROC AUC Score": firstDown.metrics.model_metrics.roc_auc(y_test, y_pred_prob)
}

print(metrics)

{'Accuracy': 0.812671797691039, 'Recall': 0.3513862024500322, 'Precision': 0.6042128603104213, 'F1 Score': 0.4443538524256013, 'ROC AUC Score': 0.8316372871143844}
