## TOCQ: Tackle Opportunity Containment Quotient (Metrics Track)

In [1]:
## Import packages
import pandas as pd
import numpy as np
from zipfile import ZipFile
import matplotlib.pyplot as plt
import matplotlib.patches as patch
from matplotlib.collections import PatchCollection
from matplotlib.animation import ArtistAnimation, PillowWriter
import matplotlib
import seaborn as sns
import math
from shapely import geometry
import shapely
import imageio
import os
from tqdm import tqdm
from IPython import display

import warnings
warnings.filterwarnings("ignore")

plt.rc('axes', axisbelow=True)

In [2]:
## Load Data
games = pd.read_csv("games.csv")
plays = pd.read_csv("plays.csv")
tackles = pd.read_csv("tackles.csv")
players = pd.read_csv("players.csv")

In [3]:
# Functions Needed
def prep_play(tracking_data, gameID, playID, def_team):
    play = tracking_data[(tracking_data.gameId == gameID) & 
                         (tracking_data.playId == playID)]
    (xmin, xmax) = (min(play.x)-1, max(play.x)+1)
    (ymin, ymax) = (min(play.y)-1, max(play.y)+1) 
    play = play[((play.club == def_team) | (play.club == "football"))]
    events = play[["event", "frameId"]].drop_duplicates()
    if "ball_snap" in events.event.values:
        idx = events[events.event == "ball_snap"].frameId.iloc[0]
        events = events.loc[events.frameId >= idx]
    if "pass_outcome_caught" in events.event.values:
        idx = events[events.event == "pass_outcome_caught"].frameId.iloc[0]
        events = events.loc[events.frameId >= idx]
    if "tackle" in events.event.values:
        idx = events[events.event == "tackle"].frameId.iloc[0]
        events = events.loc[events.frameId <= idx]
    elif "out_of_bounds" in events.event.values:
        idx = events[events.event == "out_of_bounds"].frameId.iloc[0]
        events = events.loc[events.frameId <= idx]
    elif "qb_slide" in events.event.values:
        idx = events[events.event == "qb_slide"].frameId.iloc[0]
        events = events.loc[events.frameId <= idx]
    elif "qb_sack" in events.event.values:
        idx = events[events.event == "qb_sack"].frameId.iloc[0]
        events = events.loc[events.frameId <= idx]
    elif "fumble" in events.event.values:
        idx = events[events.event == "fumble"].frameId.iloc[0]
        events = events.loc[events.frameId <= idx]
    elif "safety" in events.event.values:
        idx = events[events.event == "safety"].frameId.iloc[0]
        events = events.loc[events.frameId <= idx]
    frames = pd.unique(events.frameId)
    
    play = play[(play.frameId >= min(frames)) & (play.frameId <= max(frames))]

    return [play, frames, (xmin,xmax), (ymin,ymax)]

def plot_play(prepped, frames, bounds, save = True):
    game_id = prepped.gameId.iloc[0]
    play_id = prepped.playId.iloc[0]

    fig, ax = plt.subplots()
    ax.set_xlim(left = bounds[0][0], right = bounds[0][1])
    ax.set_ylim(bottom = bounds[1][0], top = bounds[1][1])
    ax.set_xticklabels([50 - np.abs(60-int(k)) for k in ax.get_xticks()])
    ax.set_yticks([])
    ax.set_title(plays[(plays.gameId == game_id) & 
              (plays.playId == play_id)].playDescription.iloc[0])
    ax.set_facecolor("lawngreen")
    ax.grid(which = "major", axis = "x", color = "white", linewidth = 2) 

    ims = []
    for i in frames:
        frame = prepped[prepped.frameId == i]
        wedges = []
            
        for j in range(len(frame)):
            res = frame.iloc[j]
            if res.club != "football":
                wedges.append(patch.Wedge((res.x, res.y), r = 2, theta1 = ((-res.o+90) - 75) % 360, theta2 = ((-res.o+90) + 75) % 360, 
                                         color = "dimgray", alpha = 0.7))
                wedges.append(patch.Circle((res.x, res.y), radius = 0.5, color = "black"))
                continue
            # Plot the football
            else:
                for wedge in wedges:
                    if wedge.contains_point((res.x,res.y)):
                        if type(wedge) != matplotlib.patches.Circle:
                            wedge.set_color("blue")
                wedges.append(patch.Circle((res.x, res.y), radius = 0.5, color = "saddlebrown"))

        wedges = PatchCollection([wedge for wedge in wedges],
                                 facecolors = [wedge.get_facecolor() for wedge in wedges],
                                 edgecolors = [wedge.get_edgecolor() for wedge in wedges]
                                                       )
        im = ax.add_collection(wedges)
        ims.append([im])

    animation_play = ArtistAnimation(fig = fig, artists = ims, interval = 150,
                                    blit = True, repeat_delay = 2000)
    if save:
        animation_play.save(f"{game_id}_{play_id}.gif", writer = PillowWriter())
    video = animation_play.to_jshtml()
    html = display.HTML(video)
    display.display(html)
    plt.close()

### Introduction and Motivation

During a play, the primary goal of the defenders is to stop the ball carrier. This can be by tackle, out of bounds, forcing a fumble, or forcing a slide. The box score, however, only credits the player(s) who actually "make the tackle", as if they did so alone. A player who is often in the "right place" but who is credited for the tackle does not receive credit for being "in the right place at the right time". Conversely, a players who sees tackle "opportunities" slip away is not knocked for it in the box score. With this in mind, I define **TOCQ**, or **Tackle Opportunity Containment Quotient**, which reflects a defender's ability to "contain" the "tackle opportunities" they interact with. I will first define **TOCQ** and explain the calculation process before discussing **TOCQ** in the first half of the 2022 NFL Season.

### Defining TOCQ

**TOCQ** is the quotient of "tackles contained" and "tackle opportunities." Consider a defender $X$. The *tackle zone* of defender $X$ is the sector of a circle, centered on the player with radius $r = 2\ yards$, with a $150^{\circ}$ sector angle bisected by a vector in defender $X$'s orientation direction. This tackle zone is the zone wherein it is feasible the defender, facing the ball, could both see and react to the ball carrier. Given NFL speeds [rarely exceed 22 mph](https://www.sportingnews.com/us/nfl/news/nfl-fastest-player-top-2023-speeds/80dde018741ad620c97859e5) or ~10.5 yards/second, and a frame rate of 10 fps, most players would take ~3 frames to pass through a tackle zone if completely unencumbered, at top speed, and the defender were static. Likewise, the area of the tackle zone is only $A = \frac{600}{360}\pi \approx 5.24\ yd^2$, which is small enough to be feasible for defenders to have a chance at a tackle, given correct orientation towards the ball carrier.

Now, consider an arbitrary play. For each frame between the ball snap (for run plays) or the pass completion (for pass plays), we calculate each defender's tackle zone and check if the football is inside the zone. If this is true for defender $X$, he is credited with a *tackle opportunity*. Importantly, tackle opportunities are binary in a given play; even if the ball carrier leaves the tackle zone and later returns to it, he receives one tackle opportunity. When a play ends, whether by tackle, fumble, sack, slide, or out of bounds, that frame is the "action frame," which is used to count "tackles contained." If the ball is within a defenders's tackle zone during this frame, the defender is credited with a *tackle contained*. How that involvement occurs is ignored; "tackler" is the same as all other players with the ball in their zones. Any tackle contained is also simultaneously a tackle opportunity. 

For defender $X$, there are three outcomes:
1.  Defender $X$ has no *tackle opportunity*. This is neutral for **TOCQ**.
2.  Defender $X$ has a *tackle opportunity* but no *contained tackle.* This is negative for **TOCQ**.
3.  Defender $X$ has a  *tackle contained* and corresponding *tackle opportunity*. This is positive for **TOCQ**.

I now define **TOCQ$_X$** for defender $X$ as:

$$ TOCQ_X = \frac{\text{\# of Tackles Contained by Defender X}}{\text{\# of Tackle Opportunities for Defender X}} $$

Similarly, I define **TOCQ$_T$** for a team as:

$$ TOCQ_T = \frac{\text{\# of Tackles Contained by Team T}}{\text{\# of Tackle Opportunities for Team T}} $$

For consistency, **TOCQ** without a subscript refers to the general metric and the subscript is for disambiguation in discussing actual values between team and players. The two metric forms above have slightly different interpretations. For the individual player, a high **TOCQ$_X$** reflects a defender who "makes the most of his opportunities" and is often "in the right place at the right time." Conversely, a low **TOCQ$_X$** could reflect a defender who has opportunities pass but is not involved in as many plays. For a team, a high **TOCQ$_T$** reflects quick team containment of ball carriers and a lower **TOCQ$_T$** indicates a team difficult in quickly contain ball carriers or has more "hero" plays where one defender makes a play solo.

### An example play

With **TOCQ** defined, consider a play from the New York Jets-Cleveland Browns game from Week 2 of 2022, [linked here](https://youtu.be/oT0-zDpuIIk?si=pazd7MDyxQBADVxT&t=392). We can visualize this play frame by frame; if the football is in a defender's tackle zone in a frame, the zone turns blue:

In [4]:
t2 = pd.read_csv("tracking_week_2.csv")
play = prep_play(t2, 2022091801, 2090, "NYJ")
plot_play(play[0], play[1], (play[2], play[3]))

We can now count tackle opportunities and tackles contained for this play and get the result in Table 1:

<img src="https://github.com/elignesin/Big-Data-Bowl-2024/raw/eb46cdc79507f333b2cc185ee12b4235d4598c1d/Figures%20and%20Tables/Table1.png" width="50%" height="auto">

### TOCQ for Weeks 1-9 2022

Now, we can explore **TOCQ** for individual players and teams in the first half of the 2022 Season. There are 136 games and 869 players who played a defensive snap (including 4 offensive players). First, I calculate **TOCQ$_X$** for single games and for the half season. To focus on regular players and to rule out players who may have only played a few plays, I am only presenting **TOCQ$_X$** players with 10+ opportunities in single games or 30+ opportunities for the half season.

<img src="https://github.com/elignesin/Big-Data-Bowl-2024/raw/eb46cdc79507f333b2cc185ee12b4235d4598c1d/Figures%20and%20Tables/Table2.png" width="75%" height="auto">

Table 2 shows the top and bottom individual player games. 10 opportunities is still a small sample, so we cannot draw statistically significant conclusions here. Notably, the bottom 10 are all defensive tackles, and the top 10 only includes 1 defensive lineman. Then, the half-season **TOCQ$_X$** scores for players are in Table 3:

<img src="https://github.com/elignesin/Big-Data-Bowl-2024/raw/eb46cdc79507f333b2cc185ee12b4235d4598c1d/Figures%20and%20Tables/Table3.png" width="75%" height="auto">

Table 3 continues the same trends as Table 2. The top 10 players by half-season **TOCQ$_X$** are mostly defensive backs, led by Pittsburgh cornerback Levi Wallace, and a couple linebackers. The bottom 10 players are all defensive tackles. This is reasonable; defensive tackles are likely to pick up opportunities on every single snap (due to the "ball snap" frame), and likely see more opportunities than secondary players as well. Secondary players, conversely, likely only have opportunities when they are "in on the play" and may often be the primary defender responsible on a pass play, which defensive linemen rarely are. With this, it seems reasonable to consider the average **TOCQ$_X$** (among players with 10+ opportunities) by position and position group:

<img src="https://github.com/elignesin/Big-Data-Bowl-2024/raw/eb46cdc79507f333b2cc185ee12b4235d4598c1d/Figures%20and%20Tables/Figure4.png" width="75%" height="auto">

Figure 4 includes boxplots of half-season **TOCQ$_X$** scores by position, colored by the three "position groups," defensive line, linebackers, and secondary. Figure 4 reinforces the trends from the above tables. Secondary players have marginally higher **TOCQ$_X$** than linebackers, who have much higher scores than defensive linemen. Among defensive linemen, however, there is a wide gap between DEs, who are just behind linebackers, and DTs and NTs, who are much lower. This supports the earlier supposition about defensive tackles and nose tackles getting penalized for "ball snap" frames and snaps under center, since defensive ends, who are further from the ball on the line, are not nearly as negatively affected.

As noted, **TOCQ** at the team level is denoted as **TOCQ$_T$**. As with **TOCQ$_X$** for individual players, we can calculate **TOCQ$_T$** for teams in single games and over the half season. Unlike **TOCQ$_X$** above, **TOCQ$_T$** includes players with fewer than 10 opportunities, since the overall team calculation of **TOCQ$_T$** includes significantly more than 10 opportunities.

<img src="https://github.com/elignesin/Big-Data-Bowl-2024/raw/eb46cdc79507f333b2cc185ee12b4235d4598c1d/Figures%20and%20Tables/Table5.png" width="75%" height="auto">

Table 5 is the team equivalent of Table 2. **TOCQ$_T$** scores are much more tightly concentrated around 0.5 and have much less variance than the **TOCQ$_X$** scores above. Looking at Table 5, there are few patterns or trends, but Carolina has 3 of the 5 worst team performances in the dataset in weeks 1, 5, and 7. Given Carolina was poor last year, this is unsurprising, and suggests their defense struggled to finish plays when they were close to ball carriers. Table 6, below, reinforces this. Carolina is far and away the worst team by **TOCQ$_T$** in the dataset, and appears to almost singlehandedly drag the "league average" below the league median. There are fewer noticable trends in the teams than in the player **TOCQ$_X$** scores. The "below average teams" include playoff teams such as Dallas, Jacksonville, and Tampa Bay, as well as Arizona, Houston, and Chicago, the three worst teams in 2022. The best team by **TOCQ$_T$** is Minnesota, but the top 6 include San Francisco, Kansas City, and Cincinnati, all of which went to their Conference Championship games or further.

<img src="https://github.com/elignesin/Big-Data-Bowl-2024/raw/eb46cdc79507f333b2cc185ee12b4235d4598c1d/Figures%20and%20Tables/Figure6.png" width="75%" height="auto">

Finally, Figure 7 shows the average half-season **TOCQ$_T$** broken down into the three major position groups. This is a team level equivalent to Figure 4. Like Figure 4, there is a clear distinction for most teams between defensive linemen and linebackers and secondary players, but there is not as much visible distinction between the same position groups on different teams, with the notable exception of Pittsburgh's secondary, which has a much higher score than any other secondary and is especially prominent compared to their linebackers.

<img src="https://github.com/elignesin/Big-Data-Bowl-2024/raw/eb46cdc79507f333b2cc185ee12b4235d4598c1d/Figures%20and%20Tables/Figure7.png" width="75%" height="auto">

### Conclusions, Limitations, Future Steps

This notebook introduces the metric **TOCQ: Tackle Opportunity Containment Quotient**, parameterized for both individual players and full teams. **TOCQ** also introduces the *tackle zone*, the *tackle opportunity* when the football enters the tackle zone, and the *tackle containment* when the play ends within a defender's tackle zone. At the individual player level, defensive linemen, specifically defensive tackles, have generally lower **TOCQ$_X$** scores than linebackers and secondary players. At the team level, the top six teams by **TOCQ$_T$** include three conference championship participants and the worst three teams by record are below league average.

There are several limitations to this analysis and **TOCQ** as currently presented. With the data, only 9 weeks of games from 1 NFL season are available. Though `plays.csv` contains approximately 12500 plays, restricting to only these weeks puts many players under the number of opportunities I thought sufficient to draw conclusions from (I chose 30 as the traditional statistical threshold for "small sample size"). For individual games, only 6 players had 30 opportunities in a single game, so I chose 10 opportunities, but it is inadvisable to make concrete statements about individual players from single game **TOCQ$_X$** scores. This does not affect **TOCQ$_T$**, as noted, because a full game has a large enough sample of opportunities, but comparing teams on a sample of 8-9 games (depending on bye week) is also inadvisable due to the small sample.

Within the dataset, I made several decisions which may have had adverse effects on the results. First, at least three plays were removed for missing player tracking data, and it is  possible that are other plays have missing or incomplete player tracking data. More important was the frame events in the player tracking data. I chose to only consider frames between the "ball snap" or "pass outcome caught" frame and through the tackle/out of bounds/slide/sack/fumble frame. I chose this because, for pass plays, we do not have every frame from the ball snap, so I sought consistency. This, however, may artificially inflate pass rusher **TOCQ** scores by removing opportunities from pass rush in passes which did not result in sacks, and also creates inconsistency between pass and run plays. Conversely, there may be errors in the event tagging which would cause the incorrect frames to be used in individual plays and could, if numerous enough, impact the results.

With changing the **TOCQ** metric and definition or calcuation of tackle zones and tackle opportunities in the future, it could be beneficial to move from a constant 2 yard tackle zone radius and $150^{\circ}$ sector to a tackle zone dependent on the individual defender, for example based on speed or acceleration and direction of movement at each frame. Likewise, the tackle zone currently ignores the presence of blockers within the zone. This was deliberate; on many run plays, for example, an interior defensive lineman may be "obstructed" by a blocker and still make a tackle, but this could be changed. **TOCQ** would, in this vein, benefit from a more consistent set of frames in each play, instead of separate between run and pass plays.

### Appendix

The code and visualizations are in [this github repository](https://github.com/elignesin/Big-Data-Bowl-2024/tree/main).

For questions, please email eli.gnesin@duke.edu.

Words: 1999