This is a series of kernels for the NFL Punt Analytics competition:
1. [Game Mechanics](https://www.kaggle.com/argentium/nfl-punt-game-mechanics)
2. [Group Dynamics](https://www.kaggle.com/argentium/nfl-punt-group-dynamics)
3. [Penalties](https://www.kaggle.com/argentium/nfl-punt-penalties)

In [None]:
import os
import re
import pandas as pd
import numpy as np
import seaborn as sns

import scipy
import math
from matplotlib import pyplot as plt

import statsmodels.api as sm
from statsmodels.formula.api import ols

In [None]:
df_play_info = pd.read_csv('../input/play_information.csv')
df_injury = pd.read_csv('../input/video_review.csv')
df_punt_role = pd.read_csv('../input/play_player_role_data.csv')

In [None]:
team_positions = {'Return': 
                  ['VR', 'VRo', 'VRi', 
                   'PDR1', 'PDR2', 'PDR3', 'PDR4', 'PDR5', 'PDR6',
                   'PLR', 'PLR1', 'PLR2', 'PLR3',
                   'PR', 'PFB', 'PDM', 'VL', 'VLo', 'VLi',
                   'PDL1', 'PDL2', 'PDL3', 'PDL4', 'PDL5', 'PDL6',
                   'PLL', 'PLL1', 'PLL2', 'PLL3', 'PLLi'],
     'Coverage': ['GR', 'GRo', 'GRi',
                'PRG', 'PRT', 'PRW',
                'PPR', 'PPRo', 'PPRi', 'P', 'PC', 'PLS',
                  'GL', 'GLo', 'GLi',
               'PLW', 'PLT', 'PLG',
               'PPL', 'PPLo', 'PPLi']}

# Add the corresponding side of their role
def set_team(role):
    for team in team_positions.keys():
        if str(role) in team_positions[team]:
            return str(team)
    return None

def get_goal(activity):
    if (activity == 'Blocking') or (activity == 'Tackled'):
        return 'Offensive'
    else:
        return 'Defensive'

# Add the corresponding side of their role
def set_phase(row):
    goal = get_goal(row['Player_Activity_Derived'])
    if row['Team'] == 'Coverage':
        if goal == 'Offensive':
            return 1
        else:
            return 2
    else: # Return Team
        if goal == 'Offensive':
            return 2
        else:
            return 1

df_punt_role['Team'] = df_punt_role.apply(lambda row: set_team(row['Role']), 
                                                axis=1)

# Clean
df_injury['Primary_Partner_GSISID'] = df_injury.apply(lambda row: 
                                                                  row['Primary_Partner_GSISID'] 
                                                                  if (row['Primary_Partner_GSISID'] != 'Unclear')
                                                                 else 0,
                                                                 axis=1)
df_injury['Primary_Partner_GSISID'] = df_injury['Primary_Partner_GSISID'].fillna(0)
df_injury['Primary_Partner_GSISID'] = df_injury['Primary_Partner_GSISID'].astype(int)

# Identify roles for player and partner
df_injury = df_injury.merge(df_punt_role, 
                                  left_on=['GameKey', 'PlayID', 'GSISID'],
                                 right_on=['GameKey', 'PlayID', 'GSISID'],
                                 how='left')

df_injury = df_injury.merge(df_punt_role, 
                                 suffixes=('', '_Partner'),
                                  left_on=['GameKey', 'PlayID', 'Primary_Partner_GSISID'],
                                 right_on=['GameKey', 'PlayID', 'GSISID'],
                                 how='left')
df_injury = df_injury.drop(['GSISID_Partner'], axis=1)
df_injury['Phase'] = df_injury.apply(lambda row: 
                                                set_phase(row), 
                                                axis=1)

In [None]:
results = [ 
           'downed',
           'fair catch', 
           'Touchback'
          ]

def get_result(row):
    match = re.search('.* for .*', row['PlayDescription']) # [A-Z]+? [0-9]+ # (to .*?)
    if match:
        return 'return'

    for result in results:
        if result in row['PlayDescription']:
            return result

    return 'others'

df_play_info['Result'] = df_play_info.apply(lambda row: get_result(row), axis=1)

### How many Punt plays were there?

In [None]:
print(len(df_play_info))

### How many Punt plays have a corresponding Penalty?

In [None]:
def get_penalty(row):
    if 'PENALTY' in row['PlayDescription']:
        return 'Yes'
    else:
        return 'No'

df_play_info['Penalty'] = df_play_info.apply(lambda row: get_penalty(row), axis=1)

# Show info
ax = sns.countplot(x="Penalty", 
                   order=df_play_info['Penalty'].value_counts().index,
                   data=df_play_info)
ax.set_title('Penalty Count in Punt Plays')
df_play_info['Penalty'].value_counts()

19% (1077 out of 5604) of the Punt plays have violations.

### What are the most common Punt play violations?

In [None]:
def identify_penalty(row):
    penalties = ['Offensive Holding', 
                 'Taunting',
                 'Disqualification',
                 'Running Into the Kicker',
                 'Interference',
                 'Unnecessary Roughness',
                 'Face Mask', 
                 'Neutral Zone Infraction',
                 'Horse Collar Tackle',
                 'Ineligible Downfield Kick',
                 'Player Out of Bounds on Punt',
                 'Defensive 12 On-field',
                 'Offensive 12 On-field',
                 'Chop Block',
                 'Illegal Block Above the Waist', 
                 'Illegal Blindside Block', 
                 'Illegal Touch', 
                 'Illegal Use of Hands', 
                 'Illegal Substitution',
                 'Illegal Formation',
                 'Illegal Motion',
                 'Illegal Shift',
                 'Clipping',
                 'Tripping',
                 'Invalid Fair Catch Signal',
                 'Delay of Game',
                 'Defensive Holding',
                 'Roughing the Kicker',
                 'Unsportsmanlike Conduct',
                 'Defensive Offside',
                'False Start']
    for penalty in penalties:
        if penalty in row['PlayDescription']:
            return penalty
    return 'Unknown'

def has_penalty(row):
    if 'PENALTY' in row['PlayDescription']:
        return 'Yes'
    return 'No'

df_play_info['Penalty'] = df_play_info.apply(lambda row: has_penalty(row), axis=1)
df_play_info['PenaltyID'] = df_play_info.apply(lambda row: identify_penalty(row), axis=1)
df_play_info_penalty = df_play_info[df_play_info['Penalty']=='Yes']

# Show info
sns.set(rc={'figure.figsize':(8,9)})
ax = sns.countplot(y="PenaltyID", 
                   order=df_play_info_penalty['PenaltyID'].value_counts().index,
                   data=df_play_info_penalty)
ax.set_title('Penalty Frequency')

There is already quite a lot of penalties in place during Punt plays. Among the most common are 'Offensive Holding' and 'Illegal Block Above the Waist'.

#### Offensive Holding:


#### Illegal Block Above the Waist:
The [Illegal Block Above the Waist](http://insidethepylon.com/football-101/glossary-football-101/2016/09/28/itp-glossary-illegal-block-back/) is defined as:
> An illegal block in the back penalty (officially known as illegal block above the waist) is called when a player makes contact with an opposing player, who does not have the ball, above the waist from behind. A block in the back penalty costs the team (generally on offense or receiving a kick / punt) ten yards from the spot of the foul.

Such offense is common because:
1. The Coverage team runs straight to the PR
2. The PR is behind the Return team. The Return team must turn around.
By the time the PR team turns around, they would be behind the Coverage team, who just ran past their defense.

Other terms such as Offensive Holding is defined in the [NFL Rulebook](https://operations.nfl.com/media/2646/2017-playing-rules.pdf).

#### Interference Rule:
The interesting rule to note is the Interference rule. It protects the player from getting tackled while he is distracted watching out for the ball. It is an essential rule in a Punt Play because the PR is a dedicated player to catch a kicked ball. The downfall of the rule, however, is the immediate removal of safety once the ball is caught (unless he declared a safe catch). This does not give the PR sufficient time to adjust to his surroundings.

Let us check if penalties and injuries have some relationship.

## Penalty Injuries:

### How many punt play injuries have a corresponding Penalty?

In [None]:
df_injury_plays = df_injury.merge(df_play_info, 
                                  left_on=['GameKey', 'PlayID'],
                                 right_on=['GameKey', 'PlayID'],
                                 how='left')
df_injury_plays['Penalty'] = df_injury_plays.apply(lambda row: get_penalty(row), axis=1)
df_injury_penalty = df_injury_plays[df_injury_plays['Penalty']=='Yes']
print('Penalty count: ' + str(len(df_injury_penalty)))

In 10 out of 37 injuries, there are already existing rules for some injury incidences. The penalties indicate that a number of the injuries can be prevented if the deterring rules rules were followed. However, the penalties does not completely deter the players from performing certain moves.

### What are the penalties given to plays with injuries?

In [None]:
# Graph
sns.set(rc={'figure.figsize':(6,4)})
df_injury_penalty['PenaltyID'].value_counts()

The most common violations are Illegal Block Above the Waist, Offensive Holding, and Interference.

### What is the risk of injury for each penalty violation?

In [None]:
df_total = df_play_info['PenaltyID'].value_counts().reset_index(name='total')
df_injury_total = df_injury_penalty['PenaltyID'].value_counts().reset_index(name='injured')

# df_total.head()
df_merged = df_total.merge(df_injury_total, how='right')
df_merged = df_merged.fillna(0)
df_merged['ratio'] = 100*df_merged['injured'] / df_merged['total']

ax = sns.barplot(x='ratio', y='index', 
#             order=df_merged['ratio'].value_counts().index,
            data=df_merged)
ax.set_title('Penalty Injury Risk')

Illegal Blindside Block has 10% injury rate. This makes sense because the blindside catches the target unprepared and consequently, unable to brace for impact.

The interference has 7% injury rate. This means there are injuries in every 7 out of 100 violations. The rule is intentionally created to protect the receiver as he diverts his attention from the players and  focuses on catching the ball. However, there are some limitations to the rule. The player cannot be tackled while the ball is on the air, but once it touches his hands, the other protection is immediately removed. There is no adjustment period between catching and either running with the ball or passing the ball. The video reviews will show that in some cases, the injury occured right after the ball is caught because the other players are just waiting for the ball to fall into the receiver's hands.

### What are the common play outcomes for those with penalties?

In [None]:
df_cross_injured = pd.crosstab(df_injury_penalty['PenaltyID'], df_injury_penalty['Result'])
df_cross_injured = df_cross_injured.fillna(0)

ax = sns.heatmap(df_cross_injured, annot=True, fmt='.2g')
ax.set_title('Penalty-Event')

Most injuries came from the runbacks of the Return team. Interestingly, there are injuries from interference even if it was declared a fair catch.

### What is the common activity of the injuried?

In [None]:
df_cross_injured = pd.crosstab(df_injury_penalty['Result'], df_injury_penalty['Player_Activity_Derived'])
df_cross_injured = df_cross_injured.fillna(0)

ax = sns.heatmap(df_cross_injured, annot=True, fmt='.2g')
ax.set_title('Event-Activity')

Checking the activity, the fair catch injury comes from blocking.

### Game Phases

### What Phase has the most penalties?

In [None]:
df_cross_injured = pd.crosstab(df_injury_penalty['Phase'], df_injury_penalty['Penalty'])
df_cross_injured = df_cross_injured.fillna(0)

ax = sns.heatmap(df_cross_injured, annot=True, fmt='.2g')
ax.set_title('Phase-Penalty Frequency')

There is only 2 injury penalties before the punt. However, there is only just 4 injuries in the first phase. In other words, half of the injuries before the punt are violations.

### What are the common violations before the kick that lead to an injury?

In [None]:
df_injury_penalty_phase1 = df_injury_penalty[df_injury_penalty['Phase']==1]
df_injury_penalty_phase1['PenaltyID'].value_counts()

### What is the play event and activity for injuries before the punt?

In [None]:
df_cross_injured = pd.crosstab(df_injury_penalty_phase1['Result'], df_injury_penalty_phase1['Player_Activity_Derived'])
df_cross_injured = df_cross_injured.fillna(0)

ax = sns.heatmap(df_cross_injured, annot=True, fmt='.2g')
ax.set_title('Event-Activity Before the Punt\n(With Penalties)')

### What is the impact type and activity pairs of the penalty injuries before the punt?

In [None]:
df_injury_plays_phase1 = df_injury_penalty[df_injury_penalty['Phase']==1]
df_cross_injured = pd.crosstab(df_injury_plays_phase1['Primary_Impact_Type'], 
                               df_injury_plays_phase1['Player_Activity_Derived'])
df_cross_injured = df_cross_injured.fillna(0)

ax = sns.heatmap(df_cross_injured, annot=True, fmt='.2g')
ax.set_title('Impact Type-Activity Before the Punt\n(With Penalties)')

### What are the common penalties after the kick that lead to an injury?

In [None]:
df_injury_penalty_phase2 = df_injury_penalty[df_injury_penalty['Phase']==2]
df_injury_penalty_phase2['PenaltyID'].value_counts()

### What is the play event and activity for injuries after the punt?

In [None]:
df_cross_injured = pd.crosstab(df_injury_penalty_phase2['Result'], df_injury_penalty_phase2['Player_Activity_Derived'])
df_cross_injured = df_cross_injured.fillna(0)

ax = sns.heatmap(df_cross_injured, annot=True, fmt='.2g')
ax.set_title('Event-Activity After the Punt\n(With Penalties)')

### What is the common combination of impact type and player activity for the injury plays with penalties?

In [None]:
df_cross_injured = pd.crosstab(df_injury_penalty_phase2['Primary_Impact_Type'], 
                               df_injury_penalty_phase2['Player_Activity_Derived'])
df_cross_injured = df_cross_injured.fillna(0)

ax = sns.heatmap(df_cross_injured, annot=True, fmt='.2g')
ax.set_title('Impact Type-Activity After the Punt\n(With Penalties)')

We observe two important details:
- Helmet-to-body injuries come from tackle activities
- Helmet-to-helmet injuries come from block injuries. This is most likely because they are facing each other.

## No Penalty

### How many injuries have no penalties?

In [None]:
df_injury_no_penalty = df_injury_plays[df_injury_plays['Penalty']=='No']
len(df_injury_no_penalty)

### What are the common play outcomes for those without penalties?

In [None]:
# Graph
df_injury_no_penalty['Result'].value_counts()

### What is the play event and activity for injuries after the punt (no penalties)?

In [None]:
df_cross_injured = pd.crosstab(df_injury_no_penalty['Result'], 
                               df_injury_no_penalty['Player_Activity_Derived'])
df_cross_injured = df_cross_injured.fillna(0)

ax = sns.heatmap(df_cross_injured, annot=True, fmt='.2g')
ax.set_title('Event-Activity\n(No Penalties)')

In [None]:
df_injury_no_penalty_phase1 = df_injury_no_penalty[df_injury_no_penalty['Phase']==1]
df_cross_injured = pd.crosstab(df_injury_no_penalty_phase1['Result'], 
                               df_injury_no_penalty_phase1['Player_Activity_Derived'])
df_cross_injured = df_cross_injured.fillna(0)

ax = sns.heatmap(df_cross_injured, annot=True, fmt='.2g')
ax.set_title('Event-Activity Before the Punt\n(No Penalties)')

In [None]:
df_injury_no_penalty_phase2 = df_injury_no_penalty[df_injury_no_penalty['Phase']==2]
df_cross_injured = pd.crosstab(df_injury_no_penalty_phase2['Result'], 
                               df_injury_no_penalty_phase2['Player_Activity_Derived'])
df_cross_injured = df_cross_injured.fillna(0)

ax = sns.heatmap(df_cross_injured, annot=True, fmt='.2g')
ax.set_title('Event-Activity After the Punt\n(No Penalties)')

### What is the most common combination of activity and impact type for those without penalties?

In [None]:
df_cross_injured = pd.crosstab(df_injury_no_penalty['Primary_Impact_Type'], 
                               df_injury_no_penalty['Player_Activity_Derived'])
df_cross_injured = df_cross_injured.fillna(0)

sns.heatmap(df_cross_injured, annot=True, fmt='.1g')

An important aspect of separating the penalty-based injuries is that specific action-based injuries are eliminated from the dataset.

There is a new NFL rule not yet considered in the dataset.
The Helmet Rule states the following:
> It is a foul if a player lowers his head to initiate and make contact with his helmet against an opponent. 
>-[Article 8: Use of Helmet](https://operations.nfl.com/the-rules/nfl-video-rulebook/use-of-the-helmet/)

If we consider the new helmet rule,  all active roles such as blocking and tackling will be covered. However, there is still need to account for passive injuries.

### How many passive injuries have no penalties?

In [None]:
df_injury_np_passive = df_injury_no_penalty[(df_injury_no_penalty['Player_Activity_Derived']=='Blocked') |
                                            (df_injury_no_penalty['Player_Activity_Derived']=='Tackled')]
len(df_injury_np_passive)

### What is the most common impact type of the passive non-penalty injuries?

In [None]:
# Graph
df_injury_np_passive['Primary_Impact_Type'].value_counts()

Helmet-to-helmet impact types account for 64% (7 out of 11) of the passive non-penalty injuries. Consequently, the identification of how such injuries occur can reduce the overall injuries without existing penalty rules.

## Impact Type and Position
Let us check if the player position has any effect on the impact type. For this analysis, I included all injury plays (both with and without penalties).

In [None]:
def get_side(role):
    left = ['GL', 'GLo', 'GLi', 'PLW', 'PLT', 'PLG',
           'VL', 'VLo', 'VLi',
           'PDL1', 'PDL2', 'PDL3', 'PDL4', 'PDL5', 'PDL6',
           'PLL', 'PLL1', 'PLL2', 'PLL3', 'PLLi',
           'PPL', 'PPLo', 'PPLi']
    right = ['GR', 'GRo', 'GRi', 'PRG', 'PRT', 'PRW',
           'VR', 'VRo', 'VRi',
           'PDR1', 'PDR2', 'PDR3', 'PDR4', 'PDR5', 'PDR6',
           'PLR', 'PLR1', 'PLR2', 'PLR3',
            'PPR', 'PPRo', 'PPRi',
           ]
    center = ['PLS', 'PC', 'P', 'PDM',
                'PLM', 'PLM1',
                'PFB', 'PR']
    
    if role in left:
        return 'left'
    if role in right:
        return 'right'
    if role in center:
        return 'center'
    else:
        return ''

df_injury_plays['Player_Side'] = df_injury_plays.apply(lambda row: get_side(row['Role']), axis=1)
df_injury_plays['Partner_Side'] = df_injury_plays.apply(lambda row: get_side(row['Role_Partner']), axis=1)

def get_facing(row):
    if (row['Player_Side'] == 'left' and row['Partner_Side'] == 'right') or \
    (row['Player_Side'] == 'right' and row['Partner_Side'] == 'left') or \
    (row['Player_Side'] == 'center' and row['Partner_Side'] == 'center'):
        return 'Yes' # Facing each other
    elif (row['Player_Side'] == 'center' or row['Partner_Side'] == 'center'):
        return 'Off-center'
    return 'No'
    
df_injury_plays['Facing'] = df_injury_plays.apply(lambda row: get_facing(row), axis=1)

### What are the common pairs of impact types according to how each players are directly facing each other in formation?

In [None]:
df_cross_injured = pd.crosstab(df_injury_plays['Primary_Impact_Type'], 
                               df_injury_plays['Facing'])
df_cross_injured = df_cross_injured.fillna(0)

sns.heatmap(df_cross_injured, annot=True, fmt='.2g')

Note that the left side of the coverage team is the right side of the return team.

Observe the helmet-to-helmet injuries:
- The players tend to be facing each other in the formation. 
- In off-center situations, one of the player has a center position, but it would be easy for them to move either left or right. 
- There is only 1 helmet-to-helmet injury for those on the opposite sides of the formation.

This means that the helmet-to-helmet injuries tend to be a head-on collision much like horned animals charging at each other.

![Helmet-to-helmet](http://www.animatedimages.org/data/media/164/animated-american-football-image-0032.gif)

There are various common forms that exposes the head to injuries:
1. Blocking Form

> Based on the analysis on Game Mechanics, most of the coverage formation blocking injuries (before the kick) are helmet-to-helmet collisions. In the initial phase of the play, the opponents are facing each other as the blocking occurs. However, the blocking forms may have an influence in the impact type.
    ![Wrestling](http://www.animatedimages.org/data/media/164/animated-american-football-image-0015.gif)
The Greco-Roman wrestling form best illustrates the form used in blocking defense. This form, however, naturally exposes the head to injuries. In one of the blocking injuries, there were two defenders who accidentally bumped into each other as they were double-teaming on an opponent.  In the 0:02 mark of the video, we can all the heads of the people involved are leaned forward. This seems to be the case in most blocking or blocked injuries even in one-to-one face-offs.
Since the concussion rates from such wrestling forms are beyond the given dataset, I will just refer to the following articles:
- [Wrestling抯 the most dangerous sport for concussions](https://www.athleticscholarships.net/concussion-wrestling-football-college.htm)
- [Studies shine harsh light on wrestling's concussion rate, impact on athletes](https://www.chicagotribune.com/news/ct-wrestling-concussions-met-20160110-story.html)
- [Control The Head: Combating Concussions In Wrestling](https://woub.org/2014/04/14/control-head-combating-concussions-wrestling/)

2 Running Form
>Based on the analysis on Game Mechanics, most of the gunner's injuries are helmet-to-helmet collisions. In the Collision Pairs analysis, the gunner's injuries occured on high speeds of more than 20kph  (in 4/5 injuries).
![Running](http://www.animatedimages.org/data/media/164/animated-american-football-image-0071.gif)
The standard running form has some forward lead that exposes the head. When people are running, they both expose their heads to anyone in front of them. Again, the appropriate field of study for this is kinematics.

Identification of which forms cause more injuries would require consolidation with non-punt related injuries.

## Season Types
### What is the ratio of injuries per season?

In [None]:
df_total = df_play_info['Season_Type'].value_counts().reset_index(name='total')
df_injury_total = df_injury_plays['Season_Type'].value_counts().reset_index(name='injured')

# df_total.head()
df_merged = df_total.merge(df_injury_total, how='right')
df_merged = df_merged.fillna(0)
df_merged['ratio'] = 100*df_merged['injured'] / df_merged['total']
df_merged['ratio_safe'] = 100-df_merged['injured']

sns.barplot(x='ratio', y='index', 
#             order=df_merged['ratio'].value_counts().index,
            data=df_merged)

There is a proportionally higher injury rate during the pre-season than in the regular season. Let us check the penalty rates per season.
### What is the ratio of penalties per season?

In [None]:
df_penalty = df_play_info[df_play_info['Penalty']=='Yes']

df_total = df_play_info['Season_Type'].value_counts().reset_index(name='total')
df_injury_total = df_penalty['Season_Type'].value_counts().reset_index(name='injured')

# df_total.head()
df_merged = df_total.merge(df_injury_total, how='right')
df_merged = df_merged.fillna(0)
df_merged['ratio'] = 100*df_merged['injured'] / df_merged['total']
df_merged['ratio_safe'] = 100-df_merged['injured']

sns.barplot(x='ratio', y='index', 
#             order=df_merged['ratio'].value_counts().index,
            data=df_merged)

There is a higher penalty rate for the pre-season. This means that the players are less mindful of the rules during the pre-season. Let us check if the seasonal penalties are related with the injuries.

### For the plays with injuries, what is the ratio of penalties per season?

In [None]:
df_penalty = df_injury_plays[df_injury_plays['Penalty']=='Yes']

df_total = df_injury_plays['Season_Type'].value_counts().reset_index(name='total')
df_injury_total = df_penalty['Season_Type'].value_counts().reset_index(name='injured')

# df_total.head()
df_merged = df_total.merge(df_injury_total, how='right')
df_merged = df_merged.fillna(0)
df_merged['ratio'] = 100*df_merged['injured'] / df_merged['total']
df_merged['ratio_safe'] = 100-df_merged['injured']

sns.barplot(x='ratio', y='index', 
#             order=df_merged['ratio'].value_counts().index,
            data=df_merged)

The pre-season has a higher ratio of penalties for the plays with injuries. This indicates that the rampant rule violations during the pre-season is a leading cause of injuries during the pre-season.