In [2]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
import numpy as np

In [3]:
games = pd.read_csv('../../Dataset/games.csv')
players = pd.read_csv('../../Dataset/players.csv')
plays = pd.read_csv('../../Dataset/plays.csv')
tackles = pd.read_csv('../../Dataset/tackles.csv')

files=[]
for i in range(1,10):
    file='../../Dataset/tracking_week_'+str(i)+'.csv'
    files.append(pd.read_csv(file))
    tracking=pd.concat(files)

公制化转换

In [4]:
def metric_transform(players):
    players['height'] = players['height'].apply(lambda x: int(x.split('-')[0]) * 12 + int(x.split('-')[1]))
    players['height'] = players['height'] * 2.54  # 转换为厘米
    players['weight'] = players['weight'] * 0.453592  # 转换为千克

def convert_to_metric_units(df):
    yards_to_meters = 0.9144  # 码米转换
    df['s'] = df['s'] * yards_to_meters
    df['a'] = df['a'] * yards_to_meters
    return df

metric_transform(players)
convert_to_metric_units(tracking)

players.head()
tracking.head()

Unnamed: 0,gameId,playId,nflId,displayName,frameId,time,jerseyNumber,club,playDirection,x,y,s,a,dis,o,dir,event
0,2022090800,56,35472.0,Rodger Saffold,1,2022-09-08 20:24:05.200000,76.0,BUF,left,88.37,27.27,1.481328,1.05156,0.16,231.74,147.9,
1,2022090800,56,35472.0,Rodger Saffold,2,2022-09-08 20:24:05.299999,76.0,BUF,left,88.47,27.13,1.527048,0.557784,0.17,230.98,148.53,pass_arrived
2,2022090800,56,35472.0,Rodger Saffold,3,2022-09-08 20:24:05.400000,76.0,BUF,left,88.56,27.01,1.435608,0.448056,0.15,230.98,147.05,
3,2022090800,56,35472.0,Rodger Saffold,4,2022-09-08 20:24:05.500000,76.0,BUF,left,88.64,26.9,1.316736,0.813816,0.14,232.38,145.42,
4,2022090800,56,35472.0,Rodger Saffold,5,2022-09-08 20:24:05.599999,76.0,BUF,left,88.72,26.8,1.179576,1.133856,0.13,233.36,141.95,


In [5]:
"""
标准化方向数据，统一参考系，向左的都向右。
"""
def reverse_deg(deg):
    if deg < 180:
        return deg + 180
    if deg >= 180:
        return deg - 180
    
tracking["o_standard"]=np.where(tracking["playDirection"] == "left", tracking["o"].apply(reverse_deg), tracking["o"])
        
tracking["dir_standard"] = np.where(tracking["playDirection"] == "left", tracking["dir"].apply(reverse_deg), tracking["dir"])
        
tracking["x_standard"] = np.where(tracking["playDirection"] == "left", tracking["x"].apply(lambda x: 120 - x), tracking["x"])
        
tracking["y_standard"] =np.where(tracking["playDirection"] == "left",  tracking["y"].apply(lambda y: 160/3 - y), tracking["y"])

In [6]:
"""
plays新建一个'gameplayid'属性，gameId + playId
"""
def create_gameplayid(df):
    df['gameplayid']=df['gameId'].astype(str)+df['playId'].astype(str)
    return df

"""
tracking和tackles新建一个'index'属性，nflId + gameplayid
"""
def create_index(df):
    df['index']=df['nflId'].astype(str)+df['gameplayid']
    return df

tracking=create_index(create_gameplayid(tracking))
plays=create_gameplayid(plays)
tackles=create_index(create_gameplayid(tackles))

In [7]:
"""
统计表格信息
"""
def generate_summary_table(data):
    summary_table = pd.DataFrame(columns=['Column', 'Data Type', 'Missing Values', 'Missing %', 'Unique Values', 'Min', 'Max', 'Mean', 'Median'])
    
    for column in data.columns:
        data_type = str(data[column].dtype)
        
        missing_values = data[column].isnull().sum()
        missing_percentage = (missing_values / len(data)) * 100
        
        if data[column].dtype == 'object':
            min_value, max_value, mean_value, median_value = '', '', '', ''
        else:
            min_value = data[column].min()
            max_value = data[column].max()
            mean_value = data[column].mean()
            median_value = data[column].median()
        
        unique_values = data[column].nunique()
        
        summary_table = summary_table.append({
            'Column': column,
            'Data Type': data_type,
            'Missing Values': missing_values,
            'Missing %': f'{missing_percentage:.2f}%',
            'Unique Values': unique_values,
            'Min': min_value,
            'Max': max_value,
            'Mean': mean_value,
            'Median': median_value
        }, ignore_index=True)
    
    formatted_table = summary_table.style.set_properties(**{'text-align': 'center'})
    
    display(formatted_table)
    
    return summary_table

In [8]:
df_list = [games, plays, players, tackles, tracking]
df_names = ['games', 'plays', 'players', 'tackles', 'tracking']
for df, df_name in zip(df_list, df_names):
    print(f'{df_name}统计信息：')
    summary = generate_summary_table(df)

games统计信息：


Unnamed: 0,Column,Data Type,Missing Values,Missing %,Unique Values,Min,Max,Mean,Median
0,gameId,int64,0,0.00%,136,2022090800.0,2022110700.0,2022098922.117647,2022100902.5
1,season,int64,0,0.00%,1,2022.0,2022.0,2022.0,2022.0
2,week,int64,0,0.00%,9,1.0,9.0,4.845588,5.0
3,gameDate,object,0,0.00%,27,,,,
4,gameTimeEastern,object,0,0.00%,8,,,,
5,homeTeamAbbr,object,0,0.00%,32,,,,
6,visitorTeamAbbr,object,0,0.00%,32,,,,
7,homeFinalScore,int64,0,0.00%,38,3.0,49.0,22.669118,22.5
8,visitorFinalScore,int64,0,0.00%,35,0.0,48.0,20.948529,20.0


plays统计信息：


Unnamed: 0,Column,Data Type,Missing Values,Missing %,Unique Values,Min,Max,Mean,Median
0,gameId,int64,0,0.00%,136,2022090800.0,2022110700.0,2022098953.855598,2022100903.0
1,playId,int64,0,0.00%,3974,54.0,5096.0,1986.603476,1990.5
2,ballCarrierId,int64,0,0.00%,480,25511.0,55158.0,48072.271664,47789.0
3,ballCarrierDisplayName,object,0,0.00%,480,,,,
4,playDescription,object,0,0.00%,12486,,,,
5,quarter,int64,0,0.00%,5,1.0,5.0,2.550136,3.0
6,down,int64,0,0.00%,4,1.0,4.0,1.727054,2.0
7,yardsToGo,int64,0,0.00%,32,1.0,38.0,8.469085,10.0
8,possessionTeam,object,0,0.00%,32,,,,
9,defensiveTeam,object,0,0.00%,32,,,,


players统计信息：


Unnamed: 0,Column,Data Type,Missing Values,Missing %,Unique Values,Min,Max,Mean,Median
0,nflId,int64,0,0.00%,1683,25511.0,55241.0,48221.702317,47872.0
1,height,float64,0,0.00%,16,167.64,205.74,188.604433,190.5
2,weight,float64,0,0.00%,179,69.399576,172.36496,111.458578,107.047712
3,birthDate,object,479,28.46%,985,,,,
4,collegeName,object,0,0.00%,226,,,,
5,position,object,0,0.00%,19,,,,
6,displayName,object,0,0.00%,1672,,,,


tackles统计信息：


Unnamed: 0,Column,Data Type,Missing Values,Missing %,Unique Values,Min,Max,Mean,Median
0,gameId,int64,0,0.00%,136,2022090800.0,2022110700.0,2022098971.441123,2022100903.0
1,playId,int64,0,0.00%,3943,54.0,5096.0,1982.974578,1991.0
2,nflId,int64,0,0.00%,800,33131.0,55241.0,47602.719442,46669.0
3,tackle,int64,0,0.00%,2,0.0,1.0,0.569207,1.0
4,assist,int64,0,0.00%,2,0.0,1.0,0.315276,0.0
5,forcedFumble,int64,0,0.00%,2,0.0,1.0,0.005681,0.0
6,pff_missedTackle,int64,0,0.00%,2,0.0,1.0,0.119936,0.0
7,gameplayid,object,0,0.00%,12025,,,,
8,index,object,0,0.00%,17426,,,,


tracking统计信息：


Unnamed: 0,Column,Data Type,Missing Values,Missing %,Unique Values,Min,Max,Mean,Median
0,gameId,int64,0,0.00%,136,2022090800.0,2022110700.0,2022099002.717871,2022100904.0
1,playId,int64,0,0.00%,3974,54.0,5096.0,1981.028187,1997.0
2,nflId,float64,530060,4.35%,1683,25511.0,55241.0,47459.700224,46521.0
3,displayName,object,0,0.00%,1673,,,,
4,frameId,int64,0,0.00%,164,1.0,164.0,25.511774,23.0
5,time,object,0,0.00%,488149,,,,
6,jerseyNumber,float64,530060,4.35%,99,1.0,99.0,48.947584,52.0
7,club,object,0,0.00%,33,,,,
8,playDirection,object,0,0.00%,2,,,,
9,x,float64,0,0.00%,22592,-3.55,122.96,60.413032,60.83


In [9]:
"""
合并数据
"""
def merge_clean_data(tracking, df_plays, df_tackles):
    # 合并
    merged_df = pd.merge(
        pd.merge(tracking, df_plays, on=['gameplayid', 'gameId', 'playId'], how='inner'),
        df_tackles,
        on=['gameplayid', 'gameId', 'playId', 'nflId'],
        how='inner'
    )
    
    # 创建‘event’列
    for index, row in merged_df.iterrows():
        if row['tackle'] == 1:
            merged_df.at[index, 'event'] = 'tackle'
        elif row['pff_missedTackle'] == 1:
            merged_df.at[index, 'event'] = 'missed_tackle'
        elif row['assist'] == 1:
            merged_df.at[index, 'event'] = 'assist'
        else:
            merged_df.at[index, 'event'] = 'Other'

    
    # 再次合并
    df_tackles_final = pd.merge(
        merged_df,
        df_tackles,
        on=['gameplayid', 'gameId', 'playId', 'nflId', 'tackle', 'assist', 'pff_missedTackle'],
        how='inner',
        suffixes=['_1', '_2']
    )
    
    # 过滤行并创建'index'列
    index_values = []
    for index, row in df_tackles_final.iterrows():
        # 过滤全零
        if (row['tackle'] + row['assist'] + row['pff_missedTackle']) >= 1:
            # 过滤空id
            if not pd.isna(row['gameplayid']):
                # 计算index
                index_value = str(row['nflId']) + str(int(row['gameplayid']))
                index_values.append(index_value)

    # 将index加入df
    df_tackles_final['index'] = index_values
    
    return df_tackles_final


df_tackles_final=merge_clean_data(tracking, plays, tackles)

  if not (lk == lk.astype(rk.dtype))[~np.isnan(lk)].all():


In [10]:
df_tackles_final.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 804159 entries, 0 to 804158
Data columns (total 63 columns):
 #   Column                            Non-Null Count   Dtype  
---  ------                            --------------   -----  
 0   gameId                            804159 non-null  int64  
 1   playId                            804159 non-null  int64  
 2   nflId                             804159 non-null  float64
 3   displayName                       804159 non-null  object 
 4   frameId                           804159 non-null  int64  
 5   time                              804159 non-null  object 
 6   jerseyNumber                      804159 non-null  float64
 7   club                              804159 non-null  object 
 8   playDirection                     804159 non-null  object 
 9   x                                 804159 non-null  float64
 10  y                                 804159 non-null  float64
 11  s                                 804159 non-null  f

In [11]:
"""
将player中是持球人信息提出来，作为新的df
"""
def create_ballcarrier_dataframe(df_players, tracking):
    # 提取ballCarriers信息
    df_ballcarrier_players = df_players.copy()
    df_ballcarrier_players.columns = [
        'nflId_ballcarrier', 'ballCarrierHeight', 'ballCarrierWeight',
        'ballcarrierBirth', 'ballcarrierCollege', 'ballcarrierPosition',
        'ballCarrierDisplayName'
    ]

    # 选择tracking中的相关列
    df_tracking_ballcarrier = tracking[[
        'gameplayid', 'frameId', 'nflId', 'x', 'y', 's', 'a', 'dis', 'o', 'dir',
        'event', 'o_standard', 'dir_standard', 'x_standard', 'y_standard'
    ]]

    # 合并players和tracking
    df_ballcarrier = pd.merge(df_tracking_ballcarrier, df_ballcarrier_players, left_on='nflId', right_on='nflId_ballcarrier')

    # 选择所需的列
    df_ballcarrier = df_ballcarrier[[
        'gameplayid', 'frameId', 'x', 'y', 's', 'a', 'dis', 'o', 'dir', 'event',
        'o_standard', 'dir_standard', 'x_standard', 'y_standard',
        'nflId_ballcarrier', 'ballCarrierHeight', 'ballCarrierWeight',
        'ballcarrierBirth', 'ballcarrierCollege', 'ballcarrierPosition',
        'ballCarrierDisplayName'
    ]]

    # 重命名列名
    df_ballcarrier.columns = [
        'gameplayid', 'frameId', 'x_ballcarrier', 'y_ballcarrier',
        's_ballcarrier', 'a_ballcarrier', 'dis_ballcarrier', 'o_ballcarrier',
        'dir_ballcarrier', 'event_ballcarrier', 'o_standard_ballcarrier',
        'dir_standard_ballcarrier', 'x_standard_ballcarrier',
        'y_standard_ballcarrier', 'nflId_ballcarrier', 'ballCarrierHeight',
        'ballCarrierWeight', 'ballcarrierBirth', 'ballcarrierCollege',
        'ballcarrierPosition', 'ballCarrierDisplayName'
    ]

    return df_ballcarrier

df_ballcarrier=create_ballcarrier_dataframe(players, tracking)
df_ballcarrier.head()

Unnamed: 0,gameplayid,frameId,x_ballcarrier,y_ballcarrier,s_ballcarrier,a_ballcarrier,dis_ballcarrier,o_ballcarrier,dir_ballcarrier,event_ballcarrier,...,dir_standard_ballcarrier,x_standard_ballcarrier,y_standard_ballcarrier,nflId_ballcarrier,ballCarrierHeight,ballCarrierWeight,ballcarrierBirth,ballcarrierCollege,ballcarrierPosition,ballCarrierDisplayName
0,202209080056,1,88.37,27.27,1.481328,1.05156,0.16,231.74,147.9,,...,327.9,31.63,26.063333,35472,195.58,147.4174,1988-06-06,Indiana,G,Rodger Saffold
1,202209080056,2,88.47,27.13,1.527048,0.557784,0.17,230.98,148.53,pass_arrived,...,328.53,31.53,26.203333,35472,195.58,147.4174,1988-06-06,Indiana,G,Rodger Saffold
2,202209080056,3,88.56,27.01,1.435608,0.448056,0.15,230.98,147.05,,...,327.05,31.44,26.323333,35472,195.58,147.4174,1988-06-06,Indiana,G,Rodger Saffold
3,202209080056,4,88.64,26.9,1.316736,0.813816,0.14,232.38,145.42,,...,325.42,31.36,26.433333,35472,195.58,147.4174,1988-06-06,Indiana,G,Rodger Saffold
4,202209080056,5,88.72,26.8,1.179576,1.133856,0.13,233.36,141.95,,...,321.95,31.28,26.533333,35472,195.58,147.4174,1988-06-06,Indiana,G,Rodger Saffold


In [12]:
def merge_tackles_and_ballcarriers(df_tackles_final, players, df_ballcarrier):
    # 提取Tacklers信息
    df_players_tackler = players.copy()
    df_players_tackler.columns = [
        'nflId_tackler', 'tacklerHeight', 'tacklerWeight', 'tacklerBirth',
        'tacklerCollege', 'tacklerPosition', 'tacklerDisplayName'
    ]

    # 合并 df_tackles_final 与 df_players_tackler
    df_tackles_final = pd.merge(
        df_tackles_final, df_players_tackler,
        left_on='nflId', right_on='nflId_tackler'
    )

    # 合并 df_tackles_final 与 df_ballcarrier
    df_full = pd.merge(
        df_tackles_final, df_ballcarrier,
        left_on=['gameplayid', 'ballCarrierDisplayName', 'frameId'],
        right_on=['gameplayid', 'ballCarrierDisplayName', 'frameId']
    )

    # 重命名列名
    df_full.rename(columns={
        'nflId': 'nflId_tackler',
        'x': 'x_tackler',
        'y': 'y_tackler',
        's': 's_tackler',
        'a': 'a_tackler',
        'dis': 'dis_tackler',
        'o': 'o_tackler',
        'dir': 'dir_tackler',
        'event': 'event_tackler',
        'o_standard': 'o_standard_tackler',
        'dir_standard': 'dir_standard_tackler',
        'x_standard': 'x_standard_tackler',
        'y_standard': 'y_standard_tackler'
    }, inplace=True)

    return df_full

df_full=merge_tackles_and_ballcarriers(df_tackles_final, players, df_ballcarrier)
df_full.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 804159 entries, 0 to 804158
Data columns (total 88 columns):
 #   Column                            Non-Null Count   Dtype  
---  ------                            --------------   -----  
 0   gameId                            804159 non-null  int64  
 1   playId                            804159 non-null  int64  
 2   nflId_tackler                     804159 non-null  float64
 3   displayName                       804159 non-null  object 
 4   frameId                           804159 non-null  int64  
 5   time                              804159 non-null  object 
 6   jerseyNumber                      804159 non-null  float64
 7   club                              804159 non-null  object 
 8   playDirection                     804159 non-null  object 
 9   x_tackler                         804159 non-null  float64
 10  y_tackler                         804159 non-null  float64
 11  s_tackler                         804159 non-null  f

In [13]:
"""
研究攻防球员的身体动态和静态属性、距离
"""
def create_motion_features(df_full):
    force_carrier = df_full['ballCarrierWeight'] * df_full['a_ballcarrier']
    force_tackler = df_full['tacklerWeight'] * df_full['a_tackler']

    # 动态(公制单位)
    df_full['force_ballcarrier'] = force_carrier
    df_full['force_tackler'] = force_tackler
    df_full['momentum_ballcarrier'] = df_full['ballCarrierWeight'] * df_full['s_ballcarrier']
    df_full['momentum_tackler'] = df_full['tacklerWeight'] * df_full['s_tackler']
    df_full['power_ballcarrier'] = force_carrier * df_full['s_ballcarrier']
    df_full['power_tackler'] = force_tackler * df_full['s_tackler']

    # 静态（公制单位）
    df_full['height_tackler_to_ballcarrier'] = df_full['tacklerHeight'] / df_full['ballCarrierHeight']
    df_full['dis_ballcarrier_tackler'] = np.sqrt(
        (df_full['x_standard_ballcarrier'] - df_full['x_standard_tackler']) ** 2 +
        (df_full['y_standard_ballcarrier'] - df_full['y_standard_tackler']) ** 2
    )

    return df_full

df_full=create_motion_features(df_full)
df_full.head()


Unnamed: 0,gameId,playId,nflId_tackler,displayName,frameId,time,jerseyNumber,club,playDirection,x_tackler,...,ballcarrierCollege,ballcarrierPosition,force_ballcarrier,force_tackler,momentum_ballcarrier,momentum_tackler,power_ballcarrier,power_tackler,height_tackler_to_ballcarrier,dis_ballcarrier_tackler
0,2022090800,56,43294.0,Jalen Ramsey,1,2022-09-08 20:24:05.200000,5.0,LA,left,77.95,...,Maryland,WR,71.298022,258.813063,493.540751,474.490616,406.164296,1301.622659,1.013889,3.636289
1,2022090800,56,43294.0,Jalen Ramsey,2,2022-09-08 20:24:05.299999,5.0,LA,left,78.02,...,Maryland,WR,100.609431,197.560638,494.332951,496.921082,574.062924,1040.540819,1.013889,3.52881
2,2022090800,56,43294.0,Jalen Ramsey,3,2022-09-08 20:24:05.400000,5.0,LA,left,78.06,...,Maryland,WR,196.46566,153.562418,473.735745,515.037996,1074.296234,838.292324,1.013889,3.41321
3,2022090800,56,43294.0,Jalen Ramsey,4,2022-09-08 20:24:05.500000,5.0,LA,left,78.1,...,Maryland,WR,266.971482,101.799805,449.177537,519.351547,1384.153159,560.376165,1.013889,3.258543
4,2022090800,56,43294.0,Jalen Ramsey,5,2022-09-08 20:24:05.599999,5.0,LA,left,78.13,...,Maryland,WR,289.153088,83.682891,430.164732,516.763417,1435.700602,458.352614,1.013889,3.08026


In [14]:
df_full.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 804159 entries, 0 to 804158
Data columns (total 96 columns):
 #   Column                            Non-Null Count   Dtype  
---  ------                            --------------   -----  
 0   gameId                            804159 non-null  int64  
 1   playId                            804159 non-null  int64  
 2   nflId_tackler                     804159 non-null  float64
 3   displayName                       804159 non-null  object 
 4   frameId                           804159 non-null  int64  
 5   time                              804159 non-null  object 
 6   jerseyNumber                      804159 non-null  float64
 7   club                              804159 non-null  object 
 8   playDirection                     804159 non-null  object 
 9   x_tackler                         804159 non-null  float64
 10  y_tackler                         804159 non-null  float64
 11  s_tackler                         804159 non-null  f

In [15]:
def group_by_play(df_full):
    aggregation = {
        'time': ['min', 'max'],
        'offenseFormation': 'first',
        'tacklerPosition': pd.Series.mode,
        'ballcarrierPosition': pd.Series.mode,
        'x_tackler': ['min', 'max', 'mean', 'std', 'skew'],
        'y_tackler': ['min', 'max', 'mean', 'std', 'skew'],
        's_tackler': ['min', 'max', 'mean', 'std', 'skew'],
        'a_tackler': ['min', 'max', 'mean', 'std', 'skew'],
        'dis_tackler': ['min', 'max', 'mean', 'std', 'skew'],
        'o_tackler': ['min', 'max', 'mean', 'std', 'skew'],
        'dir_tackler': ['min', 'max', 'mean', 'std', 'skew'],
        'o_standard_tackler': ['min', 'max', 'mean', 'std', 'skew'],
        'dir_standard_tackler': ['min', 'max', 'mean', 'std', 'skew'],
        'x_standard_tackler': ['min', 'max', 'mean', 'std', 'skew'],
        'y_standard_tackler': ['min', 'max', 'mean', 'std', 'skew'],
        'quarter': ['min', 'max', 'mean', 'std', 'skew'],
        'down': ['min', 'max', 'mean', 'std', 'skew'],
        'yardsToGo': ['min', 'max', 'mean', 'std', 'skew'],
        'gameClock': ['min', 'max'],
        'preSnapHomeScore': ['min', 'max', 'mean', 'std', 'skew'],
        'preSnapVisitorScore': ['min', 'max', 'mean', 'std', 'skew'],
        'passLength': ['min', 'max', 'mean', 'std', 'skew'],
        'absoluteYardlineNumber': ['min', 'max', 'mean', 'std', 'skew'],
        'defendersInTheBox': ['min', 'max', 'mean', 'std', 'skew'],
        'expectedPoints': ['min', 'max', 'mean', 'std', 'skew'],
        'tackle': 'max',
        'assist': 'max',
        'pff_missedTackle': 'max',
        'tacklerWeight': 'mean',
        'x_ballcarrier': ['min', 'max', 'mean', 'std', 'skew'],
        'y_ballcarrier': ['min', 'max', 'mean', 'std', 'skew'],
        's_ballcarrier': ['min', 'max', 'mean', 'std', 'skew'],
        'a_ballcarrier': ['min', 'max', 'mean', 'std', 'skew'],
        'dis_ballcarrier': ['min', 'max', 'mean', 'std', 'skew'],
        'o_ballcarrier': ['min', 'max', 'mean', 'std', 'skew'],
        'o_standard_ballcarrier': ['min', 'max', 'mean', 'std', 'skew'],
        'dir_standard_ballcarrier': ['min', 'max', 'mean', 'std', 'skew'],
        'x_standard_ballcarrier': ['min', 'max', 'mean', 'std', 'skew'],
        'y_standard_ballcarrier': ['min', 'max', 'mean', 'std', 'skew'],
        'ballCarrierWeight': 'mean',
        'force_ballcarrier': ['min', 'max', 'mean', 'std', 'skew'],
        'force_tackler': ['min', 'max', 'mean', 'std', 'skew'],
        'momentum_ballcarrier' : ['min', 'max', 'mean', 'std', 'skew'],
        'momentum_tackler' : ['min', 'max', 'mean', 'std', 'skew'],
        'power_ballcarrier': ['min', 'max', 'mean', 'std', 'skew'],
        'power_tackler': ['min', 'max', 'mean', 'std', 'skew'],
        'height_tackler_to_ballcarrier' : 'mean',
        'dis_ballcarrier_tackler': ['min', 'max', 'mean', 'std', 'skew']
    }
    
    df_full_grouped = df_full.groupby(['gameplayid', 'displayName']).agg(aggregation).reset_index()
    df_full_grouped.columns = ['|'.join(col).strip('|') for col in df_full_grouped.columns.values]
    
    return df_full_grouped

df_full_grouped=group_by_play(df_full)
df_full_grouped.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17420 entries, 0 to 17419
Columns: 200 entries, gameplayid to dis_ballcarrier_tackler|skew
dtypes: float64(163), int64(28), object(9)
memory usage: 26.6+ MB


In [16]:
df_full_grouped.head()


Unnamed: 0,gameplayid,displayName,time|min,time|max,offenseFormation|first,tacklerPosition|mode,ballcarrierPosition|mode,x_tackler|min,x_tackler|max,x_tackler|mean,...,power_tackler|max,power_tackler|mean,power_tackler|std,power_tackler|skew,height_tackler_to_ballcarrier|mean,dis_ballcarrier_tackler|min,dis_ballcarrier_tackler|max,dis_ballcarrier_tackler|mean,dis_ballcarrier_tackler|std,dis_ballcarrier_tackler|skew
0,2022090800101,Troy Hill,2022-09-08 20:25:08.200000,2022-09-08 20:25:13.000000,I_FORM,CB,RB,64.58,69.59,66.814898,...,1725.548112,620.835754,570.326424,0.406975,1.059701,0.353412,18.915079,11.59905,7.123702,-0.357283
1,20220908001030,David Long,2022-09-08 21:01:44.799999,2022-09-08 21:01:46.700000,SHOTGUN,CB,WR,58.42,62.51,60.197,...,1410.463799,912.115757,323.313177,-0.307915,0.946667,0.319531,3.061111,1.103907,0.81395,1.389278
2,20220908001102,Terrell Lewis,2022-09-08 21:03:51.299999,2022-09-08 21:03:56.200000,SHOTGUN,OLB,RB,84.39,88.01,85.324,...,1963.953349,642.233991,734.559958,0.766512,1.1,0.325576,6.310127,2.745755,1.235816,0.755065
3,20220908001102,Troy Hill,2022-09-08 21:03:51.299999,2022-09-08 21:03:56.200000,SHOTGUN,CB,RB,84.89,91.48,86.9382,...,1479.210714,391.896282,481.875368,1.129921,1.014286,0.375366,10.380694,2.575385,3.053362,1.452462
4,20220908001187,DaQuan Jones,2022-09-08 21:08:54.200000,2022-09-08 21:08:59.599999,SINGLEBACK,DT,RB,66.02,69.97,68.721091,...,1309.379222,450.068196,362.838584,0.860153,1.117647,0.63,7.788922,3.802571,2.961672,0.212843


In [19]:
generate_summary_table(df_full_grouped)
df_full_grouped.to_csv('../../processed_dataset/df_full_grouped.csv')

Unnamed: 0,Column,Data Type,Missing Values,Missing %,Unique Values,Min,Max,Mean,Median
0,gameplayid,object,0,0.00%,12022,,,,
1,displayName,object,0,0.00%,799,,,,
2,time|min,object,0,0.00%,11995,,,,
3,time|max,object,0,0.00%,12001,,,,
4,offenseFormation|first,object,4,0.02%,7,,,,
5,tacklerPosition|mode,object,0,0.00%,10,,,,
6,ballcarrierPosition|mode,object,0,0.00%,5,,,,
7,x_tackler|min,float64,0,0.00%,7773,4.59,110.06,57.771858,58.035
8,x_tackler|max,float64,0,0.00%,7859,9.32,119.97,63.252029,63.725
9,x_tackler|mean,float64,0,0.00%,17403,7.585,112.353421,60.502523,60.83135


In [20]:
"""
摘要统计
"""
# 计算总的进攻次数
total_plays = df_full_grouped['gameplayid'].nunique()
total_rows = len(df_full_grouped)

# 计算总的未能完成的擒抱次数、助攻次数和擒抱次数
total_missed_tackles = len(df_full_grouped[df_full_grouped['pff_missedTackle|max'] == 1])
total_assists = len(df_full_grouped[(df_full_grouped['assist|max'] == 1) & (df_full_grouped['pff_missedTackle|max'] == 0)])
total_tackles = len(df_full_grouped[(df_full_grouped['tackle|max'] == 1) & 
                                   (df_full_grouped['assist|max'] == 0) & 
                                   (df_full_grouped['pff_missedTackle|max'] == 0)])

print('Total plays:', total_plays)
print('Total row count:', total_rows)
print('Total missed tackles:', total_missed_tackles)
print('Total assists:', total_assists)
print('Total tackles:', total_tackles)


Total plays: 12022
Total row count: 17420
Total missed tackles: 2090
Total assists: 5446
Total tackles: 9884


In [21]:
"""
特征选取
"""

# 增加位置变量
def create_position_variables(df_full_grouped):
    # 持球进攻球员位置
    ballcarrier_position = pd.get_dummies(
        df_full_grouped[['gameplayid', 'displayName', 'ballcarrierPosition|mode']],
        columns=['ballcarrierPosition|mode'], drop_first=True
    ).reset_index(drop=True)
    ballcarrier_position.columns = ['gameplayid', 'displayName', 'QB_ballcarrier', 'RB_ballcarrier', 'TE_ballcarrier', 'WR_ballcarrier']

    # 防守队员位置
    tackler_position = pd.get_dummies(
        df_full_grouped[['gameplayid', 'displayName', 'tacklerPosition|mode']],
        columns=['tacklerPosition|mode'], drop_first=True
    ).reset_index(drop=True)
    tackler_position.columns = ['gameplayid', 'displayName', 'tacklerPosition|mode_DB', 'tacklerPosition|mode_DE',
                                'tacklerPosition|mode_DT', 'tacklerPosition|mode_FS',
                                'tacklerPosition|mode_ILB', 'tacklerPosition|mode_MLB',
                                'tacklerPosition|mode_NT', 'tacklerPosition|mode_OLB',
                                'tacklerPosition|mode_SS']

    # 进攻阵型
    offense_formation = pd.get_dummies(
        df_full_grouped[['gameplayid', 'displayName', 'offenseFormation|first']],
        columns=['offenseFormation|first'], drop_first=True
    ).reset_index(drop=True)

    # 合并
    df_merge = df_full_grouped.merge(ballcarrier_position, on=['gameplayid', 'displayName'])
    df_merge = df_merge.merge(tackler_position, on=['gameplayid', 'displayName'])
    df_merge = df_merge.merge(offense_formation, on=['gameplayid', 'displayName'])

    return df_merge

df_full_grouped=create_position_variables(df_full_grouped)

In [22]:
# 合成最终的target
condition = (df_full_grouped['tackle|max'] + df_full_grouped['assist|max'] >= 1) & (df_full_grouped['pff_missedTackle|max'] == 0)
df_full_grouped['tackle_assist_success'] = np.where(condition, 1, 0)

df_full_grouped.info()
# df_full_grouped.to_csv('../../Dataset/full_grouped.csv')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17420 entries, 0 to 17419
Columns: 220 entries, gameplayid to tackle_assist_success
dtypes: float64(163), int32(1), int64(28), object(9), uint8(19)
memory usage: 27.1+ MB


In [23]:
df_final = df_full_grouped[df_full_grouped['tackle|max'] + df_full_grouped['pff_missedTackle|max'] >= 1]
print(f'合法进攻样本数：{len(df_final)}')
print(f'成功tackle/assist数：{df_final.tackle_assist_success.sum()}')
print(f"失败tackle数：{df_final['pff_missedTackle|max'].sum()}")

合法进攻样本数：11974
成功tackle/assist数：9884
失败tackle数：2090


In [24]:
# final增加相对运动信息
def create_diff_range(df_final):
    df_final['difference_speed'] = df_final['s_tackler|max'] - df_final['s_ballcarrier|max']
    df_final['difference_a'] = df_final['a_tackler|max'] - df_final['a_ballcarrier|max']
    df_final['difference_orientation'] = df_final['o_standard_tackler|max'] - df_final['o_standard_ballcarrier|max']
    df_final['tackler_x_range'] = df_final['x_standard_tackler|max'] - df_final['x_standard_tackler|min']
    df_final['tackler_y_range'] = df_final['y_standard_tackler|max'] - df_final['y_standard_tackler|min']
    df_final['tackler_s_range'] = df_final['s_tackler|max'] - df_final['s_tackler|min']
    df_final['tackler_a_range'] = df_final['a_tackler|max'] - df_final['a_tackler|min']
    df_final['o_range_tackler'] = df_final['o_standard_tackler|max'] - df_final['o_standard_tackler|min']
    df_final['dir_range_tackler'] = df_final['dir_standard_tackler|max'] - df_final['dir_standard_tackler|min']
    df_final['ballcarrier_x_range'] = df_final['x_standard_ballcarrier|max'] - df_final['x_standard_ballcarrier|min']
    df_final['ballcarrier_y_range'] = df_final['y_standard_ballcarrier|max'] - df_final['y_standard_ballcarrier|min']
    df_final['ballcarrier_s_range'] = df_final['s_ballcarrier|max'] - df_final['s_ballcarrier|min']
    df_final['ballcarrier_a_range'] = df_final['a_ballcarrier|max'] - df_final['a_ballcarrier|min']
    df_final['o_range_ballcarrier'] = df_final['o_standard_ballcarrier|max'] - df_final['o_standard_ballcarrier|min']
    df_final['dir_range_ballcarrier'] = df_final['dir_standard_ballcarrier|max'] - df_final['dir_standard_ballcarrier|min']
    return df_final

Cleaned_Data = create_diff_range(df_final)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_final['difference_speed'] = df_final['s_tackler|max'] - df_final['s_ballcarrier|max']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_final['difference_a'] = df_final['a_tackler|max'] - df_final['a_ballcarrier|max']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_final['difference_orientatio

In [25]:
generate_summary_table(Cleaned_Data)
Cleaned_Data.to_csv('../../processed_dataset/Cleaned_Data.csv')

Unnamed: 0,Column,Data Type,Missing Values,Missing %,Unique Values,Min,Max,Mean,Median
0,gameplayid,object,0,0.00%,10298,,,,
1,displayName,object,0,0.00%,769,,,,
2,time|min,object,0,0.00%,10277,,,,
3,time|max,object,0,0.00%,10282,,,,
4,offenseFormation|first,object,4,0.03%,7,,,,
5,tacklerPosition|mode,object,0,0.00%,10,,,,
6,ballcarrierPosition|mode,object,0,0.00%,5,,,,
7,x_tackler|min,float64,0,0.00%,6596,4.59,109.4,57.635908,58.135
8,x_tackler|max,float64,0,0.00%,6599,9.32,119.97,63.393949,64.085
9,x_tackler|mean,float64,0,0.00%,11961,7.585,111.900625,60.511115,60.992542


In [22]:
# 选择Cleaned_Data中的数值列
Cleaned_Data_num = Cleaned_Data.select_dtypes(include=[np.number])

Cleaned_Data_num.head()

Unnamed: 0,x_tackler|min,x_tackler|max,x_tackler|mean,x_tackler|std,x_tackler|skew,y_tackler|min,y_tackler|max,y_tackler|mean,y_tackler|std,y_tackler|skew,...,tackler_s_range,tackler_a_range,o_range_tackler,dir_range_tackler,ballcarrier_x_range,ballcarrier_y_range,ballcarrier_s_range,ballcarrier_a_range,o_range_ballcarrier,dir_range_ballcarrier
0,64.58,69.59,66.814898,2.092568,0.150261,39.18,44.13,41.712245,1.458103,0.097733,...,4.105656,5.80644,335.82,352.7,14.42,11.11,6.638544,4.910328,105.02,107.21
1,58.42,62.51,60.197,1.49175,0.303782,36.76,38.61,37.428,0.561873,0.56522,...,2.048256,3.447288,342.47,163.92,1.21,1.05,2.852928,4.02336,330.49,191.01
2,84.39,88.01,85.324,1.01552,1.176394,41.76,44.98,42.9078,1.061124,0.776466,...,4.315968,6.08076,268.54,313.25,5.18,3.65,4.25196,4.178808,341.84,324.63
3,84.89,91.48,86.9382,1.752662,1.060643,38.92,44.44,42.9406,1.67028,-1.403647,...,4.233672,4.25196,164.37,138.38,5.18,3.65,4.25196,4.178808,341.84,324.63
5,65.53,70.91,68.878182,1.72019,-0.613643,24.99,40.86,31.508909,5.15533,0.122112,...,4.544568,5.952744,100.7,163.28,11.4,9.6,4.645152,4.562856,139.27,113.53


In [23]:
generate_summary_table(Cleaned_Data_num)

Unnamed: 0,Column,Data Type,Missing Values,Missing %,Unique Values,Min,Max,Mean,Median
0,x_tackler|min,float64,0,0.00%,6596,4.59,109.4,57.635908,58.135
1,x_tackler|max,float64,0,0.00%,6599,9.32,119.97,63.393949,64.085
2,x_tackler|mean,float64,0,0.00%,11961,7.585,111.900625,60.511115,60.992542
3,x_tackler|std,float64,0,0.00%,11974,0.032808,26.635287,1.919744,1.481959
4,x_tackler|skew,float64,0,0.00%,11974,-2.267549,2.825899,0.005834,-0.002717
5,y_tackler|min,float64,0,0.00%,4381,-4.63,52.55,22.114776,23.32
6,y_tackler|max,float64,0,0.00%,4383,1.43,57.79,31.879618,30.56
7,y_tackler|mean,float64,0,0.00%,11962,-1.944286,55.059375,26.996302,26.919198
8,y_tackler|std,float64,0,0.00%,11974,0.025872,14.305164,3.332538,2.752519
9,y_tackler|skew,float64,0,0.00%,11974,-3.08464,2.590824,-0.001621,0.005989


Unnamed: 0,Column,Data Type,Missing Values,Missing %,Unique Values,Min,Max,Mean,Median
0,x_tackler|min,float64,0,0.00%,6596,4.590000,109.400000,57.635908,58.135000
1,x_tackler|max,float64,0,0.00%,6599,9.320000,119.970000,63.393949,64.085000
2,x_tackler|mean,float64,0,0.00%,11961,7.585000,111.900625,60.511115,60.992542
3,x_tackler|std,float64,0,0.00%,11974,0.032808,26.635287,1.919744,1.481959
4,x_tackler|skew,float64,0,0.00%,11974,-2.267549,2.825899,0.005834,-0.002717
...,...,...,...,...,...,...,...,...,...
221,ballcarrier_y_range,float64,0,0.00%,5195,0.070000,46.220000,10.471810,8.230000
222,ballcarrier_s_range,float64,0,0.00%,2307,0.237744,9.875520,4.705704,4.745736
223,ballcarrier_a_range,float64,0,0.00%,1730,0.411480,11.503152,4.278417,4.306824
224,o_range_ballcarrier,float64,0,0.00%,8980,0.000000,359.770000,220.065176,213.580000


In [24]:
# 将Cleaned_Data_num中的所有NaN值替换为0
Cleaned_Data_num = Cleaned_Data_num.fillna(0)
generate_summary_table(Cleaned_Data_num)

Unnamed: 0,Column,Data Type,Missing Values,Missing %,Unique Values,Min,Max,Mean,Median
0,x_tackler|min,float64,0,0.00%,6596,4.59,109.4,57.635908,58.135
1,x_tackler|max,float64,0,0.00%,6599,9.32,119.97,63.393949,64.085
2,x_tackler|mean,float64,0,0.00%,11961,7.585,111.900625,60.511115,60.992542
3,x_tackler|std,float64,0,0.00%,11974,0.032808,26.635287,1.919744,1.481959
4,x_tackler|skew,float64,0,0.00%,11974,-2.267549,2.825899,0.005834,-0.002717
5,y_tackler|min,float64,0,0.00%,4381,-4.63,52.55,22.114776,23.32
6,y_tackler|max,float64,0,0.00%,4383,1.43,57.79,31.879618,30.56
7,y_tackler|mean,float64,0,0.00%,11962,-1.944286,55.059375,26.996302,26.919198
8,y_tackler|std,float64,0,0.00%,11974,0.025872,14.305164,3.332538,2.752519
9,y_tackler|skew,float64,0,0.00%,11974,-3.08464,2.590824,-0.001621,0.005989


Unnamed: 0,Column,Data Type,Missing Values,Missing %,Unique Values,Min,Max,Mean,Median
0,x_tackler|min,float64,0,0.00%,6596,4.590000,109.400000,57.635908,58.135000
1,x_tackler|max,float64,0,0.00%,6599,9.320000,119.970000,63.393949,64.085000
2,x_tackler|mean,float64,0,0.00%,11961,7.585000,111.900625,60.511115,60.992542
3,x_tackler|std,float64,0,0.00%,11974,0.032808,26.635287,1.919744,1.481959
4,x_tackler|skew,float64,0,0.00%,11974,-2.267549,2.825899,0.005834,-0.002717
...,...,...,...,...,...,...,...,...,...
221,ballcarrier_y_range,float64,0,0.00%,5195,0.070000,46.220000,10.471810,8.230000
222,ballcarrier_s_range,float64,0,0.00%,2307,0.237744,9.875520,4.705704,4.745736
223,ballcarrier_a_range,float64,0,0.00%,1730,0.411480,11.503152,4.278417,4.306824
224,o_range_ballcarrier,float64,0,0.00%,8980,0.000000,359.770000,220.065176,213.580000


In [25]:
X = Cleaned_Data_num.drop(columns=['tackle_assist_success', 'pff_missedTackle|max', 'tackle|max'])
y = Cleaned_Data_num['tackle_assist_success']

# 使用随机森林回归模型
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X, y)

feature_importances = rf.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importances})



In [None]:
# 按相关性绝对值大小排序
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
final_features_df = feature_importance_df[feature_importance_df['Importance'] >= 0.01]
final_features = final_features_df['Feature'].tolist()


In [None]:
"""
保存清洗后的数据和最终特征
"""
Cleaned_Data_num.to_csv('../../processed_dataset/Data.csv')
file_path = '../../processed_dataset/final_features.txt'
with open(file_path, 'w') as file:
    for feature in final_features:
        file.write(f'{feature}\n')
