目标：分析球员是否能铲球成功

# 1. 导入数据 <a class="anchor"  id="1"></a>

In [1]:
import pandas as pd
import plotly.express as px

In [2]:
# 读取 games.csv 文件
games_data = pd.read_csv('../../Dataset/games.csv')

# 读取 players.csv 文件
players_data = pd.read_csv('../../Dataset/players.csv')

# 读取 plays.csv 文件
plays_data = pd.read_csv('../../Dataset/plays.csv')

# 读取 tackles.csv 文件
tackles_data = pd.read_csv('../../Dataset/tackles.csv')

建立一个函数用于统计表格相关列的值信息

In [3]:
def generate_summary_table(data):
    summary_table = pd.DataFrame(columns=['Column', 'Data Type', 'Missing Values', 'Missing %', 'Unique Values', 'Min', 'Max', 'Mean', 'Median'])
    
    for column in data.columns:
        data_type = str(data[column].dtype)
        
        missing_values = data[column].isnull().sum()
        missing_percentage = (missing_values / len(data)) * 100
        
        if data[column].dtype == 'object':
            min_value, max_value, mean_value, median_value = '', '', '', ''
        else:
            min_value = data[column].min()
            max_value = data[column].max()
            mean_value = data[column].mean()
            median_value = data[column].median()
        
        unique_values = data[column].nunique()  # 计算唯一值数量
        
        summary_table = summary_table.append({
            'Column': column,
            'Data Type': data_type,
            'Missing Values': missing_values,
            'Missing %': f'{missing_percentage:.2f}%',
            'Unique Values': unique_values,  # 添加唯一值数量
            'Min': min_value,
            'Max': max_value,
            'Mean': mean_value,
            'Median': median_value
        }, ignore_index=True)
    
    formatted_table = summary_table.style.set_properties(**{'text-align': 'center'})
    
    print("表格信息统计:")
    display(formatted_table)
    
    return summary_table


1. 比赛数据games.csv分析

In [4]:
games_data.head()

Unnamed: 0,gameId,season,week,gameDate,gameTimeEastern,homeTeamAbbr,visitorTeamAbbr,homeFinalScore,visitorFinalScore
0,2022090800,2022,1,09/08/2022,20:20:00,LA,BUF,10,31
1,2022091100,2022,1,09/11/2022,13:00:00,ATL,NO,26,27
2,2022091101,2022,1,09/11/2022,13:00:00,CAR,CLE,24,26
3,2022091102,2022,1,09/11/2022,13:00:00,CHI,SF,19,10
4,2022091103,2022,1,09/11/2022,13:00:00,CIN,PIT,20,23


生成表格信息统计图

In [5]:
# 调用函数并打印生成的表格
games_data_summary = generate_summary_table(games_data)

表格信息统计:


Unnamed: 0,Column,Data Type,Missing Values,Missing %,Unique Values,Min,Max,Mean,Median
0,gameId,int64,0,0.00%,136,2022090800.0,2022110700.0,2022098922.117647,2022100902.5
1,season,int64,0,0.00%,1,2022.0,2022.0,2022.0,2022.0
2,week,int64,0,0.00%,9,1.0,9.0,4.845588,5.0
3,gameDate,object,0,0.00%,27,,,,
4,gameTimeEastern,object,0,0.00%,8,,,,
5,homeTeamAbbr,object,0,0.00%,32,,,,
6,visitorTeamAbbr,object,0,0.00%,32,,,,
7,homeFinalScore,int64,0,0.00%,38,3.0,49.0,22.669118,22.5
8,visitorFinalScore,int64,0,0.00%,35,0.0,48.0,20.948529,20.0


2. 球员数据players.csv分析

In [6]:
players_data.head()

Unnamed: 0,nflId,height,weight,birthDate,collegeName,position,displayName
0,25511,6-4,225,1977-08-03,Michigan,QB,Tom Brady
1,29550,6-4,328,1982-01-22,Arkansas,T,Jason Peters
2,29851,6-2,225,1983-12-02,California,QB,Aaron Rodgers
3,30842,6-6,267,1984-05-19,UCLA,TE,Marcedes Lewis
4,33084,6-4,217,1985-05-17,Boston College,QB,Matt Ryan


生成表格信息统计图

In [7]:
# 调用函数并打印生成的表格
players_data_summary = generate_summary_table(players_data)

表格信息统计:


Unnamed: 0,Column,Data Type,Missing Values,Missing %,Unique Values,Min,Max,Mean,Median
0,nflId,int64,0,0.00%,1683,25511.0,55241.0,48221.702317,47872.0
1,height,object,0,0.00%,16,,,,
2,weight,int64,0,0.00%,179,153.0,380.0,245.724302,236.0
3,birthDate,object,479,28.46%,985,,,,
4,collegeName,object,0,0.00%,226,,,,
5,position,object,0,0.00%,19,,,,
6,displayName,object,0,0.00%,1672,,,,


3. 每场比赛数据plays.csv分析

In [8]:
plays_data.head()

Unnamed: 0,gameId,playId,ballCarrierId,ballCarrierDisplayName,playDescription,quarter,down,yardsToGo,possessionTeam,defensiveTeam,...,preSnapHomeTeamWinProbability,preSnapVisitorTeamWinProbability,homeTeamWinProbabilityAdded,visitorTeamWinProbilityAdded,expectedPoints,expectedPointsAdded,foulName1,foulName2,foulNFLId1,foulNFLId2
0,2022100908,3537,48723,Parker Hesse,(7:52) (Shotgun) M.Mariota pass short middle t...,4,1,10,ATL,TB,...,0.976785,0.023215,-0.00611,0.00611,2.360609,0.981955,,,,
1,2022091103,3126,52457,Chase Claypool,(7:38) (Shotgun) C.Claypool right end to PIT 3...,4,1,10,PIT,CIN,...,0.160485,0.839515,-0.010865,0.010865,1.733344,-0.263424,,,,
2,2022091111,1148,42547,Darren Waller,(8:57) D.Carr pass short middle to D.Waller to...,2,2,5,LV,LAC,...,0.756661,0.243339,-0.037409,0.037409,1.312855,1.133666,,,,
3,2022100212,2007,46461,Mike Boone,(13:12) M.Boone left tackle to DEN 44 for 7 ya...,3,2,10,DEN,LV,...,0.620552,0.379448,-0.002451,0.002451,1.641006,-0.04358,,,,
4,2022091900,1372,47857,Devin Singletary,(8:33) D.Singletary right guard to TEN 32 for ...,2,1,10,BUF,TEN,...,0.83629,0.16371,0.001053,-0.001053,3.686428,-0.167903,,,,


生成表格信息统计图

In [9]:
# 调用函数并打印生成的表格
plays_data_summary = generate_summary_table(plays_data)

表格信息统计:


Unnamed: 0,Column,Data Type,Missing Values,Missing %,Unique Values,Min,Max,Mean,Median
0,gameId,int64,0,0.00%,136,2022090800.0,2022110700.0,2022098953.855598,2022100903.0
1,playId,int64,0,0.00%,3974,54.0,5096.0,1986.603476,1990.5
2,ballCarrierId,int64,0,0.00%,480,25511.0,55158.0,48072.271664,47789.0
3,ballCarrierDisplayName,object,0,0.00%,480,,,,
4,playDescription,object,0,0.00%,12486,,,,
5,quarter,int64,0,0.00%,5,1.0,5.0,2.550136,3.0
6,down,int64,0,0.00%,4,1.0,4.0,1.727054,2.0
7,yardsToGo,int64,0,0.00%,32,1.0,38.0,8.469085,10.0
8,possessionTeam,object,0,0.00%,32,,,,
9,defensiveTeam,object,0,0.00%,32,,,,


4. 铲球数据tackles.csv分析

In [10]:
tackles_data.head()

Unnamed: 0,gameId,playId,nflId,tackle,assist,forcedFumble,pff_missedTackle
0,2022090800,101,42816,1,0,0,0
1,2022090800,393,46232,1,0,0,0
2,2022090800,486,40166,1,0,0,0
3,2022090800,646,47939,1,0,0,0
4,2022090800,818,40107,1,0,0,0


生成表格信息统计图

In [11]:
# 调用函数并打印生成的表格
tackles_data_summary = generate_summary_table(tackles_data)

表格信息统计:


Unnamed: 0,Column,Data Type,Missing Values,Missing %,Unique Values,Min,Max,Mean,Median
0,gameId,int64,0,0.00%,136,2022090800,2022110700,2022098971.441123,2022100903.0
1,playId,int64,0,0.00%,3943,54,5096,1982.974578,1991.0
2,nflId,int64,0,0.00%,800,33131,55241,47602.719442,46669.0
3,tackle,int64,0,0.00%,2,0,1,0.569207,1.0
4,assist,int64,0,0.00%,2,0,1,0.315276,0.0
5,forcedFumble,int64,0,0.00%,2,0,1,0.005681,0.0
6,pff_missedTackle,int64,0,0.00%,2,0,1,0.119936,0.0


# 2. 可视化数据 <a class="anchor"  id="2"></a>

1. games.csv

首先对该表格内容进行可视化

由之前信息可得此表中没有出现数据缺失的情况，因此可以直接处理

得分分布 - 直方图展示主队和客队得分的分布情况

In [12]:
# 调用函数并打印生成的表格
games_data_summary = generate_summary_table(games_data)

表格信息统计:


Unnamed: 0,Column,Data Type,Missing Values,Missing %,Unique Values,Min,Max,Mean,Median
0,gameId,int64,0,0.00%,136,2022090800.0,2022110700.0,2022098922.117647,2022100902.5
1,season,int64,0,0.00%,1,2022.0,2022.0,2022.0,2022.0
2,week,int64,0,0.00%,9,1.0,9.0,4.845588,5.0
3,gameDate,object,0,0.00%,27,,,,
4,gameTimeEastern,object,0,0.00%,8,,,,
5,homeTeamAbbr,object,0,0.00%,32,,,,
6,visitorTeamAbbr,object,0,0.00%,32,,,,
7,homeFinalScore,int64,0,0.00%,38,3.0,49.0,22.669118,22.5
8,visitorFinalScore,int64,0,0.00%,35,0.0,48.0,20.948529,20.0


In [13]:
# 创建直方图
fig = px.histogram(games_data, x=["homeFinalScore", "visitorFinalScore"], nbins=20, barmode="overlay",
                   color_discrete_map={"homeFinalScore": "blue", "visitorFinalScore": "red"},
                   labels={"value": "Score", "variable": "Team"}, title="Score Distribution")

# 自定义颜色和透明度
fig.update_traces(opacity=0.7)

# 添加工具提示
fig.update_layout(hovermode="x unified", hoverlabel=dict(bgcolor="white", font_size=16))

# 添加渐变背景颜色
fig.update_layout(
    plot_bgcolor='rgba(255,255,255,0.9)',  # 白色背景
    paper_bgcolor='rgba(255,255,255,0.9)',  # 白色背景
)

# 添加动画效果
fig.update_layout(transition_duration=500)

# 自定义字体样式
fig.update_layout(
    font=dict(family="Arial", size=14, color="black"),  # 自定义字体样式
    title=dict(font=dict(size=20)),  # 自定义标题样式
    xaxis_title=dict(font=dict(size=16)),  # 自定义X轴标题样式
    yaxis_title=dict(font=dict(size=16)),  # 自定义Y轴标题样式
)

fig.show()

赛季中的队伍表现 - 折线图显示每支球队在赛季不同周次的得分情况

In [14]:
# 根据赛季、周次和队伍分组并计算得分总和
team_scores = games_data.groupby(["season", "week", "homeTeamAbbr"])["homeFinalScore"].sum().reset_index()
team_scores = team_scores.rename(columns={"homeTeamAbbr": "Team", "homeFinalScore": "Score"})

# 获取所有队伍的列表
all_teams = team_scores["Team"].unique().tolist()

# 使用Plotly Express创建交互式折线图
fig = px.line(team_scores, x="week", y="Score", color="Team", title="Team Performance in Different Weeks",
              labels={"week": "Week", "Score": "Score"}, width=1000, height=600)

# 设置布局
fig.update_layout(
    legend=dict(orientation="h", y=-0.2),
    margin=dict(l=20, r=20, t=80, b=20),
    plot_bgcolor="white",  # 设置背景色为白色
    xaxis=dict(gridcolor='lightgray'),  # 添加网格线
    yaxis=dict(gridcolor='lightgray'),
    font=dict(family="Arial", size=12),  # 设置字体样式和大小
)

fig.show()

从结果可以看出，并不是所有的队伍每个星期都参赛

赛季中比赛结果统计 - 饼图展示每个赛季主队和客队的胜利次数和比赛结果

In [15]:
# 创建新列，表示主队和客队的胜利情况
games_data['home_win'] = games_data['homeFinalScore'] > games_data['visitorFinalScore']
games_data['visitor_win'] = games_data['homeFinalScore'] < games_data['visitorFinalScore']

# 统计每个赛季中主队和客队的胜利次数
season_results = games_data.groupby(['season']).agg({
    'home_win': 'sum',
    'visitor_win': 'sum'
}).reset_index()

# 重塑数据以符合饼图格式
season_results_melted = season_results.melt(id_vars='season', var_name='Result', value_name='Wins')

# 显示饼图
fig = px.pie(season_results_melted, values='Wins', names='Result', title='Season Results: Home Team vs Visitor Team Wins',
             hover_data=['Wins'], labels={'Result': 'Game Result'})
fig.show()

看来主场确实有优势啊

至此，该表的数据可视化完成

2. players.csv

首先对该表格内容进行公制化转换，转换成我们可以处理的数据

身高：米  体重：千克  出生日期：转化为年龄

In [16]:
# 转换身高为厘米
players_data['height'] = players_data['height'].apply(lambda x: int(x.split('-')[0]) * 30.48 + int(x.split('-')[1]) * 2.54)

# 转换体重为公斤
players_data['weight'] = players_data['weight'] * 0.453592

# 计算年龄
players_data['birthDate'] = 2022 - pd.to_datetime(players_data['birthDate']).dt.year

# 输出转换后的结果
print(players_data)

      nflId  height      weight  birthDate       collegeName position  \
0     25511  193.04  102.058200       45.0          Michigan       QB   
1     29550  193.04  148.778176       40.0          Arkansas        T   
2     29851  187.96  102.058200       39.0        California       QB   
3     30842  198.12  121.109064       38.0              UCLA       TE   
4     33084  193.04   98.429464       37.0    Boston College       QB   
...     ...     ...         ...        ...               ...      ...   
1678  55200  198.12  120.655472        NaN           Indiana       DT   
1679  55212  182.88  104.326160        NaN        Iowa State      ILB   
1680  55239  187.96  136.077600        NaN      Pennsylvania       DT   
1681  55240  185.42   83.914520        NaN           Buffalo       CB   
1682  55241  187.96  127.005760        NaN  Coastal Carolina       DT   

           displayName  
0            Tom Brady  
1         Jason Peters  
2        Aaron Rodgers  
3       Marcedes Lewis 

接下来我们进行可视化分析

In [17]:
# 创建直方图统计身高每一厘米的频率
height_counts = players_data['height'].value_counts().sort_index().reset_index()
height_counts.columns = ['Height (cm)', 'Frequency']

fig = px.bar(height_counts, x='Height (cm)', y='Frequency', title='Height Distribution')
fig.update_layout(
    xaxis_title='Height (cm)',
    yaxis_title='Frequency',
    template='plotly_white'  # 使用明亮的主题
)

# 添加动画效果
fig.update_traces(marker_color='darkblue', opacity=0.7)
fig.update_layout(transition_duration=500)  # 设置动画持续时间

fig.show()

In [18]:
# 创建直方图统计体重
fig_weight = px.histogram(players_data, x='weight', title='Weight Distribution')

# 更新坐标轴标题和使用明亮的主题
fig_weight.update_xaxes(title='Weight')
fig_weight.update_layout(template='plotly_white')

# 添加动画效果
fig_weight.update_traces(marker_color='darkblue', opacity=0.7)
fig_weight.update_layout(transition_duration=500)  # 设置动画持续时间

fig_weight.show()


In [19]:
# 创建直方图统计不同位置的球员数量
fig_position = px.histogram(players_data, x='position', title='Player Position Distribution')

# 更新坐标轴标题和使用明亮的主题
fig_position.update_xaxes(title='Position')
fig_position.update_layout(template='plotly_white')

# 添加动画效果
fig_position.update_traces(marker_color='darkblue', opacity=0.7)
fig_position.update_layout(transition_duration=500)  # 设置动画持续时间

fig_position.show()

In [20]:
# 创建箱线图得到位置与体重的关系
fig_box = px.box(players_data, x='position', y='weight', title='Position vs Weight')
fig_box.update_xaxes(title='Position')
fig_box.update_yaxes(title='Weight')
fig_box.show()

In [21]:
# 计算各个球员位置的平均身高和体重
avg_height_weight = players_data.groupby('position').agg({'height': 'mean', 'weight': 'mean'}).reset_index()

# 创建自定义颜色映射
colors = ['#1f77b4', '#ff7f0e']

# 绘制柱状图展示不同位置球员的平均身高和体重
fig_avg = px.bar(avg_height_weight, x='position', y=['height', 'weight'], 
                 barmode='group', title='Average Height and Weight by Position',
                 labels={'position': 'Position', 'value': 'Value'},
                 color_discrete_sequence=colors)

# 设置图表样式
fig_avg.update_layout(
    xaxis_title_font=dict(size=14, color='black', family='Arial'),
    yaxis_title_font=dict(size=14, color='black', family='Arial'),
    title_font=dict(size=18, color='black', family='Arial'),
    legend_title_font=dict(size=14, color='black', family='Arial'),
    legend=dict(title='', title_font=dict(size=14, color='black', family='Arial')),
    plot_bgcolor='white',  # 设置背景颜色为白色
    font=dict(family='Arial', size=12),
)

fig_avg.show()

In [22]:
# 计算BMI指数
players_data['bmi'] = players_data['weight'] / ((players_data['height'] / 100) ** 2)

# 创建直方图，设置颜色为'darkblue'和透明度为0.7
fig_bmi = px.histogram(players_data, x='bmi', title='BMI Distribution', color_discrete_sequence=['darkblue'], opacity=0.7)

# 设置图表样式和标签
fig_bmi.update_layout(
    xaxis_title='BMI',
    yaxis_title='Count',
    title_font=dict(size=18, family='Arial'),
    xaxis_title_font=dict(size=14, family='Arial'),
    yaxis_title_font=dict(size=14, family='Arial'),
    font=dict(family='Arial', size=12),
    showlegend=False  # 隐藏图例
)

# 移除直方图的边框
fig_bmi.update_traces(marker=dict(line=dict(color='skyblue', width=0)))  # 设置边框宽度为0

fig_bmi.show()


3. plays.csv

首先对该表格内容进行数据清洗与可视化

In [23]:
plays_data.head()

Unnamed: 0,gameId,playId,ballCarrierId,ballCarrierDisplayName,playDescription,quarter,down,yardsToGo,possessionTeam,defensiveTeam,...,preSnapHomeTeamWinProbability,preSnapVisitorTeamWinProbability,homeTeamWinProbabilityAdded,visitorTeamWinProbilityAdded,expectedPoints,expectedPointsAdded,foulName1,foulName2,foulNFLId1,foulNFLId2
0,2022100908,3537,48723,Parker Hesse,(7:52) (Shotgun) M.Mariota pass short middle t...,4,1,10,ATL,TB,...,0.976785,0.023215,-0.00611,0.00611,2.360609,0.981955,,,,
1,2022091103,3126,52457,Chase Claypool,(7:38) (Shotgun) C.Claypool right end to PIT 3...,4,1,10,PIT,CIN,...,0.160485,0.839515,-0.010865,0.010865,1.733344,-0.263424,,,,
2,2022091111,1148,42547,Darren Waller,(8:57) D.Carr pass short middle to D.Waller to...,2,2,5,LV,LAC,...,0.756661,0.243339,-0.037409,0.037409,1.312855,1.133666,,,,
3,2022100212,2007,46461,Mike Boone,(13:12) M.Boone left tackle to DEN 44 for 7 ya...,3,2,10,DEN,LV,...,0.620552,0.379448,-0.002451,0.002451,1.641006,-0.04358,,,,
4,2022091900,1372,47857,Devin Singletary,(8:33) D.Singletary right guard to TEN 32 for ...,2,1,10,BUF,TEN,...,0.83629,0.16371,0.001053,-0.001053,3.686428,-0.167903,,,,


In [24]:
# 调用函数并打印生成的表格
plays_data_summary = generate_summary_table(plays_data)

表格信息统计:


Unnamed: 0,Column,Data Type,Missing Values,Missing %,Unique Values,Min,Max,Mean,Median
0,gameId,int64,0,0.00%,136,2022090800.0,2022110700.0,2022098953.855598,2022100903.0
1,playId,int64,0,0.00%,3974,54.0,5096.0,1986.603476,1990.5
2,ballCarrierId,int64,0,0.00%,480,25511.0,55158.0,48072.271664,47789.0
3,ballCarrierDisplayName,object,0,0.00%,480,,,,
4,playDescription,object,0,0.00%,12486,,,,
5,quarter,int64,0,0.00%,5,1.0,5.0,2.550136,3.0
6,down,int64,0,0.00%,4,1.0,4.0,1.727054,2.0
7,yardsToGo,int64,0,0.00%,32,1.0,38.0,8.469085,10.0
8,possessionTeam,object,0,0.00%,32,,,,
9,defensiveTeam,object,0,0.00%,32,,,,


不同节次中比赛的分布情况

In [25]:
# 创建直方图
fig = px.histogram(plays_data, x="quarter", title="Distribution of Games by Quarters")

# 修改颜色方案并设置opacity
color_sequence = px.colors.qualitative.Plotly
fig.update_traces(marker=dict(color='darkblue', opacity=0.7))  # 设置颜色和opacity

# 添加渐变填充
fig.update_traces(marker=dict(line=dict(color='white', width=2)))

# 使用更大的字体和粗体文本
fig.update_layout(title_font=dict(size=24, family="Arial"), font=dict(size=14, family="Arial"))

# 修改布局
fig.update_layout(
    xaxis_title="Quarter",
    yaxis_title="Number of Games",
    xaxis=dict(showgrid=False),
    yaxis=dict(showgrid=False),
    margin=dict(l=80, r=80, t=80, b=80),
)

fig.show()

进攻次数在比赛的分布情况

In [26]:
# 获取不同down的唯一整数值
down_values = sorted(plays_data['down'].unique())

# 创建直方图
fig = px.histogram(plays_data, x='down', title='Distribution of Plays by Down',
                   labels={'down': 'Down', 'count': 'Number of Plays'})

# 自定义横坐标轴的刻度值和标签
fig.update_layout(xaxis=dict(tickvals=down_values, ticktext=[str(val) for val in down_values]))

# 将柱子颜色改成碧蓝色，并设置透明度为0.7
fig.update_traces(marker_color='darkblue', opacity=0.7)

# 使用自定义字体和标题样式
fig.update_layout(font=dict(family="Arial", size=12, color="black"))
fig.update_layout(title_font=dict(family="Times New Roman", size=24, color="navy"))

# 添加阴影效果
fig.update_traces(marker=dict(line=dict(width=2, color='white')))

# 显示图形
fig.show()


前进码数（yards to go）的分布情况

In [27]:
# 使用 Plotly 绘制前进码数的分布情况
fig = px.histogram(plays_data, x='yardsToGo', title='Distribution of Yards to Go')

# 自定义图表布局
fig.update_layout(
    plot_bgcolor='lightgray',  # 设置背景颜色
    title=dict(text='Distribution of Yards to Go', font=dict(size=24, color='navy')),  # 设置标题样式
    xaxis=dict(title='Yards to Go', title_font=dict(size=16, color='black'), tickfont=dict(size=14, color='black')),  # 设置 x 轴样式
    yaxis=dict(title='Frequency', title_font=dict(size=16, color='black'), tickfont=dict(size=14, color='black')),  # 设置 y 轴样式
)

# 自定义直方图的颜色和透明度
fig.update_traces(marker_color='darkblue', opacity=0.7)  # 修改颜色为 darkblue 并设置透明度为 0.7

# 调整直方图的 bin 宽度
fig.update_traces(histnorm='probability', xbins=dict(start=0, end=20, size=1))

# 显示图表
fig.show()


第三次进攻且距离终点很短（3rd and short）或者第一次进攻但距离终点很远（1st and long）。

基于持球队伍的比赛分布情况possessionTeam

In [28]:
# Calculate the number of games for each possession team using value_counts()
possession_team_count = plays_data['possessionTeam'].value_counts().reset_index()
possession_team_count.columns = ['Possession Team', 'Number of Games']

# Create a bar chart based on the distribution of games by possession team
fig = px.bar(possession_team_count, x='Possession Team', y='Number of Games',
             labels={'Possession Team': 'Possession Team', 'Number of Games': 'Number of Games'},
             title='Distribution of Games by Possession Team',
             color='Possession Team', # Color code bars by possession team
             template='plotly', # Use the default Plotly template
             color_continuous_scale=px.colors.qualitative.Pastel, # Use the Pastel color mapping
             text='Number of Games') # Display the number of games on the bars

fig.update_xaxes(categoryorder='total descending')  # Sort possession teams in descending order by number of games

fig.show()

预期分值增加（Expected Points Added，EPA）的分布情况

In [29]:
# 创建直方图
fig = px.histogram(plays_data, x='expectedPointsAdded', 
                   title='Distribution of Expected Points Added (EPA)',
                   labels={'expectedPointsAdded': 'Expected Points Added (EPA)'})

# 更新直方图样式
fig.update_traces(marker_color='darkblue', opacity=0.7)

# 更新布局
fig.update_layout(xaxis_title='Expected Points Added (EPA)',
                  yaxis_title='Count')

# 显示图表
fig.show()

EPA衡量了一次比赛对球队预期得分的影响。它有助于评估不同比赛的效果。

4. tackles.csv

首先对该表格内容进行数据清洗与可视化

In [30]:
tackles_data.head()

Unnamed: 0,gameId,playId,nflId,tackle,assist,forcedFumble,pff_missedTackle
0,2022090800,101,42816,1,0,0,0
1,2022090800,393,46232,1,0,0,0
2,2022090800,486,40166,1,0,0,0
3,2022090800,646,47939,1,0,0,0
4,2022090800,818,40107,1,0,0,0


In [31]:
# 调用函数并打印生成的表格
tackles_data_summary = generate_summary_table(tackles_data)

表格信息统计:


Unnamed: 0,Column,Data Type,Missing Values,Missing %,Unique Values,Min,Max,Mean,Median
0,gameId,int64,0,0.00%,136,2022090800,2022110700,2022098971.441123,2022100903.0
1,playId,int64,0,0.00%,3943,54,5096,1982.974578,1991.0
2,nflId,int64,0,0.00%,800,33131,55241,47602.719442,46669.0
3,tackle,int64,0,0.00%,2,0,1,0.569207,1.0
4,assist,int64,0,0.00%,2,0,1,0.315276,0.0
5,forcedFumble,int64,0,0.00%,2,0,1,0.005681,0.0
6,pff_missedTackle,int64,0,0.00%,2,0,1,0.119936,0.0


可以看到该表全为数值项，并且没有任何缺失数据，大大减轻了笔者的工作！！！
直接进行可视化分析！！！

铲球（tackles）的分布情况

In [32]:
# 统计铲球是否成功的次数
tackle_success_counts = tackles_data['tackle'].value_counts()

# 创建条形统计图
fig = px.bar(x=tackle_success_counts.index, y=tackle_success_counts.values,
             labels={'x': 'Tackle Success', 'y': 'Count'},
             title='Distribution of Tackle Success',
             category_orders={'x': [0, 1]}
            )

# 设置 x 轴标签
fig.update_xaxes(type='category')

# 在每个条形上方添加文本标签
fig.update_traces(text=tackle_success_counts.values, textposition='outside')

# 显示图表
fig.show()

助攻（assists）的分布情况

In [33]:
# 统计助攻是否成功的次数
assist_counts = tackles_data['assist'].value_counts()

# 创建条形统计图
fig = px.bar(x=assist_counts.index, y=assist_counts.values,
             labels={'x': 'Assist Success', 'y': 'Count'},
             title='Distribution of Assist Success',
             category_orders={'x': [0, 1]}
            )

# 设置 x 轴标签
fig.update_xaxes(type='category')

# 在每个条形上方添加文本标签
for i, count in enumerate(assist_counts.values):
    fig.add_annotation(
        x=assist_counts.index[i],
        y=count,
        text=str(count),
        showarrow=True,
        font=dict(size=12)
    )

# 显示图表
fig.show()

强制性失误（forced fumbles）的分布情况

In [34]:
# 统计强制性失误的次数
forced_fumbles_counts = tackles_data['forcedFumble'].value_counts()

# 创建条形统计图
fig = px.bar(x=forced_fumbles_counts.index, y=forced_fumbles_counts.values,
             labels={'x': 'Forced Fumble', 'y': 'Count'},
             title='Distribution of Forced Fumbles',
             category_orders={'x': [0, 1]}
            )

# 设置 x 轴标签
fig.update_xaxes(type='category')

# 在每个条形上方添加文本标签
for i, count in enumerate(forced_fumbles_counts.values):
    fig.add_annotation(
        x=forced_fumbles_counts.index[i],
        y=count,
        text=str(count),
        showarrow=True,
        font=dict(size=12)
    )

# 显示图表
fig.show()

Pro Football Focus（PFF）失误铲球（missed tackles）的分布情况

In [35]:
# 统计 PFF 失误铲球的次数
missed_tackles_counts = tackles_data['pff_missedTackle'].value_counts()

# 创建条形统计图
fig = px.bar(x=missed_tackles_counts.index, y=missed_tackles_counts.values,
             labels={'x': 'PFF Missed Tackle', 'y': 'Count'},
             title='Distribution of PFF Missed Tackles',
             category_orders={'x': [0, 1]}
            )

# 设置 x 轴标签
fig.update_xaxes(type='category')

# 在每个条形上方添加文本标签
for i, count in enumerate(missed_tackles_counts.values):
    fig.add_annotation(
        x=missed_tackles_counts.index[i],
        y=count,
        text=str(count),
        showarrow=True,
        font=dict(size=12)
    )

# 显示图表
fig.show()