### <font color='289C4E'>目录</font><a class='anchor' id='top'></a>
- [导入数据](#1)
- [标准化方向数据](#2)
- [合并数据](#3)
- [创建比赛摘要](#4)
- [探索性数据分析](#5)
- [建模](#6)
- [GIF/动画](#7)

# 1. 导入数据 <a class="anchor"  id="1"></a>

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

In [2]:
# 读取 games.csv 文件
games_data = pd.read_csv('../../Dataset/games.csv')

# 读取 players.csv 文件
players_data = pd.read_csv('../../Dataset/players.csv')

# 读取 plays.csv 文件
plays_data = pd.read_csv('../../Dataset/plays.csv')

# 读取 tackles.csv 文件
tackles_data = pd.read_csv('../../Dataset/tackles.csv')

建立一个函数用于统计表格相关列的值信息

In [3]:
def generate_summary_table(data):
    summary_table = pd.DataFrame(columns=['Column', 'Data Type', 'Missing Values', 'Missing %', 'Unique Values', 'Min', 'Max', 'Mean', 'Median'])
    
    for column in data.columns:
        data_type = str(data[column].dtype)
        
        missing_values = data[column].isnull().sum()
        missing_percentage = (missing_values / len(data)) * 100
        
        if data[column].dtype == 'object':
            min_value, max_value, mean_value, median_value = '', '', '', ''
        else:
            min_value = data[column].min()
            max_value = data[column].max()
            mean_value = data[column].mean()
            median_value = data[column].median()
        
        unique_values = data[column].nunique()  # 计算唯一值数量
        
        summary_table = summary_table.append({
            'Column': column,
            'Data Type': data_type,
            'Missing Values': missing_values,
            'Missing %': f'{missing_percentage:.2f}%',
            'Unique Values': unique_values,  # 添加唯一值数量
            'Min': min_value,
            'Max': max_value,
            'Mean': mean_value,
            'Median': median_value
        }, ignore_index=True)
    
    formatted_table = summary_table.style.set_properties(**{'text-align': 'center'})
    
    print("表格信息统计:")
    display(formatted_table)
    
    return summary_table


1. 比赛数据games.csv分析

In [4]:
games_data.head()

Unnamed: 0,gameId,season,week,gameDate,gameTimeEastern,homeTeamAbbr,visitorTeamAbbr,homeFinalScore,visitorFinalScore
0,2022090800,2022,1,09/08/2022,20:20:00,LA,BUF,10,31
1,2022091100,2022,1,09/11/2022,13:00:00,ATL,NO,26,27
2,2022091101,2022,1,09/11/2022,13:00:00,CAR,CLE,24,26
3,2022091102,2022,1,09/11/2022,13:00:00,CHI,SF,19,10
4,2022091103,2022,1,09/11/2022,13:00:00,CIN,PIT,20,23


生成表格信息统计图

In [5]:
# 调用函数并打印生成的表格
games_data_summary = generate_summary_table(games_data)

表格信息统计:


Unnamed: 0,Column,Data Type,Missing Values,Missing %,Unique Values,Min,Max,Mean,Median
0,gameId,int64,0,0.00%,136,2022090800.0,2022110700.0,2022098922.117647,2022100902.5
1,season,int64,0,0.00%,1,2022.0,2022.0,2022.0,2022.0
2,week,int64,0,0.00%,9,1.0,9.0,4.845588,5.0
3,gameDate,object,0,0.00%,27,,,,
4,gameTimeEastern,object,0,0.00%,8,,,,
5,homeTeamAbbr,object,0,0.00%,32,,,,
6,visitorTeamAbbr,object,0,0.00%,32,,,,
7,homeFinalScore,int64,0,0.00%,38,3.0,49.0,22.669118,22.5
8,visitorFinalScore,int64,0,0.00%,35,0.0,48.0,20.948529,20.0


2. 球员数据players.csv分析

In [6]:
players_data.head()

Unnamed: 0,nflId,height,weight,birthDate,collegeName,position,displayName
0,25511,6-4,225,1977-08-03,Michigan,QB,Tom Brady
1,29550,6-4,328,1982-01-22,Arkansas,T,Jason Peters
2,29851,6-2,225,1983-12-02,California,QB,Aaron Rodgers
3,30842,6-6,267,1984-05-19,UCLA,TE,Marcedes Lewis
4,33084,6-4,217,1985-05-17,Boston College,QB,Matt Ryan


生成表格信息统计图

In [7]:
# 调用函数并打印生成的表格
players_data_summary = generate_summary_table(players_data)

表格信息统计:


Unnamed: 0,Column,Data Type,Missing Values,Missing %,Unique Values,Min,Max,Mean,Median
0,nflId,int64,0,0.00%,1683,25511.0,55241.0,48221.702317,47872.0
1,height,object,0,0.00%,16,,,,
2,weight,int64,0,0.00%,179,153.0,380.0,245.724302,236.0
3,birthDate,object,479,28.46%,985,,,,
4,collegeName,object,0,0.00%,226,,,,
5,position,object,0,0.00%,19,,,,
6,displayName,object,0,0.00%,1672,,,,


3. 每场比赛数据plays.csv分析

In [8]:
plays_data.head()

Unnamed: 0,gameId,playId,ballCarrierId,ballCarrierDisplayName,playDescription,quarter,down,yardsToGo,possessionTeam,defensiveTeam,...,preSnapHomeTeamWinProbability,preSnapVisitorTeamWinProbability,homeTeamWinProbabilityAdded,visitorTeamWinProbilityAdded,expectedPoints,expectedPointsAdded,foulName1,foulName2,foulNFLId1,foulNFLId2
0,2022100908,3537,48723,Parker Hesse,(7:52) (Shotgun) M.Mariota pass short middle t...,4,1,10,ATL,TB,...,0.976785,0.023215,-0.00611,0.00611,2.360609,0.981955,,,,
1,2022091103,3126,52457,Chase Claypool,(7:38) (Shotgun) C.Claypool right end to PIT 3...,4,1,10,PIT,CIN,...,0.160485,0.839515,-0.010865,0.010865,1.733344,-0.263424,,,,
2,2022091111,1148,42547,Darren Waller,(8:57) D.Carr pass short middle to D.Waller to...,2,2,5,LV,LAC,...,0.756661,0.243339,-0.037409,0.037409,1.312855,1.133666,,,,
3,2022100212,2007,46461,Mike Boone,(13:12) M.Boone left tackle to DEN 44 for 7 ya...,3,2,10,DEN,LV,...,0.620552,0.379448,-0.002451,0.002451,1.641006,-0.04358,,,,
4,2022091900,1372,47857,Devin Singletary,(8:33) D.Singletary right guard to TEN 32 for ...,2,1,10,BUF,TEN,...,0.83629,0.16371,0.001053,-0.001053,3.686428,-0.167903,,,,


生成表格信息统计图

In [9]:
# 调用函数并打印生成的表格
plays_data_summary = generate_summary_table(plays_data)

表格信息统计:


Unnamed: 0,Column,Data Type,Missing Values,Missing %,Unique Values,Min,Max,Mean,Median
0,gameId,int64,0,0.00%,136,2022090800.0,2022110700.0,2022098953.855598,2022100903.0
1,playId,int64,0,0.00%,3974,54.0,5096.0,1986.603476,1990.5
2,ballCarrierId,int64,0,0.00%,480,25511.0,55158.0,48072.271664,47789.0
3,ballCarrierDisplayName,object,0,0.00%,480,,,,
4,playDescription,object,0,0.00%,12486,,,,
5,quarter,int64,0,0.00%,5,1.0,5.0,2.550136,3.0
6,down,int64,0,0.00%,4,1.0,4.0,1.727054,2.0
7,yardsToGo,int64,0,0.00%,32,1.0,38.0,8.469085,10.0
8,possessionTeam,object,0,0.00%,32,,,,
9,defensiveTeam,object,0,0.00%,32,,,,


4. 铲球数据tackles.csv分析

In [10]:
tackles_data.head()

Unnamed: 0,gameId,playId,nflId,tackle,assist,forcedFumble,pff_missedTackle
0,2022090800,101,42816,1,0,0,0
1,2022090800,393,46232,1,0,0,0
2,2022090800,486,40166,1,0,0,0
3,2022090800,646,47939,1,0,0,0
4,2022090800,818,40107,1,0,0,0


生成表格信息统计图

In [11]:
# 调用函数并打印生成的表格
tackles_data_summary = generate_summary_table(tackles_data)

表格信息统计:


Unnamed: 0,Column,Data Type,Missing Values,Missing %,Unique Values,Min,Max,Mean,Median
0,gameId,int64,0,0.00%,136,2022090800,2022110700,2022098971.441123,2022100903.0
1,playId,int64,0,0.00%,3943,54,5096,1982.974578,1991.0
2,nflId,int64,0,0.00%,800,33131,55241,47602.719442,46669.0
3,tackle,int64,0,0.00%,2,0,1,0.569207,1.0
4,assist,int64,0,0.00%,2,0,1,0.315276,0.0
5,forcedFumble,int64,0,0.00%,2,0,1,0.005681,0.0
6,pff_missedTackle,int64,0,0.00%,2,0,1,0.119936,0.0


# 2. 标准化数据 <a class="anchor"  id="1"></a>

1. games.csv

首先对该表格内容进行数据清洗与可视化

由之前信息可得此表中没有出现数据缺失的情况，因此可以直接处理

得分分布 - 直方图展示主队和客队得分的分布情况

In [12]:
# 调用函数并打印生成的表格
games_data_summary = generate_summary_table(games_data)

表格信息统计:


Unnamed: 0,Column,Data Type,Missing Values,Missing %,Unique Values,Min,Max,Mean,Median
0,gameId,int64,0,0.00%,136,2022090800.0,2022110700.0,2022098922.117647,2022100902.5
1,season,int64,0,0.00%,1,2022.0,2022.0,2022.0,2022.0
2,week,int64,0,0.00%,9,1.0,9.0,4.845588,5.0
3,gameDate,object,0,0.00%,27,,,,
4,gameTimeEastern,object,0,0.00%,8,,,,
5,homeTeamAbbr,object,0,0.00%,32,,,,
6,visitorTeamAbbr,object,0,0.00%,32,,,,
7,homeFinalScore,int64,0,0.00%,38,3.0,49.0,22.669118,22.5
8,visitorFinalScore,int64,0,0.00%,35,0.0,48.0,20.948529,20.0


In [13]:
# 创建直方图
fig = px.histogram(games_data, x=["homeFinalScore", "visitorFinalScore"], nbins=20, barmode="overlay",
                   labels={"value": "Score", "variable": "Team"}, title="Score Distribution")
fig.update_layout(xaxis_title="Score", yaxis_title="Frequency")
fig.show()


赛季中的队伍表现 - 折线图显示每支球队在赛季不同周次的得分情况

In [14]:
# 根据赛季、周次和队伍分组并计算得分总和
team_scores = games_data.groupby(["season", "week", "homeTeamAbbr"])["homeFinalScore"].sum().reset_index()
team_scores = team_scores.rename(columns={"homeTeamAbbr": "Team", "homeFinalScore": "Score"})

# 获取所有队伍的列表
all_teams = team_scores["Team"].unique().tolist()

# 创建交互式折线图
fig = go.Figure()

for team in all_teams:
    team_data = team_scores[team_scores["Team"] == team]
    fig.add_trace(go.Scatter(x=team_data["week"], y=team_data["Score"], mode='lines', name=team))

# 设置布局
fig.update_layout(
    title="Team Performance in Different Weeks",
    xaxis_title="Week",
    yaxis_title="Score",
    legend=dict(orientation="h", y=-0.2),
    margin=dict(l=20, r=20, t=80, b=20)
)

fig.show()

从结果可以看出，并不是所有的队伍每个星期都参赛

赛季中比赛结果统计 - 饼图展示每个赛季主队和客队的胜利次数和比赛结果

In [15]:
# 创建新列，表示主队和客队的胜利情况
games_data['home_win'] = games_data['homeFinalScore'] > games_data['visitorFinalScore']
games_data['visitor_win'] = games_data['homeFinalScore'] < games_data['visitorFinalScore']

# 统计每个赛季中主队和客队的胜利次数
season_results = games_data.groupby(['season']).agg({
    'home_win': 'sum',
    'visitor_win': 'sum'
}).reset_index()

# 重塑数据以符合饼图格式
season_results_melted = season_results.melt(id_vars='season', var_name='Result', value_name='Wins')

# 显示饼图
fig = px.pie(season_results_melted, values='Wins', names='Result', title='Season Results: Home Team vs Visitor Team Wins',
             hover_data=['Wins'], labels={'Result': 'Game Result'})
fig.show()

看来主场确实有优势啊

接下来进行数据清理的工作

In [16]:
# 调用函数并打印生成的表格
games_data_summary = generate_summary_table(games_data)

表格信息统计:


Unnamed: 0,Column,Data Type,Missing Values,Missing %,Unique Values,Min,Max,Mean,Median
0,gameId,int64,0,0.00%,136,2022090800,2022110700,2022098922.117647,2022100902.5
1,season,int64,0,0.00%,1,2022,2022,2022.0,2022.0
2,week,int64,0,0.00%,9,1,9,4.845588,5.0
3,gameDate,object,0,0.00%,27,,,,
4,gameTimeEastern,object,0,0.00%,8,,,,
5,homeTeamAbbr,object,0,0.00%,32,,,,
6,visitorTeamAbbr,object,0,0.00%,32,,,,
7,homeFinalScore,int64,0,0.00%,38,3,49,22.669118,22.5
8,visitorFinalScore,int64,0,0.00%,35,0,48,20.948529,20.0
9,home_win,bool,0,0.00%,2,False,True,0.536765,1.0


分析结果发现这里面存在着大量的非数值文本，这会使数据很难处理，因此我们要适当的舍弃或者对这些文本进行重新编码。接下来我们看看这些列分别代表了什么数据

这些字段的含义如下：

- **gameId（比赛ID）**：比赛的唯一标识符（数字）
- **season（赛季）**：比赛所属的赛季（数字）
- **week（周次）**：比赛所在的周次（数字）
- **gameDate（比赛日期）**：比赛日期（时间，月/日/年）
- **gameTimeEastern（比赛开始时间）**：比赛开始时间（时间，时:分:秒，东部标准时间）
- **homeTeamAbbr（主队三字母代码）**：主队的三字母代码（文本）
- **visitorTeamAbbr（客队三字母代码）**：客队的三字母代码（文本）
- **homeFinalScore（主队最终得分）**：主队在比赛中的总得分（数字）
- **visitorFinalScore（客队最终得分）**：客队在比赛中的总得分（数字）

这些字段描述了每场比赛的详细信息，包括比赛的标识、赛季、比赛日期和时间、参赛队伍及其得分情况。

* 其中赛季（season）肯定是一个无关值，因为这些数据都属于一个赛季，可以抛弃
* gameTimeEastern可能不太重要，因为是比赛举行的时间，稍后可以用热力图分析一下
* 而对主队客队，我们可以采用编码的方式来对这些队伍进行编码来剔除文本信息。

由于热力图只能处理数值信息，因此我们应先将队名，比赛日期和比赛时间进行编码

In [17]:
# 获取所有队伍的列表，并按字母顺序排序
teams = sorted(set(games_data['homeTeamAbbr'].unique()) | set(games_data['visitorTeamAbbr'].unique()))

# 创建队伍编码映射
team_encoding = {team: code for code, team in enumerate(teams, 1)}

# 对 'homeTeamAbbr' 和 'visitorTeamAbbr' 进行编码
games_data['homeTeamAbbr'] = games_data['homeTeamAbbr'].map(team_encoding)
games_data['visitorTeamAbbr'] = games_data['visitorTeamAbbr'].map(team_encoding)

# 对 'gameDate' 进行日期编码
games_data['gameDate'] = pd.to_datetime(games_data['gameDate'])
games_data['gameDate'] = (games_data['gameDate'] - games_data['gameDate'].min()).dt.days + 1

# 对 'gameTimeEastern' 进行小时编码
games_data['gameTimeEastern'] = pd.to_datetime(games_data['gameTimeEastern']).dt.hour + \
                                pd.to_datetime(games_data['gameTimeEastern']).dt.minute / 60 + \
                                pd.to_datetime(games_data['gameTimeEastern']).dt.second / 3600

# 输出队名与编码的对应关系
for team, code in team_encoding.items():
    print(f"Team: {team} -> Code: {code}")

# 输出编码后的结果
print(games_data[['season', 'week', 'homeTeamAbbr', 'visitorTeamAbbr', 'homeFinalScore', 'visitorFinalScore', 'gameDate', 'gameTimeEastern']])

Team: ARI -> Code: 1
Team: ATL -> Code: 2
Team: BAL -> Code: 3
Team: BUF -> Code: 4
Team: CAR -> Code: 5
Team: CHI -> Code: 6
Team: CIN -> Code: 7
Team: CLE -> Code: 8
Team: DAL -> Code: 9
Team: DEN -> Code: 10
Team: DET -> Code: 11
Team: GB -> Code: 12
Team: HOU -> Code: 13
Team: IND -> Code: 14
Team: JAX -> Code: 15
Team: KC -> Code: 16
Team: LA -> Code: 17
Team: LAC -> Code: 18
Team: LV -> Code: 19
Team: MIA -> Code: 20
Team: MIN -> Code: 21
Team: NE -> Code: 22
Team: NO -> Code: 23
Team: NYG -> Code: 24
Team: NYJ -> Code: 25
Team: PHI -> Code: 26
Team: PIT -> Code: 27
Team: SEA -> Code: 28
Team: SF -> Code: 29
Team: TB -> Code: 30
Team: TEN -> Code: 31
Team: WAS -> Code: 32
     season  week  homeTeamAbbr  visitorTeamAbbr  homeFinalScore  \
0      2022     1            17                4              10   
1      2022     1             2               23              26   
2      2022     1             5                8              24   
3      2022     1             6          

对于队名我采用了一个编码规则，按照字母排列编码
比赛日期也采用了相似的编码，比赛时间编码则是直接转化为小时数，比如20:00:00就编码为20

首先drop掉season，再画出热力图，看看比赛时间会不会对比赛的胜负结果产生影响。

In [18]:
# 剔除 'season', 'home_win', 'visitor_win' 列
games_df_without_season = games_data.drop(columns=['season', 'home_win', 'visitor_win'])

# 计算相关性矩阵
correlation_matrix = games_df_without_season.corr()

# 创建热力图
fig = px.imshow(correlation_matrix,
                labels=dict(color="Correlation"),
                x=correlation_matrix.index,
                y=correlation_matrix.columns,
                color_continuous_scale='Viridis')

# 在热力图中标注数值
annotations = []
for i, row in enumerate(correlation_matrix.index):
    for j, col in enumerate(correlation_matrix.columns):
        annotations.append(
            dict(
                x=col,
                y=row,
                text=f"{correlation_matrix.iloc[i, j]:.2f}",
                showarrow=False,
            )
        )

fig.update_layout(
    title='Correlation Heatmap of Columns (Season Column Excluded)',
    xaxis_title='Columns',
    yaxis_title='Columns',
    annotations=annotations
)

fig.show()

发现gameId,week和gameDate强相关，这是因为比赛的时间安排决定了这几项必然强相关，相反，其他数据项的相关性并不大，我们可以放心剔除week和gameDate来减少数据维度，保留gameId，因为可能要用这项数据连接其他表

In [19]:
games_data_cleaned = games_data.drop(columns=['season', 'home_win', 'visitor_win', 'week', 'gameDate'])
games_data_cleaned.head()  # 数据清理完成

Unnamed: 0,gameId,gameTimeEastern,homeTeamAbbr,visitorTeamAbbr,homeFinalScore,visitorFinalScore
0,2022090800,20.333333,17,4,10,31
1,2022091100,13.0,2,23,26,27
2,2022091101,13.0,5,8,24,26
3,2022091102,13.0,6,29,19,10
4,2022091103,13.0,7,27,20,23


至此，该表的数据清理完成

2. players.csv

首先对该表格内容进行公制化转换，转换成我们可以处理的数据

In [20]:
players_data.head()

Unnamed: 0,nflId,height,weight,birthDate,collegeName,position,displayName
0,25511,6-4,225,1977-08-03,Michigan,QB,Tom Brady
1,29550,6-4,328,1982-01-22,Arkansas,T,Jason Peters
2,29851,6-2,225,1983-12-02,California,QB,Aaron Rodgers
3,30842,6-6,267,1984-05-19,UCLA,TE,Marcedes Lewis
4,33084,6-4,217,1985-05-17,Boston College,QB,Matt Ryan


身高：米  体重：千克  出生日期：转化为年龄

In [21]:
# 转换身高为厘米
players_data['height'] = players_data['height'].apply(lambda x: int(x.split('-')[0]) * 30.48 + int(x.split('-')[1]) * 2.54)

# 转换体重为公斤
players_data['weight'] = players_data['weight'] * 0.453592

# 计算年龄
players_data['birthDate'] = 2022 - pd.to_datetime(players_data['birthDate']).dt.year

# 输出转换后的结果
print(players_data)

      nflId  height      weight  birthDate       collegeName position  \
0     25511  193.04  102.058200       45.0          Michigan       QB   
1     29550  193.04  148.778176       40.0          Arkansas        T   
2     29851  187.96  102.058200       39.0        California       QB   
3     30842  198.12  121.109064       38.0              UCLA       TE   
4     33084  193.04   98.429464       37.0    Boston College       QB   
...     ...     ...         ...        ...               ...      ...   
1678  55200  198.12  120.655472        NaN           Indiana       DT   
1679  55212  182.88  104.326160        NaN        Iowa State      ILB   
1680  55239  187.96  136.077600        NaN      Pennsylvania       DT   
1681  55240  185.42   83.914520        NaN           Buffalo       CB   
1682  55241  187.96  127.005760        NaN  Coastal Carolina       DT   

           displayName  
0            Tom Brady  
1         Jason Peters  
2        Aaron Rodgers  
3       Marcedes Lewis 

In [22]:
# 调用函数并打印生成的表格
players_data_summary = generate_summary_table(players_data)

表格信息统计:


Unnamed: 0,Column,Data Type,Missing Values,Missing %,Unique Values,Min,Max,Mean,Median
0,nflId,int64,0,0.00%,1683,25511.0,55241.0,48221.702317,47872.0
1,height,float64,0,0.00%,16,167.64,205.74,188.604433,190.5
2,weight,float64,0,0.00%,179,69.399576,172.36496,111.458578,107.047712
3,birthDate,float64,479,28.46%,19,23.0,45.0,27.467608,27.0
4,collegeName,object,0,0.00%,226,,,,
5,position,object,0,0.00%,19,,,,
6,displayName,object,0,0.00%,1672,,,,


由之前的统计信息可知，此表在生日值上是有缺失的，因此很多人的年龄不知道，并且缺失值达到了28.46%，不能采用中值或者均值进行替换，因此我们不得不舍弃该数据列，尽管在体育上年龄对于球员表现是重要因素。

而且球员姓名也有重名，因此我们不能对球员姓名进行编码，只能使用nflId来标识球员

In [23]:
# 删除 'birthDate' 列
players_data.drop(['birthDate'], axis=1, inplace=True)

# 输出结果
print(players_data)

      nflId  height      weight       collegeName position       displayName
0     25511  193.04  102.058200          Michigan       QB         Tom Brady
1     29550  193.04  148.778176          Arkansas        T      Jason Peters
2     29851  187.96  102.058200        California       QB     Aaron Rodgers
3     30842  198.12  121.109064              UCLA       TE    Marcedes Lewis
4     33084  193.04   98.429464    Boston College       QB         Matt Ryan
...     ...     ...         ...               ...      ...               ...
1678  55200  198.12  120.655472           Indiana       DT    Ryder Anderson
1679  55212  182.88  104.326160        Iowa State      ILB       Jake Hummel
1680  55239  187.96  136.077600      Pennsylvania       DT      Prince Emili
1681  55240  185.42   83.914520           Buffalo       CB  Ja'Marcus Ingram
1682  55241  187.96  127.005760  Coastal Carolina       DT       C.J. Brewer

[1683 rows x 6 columns]


接下来我们进行可视化分析

In [24]:
# 创建直方图统计身高每一厘米的频率
height_counts = players_data['height'].value_counts().sort_index()

fig = go.Figure(data=[go.Bar(x=height_counts.index, y=height_counts.values)])
fig.update_layout(
    title='Height Distribution',
    xaxis_title='Height (cm)',
    yaxis_title='Frequency'
)
fig.show()

In [25]:
# 创建直方图统计体重
fig_weight = px.histogram(players_data, x='weight', title='Weight Distribution')
fig_weight.update_xaxes(title='Weight')
fig_weight.show()

In [26]:
# 创建直方图不同位置的球员数量
fig_position = px.histogram(players_data, x='position', title='Player Position Distribution')
fig_position.update_xaxes(title='Position')
fig_position.show()

In [27]:
# 创建箱线图得到位置与体重的关系
fig_box = px.box(players_data, x='position', y='weight', title='Position vs Weight')
fig_box.update_xaxes(title='Position')
fig_box.update_yaxes(title='Weight')
fig_box.show()

In [28]:
# 计算各个球员位置的平均身高和体重
avg_height_weight = players_data.groupby('position').agg({'height': 'mean', 'weight': 'mean'}).reset_index()

# 绘制柱状图展示不同位置球员的平均身高和体重
fig_avg = px.bar(avg_height_weight, x='position', y=['height', 'weight'], 
                 barmode='group', title='Average Height and Weight by Position')
fig_avg.update_xaxes(title='Position')
fig_avg.update_yaxes(title='Value')
fig_avg.show()

In [29]:
# 计算BMI指数
players_data['bmi'] = players_data['weight'] / ((players_data['height'] / 100) ** 2)

# 绘制BMI指数的直方图
fig_bmi = px.histogram(players_data, x='bmi', title='BMI Distribution')
fig_bmi.update_xaxes(title='BMI')
fig_bmi.show()

可视化分析完成后，接下来对其collegeName和position两列进行编码，将姓名drop掉使用nflId标识球员

In [30]:
# 获取所有学院和职位的列表，并按字母顺序排序
college_names = sorted(players_data['collegeName'].unique())
positions = sorted(players_data['position'].unique())

# 创建学院和职位编码映射
college_encoding = {college: code for code, college in enumerate(college_names, 1)}
position_encoding = {position: code for code, position in enumerate(positions, 1)}

# 对 'collegeName' 和 'position' 进行编码
players_data['collegeName'] = players_data['collegeName'].map(college_encoding)
players_data['position'] = players_data['position'].map(position_encoding)

# 删除 'displayName' 列
players_data.drop(['displayName'], axis=1, inplace=True)

# 输出编码后的结果
print(players_data)

      nflId  height      weight  collegeName  position        bmi
0     25511  193.04  102.058200          109        14  27.387557
1     29550  193.04  148.778176            8        17  39.924971
2     29851  187.96  102.058200           29        14  28.887970
3     30842  198.12  121.109064          196        18  30.854606
4     33084  193.04   98.429464           22        14  26.413777
...     ...     ...         ...          ...       ...        ...
1678  55200  198.12  120.655472           82         5  30.739046
1679  55212  182.88  104.326160           85         9  31.193262
1680  55239  187.96  136.077600          146         5  38.517294
1681  55240  185.42   83.914520           28         2  24.407537
1682  55241  187.96  127.005760           39         5  35.949474

[1683 rows x 6 columns]
