In [7]:
df = pd.read_excel("Team Stats Arsenal.xlsx", sheet_name="TeamStats", skiprows=[1, 2])



In [52]:
import pandas as pd
import numpy as np
import re
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder

#读取文件
df = pd.read_excel("Team Stats Arsenal.xlsx", sheet_name="TeamStats", skiprows=[1, 2])

#自动命名合并列
new_cols = []
current_title = None
count = 0

#制定命名规则，以大标题命名每个小列
semantic_map = {
    "Shots / on target": ["Shots", "On_Target", "Shots_on_target_Accuracy"],
    "Passes / accurate": ["Passes", "Accurate", "Passes_accurate_Accuracy"],
    "Losses / Low / Medium / High": ["Losses", "Losses_Low", "Losses_Medium", "Losses_High"],
    "Recoveries / Low / Medium / High": ["Recoveries", "Recoveries_Low", "Recoveries_Medium", "Recoveries_High"],
    "Duels / won": ["Duels", "Won", "Duels / won_Accuracy"],
    "Shots from outside penalty area / on target": ["Shots_from_outside_penalty_area", "Shots_from_outside_penalty_area_On_Target", "Shots from outside penalty area/on target_Accuracy"],
    "Positional attacks / with shots": ["Positional_attacks", "Positional_attacks_WithShots", "Accuracy"],
    "Counterattacks / with shots": ["Counterattacks", "Counterattacks_WithShots", "Accuracy"],
    "Set pieces / with shots": ["Set_pieces", "Set_pieces_WithShots", "Set pieces/with shots_Accuracy"],
    "Corners / with shots": ["Corners", "Corners_WithShots", "Corners/with shots_Accuracy"],
    "Free kicks / with shots": ["Free_kicks", "Free_kicks_WithShots", "Free kicks/with shots_Accuracy"],
    "Penalties / converted": ["Penalties", "Converted", "Penalties/converted_Accuracy"],
    "Crosses / accurate": ["Crosses", "Crosses_Accurate", "Crosses/accurate_Accuracy"],
    "Penalty area entries (runs / crosses)": ["Penalty_area_entries", "Penalty_area_entries_Runs", "Penalty_area_entries_Crosses"],
    "Offensive duels / won": ["Offensive_duels", "Offensive duels_Won", "Offensive duels/won_Accuracy"],
    "Shots against / on target": ["Shots_against", "Shots_against_OnTarget", "Shots against/on target_Accuracy"],
    "Defensive duels / won": ["Defensive_duels", "Defensive duels_Won", "Defensive duels/won_Accuracy"],
    "Aerial duels / won": ["Aerial_duels", "Aerial duels_Won", "Aerial duels/won_Accuracy"],
    "Sliding tackles / successful": ["Sliding_tackles", "Successful", "Sliding tackles/successful_Accuracy"],
    "Forward passes / accurate": ["Forward_passes", "Forward_passes_Accurate", "Forward passes/accurate_Accuracy"],
    "Back passes / accurate": ["Back_passes", "Back_passes_Accurate", "Back passes/accurate_Accuracy"],
    "Lateral passes / accurate": ["Lateral_passes", "Lateral_passes_Accurate", "Lateral passes/accurate_Accuracy"],
    "Long passes / accurate": ["Long_passes", "Long_passes_Accurate", "Long passes/accurate_Accuracy"],
    "Passes to final third / accurate": ["Passes_to_final_third", "Passes_to_final_third_Accurate", "Accuracy"],
    "Progressive passes / accurate": ["Progressive_passes", "Progressive_passes_Accurate", "Progressive passes/accurate_Accuracy"],
    "Smart passes / accurate": ["Smart_passes", "Smart_passes_Accurate", "Smart passes/accurate_Accuracy"],
    "Throw ins / accurate": ["Throw_ins", "Throw_ins_Accurate", "Throw ins/accurate_Accuracy"],
}

for col in df.columns:
    if "Unnamed" not in str(col):
        current_title = col.strip()
        count = 0
        new_cols.append(current_title)
    else:
        count += 1
        if current_title in semantic_map:
            names = semantic_map[current_title]
            if count - 1 < len(names):
                new_cols.append(names[count - 1])
#整理列名格式
cleaned = []
for c in new_cols:
    name = re.sub(r"[ /,%]", "_", c).strip()
    name = re.sub(r"_+", "_", name)
    name = name.strip("_")
    cleaned.append(name)
df.columns = cleaned

#区分主客场
def get_home_away(match_str):
    if isinstance(match_str, str):
        if match_str.startswith("Arsenal - "):
            return "Home"
        elif " - Arsenal" in match_str:
            return "Away"
    return "Unknown"

df["HomeAway"] = df["Match"].apply(get_home_away)

#拆分阿森纳与对手
arsenal_df = df[df["Team"] == "Arsenal"].copy()
opp_df = df[df["Team"] != "Arsenal"].copy()

common_cols = list(set(arsenal_df.columns) & set(opp_df.columns))
common_cols.remove("Team")

#按Match合并
merged = pd.merge(
    arsenal_df,
    opp_df,
    on="Match",
    suffixes=("_Arsenal", "_Opponents")
)

#每90分钟进球率
if "Duration_Arsenal" in merged.columns:
    merged["Goals_per_90"] = merged["Goals_Arsenal"] / (merged["Duration_Arsenal"] / 90)
else:
    merged["Goals_per_90"] = merged["Goals_Arsenal"]
merged["Goals_per_90"] = merged["Goals_per_90"].fillna(0)
target_col = "Goals_per_90"

#文字转数字
for col in merged.select_dtypes(include=['object']).columns:
    merged[col] = LabelEncoder().fit_transform(merged[col].astype(str))

#将y作为预测目标，其余列作为特征x，避免答案泄露
X = merged.drop(
    columns=[c for c in merged.columns
             if "Goal" in c
             or "Conceded" in c],
    errors="ignore"
)
y = merged["Goals_per_90"]

#转换日期
for col in ["Date_Arsenal", "Date_Opponents"]:
    if col in X.columns:
        X[col] = pd.to_datetime(X[col], errors='coerce')
        X[col] = (X[col] - X[col].min()).dt.days
X = X.apply(pd.to_numeric, errors='coerce').fillna(0)

#构建模型
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X, y)

feature_importances = pd.Series(rf.feature_importances_, index=X.columns).sort_values(ascending=False)
top20 = feature_importances.head(20)

#列出Top20
print("Top 20 Features:")
print(top20)

#特征重要性
print("Sum of all feature importances:", feature_importances.sum())
print("Sum of top20 feature importances:", top20.sum())

Top 20 Features:
Set_pieces_with_shots_Arsenal            0.113032
Shots_Arsenal                            0.091079
On_Target_Arsenal                        0.084017
Recoveries_Low_Medium_High_Opponents     0.080468
Shots_against_OnTarget_Opponents         0.065563
Long_passes_Accurate_Opponents           0.061135
Shots_against_Opponents                  0.058651
xG_Arsenal                               0.039378
Aerial_duels_Opponents                   0.028844
Progressive_passes_Accurate_Arsenal      0.022057
Progressive_passes_accurate_Opponents    0.013059
Interceptions_Opponents                  0.010822
Forward_passes_Accurate_Opponents        0.010034
xG_Opponents                             0.009250
Long_passes_Accurate_Arsenal             0.007965
Lateral_passes_Accurate_Opponents        0.007349
Corners_WithShots_Arsenal                0.007268
Match_tempo_Opponents                    0.006510
Back_passes_Accurate_Arsenal             0.006499
Average_pass_length_Arsenal      