In [6]:
import requests
from bs4 import BeautifulSoup
import re

In [7]:
def get_page_source(url):
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Encoding': 'gzip, deflate, br',
            'Accept-Language': 'en-US,en;q=0.9',
            'Connection': 'keep-alive'
        }
        response = requests.get(url, headers=headers)
        response.raise_for_status()  
        response.encoding = "utf-8"
        return response.text
    except requests.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return None

<br>

# 获取球队

whoscored.com/robots.txt:

```
User-agent: *
Disallow: /Accounts/
Disallow: /Predictions/
Disallow: /Users/
Disallow: /accounts/
Disallow: /predictions/
Disallow: /users/
```

In [115]:
url_laliga = 'https://www.whoscored.com/teams/52'
html_laliga = get_page_source(url_laliga)
print(html_laliga)

# 粗略发现：连续请求会被403

Error fetching https://www.whoscored.com/teams/52: 403 Client Error: Forbidden for url: https://www.whoscored.com/teams/52
None


In [116]:
url1 = 'https://www.whoscored.com/teams/52'      # laliga: real madrid page
url2 = 'https://www.whoscored.com/teams/304/'   # ligue 1: psg page
url3 = 'https://www.whoscored.com/teams/37/'   # bundesliga: bayern page
url4 = 'https://www.whoscored.com/teams/26/'   # premier league: liverpool page
url5 = 'https://www.whoscored.com/teams/75/'   # serie a: inter page

top5_url = [url1, url2, url3, url4, url5]
top5_dict = {'LaLiga':0, 'Ligue 1':0, 'Bundesliga':0, 'Premier League':0, 'Serie A':0}

import time
for url, key in zip(top5_url, top5_dict.keys()):
    print(url, '\n', key, '\n\n...')
    html = get_page_source(url)
    soup = BeautifulSoup(html, 'html.parser')
    top5_dict[key] = {option.text.strip(): option['value'] for option in soup.select('select#teams option')}
    time.sleep(60)

https://www.whoscored.com/teams/52 
 LaLiga 

...
Error fetching https://www.whoscored.com/teams/52: 403 Client Error: Forbidden for url: https://www.whoscored.com/teams/52


TypeError: object of type 'NoneType' has no len()

### whoscored的反爬机制太狠了，如何解决？
1. 使用selenium, 参考 https://github.com/cboutaud/whoscraped
2. 换一个网站
| 网站            | 数据类型     | 是否需要模拟请求头 | 防爬强度 | 是否有 API | 
| ------------- | -------- | --------- | ---- | ----------- |
| Sofascore     | 全面       | 是（轻度）     | 低    | ✅（隐藏）   | 
| Transfermarkt | 转会、估值    | 是         | 中低   | ❌       |
| FBref         | 技术统计     | 否（基本不需要）  | 极低   | ❌       | 
| Flashscore    | 实时比分     | 是         | 中    | ❌       |
| football-data | 基础数据 API | 有 token   | 无    | ✅       |


In [135]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options

options = Options()
options.binary_location = "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe"  # 实际路径
options.add_argument("--headless")
options.add_argument("window-size=1920x1080")
options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36")

service = Service(executable_path="chromedriver.exe")  # 指定 chromedriver 路径
driver = webdriver.Chrome(service=service, options=options)

# 以下代码运行后会陷入长时间运行，可能是连接不上
# driver.get("https://www.whoscored.com/teams/52")
# html = driver.page_source
# print(html)
# driver.quit()

<br>

fbref.com/robots.txt 文件没有禁止：
```
/en/players/（球员页）
/en/comps/（联赛数据页）
/en/squads/（球队页）
```

In [196]:
url_fbref = 'https://fbref.com/en/'

url1 = 'https://fbref.com/en/comps/9/Premier-League-Stats'  # Premier League
url2 = 'https://fbref.com/en/comps/12/La-Liga-Stats'   # La Liga
url3 = 'https://fbref.com/en/comps/20/Bundesliga-Stats' # Bundesliga
url4 = 'https://fbref.com/en/comps/11/Serie-A-Stats'  # Serie A
url5 = 'https://fbref.com/en/comps/13/Ligue-1-Stats'  # Ligue 1

In [137]:
# print(get_page_source(url1))

    
      
<!DOCTYPE html>
<html data-version="klecko-" data-root="/home/fb/deploy/www/base" lang="en" class="no-js" >
<head>
    <meta charset="utf-8">
    <meta http-equiv="x-ua-compatible" content="ie=edge">
    <meta name="viewport" content="width=device-width, initial-scale=1.0, maximum-scale=2.0" />
    <link rel="dns-prefetch" href="https://cdn.ssref.net/req/202505073" />
<script>
/* https://docs.osano.com/hc/en-us/articles/22469433444372-Google-Consent-Mode-v2  */
  window.dataLayer = window.dataLayer ||[];
      function gtag(){dataLayer.push(arguments);}
      gtag('consent','default',{
        'ad_storage':'denied',
        'analytics_storage':'denied',
        'ad_user_data':'denied',
        'ad_personalization':'denied',
        'personalization_storage':'denied',
        'functionality_storage':'granted',
        'security_storage':'granted',
        'wait_for_update': 500
      });
      gtag("set", "ads_data_redaction", true);
</script>
<script src="https://cmp.osano.

``` html
<div class="table_container tabbed current is_setup" id="div_results2024-2025111_overall">  // 意甲表格
<div class="table_container tabbed current is_setup" id="div_results2024-202591_overall">  // 英超表格  
```
div中套着table
``` html
<table class="stats_table sortable min_width force_mobilize now_sortable" id="results2024-202591_overall" data-cols-to-freeze=",2">
```
这才是表格本体，直接让`soup`找它就行

In [None]:
top5_url = [url1, url2, url3, url4, url5]
top5_dict = {'Premier League':0, 'LaLiga':0, 'Bundesliga':0, 'Serie A':0, 'Ligue 1':0}

In [215]:
league_codes = [9,12,20,11,13]

for code, key in zip(league_codes, top5_dict.keys()):
    url = 'https://fbref.com/en/comps/'+str(code)
    print(url, '\n', key, '\n\n...')
    html = get_page_source(url)
    soup = BeautifulSoup(html, 'html.parser')
    table = soup.find('table', id='results2024-2025'+str(code)+'1_overall')
    top5_dict[key] = {
        (a := row.find('td', {'data-stat': 'team'}).find('a')).text.strip(): 
        'https://fbref.com' + a['href'].strip()
        for row in table.tbody.find_all('tr')
        if row.find('td', {'data-stat': 'team'}) and row.find('td', {'data-stat': 'team'}).find('a')
    }

print('Done')

https://fbref.com/en/comps/9 
 Premier League 

...
https://fbref.com/en/comps/12 
 LaLiga 

...
https://fbref.com/en/comps/20 
 Bundesliga 

...
https://fbref.com/en/comps/11 
 Serie A 

...
https://fbref.com/en/comps/13 
 Ligue 1 

...
Done


In [216]:
print([len(l) for l in top5_dict.values()])
top5_dict

[20, 20, 18, 20, 18]


{'Premier League': {'Liverpool': 'https://fbref.com/en/squads/822bd0ba/Liverpool-Stats',
  'Arsenal': 'https://fbref.com/en/squads/18bb7c10/Arsenal-Stats',
  'Newcastle Utd': 'https://fbref.com/en/squads/b2b47a98/Newcastle-United-Stats',
  'Manchester City': 'https://fbref.com/en/squads/b8fd03ef/Manchester-City-Stats',
  'Chelsea': 'https://fbref.com/en/squads/cff3d9bb/Chelsea-Stats',
  'Aston Villa': 'https://fbref.com/en/squads/8602292d/Aston-Villa-Stats',
  "Nott'ham Forest": 'https://fbref.com/en/squads/e4a775cb/Nottingham-Forest-Stats',
  'Brentford': 'https://fbref.com/en/squads/cd051869/Brentford-Stats',
  'Brighton': 'https://fbref.com/en/squads/d07537b9/Brighton-and-Hove-Albion-Stats',
  'Bournemouth': 'https://fbref.com/en/squads/4ba7cbea/Bournemouth-Stats',
  'Fulham': 'https://fbref.com/en/squads/fd962109/Fulham-Stats',
  'Crystal Palace': 'https://fbref.com/en/squads/47c64c55/Crystal-Palace-Stats',
  'Everton': 'https://fbref.com/en/squads/d3fd31cc/Everton-Stats',
  'Wolve

---
TEST

In [144]:
code = 9
html = get_page_source('https://fbref.com/en/comps/'+str(code))
soup = BeautifulSoup(html, 'html.parser')
table = soup.find('table', id='results2024-2025'+str(code)+'1_overall')

In [184]:
teams = []

for row in table.tbody.find_all('tr'):
    team_cell = row.find('td', {'data-stat': 'team'})  
    if team_cell:
        team_name = team_cell.text.strip()
        teams.append(team_name)
        
print(teams)

<td class="left" data-stat="team"><img alt="Club Crest" height="13" itemscope="image" src="https://cdn.ssref.net/req/202505073/tlogo/fb/mini.33c895d4.png" style="vertical-align:text-top" width="13"/> <a href="/en/squads/33c895d4/Southampton-Stats">Southampton</a></td>


以下代码用于理解上一块代码：

In [192]:
def pt(x):
    print(type(x))

In [227]:
print(table)
pt(table)

# print(table.tbody.prettify())
# pt(table.tbody)

<table class="stats_table sortable min_width force_mobilize" data-cols-to-freeze=",2" id="results2024-2025131_overall"> <caption>Ligue 1 Table</caption> <colgroup><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/></colgroup> <thead> <tr> <th aria-label="Rank" class="poptip sort_default_asc center" data-stat="rank" data-tip="&lt;strong&gt;Rank&lt;/strong&gt;&lt;br&gt;Squad finish in competition&lt;br&gt;Finish within the league or competition.&lt;br&gt;For knockout competitions may show final round reached.&lt;br&gt;Colors and arrows represent promotion/relegation or qualifiation for continental cups.&lt;br&gt;Trophy indicates team won league whether by playoffs or by leading the table.&lt;br&gt;Star indicates topped table in league USING another means of naming champion." scope="col">Rk</th> <th aria-label="Squad" class="poptip sort_default_asc center" data-stat="team" scope="col">Squad</th> <th aria-label="Matches P

In [195]:
# pt(table.tbody.find_all('tr'))
# for p in table.tbody.find_all('tr'):
#     print(p, '\n')
    
row = table.tbody.find_all('tr')[0]
# pt(row)
print(row.prettify())

cell = row.find('td', {'data-stat': 'team'})
print(cell.prettify())

<tr>
 <th class="right" csk="1" data-stat="rank" scope="row">
  1
 </th>
 <td class="left" data-stat="team">
  <img alt="Club Crest" height="13" itemscope="image" src="https://cdn.ssref.net/req/202505073/tlogo/fb/mini.822bd0ba.png" style="vertical-align:text-top" width="13"/>
  <a href="/en/squads/822bd0ba/Liverpool-Stats">
   Liverpool
  </a>
 </td>
 <td class="right" data-stat="games">
  36
 </td>
 <td class="right" data-stat="wins">
  25
 </td>
 <td class="right" data-stat="ties">
  8
 </td>
 <td class="right" data-stat="losses">
  3
 </td>
 <td class="right" data-stat="goals_for">
  83
 </td>
 <td class="right" data-stat="goals_against">
  37
 </td>
 <td class="right" data-stat="goal_diff">
  +46
 </td>
 <td class="right" data-stat="points">
  83
 </td>
 <td class="right" data-stat="points_avg">
  2.31
 </td>
 <td class="right" data-stat="xg_for">
  77.8
 </td>
 <td class="right" data-stat="xg_against">
  34.6
 </td>
 <td class="right" data-stat="xg_diff">
  +43.2
 </td>
 <td class

tr: table row   
td: table data

row就是：
<figure>
    <left> <img src="images/liverpoolrow.png"  alt='missing' width="800"  ><left/>
<figure/>
cell就是：
<figure>
    <left> <img src="images/liverpoolcell.png"  alt='missing' width="100"  ><left/>
<figure/>
        其中包含三个元素：Liverpool文字，超链接，队标图片

In [174]:
teams_dict = {
    (a := row.find('td', {'data-stat': 'team'}).find('a')).text.strip(): 
    'https://fbref.com' + a['href'].strip()
    for row in table.tbody.find_all('tr')
    if row.find('td', {'data-stat': 'team'}) and row.find('td', {'data-stat': 'team'}).find('a')
}

`(a := row.find('td', {'data-stat': 'team'})`<br>
整个表达式的值为`:=`右边的表达式的值，且该值赋给了`a`

In [157]:
teams_dict

{'Liverpool': 'https://fbref.com/en/squads/822bd0ba/Liverpool-Stats',
 'Arsenal': 'https://fbref.com/en/squads/18bb7c10/Arsenal-Stats',
 'Newcastle Utd': 'https://fbref.com/en/squads/b2b47a98/Newcastle-United-Stats',
 'Manchester City': 'https://fbref.com/en/squads/b8fd03ef/Manchester-City-Stats',
 'Chelsea': 'https://fbref.com/en/squads/cff3d9bb/Chelsea-Stats',
 'Aston Villa': 'https://fbref.com/en/squads/8602292d/Aston-Villa-Stats',
 "Nott'ham Forest": 'https://fbref.com/en/squads/e4a775cb/Nottingham-Forest-Stats',
 'Brentford': 'https://fbref.com/en/squads/cd051869/Brentford-Stats',
 'Brighton': 'https://fbref.com/en/squads/d07537b9/Brighton-and-Hove-Albion-Stats',
 'Bournemouth': 'https://fbref.com/en/squads/4ba7cbea/Bournemouth-Stats',
 'Fulham': 'https://fbref.com/en/squads/fd962109/Fulham-Stats',
 'Crystal Palace': 'https://fbref.com/en/squads/47c64c55/Crystal-Palace-Stats',
 'Everton': 'https://fbref.com/en/squads/d3fd31cc/Everton-Stats',
 'Wolves': 'https://fbref.com/en/squads

<br>

# 获取球员信息

In [229]:
print(get_page_source("https://fbref.com/en/squads/d48ad4ff/Napoli-Stats"))

    
      
<!DOCTYPE html>
<html data-version="klecko-" data-root="/home/fb/deploy/www/base" lang="en" class="no-js" >
<head>
    <meta charset="utf-8">
    <meta http-equiv="x-ua-compatible" content="ie=edge">
    <meta name="viewport" content="width=device-width, initial-scale=1.0, maximum-scale=2.0" />
    <link rel="dns-prefetch" href="https://cdn.ssref.net/nocdn" />
<script>
/* https://docs.osano.com/hc/en-us/articles/22469433444372-Google-Consent-Mode-v2  */
  window.dataLayer = window.dataLayer ||[];
      function gtag(){dataLayer.push(arguments);}
      gtag('consent','default',{
        'ad_storage':'denied',
        'analytics_storage':'denied',
        'ad_user_data':'denied',
        'ad_personalization':'denied',
        'personalization_storage':'denied',
        'functionality_storage':'granted',
        'security_storage':'granted',
        'wait_for_update': 500
      });
      gtag("set", "ads_data_redaction", true);
</script>
<script src="https://cmp.osano.com/16CG

``` html
<div class="table_container tabbed current is_setup" id="div_stats_standard_11"> // 那不勒斯球员表
<div class="table_container tabbed current is_setup" id="div_stats_standard_9">  // 利物浦球员表
<div class="table_container tabbed current is_setup" id="div_stats_standard_9">  // 阿森纳球员表
```

同上

In [266]:
player_dict = {'Premier League':0, 'LaLiga':0, 'Bundesliga':0, 'Serie A':0, 'Ligue 1':0}
league_codes = [9,12,20,11,13]

for league_name, league_dict, code in zip(top5_dict.keys(), top5_dict.values(), league_codes):
    if code == 9:
        continue
    print(league_name, ':')
    s = set()  # 用集合储存一个联赛的所有球员
    for team, url in league_dict.items():
        print(f"\t{team}...")
#         print('\t', code, url)
        html = get_page_source(url)
        soup = BeautifulSoup(html, 'html.parser')
        table = soup.find('table', id='stats_standard_'+str(code))
        rows = table.tbody.find_all('tr')
        for r in rows:
            cell = r.find('th')
            s.add(cell.text.strip())
        time.sleep(5)
    player_dict[league_name] = s

LaLiga :
	Barcelona...
	Real Madrid...
	Atlético Madrid...
	Athletic Club...
	Villarreal...
	Betis...
	Celta Vigo...
	Rayo Vallecano...
	Mallorca...
	Osasuna...
	Valencia...
	Real Sociedad...
	Girona...
	Sevilla...
	Getafe...
	Espanyol...
	Alavés...
	Leganés...
	Las Palmas...
	Valladolid...
Bundesliga :
	Bayern Munich...
	Leverkusen...
	Eint Frankfurt...
	Freiburg...
	Dortmund...
	Mainz 05...
	RB Leipzig...
	Werder Bremen...
	Stuttgart...
	Gladbach...
	Augsburg...
	Wolfsburg...
	Union Berlin...
	St. Pauli...
	Hoffenheim...
	Heidenheim...
	Holstein Kiel...
	Bochum...
Serie A :
	Napoli...
	Inter...
	Atalanta...
	Juventus...
	Lazio...
	Roma...
	Bologna...
	Milan...
	Fiorentina...
	Como...
	Torino...
	Udinese...
	Genoa...
	Cagliari...
	Hellas Verona...
	Parma...
	Venezia...
	Lecce...
	Empoli...
	Monza...
Ligue 1 :
	Paris S-G...
	Marseille...
	Monaco...
	Nice...
	Lille...
	Strasbourg...
	Lyon...
	Brest...
	Lens...
	Auxerre...
	Rennes...
	Toulouse...
	Angers...
	Reims...
	Nantes...
	Le Havre

In [9]:
player_dict

NameError: name 'player_dict' is not defined

---
TEST

In [253]:
url = "https://fbref.com/en/squads/d48ad4ff/Napoli-Stats"
code = 11

html = get_page_source(url)
soup = BeautifulSoup(html, 'html.parser')
table = soup.find('table', id='stats_standard_'+str(code))

In [254]:
# print(table.prettify())

rows = table.tbody.find_all('tr')
# print(rows[0].prettify())
cell = rows[0].find('th')
# print(cell)
cell.text.strip()

'Amir Rrahmani'

In [260]:
s = {1,3,4}
pt(s)
s.add(1)
s.add(66)
s
s1 = {}
pt(s1)
s2 = set()
pt(s2)

<class 'set'>
<class 'dict'>
<class 'set'>


<br>

# 德转网站上搜索球员

In [4]:
from urllib.parse import quote_plus

def search_on_transfermarkt(query):
    encoded_query = quote_plus(query)
    url = f'https://www.transfermarkt.com/schnellsuche/ergebnis/schnellsuche?query={encoded_query}'
    return get_page_source(url)

In [8]:
query = "Kylian Mbappe"

s = search_on_transfermarkt(query)
print(s)

<!DOCTYPE html>
<html lang="en">

<head>
    
<script type="text/javascript" data-description="sourcepoint stub code">
    !function () { var e = function () { var e, t = "__tcfapiLocator", a = [], n = window; for (; n;) { try { if (n.frames[t]) { e = n; break } } catch (e) { } if (n === window.top) break; n = n.parent } e || (!function e() { var a = n.document, r = !!n.frames[t]; if (!r) if (a.body) { var i = a.createElement("iframe"); i.style.cssText = "display:none", i.name = t, a.body.appendChild(i) } else setTimeout(e, 5); return !r }(), n.__tcfapi = function () { for (var e, t = arguments.length, n = new Array(t), r = 0; r < t; r++)n[r] = arguments[r]; if (!n.length) return a; if ("setGdprApplies" === n[0]) n.length > 3 && 2 === parseInt(n[1], 10) && "boolean" == typeof n[3] && (e = n[3], "function" == typeof n[2] && n[2]("set", !0)); else if ("ping" === n[0]) { var i = { gdprApplies: e, cmpLoaded: !1, cmpStatus: "stub" }; "function" == typeof n[2] && n[2](i) } else a.push(n) }, 

---
TEST

In [272]:
from urllib.parse import quote_plus

query = "Kylian Mbappe"
encoded_query = quote_plus(query)

print(encoded_query)
pt(encoded_query)

# 构造搜索 URL
url = f'https://www.transfermarkt.com/schnellsuche/ergebnis/schnellsuche?query={encoded_query}'
get_page_source(url)

Kylian+Mbappe
<class 'str'>


'<!DOCTYPE html>\n<html lang="en">\n\n<head>\n    \n<script type="text/javascript" data-description="sourcepoint stub code">\n    !function () { var e = function () { var e, t = "__tcfapiLocator", a = [], n = window; for (; n;) { try { if (n.frames[t]) { e = n; break } } catch (e) { } if (n === window.top) break; n = n.parent } e || (!function e() { var a = n.document, r = !!n.frames[t]; if (!r) if (a.body) { var i = a.createElement("iframe"); i.style.cssText = "display:none", i.name = t, a.body.appendChild(i) } else setTimeout(e, 5); return !r }(), n.__tcfapi = function () { for (var e, t = arguments.length, n = new Array(t), r = 0; r < t; r++)n[r] = arguments[r]; if (!n.length) return a; if ("setGdprApplies" === n[0]) n.length > 3 && 2 === parseInt(n[1], 10) && "boolean" == typeof n[3] && (e = n[3], "function" == typeof n[2] && n[2]("set", !0)); else if ("ping" === n[0]) { var i = { gdprApplies: e, cmpLoaded: !1, cmpStatus: "stub" }; "function" == typeof n[2] && n[2](i) } else a.push