In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import re
from urllib.parse import urljoin
import logging
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, WebDriverException
from webdriver_manager.chrome import ChromeDriverManager

# --- Cấu hình Logging ---
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

In [None]:
def init_driver():
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--window-size=1920x1080")
    chrome_options.add_argument(
        "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    )

    # Đường dẫn đến ChromeDriver trong thư mục chrome-linux64
    chrome_driver_path = "/home/minh/codeproject/epl_crawl/chrome-linux64/chrome"

    try:
        service = Service(chrome_driver_path)
        driver = webdriver.Chrome(service=service, options=chrome_options)
        logging.info("WebDriver đã được khởi tạo.")
        return driver
    except Exception as e:
        logging.error(f"Lỗi khởi tạo WebDriver: {e}")
        return None

In [3]:
# --- Hàm lấy link Match Report từ trang lịch thi đấu --- (Giữ nguyên bản Selenium)
def get_match_report_links_selenium(driver, schedule_url, base_url="https://fbref.com"):
    links = []
    if not driver: return links
    try:
        logging.info(f"Đang lấy link từ: {schedule_url} bằng Selenium")
        driver.get(schedule_url)
        WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.CSS_SELECTOR, "table[id^='sched_'] , table.stats_table")))
        time.sleep(1)
        page_source = driver.page_source
        soup = BeautifulSoup(page_source, 'html.parser')

        table = soup.find('table', id=re.compile(r'^sched_.*'))
        if not table: table = soup.find('table', class_='stats_table')
        if not table: logging.error(f"Không tìm thấy bảng lịch thi đấu: {schedule_url}"); return []

        match_report_cells = table.find_all('td', {'data-stat': 'match_report'})
        for cell in match_report_cells:
            link_tag = cell.find('a', href=True)
            if link_tag:
                relative_url = link_tag['href']
                if '/en/matches/' in relative_url:
                    full_url = urljoin(base_url, relative_url)
                    links.append(full_url)
        logging.info(f"Tìm thấy {len(links)} link Match Report.")
        return list(dict.fromkeys(links)) # Loại bỏ trùng lặp

    except Exception as e: logging.error(f"Lỗi lấy link từ {schedule_url}: {e}", exc_info=False); return []

# --- Hàm lấy League và Season --- (Giữ nguyên)
def get_league_season_info(driver, schedule_url):
    league_name, season_str = None, None
    if not driver: return league_name, season_str
    try:
        logging.info(f"Đang lấy thông tin League/Season từ: {schedule_url}")
        driver.get(schedule_url)
        WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.CSS_SELECTOR, "h1")))
        time.sleep(1)
        page_source = driver.page_source
        soup = BeautifulSoup(page_source, 'html.parser')
        h1_tag = soup.find('h1')
        if h1_tag:
            season_match = re.search(r'(\d{4}-\d{4}|\d{4})', h1_tag.text)
            if season_match: season_str = season_match.group(1)
            league_name_raw = h1_tag.text
            if season_str: league_name_raw = league_name_raw.replace(season_str, '')
            league_name_raw = re.sub(r'Scores?.+Fixtures?', '', league_name_raw, flags=re.IGNORECASE).strip()
            league_name = league_name_raw
        logging.info(f"League: {league_name}, Season: {season_str}")
        return league_name, season_str
    except Exception as e: logging.error(f"Lỗi lấy league/season từ {schedule_url}: {e}", exc_info=False); return None, None

In [4]:
# --- Hàm phụ: Lấy đội hình, đội hình dự bị và sơ đồ chiến thuật ---
def get_lineup_formation_bench(soup, team_id_char, match_url):
    starting_lineup = []; bench = []; formation = None
    try:
        lineup_div = soup.find('div', class_='lineup', id=team_id_char)
        if lineup_div:
            lineup_table = lineup_div.find('table')
            if lineup_table:
                header_th = lineup_table.find('th', colspan="2")
                if header_th:
                    header_text = header_th.get_text(strip=True); formation_match = re.search(r'\(([\d\-]+)\)', header_text)
                    if formation_match: formation = formation_match.group(1)

                bench_header_row_found = None
                bench_header_th = lineup_table.find('th', string='Bench')
                if bench_header_th: bench_header_row_found = bench_header_th.find_parent('tr')

                tbody = lineup_table.find('tbody')
                if tbody:
                    player_rows = tbody.find_all('tr')
                    bench_started = False; starter_count = 0
                    for row in player_rows:
                        if row == bench_header_row_found: bench_started = True; continue
                        if row.find('th') and not row.find('a', href=lambda href: href and '/players/' in href): continue
                        player_link = row.find('a', href=lambda href: href and '/players/' in href)
                        if player_link:
                            player_name = player_link.text.strip()
                            if player_name:
                                if not bench_started and starter_count < 11: starting_lineup.append(player_name); starter_count += 1
                                elif bench_started or starter_count >= 11: bench.append(player_name)
                    if not bench_header_row_found and len(starting_lineup) == 11:
                        all_player_links = lineup_table.find_all('a', href=lambda href: href and '/players/' in href)
                        all_player_names = [link.text.strip() for link in all_player_links if link.text.strip()]
                        if len(all_player_names) > 11: bench = all_player_names[11:]
    except Exception as e:
        logging.warning(f"[{match_url}] Lỗi khi lấy lineup/bench team '{team_id_char}': {e}", exc_info=False) # Giảm độ chi tiết log lỗi

    lineup_output = [formation] + starting_lineup if formation else starting_lineup
    # Đảm bảo lineup_output là một list duy nhất để ghi vào CSV
    lineup_str = ", ".join(filter(None, lineup_output)) # Chuyển list thành string
    bench_str = ", ".join(filter(None, bench)) # Chuyển list thành string

    # Trả về dạng string để ghi CSV dễ hơn, hoặc giữ nguyên list nếu muốn xử lý sau
    return lineup_str, bench_str
    # return lineup_output, bench # Nếu muốn trả về list

In [5]:
# --- Hàm phụ: Lấy tổng Shots, SoT, Cards từ tfoot bảng Player Stats ---
def get_totals_from_player_stats_tfoot(soup, specific_table_id, match_url):
    totals = {'shots': 0, 'shots_on_target': 0, 'yellow_cards': 0, 'red_cards': 0}
    if not specific_table_id: return totals
    try:
        table = soup.find('table', id=specific_table_id)
        if not table: logging.warning(f"[{match_url}] Không tìm thấy bảng player stats ID: {specific_table_id}"); return totals
        thead = table.find('thead'); tfoot = table.find('tfoot')
        if thead and tfoot:
            header_row = thead.find_all('tr')[-1]; headers_th = header_row.find_all('th')
            headers = [th.get('data-stat') for th in headers_th]
            total_row = tfoot.find('tr')
            if total_row:
                cells = total_row.find_all(['th', 'td'])
                stat_map = {'shots': 'shots', 'shots_on_target': 'shots_on_target', 'cards_yellow': 'yellow_cards', 'cards_red': 'red_cards'}
                for data_stat_name, target_key in stat_map.items():
                    try:
                        if data_stat_name in headers:
                            col_index = headers.index(data_stat_name)
                            if col_index < len(cells):
                                value_str = cells[col_index].text.strip()
                                totals[target_key] = int(value_str) if value_str.isdigit() else 0
                            else: logging.warning(f"[{match_url}] Index cột '{data_stat_name}' không hợp lệ tfoot {specific_table_id}."); totals[target_key] = 0
                        else: logging.warning(f"[{match_url}] Không tìm thấy header '{data_stat_name}' trong {specific_table_id}."); totals[target_key] = 0
                    except ValueError: logging.warning(f"[{match_url}] Giá trị không hợp lệ tfoot {specific_table_id}: '{cells[col_index].text.strip()}'. Gán 0."); totals[target_key] = 0
                    except Exception as e_inner: logging.warning(f"[{match_url}] Lỗi lấy tfoot {target_key} {specific_table_id}: {e_inner}"); totals[target_key] = 0
            else: logging.warning(f"[{match_url}] Không thấy hàng trong tfoot {specific_table_id}")
        else: logging.warning(f"[{match_url}] Không thấy thead/tfoot trong {specific_table_id}")
    except Exception as e: logging.error(f"[{match_url}] Lỗi nghiêm trọng xử lý tfoot {specific_table_id}: {e}", exc_info=False)
    return totals

In [6]:
# --- Hàm phụ: Lấy Saves từ bảng Goalkeeper Stats ---
def get_saves_from_keeper_stats(soup, specific_div_id, match_url):
    saves = 0
    if not specific_div_id: return saves
    try:
        target_div = soup.find('div', id=specific_div_id)
        if not target_div: logging.warning(f"[{match_url}] Không tìm thấy div keeper stats: {specific_div_id}"); return saves
        keeper_table = target_div.find('table', id=re.compile(r'^keeper_stats_'))
        if keeper_table:
            tbody = keeper_table.find('tbody')
            if tbody:
                keeper_rows = tbody.find_all('tr'); total_saves = 0
                for keeper_row in keeper_rows:
                    saves_cell = keeper_row.find('td', {'data-stat': 'gk_saves'})
                    if saves_cell:
                        try: saves_val = saves_cell.text.strip(); total_saves += int(saves_val) if saves_val.isdigit() else 0
                        except ValueError: logging.warning(f"[{match_url}] Giá trị saves không hợp lệ {specific_div_id}: '{saves_val}'.")
                saves = total_saves
    except Exception as e: logging.warning(f"[{match_url}] Lỗi lấy saves từ {specific_div_id}: {e}")
    return saves

In [7]:
# --- Hàm chính: Crawl dữ liệu chi tiết từ một trang Match Report ---
def scrape_match_data_selenium(driver, match_url, league_name, season_str, source_url):
    """Crawl dữ liệu chi tiết từ một link Match Report dùng Selenium."""
    match_data = {'match_report_url': match_url, 'league': league_name, 'season': season_str, 'source': source_url}
    if not driver: return None

    try:
        logging.info(f"Đang crawl: {match_url}")
        driver.get(match_url)
        WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.CSS_SELECTOR, "div.scorebox")))
        time.sleep(1) # Ngắn hơn để tăng tốc
        page_source = driver.page_source
        soup = BeautifulSoup(page_source, 'html.parser')

        # -- Round (Week) --
        match_data['round'] = 0 # Mặc định 0
        try:
            header_info_div = soup.find('h1').find_next_sibling('div')
            if header_info_div:
                week_text = header_info_div.text
                week_match = re.search(r'\((?:Matchweek|Round)\s+(\d+)\)', week_text, re.IGNORECASE)
                if week_match: match_data['round'] = int(week_match.group(1))
        except Exception: logging.warning(f"[{match_url}] Không tìm thấy round.")

        # -- Scorebox Data (Teams, Scores, Date, Venue) --
        match_data['home'] = ""; match_data['away'] = ""; match_data['home_score'] = 0; match_data['away_score'] = 0
        match_data['date'] = ""; match_data['venue'] = ""
        try:
            scorebox = soup.find('div', class_='scorebox')
            if scorebox:
                teams_divs = scorebox.find_all('div', recursive=False)[:2]
                if len(teams_divs) == 2:
                    home_a = teams_divs[0].select_one('strong > a'); away_a = teams_divs[1].select_one('strong > a')
                    if home_a: match_data['home'] = home_a.text.strip()
                    if away_a: match_data['away'] = away_a.text.strip()
                    home_score_div = teams_divs[0].select_one('div.scores > div.score'); away_score_div = teams_divs[1].select_one('div.scores > div.score')
                    if home_score_div:
                        try: match_data['home_score'] = int(home_score_div.text.strip())
                        except ValueError: pass # Bỏ qua nếu score không phải số
                    if away_score_div:
                        try: match_data['away_score'] = int(away_score_div.text.strip())
                        except ValueError: pass
                scorebox_meta = scorebox.find('div', class_='scorebox_meta')
                if scorebox_meta:
                    date_tag = scorebox_meta.find('strong'); date_link = date_tag.find('a') if date_tag else None
                    if date_link : match_data['date'] = date_link.text.strip()
                    elif date_tag: match_data['date'] = date_tag.text.split(',')[0].strip()
                    venue_strong = scorebox_meta.find('strong', string='Venue')
                    if venue_strong:
                        venue_parent = venue_strong.find_parent('div')
                        if venue_parent:
                            venue_small = venue_parent.find('small', string=lambda t: t and t.strip() != 'Venue')
                            if venue_small: match_data['venue'] = venue_small.text.strip()
                            else:
                                next_sibling = venue_strong.next_sibling
                                if next_sibling and isinstance(next_sibling, str):
                                    venue_cleaned = next_sibling.strip(':').strip()
                                    if venue_cleaned: match_data['venue'] = venue_cleaned
        except Exception as e: logging.warning(f"[{match_url}] Lỗi scorebox: {e}", exc_info=False)

        # -- Lineup, Formation, Bench --
        match_data['home_lineup'], match_data['home_missing'] = get_lineup_formation_bench(soup, 'a', match_url)
        match_data['away_lineup'], match_data['away_missing'] = get_lineup_formation_bench(soup, 'b', match_url)

        # -- Possession & Pass Accuracy --
        home_stats = {}; away_stats = {}
        try:
            team_stats_div = soup.find('div', id='team_stats')
            if team_stats_div:
                for header_text, key in {'Possession': 'Possession', 'Passing Accuracy': 'Passing Accuracy'}.items():
                    header_th = team_stats_div.find('th', string=header_text)
                    if header_th:
                        data_row = header_th.find_parent('tr').find_next_sibling('tr')
                        if data_row:
                            values_td = data_row.find_all('td');
                            if len(values_td) == 2:
                                try:
                                    home_text_raw = values_td[0].get_text(strip=True); away_text_raw = values_td[1].get_text(strip=True)
                                    home_match = re.search(r'(\d{1,3}(?:\.\d+)?%)', home_text_raw); away_match = re.search(r'(\d{1,3}(?:\.\d+)?%)', away_text_raw)
                                    if home_match: home_stats[key] = home_match.group(1)
                                    if away_match: away_stats[key] = away_match.group(1)
                                except Exception: pass
        except Exception: pass # Bỏ qua nếu lỗi
        match_data['home_possession'] = home_stats.get('Possession', '')
        match_data['away_possession'] = away_stats.get('Possession', '')
        match_data['home_pass_completion'] = home_stats.get('Passing Accuracy', '')
        match_data['away_pass_completion'] = away_stats.get('Passing Accuracy', '')

        # -- Fouls & Corners --
        home_extra = {}; away_extra = {}
        try:
            team_stats_extra_div = soup.find('div', id='team_stats_extra')
            if team_stats_extra_div:
                stat_rows = team_stats_extra_div.find_all('div', recursive=False)
                for row_div in stat_rows:
                    inner_divs = row_div.find_all('div')
                    if len(inner_divs) == 3:
                        stat_name = inner_divs[1].text.strip()
                        if stat_name == "Fouls" or stat_name == "Corners":
                            try:
                                home_val_str = inner_divs[0].text.strip(); away_val_str = inner_divs[2].text.strip()
                                home_extra[stat_name] = int(home_val_str) if home_val_str.isdigit() else 0
                                away_extra[stat_name] = int(away_val_str) if away_val_str.isdigit() else 0
                            except Exception: pass
        except Exception: pass
        match_data['home_fouls'] = home_extra.get('Fouls', 0); match_data['away_fouls'] = away_extra.get('Fouls', 0)
        match_data['home_corners'] = home_extra.get('Corners', 0); match_data['away_corners'] = away_extra.get('Corners', 0)

        # -- Shots, SoT, Cards --
        home_table_id = None; away_table_id = None
        player_stat_tables = soup.find_all('table', id=re.compile(r'^stats_.*_summary'))
        if len(player_stat_tables) >= 1: home_table_id = player_stat_tables[0].get('id')
        if len(player_stat_tables) >= 2: away_table_id = player_stat_tables[1].get('id')
        home_totals = get_totals_from_player_stats_tfoot(soup, home_table_id, match_url)
        away_totals = get_totals_from_player_stats_tfoot(soup, away_table_id, match_url)
        match_data['home_shots'] = home_totals.get('shots', 0); match_data['away_shots'] = away_totals.get('shots', 0)
        match_data['home_shots_on_target'] = home_totals.get('shots_on_target', 0); match_data['away_shots_on_target'] = away_totals.get('shots_on_target', 0)
        match_data['home_yellow_cards'] = home_totals.get('yellow_cards', 0); match_data['away_yellow_cards'] = away_totals.get('yellow_cards', 0)
        match_data['home_red_cards'] = home_totals.get('red_cards', 0); match_data['away_red_cards'] = away_totals.get('red_cards', 0)

        # -- Saves --
        home_keeper_div_id = None; away_keeper_div_id = None
        keeper_stat_divs = soup.find_all('div', id=re.compile(r'^all_keeper_stats_'))
        if len(keeper_stat_divs) >= 1: home_keeper_div_id = keeper_stat_divs[0]['id']
        if len(keeper_stat_divs) >= 2: away_keeper_div_id = keeper_stat_divs[1]['id']
        match_data['home_saves'] = get_saves_from_keeper_stats(soup, home_keeper_div_id, match_url)
        match_data['away_saves'] = get_saves_from_keeper_stats(soup, away_keeper_div_id, match_url)

        # --- Final Data Dict ---
        default_numeric = 0; default_string = ""; default_list_str = "" # Thay default_list bằng default_list_str
        final_data_template = {
            'date': default_string, 'home': default_string, 'away': default_string, 'home_score': default_numeric, 'away_score': default_numeric,
            'league': default_string, 'season': default_string, 'source': default_string, 'match_report_url': default_string,
            'round': default_numeric, 'venue': default_string,
            'home_lineup': default_list_str, 'away_lineup': default_list_str, 'home_missing': default_list_str, 'away_missing': default_list_str, # Sử dụng default_list_str
            'home_possession': default_string, 'away_possession': default_string, 'home_shots': default_numeric, 'away_shots': default_numeric,
            'home_shots_on_target': default_numeric, 'away_shots_on_target': default_numeric, 'home_pass_completion': default_string, 'away_pass_completion': default_string,
            'home_red_cards': default_numeric, 'away_red_cards': default_numeric, 'home_yellow_cards': default_numeric, 'away_yellow_cards': default_numeric,
            'home_saves': default_numeric, 'away_saves': default_numeric, 'home_fouls': default_numeric, 'away_fouls': default_numeric,
            'home_corners': default_numeric, 'away_corners': default_numeric
        }
        final_data = {key: match_data.get(key, default_value) for key, default_value in final_data_template.items()}

        # Kiểm tra thiếu dữ liệu (log ít hơn)
        # for key, value in final_data.items():
        #     if value == final_data_template[key] and value is not 0 and value != "" and value != []:
        #         logging.debug(f"[{match_url}] Dữ liệu mặc định cho: {key}")

        return final_data

    except TimeoutException: logging.error(f"Timeout khi tải trang: {match_url}"); return None
    except WebDriverException as e: logging.error(f"Lỗi WebDriver: {match_url}: {e}"); return None
    except Exception as e: logging.error(f"Lỗi không xác định: {match_url}: {e}", exc_info=False); return None


In [8]:
# --- Main --- (Giữ nguyên)
if __name__ == "__main__":
    schedule_page_url = "https://fbref.com/en/comps/9/2014-2015/schedule/2014-2015-Premier-League-Scores-and-Fixtures"
    output_csv_file = "fbref_premier_league_2014_2015_v18.csv"
    max_retries = 1; retry_delay = 10

    driver = init_driver()

    if driver:
        league_name, season_str = get_league_season_info(driver, schedule_page_url)
        if not league_name or not season_str:
            logging.error("Không lấy được League/Season. Dừng.")
        else:
            match_links = get_match_report_links_selenium(driver, schedule_page_url)
            if not match_links: logging.error("Không lấy được link. Dừng.")
            else:
                all_matches_data = []
                processed_links_count = 0
                for i, link in enumerate(match_links):
                    attempt = 0; data = None
                    while attempt <= max_retries:
                        data = scrape_match_data_selenium(driver, link, league_name, season_str, schedule_page_url)
                        if data:
                            all_matches_data.append(data)
                            processed_links_count += 1
                            logging.info(f"Đã crawl thành công {processed_links_count}/{len(match_links)} links.")
                            break
                        else:
                            attempt += 1
                            logging.warning(f"[{link}] Lần thử {attempt}/{max_retries + 1} lỗi.")
                            if attempt <= max_retries: time.sleep(retry_delay)
                            else: logging.error(f"[{link}] Bỏ qua link sau {max_retries} lần thử lại.")
                    time.sleep(2) # Tạm dừng giữa các link chính

2025-04-18 13:56:47,761 - ERROR - Lỗi khởi tạo WebDriver: Message: Can not connect to the Service /home/minh/codeproject/epl_crawl/chrome-linux64/chrome



In [9]:
if all_matches_data:
    df = pd.DataFrame(all_matches_data)
    columns_order = [
        'date', 'home', 'away', 'home_score', 'away_score', 'league', 'season',
        'round', 'venue', 'home_lineup', 'away_lineup', 'home_missing', 'away_missing',
        'home_possession', 'away_possession', 'home_shots', 'away_shots',
        'home_shots_on_target', 'away_shots_on_target', 'home_pass_completion', 'away_pass_completion',
        'home_red_cards', 'away_red_cards', 'home_yellow_cards', 'away_yellow_cards',
        'home_saves', 'away_saves', 'home_fouls', 'away_fouls',
        'home_corners', 'away_corners', 'match_report_url', 'source'
    ]
    columns_to_write = [col for col in columns_order if col in df.columns]
    df = df[columns_to_write]
    try:
        df.to_csv(output_csv_file, index=False, encoding='utf-8-sig')
        logging.info(f"Đã lưu vào: {output_csv_file}")
    except Exception as e: logging.error(f"Lỗi ghi CSV: {e}")
else:
    logging.info("Không có dữ liệu để ghi.")
driver.quit()
logging.info("WebDriver đã đóng.")


NameError: name 'all_matches_data' is not defined