In [2]:
# Thêm thư mục gốc dự án vào sys.path
import sys
import os
from pathlib import Path

# Xác định thư mục gốc dự án
notebook_dir = Path(os.getcwd())
project_root = notebook_dir.parent  # Đi lên 1 cấp từ scripts
sys.path.append(str(project_root))

print(f"Added {project_root} to Python path")

# Import các module
from data.collector import F1Collector
import config

# Tạo instance của F1Collector
collector = F1Collector(cache_enabled=True)

# Test thu thập dữ liệu cho một Grand Prix cụ thể
year = 2021
gp_name = "Monaco Grand Prix"
print(f"Collecting data for {gp_name} {year}...")
result = collector.collect_event_data(year, gp_name)
print(f"Result: {result}")

# Kiểm tra kết quả
print("\nFiles created:")
for root, dirs, files in os.walk(config.RAW_DATA_DIR):
    for file in files:
        print(os.path.join(root, file))


Added d:\VSCODE\UEH_BigData_Final\UEH_BigData_Final to Python path


2025-04-27 19:06:41,795 - data.collector - INFO - FastF1 cache enabled at D:\VSCODE\UEH_BigData_Final\UEH_BigData_Final\data\cache
2025-04-27 19:06:41,796 - data.collector - INFO - F1Collector initialized successfully
2025-04-27 19:06:41,797 - data.collector - INFO - Loading R session for Monaco Grand Prix 2021


Collecting data for Monaco Grand Prix 2021...


core           INFO 	Loading data for Monaco Grand Prix - Race [v3.5.3]
2025-04-27 19:06:42,404 - fastf1.fastf1.core - INFO - Loading data for Monaco Grand Prix - Race [v3.5.3]
req            INFO 	Using cached data for session_info
2025-04-27 19:06:42,405 - fastf1.fastf1.req - INFO - Using cached data for session_info
req            INFO 	Using cached data for driver_info
2025-04-27 19:06:42,408 - fastf1.fastf1.req - INFO - Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
2025-04-27 19:06:43,536 - fastf1.fastf1.req - INFO - Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
2025-04-27 19:06:43,538 - fastf1.fastf1.req - INFO - Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
2025-04-27 19:06:43,540 - fastf1.fastf1.req - INFO - Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
2025-04-27 19:06:43,55

Result: {'r_laps': WindowsPath('D:/VSCODE/UEH_BigData_Final/UEH_BigData_Final/data/raw/2021/monaco_grand_prix_r_laps.parquet'), 'r_drivers': WindowsPath('D:/VSCODE/UEH_BigData_Final/UEH_BigData_Final/data/raw/2021/monaco_grand_prix_r_drivers.parquet'), 'r_results': WindowsPath('D:/VSCODE/UEH_BigData_Final/UEH_BigData_Final/data/raw/2021/monaco_grand_prix_r_results.parquet'), 'r_weather': WindowsPath('D:/VSCODE/UEH_BigData_Final/UEH_BigData_Final/data/raw/2021/monaco_grand_prix_r_weather.parquet'), 'q_laps': WindowsPath('D:/VSCODE/UEH_BigData_Final/UEH_BigData_Final/data/raw/2021/monaco_grand_prix_q_laps.parquet'), 'q_drivers': WindowsPath('D:/VSCODE/UEH_BigData_Final/UEH_BigData_Final/data/raw/2021/monaco_grand_prix_q_drivers.parquet'), 'q_results': WindowsPath('D:/VSCODE/UEH_BigData_Final/UEH_BigData_Final/data/raw/2021/monaco_grand_prix_q_results.parquet'), 'q_weather': WindowsPath('D:/VSCODE/UEH_BigData_Final/UEH_BigData_Final/data/raw/2021/monaco_grand_prix_q_weather.parquet')}

Fi

In [1]:
# Cell tiền xử lý dữ liệu

# Import thư viện cần thiết
import sys
import os
from pathlib import Path
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

# Đảm bảo đã thêm project root vào sys.path
if 'project_root' not in locals():
    notebook_dir = Path(os.getcwd())
    project_root = notebook_dir.parent
    sys.path.append(str(project_root))
    print(f"Added {project_root} to Python path")

# Import module processor
try:
    from data.processor import F1Processor
except ImportError:
    # Nếu không tìm thấy module, thử import trực tiếp
    from data.processor import F1Processor

# Thêm cấu hình này khi khởi tạo Spark session
spark = SparkSession.builder \
    .appName("F1DataProcessor") \
    .config("spark.sql.execution.arrow.pyspark.enabled", "true") \
    .config("spark.driver.memory", "4g") \
    .config("spark.sql.legacy.parquet.nanosAsLong", "true") \
    .getOrCreate()



print("Spark session initialized")

# Khởi tạo processor
processor = F1Processor(use_spark=True)

# Xử lý dữ liệu cho năm 2021 (năm mà bạn đã crawl)
year = 2021
print(f"Processing data for {year}...")

# Xử lý dữ liệu race
output_path = processor.process_race_data(year)

print(f"Data processing completed. Output saved to: {output_path}")

# Đọc dữ liệu đã xử lý để kiểm tra
if output_path:
    processed_df = spark.read.parquet(str(output_path))
    
    # Hiển thị schema
    print("\nProcessed data schema:")
    processed_df.printSchema()
    
    # Hiển thị số lượng bản ghi
    print(f"\nTotal records: {processed_df.count()}")
    
    # Hiển thị một số bản ghi mẫu
    print("\nSample data:")
    processed_df.select([
        "driver", "team", "lapnumber", "position", 
        "laptime", "sector1time", "sector2time", "sector3time",
        "compound", "tyrelife", "tyre_degradation",
        "delta_previous", "delta_optimal",
        "pit_stop_count", "gap_ahead", "gap_behind"
    ]).show(5)

# Đóng Spark session khi hoàn thành
spark.stop()
print("Spark session stopped")


Added d:\VSCODE\UEH_BigData_Final\UEH_BigData_Final to Python path


KeyboardInterrupt: 

Crawl multi years

In [2]:
# Thêm thư mục gốc dự án vào sys.path
import sys
import os
from pathlib import Path

# Xác định thư mục gốc dự án
notebook_dir = Path(os.getcwd())
project_root = notebook_dir.parent  # Đi lên 1 cấp từ scripts
sys.path.append(str(project_root))

print(f"Added {project_root} to Python path")

# Import các module
from data.collector import F1Collector
import config

# Tạo instance của F1Collector
collector = F1Collector(cache_enabled=True)

# Danh sách các năm cần thu thập dữ liệu
years = range(2019, 2026)  # 2019 đến 2025

# Thu thập dữ liệu cho từng năm
for year in years:
    print(f"\n{'='*50}")
    print(f"Collecting data for {year} season...")
    print(f"{'='*50}")
    
    try:
        # Thu thập dữ liệu cho toàn bộ mùa giải
        results = collector.collect_season_data(year)
        
        # Hiển thị tổng số Grand Prix đã thu thập
        num_gps = len(results)
        print(f"\nCollected data for {num_gps} Grand Prix in {year}")
        
        # Hiển thị tên các Grand Prix đã thu thập
        print(f"Grand Prix collected: {', '.join(results.keys())}")
    except Exception as e:
        print(f"Error collecting data for {year}: {str(e)}")
        continue

print("\nData collection completed!")

# Kiểm tra kết quả
print("\nFiles created:")
for year in years:
    year_dir = config.RAW_DATA_DIR / str(year)
    if year_dir.exists():
        files = list(year_dir.glob("*.parquet"))
        print(f"\nYear {year}: {len(files)} files")


Added d:\VSCODE\UEH_BigData_Final\UEH_BigData_Final to Python path


2025-04-28 00:22:25,561 - data.collector - INFO - FastF1 cache enabled at D:\VSCODE\UEH_BigData_Final\UEH_BigData_Final\data\cache
2025-04-28 00:22:25,562 - data.collector - INFO - F1Collector initialized successfully
2025-04-28 00:22:25,563 - data.collector - INFO - Starting raw data collection for 2019 season
2025-04-28 00:22:25,563 - data.collector - INFO - Fetching event schedule for 2019



Collecting data for 2019 season...


2025-04-28 00:22:26,044 - data.collector - INFO - Found 21 official events for 2019
2025-04-28 00:22:26,045 - data.collector - INFO - Collecting raw data for Australian Grand Prix 2019
2025-04-28 00:22:26,046 - data.collector - INFO - Loading R session for Australian Grand Prix 2019
core           INFO 	Loading data for Australian Grand Prix - Race [v3.5.3]
2025-04-28 00:22:26,064 - fastf1.fastf1.core - INFO - Loading data for Australian Grand Prix - Race [v3.5.3]
req            INFO 	Using cached data for session_info
2025-04-28 00:22:26,073 - fastf1.fastf1.req - INFO - Using cached data for session_info
req            INFO 	Using cached data for driver_info
2025-04-28 00:22:26,082 - fastf1.fastf1.req - INFO - Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
2025-04-28 00:22:28,286 - fastf1.fastf1.req - INFO - Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
2025-04-28 00:22:28,296 - fastf1.f


Collected data for 21 Grand Prix in 2019
Grand Prix collected: Australian Grand Prix, Bahrain Grand Prix, Chinese Grand Prix, Azerbaijan Grand Prix, Spanish Grand Prix, Monaco Grand Prix, Canadian Grand Prix, French Grand Prix, Austrian Grand Prix, British Grand Prix, German Grand Prix, Hungarian Grand Prix, Belgian Grand Prix, Italian Grand Prix, Singapore Grand Prix, Russian Grand Prix, Japanese Grand Prix, Mexican Grand Prix, United States Grand Prix, Brazilian Grand Prix, Abu Dhabi Grand Prix

Collecting data for 2020 season...


2025-04-28 00:25:17,077 - data.collector - INFO - Found 17 official events for 2020
2025-04-28 00:25:17,078 - data.collector - INFO - Collecting raw data for Austrian Grand Prix 2020
2025-04-28 00:25:17,078 - data.collector - INFO - Loading R session for Austrian Grand Prix 2020
core           INFO 	Loading data for Austrian Grand Prix - Race [v3.5.3]
2025-04-28 00:25:17,097 - fastf1.fastf1.core - INFO - Loading data for Austrian Grand Prix - Race [v3.5.3]
req            INFO 	No cached data found for session_info. Loading data...
2025-04-28 00:25:17,099 - fastf1.fastf1.req - INFO - No cached data found for session_info. Loading data...
_api           INFO 	Fetching session info data...
2025-04-28 00:25:17,100 - fastf1.api - INFO - Fetching session info data...
req            INFO 	Data has been written to cache!
2025-04-28 00:25:17,896 - fastf1.fastf1.req - INFO - Data has been written to cache!
req            INFO 	No cached data found for driver_info. Loading data...
2025-04-28 00:2


Collected data for 17 Grand Prix in 2020
Grand Prix collected: Austrian Grand Prix, Styrian Grand Prix, Hungarian Grand Prix, British Grand Prix, 70th Anniversary Grand Prix, Spanish Grand Prix, Belgian Grand Prix, Italian Grand Prix, Tuscan Grand Prix, Russian Grand Prix, Eifel Grand Prix, Portuguese Grand Prix, Emilia Romagna Grand Prix, Turkish Grand Prix, Bahrain Grand Prix, Sakhir Grand Prix, Abu Dhabi Grand Prix

Collecting data for 2021 season...


2025-04-28 00:41:22,438 - data.collector - INFO - Found 19 official events for 2021
2025-04-28 00:41:22,440 - data.collector - INFO - Collecting raw data for Bahrain Grand Prix 2021
2025-04-28 00:41:22,440 - data.collector - INFO - Loading R session for Bahrain Grand Prix 2021
core           INFO 	Loading data for Bahrain Grand Prix - Race [v3.5.3]
2025-04-28 00:41:22,458 - fastf1.fastf1.core - INFO - Loading data for Bahrain Grand Prix - Race [v3.5.3]
req            INFO 	No cached data found for session_info. Loading data...
2025-04-28 00:41:22,460 - fastf1.fastf1.req - INFO - No cached data found for session_info. Loading data...
_api           INFO 	Fetching session info data...
2025-04-28 00:41:22,461 - fastf1.api - INFO - Fetching session info data...
req            INFO 	Data has been written to cache!
2025-04-28 00:41:23,112 - fastf1.fastf1.req - INFO - Data has been written to cache!
req            INFO 	No cached data found for driver_info. Loading data...
2025-04-28 00:41:23


Collected data for 19 Grand Prix in 2021
Grand Prix collected: Bahrain Grand Prix, Emilia Romagna Grand Prix, Portuguese Grand Prix, Spanish Grand Prix, Monaco Grand Prix, Azerbaijan Grand Prix, French Grand Prix, Styrian Grand Prix, Austrian Grand Prix, Hungarian Grand Prix, Belgian Grand Prix, Dutch Grand Prix, Russian Grand Prix, Turkish Grand Prix, United States Grand Prix, Mexico City Grand Prix, Qatar Grand Prix, Saudi Arabian Grand Prix, Abu Dhabi Grand Prix

Collecting data for 2022 season...


2025-04-28 00:50:13,612 - fastf1.fastf1.events - DEBUG - Traceback for failure in FastF1 schedule
Traceback (most recent call last):
  File "c:\Users\vanhu\AppData\Local\Programs\Python\Python312\Lib\site-packages\fastf1\logger.py", line 151, in __wrapped
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\vanhu\AppData\Local\Programs\Python\Python312\Lib\site-packages\fastf1\events.py", line 584, in _get_schedule_ff1
    response = Cache.requests_get(
               ^^^^^^^^^^^^^^^^^^^
  File "c:\Users\vanhu\AppData\Local\Programs\Python\Python312\Lib\site-packages\fastf1\req.py", line 303, in requests_get
    return cls._cached_request('GET', url, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\vanhu\AppData\Local\Programs\Python\Python312\Lib\site-packages\fastf1\req.py", line 347, in _cached_request
    response = func(url, **kwargs)
               ^^^^^^^^^^^^^^^^^^^
  File "c:\Users\vanhu\AppData\Local\Programs\Python

Error collecting data for 2022: Failed to load any schedule data.

Collecting data for 2023 season...


2025-04-28 00:50:25,383 - fastf1.fastf1.events - DEBUG - Traceback for failure in FastF1 schedule
Traceback (most recent call last):
  File "c:\Users\vanhu\AppData\Local\Programs\Python\Python312\Lib\site-packages\fastf1\logger.py", line 151, in __wrapped
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\vanhu\AppData\Local\Programs\Python\Python312\Lib\site-packages\fastf1\events.py", line 584, in _get_schedule_ff1
    response = Cache.requests_get(
               ^^^^^^^^^^^^^^^^^^^
  File "c:\Users\vanhu\AppData\Local\Programs\Python\Python312\Lib\site-packages\fastf1\req.py", line 303, in requests_get
    return cls._cached_request('GET', url, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\vanhu\AppData\Local\Programs\Python\Python312\Lib\site-packages\fastf1\req.py", line 347, in _cached_request
    response = func(url, **kwargs)
               ^^^^^^^^^^^^^^^^^^^
  File "c:\Users\vanhu\AppData\Local\Programs\Python

Error collecting data for 2023: Failed to load any schedule data.

Collecting data for 2024 season...


2025-04-28 00:50:37,179 - fastf1.fastf1.events - DEBUG - Traceback for failure in FastF1 schedule
Traceback (most recent call last):
  File "c:\Users\vanhu\AppData\Local\Programs\Python\Python312\Lib\site-packages\fastf1\logger.py", line 151, in __wrapped
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\vanhu\AppData\Local\Programs\Python\Python312\Lib\site-packages\fastf1\events.py", line 584, in _get_schedule_ff1
    response = Cache.requests_get(
               ^^^^^^^^^^^^^^^^^^^
  File "c:\Users\vanhu\AppData\Local\Programs\Python\Python312\Lib\site-packages\fastf1\req.py", line 303, in requests_get
    return cls._cached_request('GET', url, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\vanhu\AppData\Local\Programs\Python\Python312\Lib\site-packages\fastf1\req.py", line 347, in _cached_request
    response = func(url, **kwargs)
               ^^^^^^^^^^^^^^^^^^^
  File "c:\Users\vanhu\AppData\Local\Programs\Python

Error collecting data for 2024: Failed to load any schedule data.

Collecting data for 2025 season...


2025-04-28 00:50:48,956 - fastf1.fastf1.events - DEBUG - Traceback for failure in FastF1 schedule
Traceback (most recent call last):
  File "c:\Users\vanhu\AppData\Local\Programs\Python\Python312\Lib\site-packages\fastf1\logger.py", line 151, in __wrapped
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\vanhu\AppData\Local\Programs\Python\Python312\Lib\site-packages\fastf1\events.py", line 584, in _get_schedule_ff1
    response = Cache.requests_get(
               ^^^^^^^^^^^^^^^^^^^
  File "c:\Users\vanhu\AppData\Local\Programs\Python\Python312\Lib\site-packages\fastf1\req.py", line 303, in requests_get
    return cls._cached_request('GET', url, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\vanhu\AppData\Local\Programs\Python\Python312\Lib\site-packages\fastf1\req.py", line 347, in _cached_request
    response = func(url, **kwargs)
               ^^^^^^^^^^^^^^^^^^^
  File "c:\Users\vanhu\AppData\Local\Programs\Python

Error collecting data for 2025: Failed to load any schedule data.

Data collection completed!

Files created:

Year 2019: 168 files

Year 2020: 136 files

Year 2021: 12 files
