In [2]:
# Thêm thư mục gốc dự án vào sys.path
import sys
import os
from pathlib import Path

# Xác định thư mục gốc dự án
notebook_dir = Path(os.getcwd())
project_root = notebook_dir.parent  # Đi lên 1 cấp từ scripts
sys.path.append(str(project_root))

print(f"Added {project_root} to Python path")

# Import các module
from data.collector import F1Collector
import config

# Tạo instance của F1Collector
collector = F1Collector(cache_enabled=True)

# Test thu thập dữ liệu cho một Grand Prix cụ thể
year = 2021
gp_name = "Monaco Grand Prix"
print(f"Collecting data for {gp_name} {year}...")
result = collector.collect_event_data(year, gp_name)
print(f"Result: {result}")

# Kiểm tra kết quả
print("\nFiles created:")
for root, dirs, files in os.walk(config.RAW_DATA_DIR):
    for file in files:
        print(os.path.join(root, file))


Added d:\VSCODE\UEH_BigData_Final\UEH_BigData_Final to Python path


2025-04-27 19:06:41,795 - data.collector - INFO - FastF1 cache enabled at D:\VSCODE\UEH_BigData_Final\UEH_BigData_Final\data\cache
2025-04-27 19:06:41,796 - data.collector - INFO - F1Collector initialized successfully
2025-04-27 19:06:41,797 - data.collector - INFO - Loading R session for Monaco Grand Prix 2021


Collecting data for Monaco Grand Prix 2021...


core           INFO 	Loading data for Monaco Grand Prix - Race [v3.5.3]
2025-04-27 19:06:42,404 - fastf1.fastf1.core - INFO - Loading data for Monaco Grand Prix - Race [v3.5.3]
req            INFO 	Using cached data for session_info
2025-04-27 19:06:42,405 - fastf1.fastf1.req - INFO - Using cached data for session_info
req            INFO 	Using cached data for driver_info
2025-04-27 19:06:42,408 - fastf1.fastf1.req - INFO - Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
2025-04-27 19:06:43,536 - fastf1.fastf1.req - INFO - Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
2025-04-27 19:06:43,538 - fastf1.fastf1.req - INFO - Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
2025-04-27 19:06:43,540 - fastf1.fastf1.req - INFO - Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
2025-04-27 19:06:43,55

Result: {'r_laps': WindowsPath('D:/VSCODE/UEH_BigData_Final/UEH_BigData_Final/data/raw/2021/monaco_grand_prix_r_laps.parquet'), 'r_drivers': WindowsPath('D:/VSCODE/UEH_BigData_Final/UEH_BigData_Final/data/raw/2021/monaco_grand_prix_r_drivers.parquet'), 'r_results': WindowsPath('D:/VSCODE/UEH_BigData_Final/UEH_BigData_Final/data/raw/2021/monaco_grand_prix_r_results.parquet'), 'r_weather': WindowsPath('D:/VSCODE/UEH_BigData_Final/UEH_BigData_Final/data/raw/2021/monaco_grand_prix_r_weather.parquet'), 'q_laps': WindowsPath('D:/VSCODE/UEH_BigData_Final/UEH_BigData_Final/data/raw/2021/monaco_grand_prix_q_laps.parquet'), 'q_drivers': WindowsPath('D:/VSCODE/UEH_BigData_Final/UEH_BigData_Final/data/raw/2021/monaco_grand_prix_q_drivers.parquet'), 'q_results': WindowsPath('D:/VSCODE/UEH_BigData_Final/UEH_BigData_Final/data/raw/2021/monaco_grand_prix_q_results.parquet'), 'q_weather': WindowsPath('D:/VSCODE/UEH_BigData_Final/UEH_BigData_Final/data/raw/2021/monaco_grand_prix_q_weather.parquet')}

Fi

In [3]:
# Cell tiền xử lý dữ liệu

# Import thư viện cần thiết
import sys
import os
from pathlib import Path
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

# Đảm bảo đã thêm project root vào sys.path
if 'project_root' not in locals():
    notebook_dir = Path(os.getcwd())
    project_root = notebook_dir.parent
    sys.path.append(str(project_root))
    print(f"Added {project_root} to Python path")

# Import module processor
try:
    from data.processor import F1Processor
except ImportError:
    # Nếu không tìm thấy module, thử import trực tiếp
    from data.processor import F1Processor

# Thêm cấu hình này khi khởi tạo Spark session
spark = SparkSession.builder \
    .appName("F1DataProcessor") \
    .config("spark.sql.execution.arrow.pyspark.enabled", "true") \
    .config("spark.driver.memory", "4g") \
    .config("spark.sql.legacy.parquet.nanosAsLong", "true") \
    .getOrCreate()



print("Spark session initialized")

# Khởi tạo processor
processor = F1Processor(use_spark=True)

# Xử lý dữ liệu cho năm 2021 (năm mà bạn đã crawl)
year = 2021
print(f"Processing data for {year}...")

# Xử lý dữ liệu race
output_path = processor.process_race_data(year)

print(f"Data processing completed. Output saved to: {output_path}")

# Đọc dữ liệu đã xử lý để kiểm tra
if output_path:
    processed_df = spark.read.parquet(str(output_path))
    
    # Hiển thị schema
    print("\nProcessed data schema:")
    processed_df.printSchema()
    
    # Hiển thị số lượng bản ghi
    print(f"\nTotal records: {processed_df.count()}")
    
    # Hiển thị một số bản ghi mẫu
    print("\nSample data:")
    processed_df.select([
        "driver", "team", "lapnumber", "position", 
        "laptime", "sector1time", "sector2time", "sector3time",
        "compound", "tyrelife", "tyre_degradation",
        "delta_previous", "delta_optimal",
        "pit_stop_count", "gap_ahead", "gap_behind"
    ]).show(5)

# Đóng Spark session khi hoàn thành
spark.stop()
print("Spark session stopped")


2025-04-27 20:33:22,627 - data.processor - INFO - Spark session initialized successfully
2025-04-27 20:33:22,629 - data.processor - INFO - F1Processor initialized with Spark: True
2025-04-27 20:33:22,630 - data.processor - INFO - Processing race data for 2021
2025-04-27 20:33:22,632 - data.processor - INFO - Found 1 r_laps files for 2021


Spark session initialized
Processing data for 2021...


2025-04-27 20:33:23,104 - data.processor - INFO - Read 1420 rows from 1 files
2025-04-27 20:33:23,107 - data.processor - INFO - Original DataFrame columns:
2025-04-27 20:33:23,110 - data.processor - INFO - ['Time', 'Driver', 'DriverNumber', 'LapTime', 'LapNumber', 'Stint', 'PitOutTime', 'PitInTime', 'Sector1Time', 'Sector2Time', 'Sector3Time', 'Sector1SessionTime', 'Sector2SessionTime', 'Sector3SessionTime', 'SpeedI1', 'SpeedI2', 'SpeedFL', 'SpeedST', 'IsPersonalBest', 'Compound', 'TyreLife', 'FreshTyre', 'Team', 'LapStartTime', 'LapStartDate', 'TrackStatus', 'Position', 'Deleted', 'DeletedReason', 'FastF1Generated', 'IsAccurate']
2025-04-27 20:33:23,318 - data.processor - INFO - Renamed DataFrame columns:


root
 |-- Time: long (nullable = true)
 |-- Driver: string (nullable = true)
 |-- DriverNumber: string (nullable = true)
 |-- LapTime: long (nullable = true)
 |-- LapNumber: double (nullable = true)
 |-- Stint: double (nullable = true)
 |-- PitOutTime: long (nullable = true)
 |-- PitInTime: long (nullable = true)
 |-- Sector1Time: long (nullable = true)
 |-- Sector2Time: long (nullable = true)
 |-- Sector3Time: long (nullable = true)
 |-- Sector1SessionTime: long (nullable = true)
 |-- Sector2SessionTime: long (nullable = true)
 |-- Sector3SessionTime: long (nullable = true)
 |-- SpeedI1: double (nullable = true)
 |-- SpeedI2: double (nullable = true)
 |-- SpeedFL: double (nullable = true)
 |-- SpeedST: double (nullable = true)
 |-- IsPersonalBest: boolean (nullable = true)
 |-- Compound: string (nullable = true)
 |-- TyreLife: double (nullable = true)
 |-- FreshTyre: boolean (nullable = true)
 |-- Team: string (nullable = true)
 |-- LapStartTime: long (nullable = true)
 |-- LapStartDa

2025-04-27 20:33:23,320 - data.processor - INFO - ['time', 'driver', 'drivernumber', 'laptime', 'lapnumber', 'stint', 'pitouttime', 'pitintime', 'sector1time', 'sector2time', 'sector3time', 'sector1sessiontime', 'sector2sessiontime', 'sector3sessiontime', 'speedi1', 'speedi2', 'speedfl', 'speedst', 'ispersonalbest', 'compound', 'tyrelife', 'freshtyre', 'team', 'lapstarttime', 'lapstartdate', 'trackstatus', 'position', 'deleted', 'deletedreason', 'fastf1generated', 'isaccurate']
2025-04-27 20:33:24,769 - data.processor - INFO - Processed race data saved to D:\VSCODE\UEH_BigData_Final\UEH_BigData_Final\data\processed\2021\race_features.parquet


Data processing completed. Output saved to: D:\VSCODE\UEH_BigData_Final\UEH_BigData_Final\data\processed\2021\race_features.parquet

Processed data schema:
root
 |-- driver: string (nullable = true)
 |-- drivernumber: string (nullable = true)
 |-- team: string (nullable = true)
 |-- lapnumber: double (nullable = true)
 |-- position: double (nullable = true)
 |-- laptime: long (nullable = true)
 |-- sector1time: long (nullable = true)
 |-- sector2time: long (nullable = true)
 |-- sector3time: long (nullable = true)
 |-- pitintime: long (nullable = true)
 |-- pitouttime: long (nullable = true)
 |-- compound: string (nullable = true)
 |-- tyrelife: double (nullable = true)
 |-- stint: double (nullable = true)
 |-- freshtyre: boolean (nullable = true)
 |-- speedi1: double (nullable = true)
 |-- speedi2: double (nullable = true)
 |-- speedfl: double (nullable = true)
 |-- speedst: double (nullable = true)
 |-- trackstatus: string (nullable = true)
 |-- ispersonalbest: boolean (nullable = tr