In [3]:
import os
import glob
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm

In [4]:
def get_day_folders(base_path):
    # یافتن همه پوشه‌های روز
    return sorted([
        folder for folder in os.listdir(base_path)
        if os.path.isdir(os.path.join(base_path, folder)) and folder.startswith("2024")
    ])

def load_file(file):
    try:
        return pd.read_parquet(file)
    except Exception as e:
        print(f"Error reading {file}: {e}")
        return None

def load_all_data(base_path, subfolder_name):
    all_files = []
    for folder in get_day_folders(base_path):
        path_pattern = os.path.join(base_path, folder, 'data', 'raw', subfolder_name, '*.parquet')
        all_files.extend(glob.glob(path_pattern))

    dfs = []
    # استفاده از ThreadPoolExecutor برای بارگذاری موازی
    with ThreadPoolExecutor(max_workers=8) as executor:
        # tqdm برای نمایش پیشرفت
        futures = {executor.submit(load_file, file): file for file in all_files}
        for future in tqdm(as_completed(futures), total=len(futures), desc="Loading files"):
            result = future.result()
            if result is not None:
                dfs.append(result)
    
    return pd.concat(dfs, ignore_index=True) if dfs else None

In [None]:
base_path = "../data/tennis_data"
print("Loading all tennis data...")
matches_df = load_all_data(base_path, 'raw_match_parquet')
print("Loading complete.")


Loading all tennis data...


Loading files:  11%|█         | 33843/316802 [00:53<07:31, 627.29it/s] 


In [None]:
#stats
print("Loading all tennis data...")
stats_df = load_all_data(base_path, 'raw_statistics_parquet')
p_df = load_all_data(base_path, 'raw_tennis_power_parquet')
v_df = load_all_data(base_path, 'raw_votes_parquet')
o_df = load_all_data(base_path, 'raw_odds_parquet')
pbp_df = load_all_data(base_path, 'raw_point_by_point_parquet')
print("Loading complete.")


Loading all tennis data...


Loading files: 100%|██████████| 23291/23291 [00:48<00:00, 484.71it/s]
Loading files: 100%|██████████| 22724/22724 [00:38<00:00, 582.71it/s]
Loading files: 100%|██████████| 35658/35658 [00:56<00:00, 630.14it/s]
Loading files: 100%|██████████| 22065/22065 [00:38<00:00, 570.92it/s]
Loading files: 100%|██████████| 22272/22272 [00:46<00:00, 480.76it/s]


Loading complete.


In [16]:
matches_df.columns

Index(['match_id', 'name', 'slug', 'gender', 'user_count', 'residence',
       'birthplace', 'height', 'weight', 'plays', 'turned_pro',
       'current_prize', 'total_prize', 'player_id', 'current_rank',
       'name_code', 'country', 'full_name', 'current_score', 'display_score',
       'period_1', 'period_2', 'period_3', 'period_4', 'period_5',
       'period_1_tie_break', 'period_2_tie_break', 'period_3_tie_break',
       'period_4_tie_break', 'period_5_tie_break', 'normal_time',
       'first_to_serve', 'home_team_seed', 'away_team_seed', 'custom_id',
       'winner_code', 'default_period_count', 'start_datetime', 'match_slug',
       'final_result_only', 'round_id', 'cup_round_type', 'season_id', 'year',
       'current_period_start_timestamp', 'tournament_id', 'tournament_name',
       'tournament_slug', 'tournament_unique_id', 'tournament_category_name',
       'tournament_category_slug', 'ground_type', 'tennis_points',
       'has_event_player_statistics', 'crowd_sourcing_enabl

In [24]:
stats_df.columns

Index(['match_id', 'period', 'statistic_category_name', 'statistic_name',
       'home_stat', 'away_stat', 'compare_code', 'statistic_type',
       'value_type', 'home_value', 'away_value', 'home_total', 'away_total'],
      dtype='object')

In [25]:
p_df.columns

Index(['match_id', 'set_num', 'game_num', 'value', 'break_occurred'], dtype='object')

In [26]:
v_df.columns

Index(['match_id', 'home_vote', 'away_vote'], dtype='object')

In [27]:
o_df.columns

Index(['match_id', 'market_id', 'market_name', 'is_live', 'suspended',
       'initial_fractional_value', 'fractional_value', 'choice_name',
       'choice_source_id', 'winnig', 'change'],
      dtype='object')

In [28]:
pbp_df.columns

Index(['match_id', 'set_id', 'game_id', 'point_id', 'home_point', 'away_point',
       'point_description', 'home_point_type', 'away_point_type', 'home_score',
       'away_score', 'serving', 'scoring'],
      dtype='object')

In [5]:
matches_df.head()

Unnamed: 0,match_id,name,slug,gender,user_count,residence,birthplace,height,weight,plays,...,tennis_points,has_event_player_statistics,crowd_sourcing_enabled,has_performance_graph_feature,display_inverse_home_away_teams,priority,competition_type,city,stadium,venue_id
0,11998780,Kozlov S.,kozlov-stefan,M,752.0,"Pembroke Pines, FL, USA","Skopje, Macedonia",1.83,79.0,right-handed,...,,,,,,,,,,
1,11998672,Sekulic P.,sekulic-philip,M,514.0,,Subiaco,1.91,,right-handed,...,,,,,,,,,,
2,11998774,Maloney P.,maloney-patrick,M,101.0,,Oyster Bay,1.93,,,...,,,,,,,,,,
3,12017514,Baptiste H.,baptiste-hailey,F,1106.0,,,,,right-handed,...,,,,,,,,,,
4,12002055,Tauson C.,tauson-clara,F,6796.0,,"Copenhagen, Denmark",1.82,,right-handed,...,,,,,,,,,,
