In [None]:
import asyncio
import aiohttp
import pandas as pd
import json
import os
from pathlib import Path
import logging
from typing import Set, Tuple
from concurrent.futures import ThreadPoolExecutor
import time
# Setup logging (unchanged)
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('wowy_scraper.log'),
        logging.StreamHandler()
    ]
)

class RateLimiter:
    def __init__(self, rate_limit, time_period):
        self.rate_limit = rate_limit
        self.time_period = time_period
        self.tokens = rate_limit
        self.updated_at = time.monotonic()

    async def acquire(self):
        while True:
            now = time.monotonic()
            time_passed = now - self.updated_at
            self.tokens += time_passed * (self.rate_limit / self.time_period)
            if self.tokens > self.rate_limit:
                self.tokens = self.rate_limit
            self.updated_at = now

            if self.tokens >= 1:
                self.tokens -= 1
                return
            else:
                await asyncio.sleep(self.time_period / self.rate_limit)

async def fetch_wowy_data(session, url, params):
    try:
        async with session.get(url, params=params) as response:
            response.raise_for_status()
            return await response.json()
    except aiohttp.ClientResponseError as e:
        logging.error(f"HTTP error {e.status} for URL {e.request_info.url}. Response text: {await response.text()}")
        return None
    except json.JSONDecodeError as e:
        content = await response.text()
        logging.error(f"JSONDecodeError: {e}. Content: {content}")
        return None
    except Exception as e:
        logging.error(f"General error fetching data: {e}")
        return None

async def wowy_shift(session, team_id, player1_id, seasons, ps=False):
    s_type = 'Playoffs' if ps else 'Regular Season'
    if ps == 'all':
        s_type = 'All'

    wowy_url = "https://api.pbpstats.com/get-wowy-stats/nba"
    time.sleep(1)
    params_on = {
        "0Exactly1OnFloor": player1_id,
        "TeamId": team_id,
        "Season": ",".join(seasons),
        "SeasonType": s_type,
        "Type": "Player",
    }
    time.sleep(1)
    params_off = {
        "0Exactly0OnFloor": player1_id,
        "TeamId": team_id,
        "Season": ",".join(seasons),
        "SeasonType": s_type,
        "Type": "Player",
    }

    wowy_on, wowy_off = await asyncio.gather(
        fetch_wowy_data(session, wowy_url, params_on),
        fetch_wowy_data(session, wowy_url, params_off)
    )

    if wowy_on is None or wowy_off is None:
        logging.warning(f"Skipping {player1_id} - {team_id} due to data fetch failure.")
        return None

    df_on = pd.DataFrame(wowy_on["multi_row_table_data"])
    df_on['on'] = True
    df_off = pd.DataFrame(wowy_off["multi_row_table_data"])
    df_off['on'] = False

    return pd.concat([df_on, df_off])

# Setup folders function (unchanged)
def setup_folders(base_year: int, end_year: int, ps=False) -> None:
    trail = 'ps' if ps else ''
    for year in range(base_year, end_year + 1):
        Path(f"data/{year}{trail}").mkdir(parents=True, exist_ok=True)

def get_processed_combinations(year: int, ps=False) -> Set[Tuple[str, str]]:
    trail = 'ps' if ps else ''
    year_dir = Path(f"data/{year}{trail}")
    processed = set()
    if year_dir.exists():
        for file in year_dir.glob("*.csv"):
            nba_id = file.stem
            try:
                df = pd.read_csv(file, usecols=['TeamId'])
                team_ids = df['TeamId'].unique()
                for team_id in team_ids:
                    processed.add((nba_id, str(team_id)))
            except Exception as e:
                logging.error(f"Error reading file {file}: {e}")
    return processed

async def process_player(session, nba_id, team_id, year, is_postseason, seasons, rate_limiter):
    trail = 'ps' if is_postseason else ''
    output_file = Path(f"data/{int(year)}{trail}/{int(nba_id)}.csv")
    
    max_retries = 3
    for attempt in range(max_retries):
        try:
            await rate_limiter.acquire()  # Wait for rate limit token
            logging.info(f"Processing {nba_id} - {team_id} for {year} (Attempt {attempt + 1})")
            result = await wowy_shift(
                session,
                team_id=team_id,
                player1_id=str(int(nba_id)),
                seasons=seasons,
                ps=is_postseason
            )
            
            if result is None:
                raise Exception("wowy_shift failed")
            
            if output_file.exists():
                existing_data = pd.read_csv(output_file)
                combined_data = pd.concat([existing_data, result], ignore_index=True)
                combined_data.drop_duplicates().to_csv(output_file, index=False)
            else:
                result.to_csv(output_file, index=False)
            
            return nba_id, str(team_id)
        except Exception as e:  
            logging.error(f"Error processing {nba_id} - {team_id} for {year} (Attempt {attempt + 1}): {e}")
            if attempt < max_retries - 1:
                await asyncio.sleep(2 ** attempt)  # Exponential backoff
            else:
                return None

async def process_season_data(year: int, is_postseason: bool, index_df: pd.DataFrame, 
                              processed_combinations: Set[Tuple[str, str]], rate_limiter: RateLimiter) -> None:
    season_start = str(year - 1)
    season_end = str(year)
    seasons = [f"{season_start}-{season_end[-2:]}"]
    index_df['nba_id'] = index_df['nba_id'].astype(int)

    timeout = aiohttp.ClientTimeout(total=300)
    async with aiohttp.ClientSession(timeout=timeout) as session:
        tasks = []
        for _, row in index_df[index_df['year'] == year].iterrows():
            time.sleep(1)
            nba_id = row['nba_id']
            team_id = row['team_id']
            if (str(nba_id), str(team_id)) not in processed_combinations:
                task = asyncio.create_task(process_player(session, nba_id, team_id, year, is_postseason, seasons, rate_limiter))
                tasks.append(task)
        
        results = await asyncio.gather(*tasks, return_exceptions=True)
        for result in results:
            if isinstance(result, Exception):
                logging.error(f"Task failed with exception: {result}")
            elif result:
                processed_combinations.add(result)


async def main():
    # Load data
    try:
        index_reg = pd.read_csv('data/index_master.csv')
        index_reg = index_reg[index_reg.team != 'TOT']
        index_ps = pd.read_csv('data/index_master_ps.csv')
        index_ps = index_ps[index_ps.team != 'TOT']
    except Exception as e:
        logging.error(f"Error loading index files: {e}")
        return

    # Create folders
    setup_folders(2001, 2025)
    setup_folders(2001, 2024, ps=True)

    # Create rate limiter (e.g., 100 requests per 60 seconds)
    rate_limiter = RateLimiter(100, 60)

    # Process regular season (2001-2025)
    for year in range(2025, 2026):
        logging.info(f"Processing regular season {year}")
        processed = get_processed_combinations(year)
        await process_season_data(year, False, index_reg, processed, rate_limiter)

    # Process postseason (2001-2024)
    for year in range(2025, 2026):
        logging.info(f"Processing postseason {year}")
        processed = get_processed_combinations(year, ps=False)
        await process_season_data(year, False, index_ps, processed, rate_limiter)


if __name__ == "__main__":
    import asyncio
    import nest_asyncio

    nest_asyncio.apply()
    asyncio.run(main())




2025-03-01 14:58:37,284 - INFO - Processing regular season 225
2025-03-01 14:58:37,286 - INFO - Processing regular season 226
2025-03-01 14:58:37,289 - INFO - Processing regular season 227
2025-03-01 14:58:37,291 - INFO - Processing regular season 228
2025-03-01 14:58:37,294 - INFO - Processing regular season 229
2025-03-01 14:58:37,296 - INFO - Processing regular season 230
2025-03-01 14:58:37,297 - INFO - Processing regular season 231
2025-03-01 14:58:37,298 - INFO - Processing regular season 232
2025-03-01 14:58:37,300 - INFO - Processing regular season 233
2025-03-01 14:58:37,302 - INFO - Processing regular season 234
2025-03-01 14:58:37,304 - INFO - Processing regular season 235
2025-03-01 14:58:37,306 - INFO - Processing regular season 236
2025-03-01 14:58:37,308 - INFO - Processing regular season 237
2025-03-01 14:58:37,309 - INFO - Processing regular season 238
2025-03-01 14:58:37,311 - INFO - Processing regular season 239
2025-03-01 14:58:37,313 - INFO - Processing regular sea