In [18]:
import asyncio
import aiohttp
import time
import pandas as pd
import json
import parsing_functions as pf

HEADERS = json.loads(open('headers.json').read())
all_teams = pd.read_csv('all_team_histories.csv')
division = "D-I"
years = [str(n) + "-" + str((n + 1) % 100).zfill(2) for n in range(2000, 2022)]

folders = [
    "legends",
    "coaches",
    "links",
    "records",
    "schedules",
    "team_page_stats",
    "venues",
]
parsing_functions = [
    pf.parse_team_legend,
    pf.parse_head_coaches,
    pf.parse_links,
    pf.parse_records,
    pf.parse_schedule,
    pf.parse_team_stats,
    pf.parse_venues,
]

In [None]:

print(f"Fetching {year}")

teams_df = all_team_histories.loc[
    (all_team_histories.division == division) & (all_team_histories.year == year)
]

file_names = [division + "/" + folder + "/" + year + ".csv" for folder in folders]

files = []
for file_name in file_names:
    files.append(open(file_name, "w", newline=""))

writers = [csv.writer(file) for file in files]

with requests.Session() as session:
    for i, row in tqdm(teams_df.iterrows(), total=teams_df.shape[0]):
        team_id = row["team_id"]
        school_id = row["school_id"]
        team_url = row["team_url"]

        attempts = 0
        while attempts < 3:
            response = session.get(team_url, headers=HEADERS2)
            if response.status_code == 200:
                break
            print(f"Error fetching team {team_id} on attempt {attempts + 1}")
            attempts += 1
            time.sleep(1)

        if attempts == 3:
            print(f"Failed to fetch {team_id}")
            continue

        soup = BeautifulSoup(response.content, "lxml")

        for i, function in enumerate(parsing_functions):
            rows = function(soup, school_id, team_id)
            if rows:
                writers[i].writerows(rows)

        time.sleep(1)

for file in files:
    file.close()

In [5]:
all_teams = pd.read_csv('all_team_histories.csv')

In [10]:
urls = all_teams.team_url[:50]

In [None]:
import asyncio
import aiohttp
import csv


async def make_request(session, url):
    async with session.get(url, headers=HEADERS2) as response:
        return await response.text()


async def parse_response_and_write_to_csv(response, csv_writer):
    # Parse the response (example)
    parsed_data = response[:100]  # Example parsing, adjust as needed

    # Write data to CSV
    csv_writer.writerow([parsed_data])


async def main():
    urls = urls # List of URLs to request
    delay = 0.25  # Minimum delay between requests in seconds
    token_bucket = asyncio.Queue()  # Token bucket to control request rate

    # Fill the token bucket with initial tokens
    for _ in range(len(urls)):
        await token_bucket.put(None)

    async with aiohttp.ClientSession() as session:
        tasks = []
        async with open("output.csv", "w", newline="") as csvfile:
            csv_writer = csv.writer(csvfile)

            for url in urls:
                # Consume a token from the token bucket
                await token_bucket.get()

                # Make the request
                response = await make_request(session, url)

                # Parse the response and write to CSV in parallel
                task = asyncio.create_task(
                    parse_response_and_write_to_csv(response, csv_writer)
                )
                tasks.append(task)

                # Add a token back to the bucket after the delay
                asyncio.create_task(fill_token_bucket_after_delay(delay, token_bucket))

            # Wait for all tasks to complete
            await asyncio.gather(*tasks)


async def fill_token_bucket_after_delay(delay, token_bucket):
    await asyncio.sleep(delay)
    await token_bucket.put(None)


if __name__ == "__main__":
    asyncio.run(main())