In [1]:
from parkrun_scraper_sdk import Country, Course, Result, Event, ParkrunDataExtractionOrchestrator



In [None]:
import os
from datetime import datetime
from upath import UPath

def create_folder_structure(base_path: UPath):
    """Create the folder structure for Parquet file storage."""
    folders = [
        'countries',
        'courses',
        'events',
        'results',
        'runners'
    ]
    
    for folder in folders:
        path = os.path.join(base_path, folder)
        if not os.path.exists(path):
            os.makedirs(path, exist_ok=True)
            print(f"Created folder: {path}")

base_path = UPath("/home/nathanielramm/parkrun_data")  # You can change this to your preferred base path
create_folder_structure(base_path)
print("Folder structure created successfully.")



In [14]:
import os
from datetime import datetime
import pyarrow as pa
import pyarrow.parquet as pq
from upath import UPath





def write_countries_parquet(countries, base_path: UPath):
    """Write countries data to a Parquet file."""
    table = pa.Table.from_pylist([country.to_dict() for country in countries])
    output_path = os.path.join(base_path, "countries", "countries.parquet")
    pq.write_table(table, output_path)
    print(f"Countries data written to {output_path}")

def write_courses_parquet(courses, base_path: UPath):
    """Write courses data to a Parquet file."""
    table = pa.Table.from_pylist([course.to_dict() for course in courses])
    output_path = os.path.join(base_path, "courses", "courses.parquet")
    pq.write_table(table, output_path)
    print(f"Courses data written to {output_path}")

def write_events_parquet(events, base_path: UPath, course_id: str, date: str):
    """Write events data to a Parquet file, partitioned by course_id."""

    if isinstance(course_id, Course):
        course_id = course_id.id
    if isinstance(course_id, int):
        course_id = str(course_id)


    table = pa.Table.from_pylist([event.to_dict() for event in events])
    output_path = os.path.join(base_path, "events", course_id, f"events_{date}.parquet")
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    pq.write_table(table, output_path)
    print(f"Events data written to {output_path}")

def write_results_parquet(results, base_path: UPath, course_id: str, date: str):
    """Write results data to a Parquet file, partitioned by course_id."""

    if isinstance(course_id, Course):
        course_id = course_id.id
    if isinstance(course_id, int):
        course_id = str(course_id)


    table = pa.Table.from_pylist([result.to_dict() for result in results])
    output_path = os.path.join(base_path, "results", course_id, f"results_{date}.parquet")

    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    pq.write_table(table, output_path)
    print(f"Results data written to {output_path}")

def write_runners_parquet(runners, base_path: UPath):
    """Write runners data to Parquet files, partitioned by athlete_id."""
    for runner in runners:
        table = pa.Table.from_pylist([runner.to_dict()])
        athlete_id = str(runner.athlete_id)
        output_path = os.path.join(base_path, "runners", athlete_id[:2], f"{athlete_id}.parquet")
        os.makedirs(os.path.dirname(output_path), exist_ok=True)
        pq.write_table(table, output_path)
    print(f"Runners data written to {base_path}/runners/")



In [15]:
import os
import pyarrow.parquet as pq

def get_processed_countries(base_path: UPath):
    """Get the list of processed country IDs."""
    file_path = os.path.join(base_path, "countries", "countries.parquet")
    if os.path.exists(file_path):
        table = pq.read_table(file_path)
        return table['id'].to_pylist()
    return []

def get_processed_courses(base_path: UPath):
    """Get the list of processed course IDs."""
    file_path = os.path.join(base_path, "courses", "courses.parquet")
    if os.path.exists(file_path):
        table = pq.read_table(file_path)
        return table['id'].to_pylist()
    return []

def get_processed_events(base_path: UPath, course_id: str, date: str):
    """Get the list of processed event IDs for a specific course and date."""

    if isinstance(course_id, Course):
        course_id = course_id.id
    if isinstance(course_id, int):
        course_id = str(course_id)


    file_path = os.path.join(base_path, "events", course_id, f"events_{date}.parquet")
    if os.path.exists(file_path):
        table = pq.read_table(file_path)
        return table['event_number'].to_pylist()
    return []

def get_processed_results(base_path: UPath, course_id: str, date: str):
    """Check if results for a specific course and date have been processed."""

    if isinstance(course_id, Course):
        course_id = course_id.id
    if isinstance(course_id, int):
        course_id = str(course_id)


    file_path = os.path.join(base_path, "results", course_id, f"results_{date}.parquet")
    return os.path.exists(file_path)

def get_processed_runner(base_path: UPath, athlete_id):
    """Check if a runner with the given athlete_id has been processed."""
    file_path = os.path.join(base_path, "runners", athlete_id[:2], f"{athlete_id}.parquet")
    return os.path.exists(file_path)


def get_latest_processing_date(base_path: UPath, course_id: str):

    if isinstance(course_id, Course):
        course_id = course_id.id
    if isinstance(course_id, int):
        course_id = str(course_id)
    
    """Get the latest processing date for a specific course."""
    events_path = os.path.join(base_path, "events", course_id)

    if os.path.exists(events_path):
        event_files = [f for f in os.listdir(events_path) if f.endswith('.parquet')]
        if event_files:
            latest_file = max(event_files)
            return latest_file.split('_')[1].split('.')[0]  # Extract date from filename
    return None

In [16]:
import os
from datetime import datetime, timedelta
from parkrun_scraper_sdk import ParkrunDataExtractionOrchestrator
# from parquet_writer import (
#     write_countries_parquet, write_courses_parquet,
#     write_events_parquet, write_results_parquet, write_runners_parquet
# )
# from file_checker import (
#     get_processed_countries, get_processed_courses,
#     get_processed_events, get_processed_results,
#     get_processed_runner, get_latest_processing_date
# )

def main(base_path: UPath, country_ids=None, course_ids=None, processing_date: str="2014-01-01", max_events_per_run=10000):

    # base_path = "parkrun_data"
    # processing_date = datetime.now().strftime("%Y-%m-%d")
    
    # Initialize the ParkrunDataExtractionOrchestrator
    orchestrator = ParkrunDataExtractionOrchestrator(processing_date)
    
    # Process countries
    countries = orchestrator.extract_raw_countries()
    if country_ids:
        countries = [country for country in countries if country.id in country_ids]
    processed_countries = get_processed_countries(base_path)
    new_countries = [country for country in countries if country.id not in processed_countries]

    if new_countries:
        write_countries_parquet(new_countries, base_path)
    
    # Process courses
    courses = orchestrator.extract_raw_courses()
    if course_ids:
        courses = [course for course in courses if course.id in course_ids]
    processed_courses = get_processed_courses(base_path)
    new_courses = [course for course in courses if course.id not in processed_courses]
    if new_courses:
        write_courses_parquet(new_courses, base_path)
    
    # Process events and results
    total_events_processed = 0
    
    for course in courses:

        if total_events_processed >= max_events_per_run:
            break
        
        latest_processing_date = get_latest_processing_date(base_path, course.id)
        
        start_date = datetime.strptime(latest_processing_date, "%Y-%m-%d") + timedelta(days=1) if latest_processing_date else None
        end_date = datetime.strptime(processing_date, "%Y-%m-%d")
        
        current_date = start_date or end_date
        
        while current_date <= end_date and total_events_processed < max_events_per_run:
            date_str = current_date.strftime("%Y-%m-%d")
            
            processed_events = get_processed_events(base_path, course.id, date_str)
            new_events = orchestrator.extract_course_new_event_history(course.id, processed_events)
            
            if new_events:
                events_to_write = list(new_events.values())[:max_events_per_run - total_events_processed]
                write_events_parquet(events_to_write, base_path, course.id, date_str)
                total_events_processed += len(events_to_write)
            
            for event_id, event in list(new_events.items())[:max_events_per_run - total_events_processed]:
                if not get_processed_results(base_path, course.id, date_str):
                    results = orchestrator.extract_raw_event_results(course.id, event_id)
                    write_results_parquet(results, base_path, course.id, date_str)
                    
                    # # Process runners
                    # for result in results:
                    #     if not get_processed_runner(base_path, result.athlete_id):
                    #         runner = orchestrator.extract_runner(result.athlete_id)
                    #         write_runners_parquet([runner], base_path)
            
            current_date += timedelta(days=1)

    print(f"Processed {total_events_processed} events in this run.")

# if __name__ == "__main__":
#     # Example usage:
#     # Process all countries and courses, with a limit of 1000 events per run
#     main()

    # Process specific countries and courses, with a limit of 500 events per run
    # main(country_ids=['1', '2', '3'], course_ids=['10', '20', '30'], max_events_per_run=500)

In [17]:
obj = ParkrunDataExtractionOrchestrator( "2024-10-22")

In [18]:
# obj.country_num_courses

In [None]:
main(base_path, "54")

In [1]:
# from msilib import schema
from cmath import polar
import os
from abc import ABC, abstractmethod
from tkinter import E
from typing import List, Dict, Any, Union
import pyarrow as pa
import pyarrow.parquet as pq
from upath import UPath
import polars

from parkrun_scraper_sdk import Country, Course, Result, Event, ParkrunDataExtractionOrchestrator


class BaseParquetHandler(ABC):

    def __init__(self, base_path: UPath):
        self.base_path = base_path

    @property
    @abstractmethod
    def domain_folder(self) -> str:
        pass

    @property
    @abstractmethod
    def file_name_template(self) -> str:
        pass

    @property
    @abstractmethod
    def partition_keys(self) -> List[str]:
        pass

    def get_file_path(self, **kwargs) -> UPath:
        
        #add the partition keys to the file path. Preserve order!
        ordered_partitions = []
        for key in self.partition_keys:
            if key in kwargs:
                ordered_partitions.append(f"{key}={kwargs[key]}")        

        partition_path = '/'.join( ordered_partitions )       

        file_name = self.file_name_template.format(**kwargs)
        file_path =  os.path.join(self.base_path, self.domain_folder, partition_path, file_name)

        return UPath(file_path)



    def read_parquet(self, **kwargs) -> polars.LazyFrame:

        file_path: UPath = self.get_file_path(**kwargs)

        try:
            return polars.scan_parquet(file_path)
        except Exception as e:
            print(f"Error reading file {file_path}: {e}")
            return None


    def get_processed_ids(self, id_column: str, **kwargs) -> List[str]:

        table: polars.LazyFrame = self.read_parquet(**kwargs)#.select(polars.col(id_column))

        if table is not None:

            try:
                table_dicts =  table.collect().to_dict()
                col = table_dicts[id_column]
                return [str(item) for item in col]
            except Exception as e:
                print(f"Error getting processed IDs: {e}")
                return []
        return []

    def file_exists(self, **kwargs) -> bool:

        file_path: UPath = self.get_file_path(**kwargs)
        return file_path.exists()
    
    def write_parquet(self, data: List[Dict[str, Any]], **kwargs):

        items = [item.to_dict() for item in data]
        table = pa.Table.from_pylist(items)

        output_path = self.get_file_path(**kwargs)
        os.makedirs(os.path.dirname(output_path), exist_ok=True)
        pq.write_table(table, output_path)
        
        print(f"Data written to {output_path}")


class CountriesHandler(BaseParquetHandler):
    domain_folder = "countries"
    # file_name_template = "countries_{processing_date}.parquet"
    file_name_template = "countries.parquet"
    partition_keys = []
    


    def get_stored_countries(self) -> List[str]:
        return self.get_processed_ids('country_id')

class CoursesHandler(BaseParquetHandler):

    domain_folder = "courses"
    # file_name_template = "courses_{processing_date}.parquet"
    file_name_template = "courses.parquet"
    partition_keys = []

    def get_stored_courses(self) -> List[str]:
        return self.get_processed_ids('course_id')

class EventsHandler(BaseParquetHandler):

    domain_folder = "events"
    file_name_template = "events_{course_id}.parquet"
    partition_keys = ['country_id']

    def get_processed_event_ids(self, course_id: str, country_id: str) -> List[str]:
        
        return self.get_processed_ids('event_id', course_id=course_id, country_id=country_id)


class ResultsHandler(BaseParquetHandler):
    domain_folder = "results"
    file_name_template = "results_{course_id}_{event_id}.parquet"
    partition_keys = ['country_id', 'course_id']

    def get_processed_result_event_ids(self, course_id: str, country_id: str ) -> List[str]:

        event_ids =  self.get_processed_ids('event_id', course_id=course_id, country_id=country_id, event_id="*")
        
        return list(set(event_ids))


class RunnersHandler(BaseParquetHandler):
    domain_folder = "runners"
    file_name_template = "{athlete_id}.parquet"
    partition_keys = ['athlete_id']

    def get_processed_runner(self, athlete_id: str) -> bool:
        return self.file_exists(athlete_id=athlete_id)

# Usage example:
base_path = UPath("/home/nathanielramm/parkrun_data")
countries_handler = CountriesHandler(base_path)
courses_handler = CoursesHandler(base_path)
events_handler = EventsHandler(base_path)
results_handler = ResultsHandler(base_path)
runners_handler = RunnersHandler(base_path)

In [2]:
import subprocess
import os

def nordvpn_reconnect():
    script_path = os.path.expanduser("~/git/nordvpn-connect/nordvpn_reconnect.sh")
    
    try:
        # Run the script with sudo
        result = subprocess.run(['sudo', script_path], 
                                capture_output=True, 
                                text=True, 
                                check=True)
        
        print("Reconnection successful!")
        print("Output:", result.stdout)
        
        # Check if we're connected to a new server
        if "Reconnected to:" in result.stdout:
            new_server = result.stdout.split("Reconnected to:")[-1].strip()
            print(f"Connected to new server: {new_server}")
        
    except subprocess.CalledProcessError as e:
        print("Reconnection failed!")
        print("Error output:", e.stderr)
    except Exception as e:
        print(f"An unexpected error occurred: {str(e)}")

In [6]:
import os
from datetime import datetime, timedelta
from dateutil import parser
from parkrun_scraper_sdk import ParkrunDataExtractionOrchestrator, course
import concurrent.futures
from typing import List

# from domain_handlers import (
#     CountriesHandler, CoursesHandler, EventsHandler, ResultsHandler, RunnersHandler
# )

# def process_event(orchestrator, course, event):
#     event_result = orchestrator.extract_raw_event_results(course, event)
#     results_handler.write_parquet(event_result, course_id=course.course_id, country_id=course.country_id, event_id=event.event_id)


def do_it(base_path: UPath, processing_date: str = "2014-01-01", country_ids: List[str]=None, course_ids: List[str]=None):


    # base_path = "parkrun_data"
    # processing_date = datetime.strftime("%Y-%m-%d")
    # processing_date = datetime(2014, 1, 1).strftime("%Y-%m-%d")

    #Preprocess the country_ids and course_ids
    if country_ids is None:
        country_ids = []
    elif isinstance(country_ids, str):
        country_ids = [country_ids]
    elif isinstance(country_ids, int):
        country_ids = [str(country_ids)]
    country_ids = [str(country_id) for country_id in country_ids]

    if course_ids is None:
        course_ids = []
    elif isinstance(course_ids, str):
        course_ids = [course_ids]
    elif isinstance(course_ids, int):
        course_ids = [str(course_ids)]
    course_ids = [str(course_id) for course_id in course_ids]


    # Initialize handlers
    countries_handler = CountriesHandler(base_path)
    courses_handler =   CoursesHandler(base_path)
    events_handler =    EventsHandler(base_path)
    results_handler =   ResultsHandler(base_path)
    runners_handler =   RunnersHandler(base_path)
    
    # Initialize the ParkrunDataExtractionOrchestrator
    orchestrator = ParkrunDataExtractionOrchestrator(processing_date)

    ########################################################    
    # Process countries
    ########################################################
    live_country_ids =   list(orchestrator.live_countries_lookup.keys())
    stored_country_ids = countries_handler.get_stored_countries()

    if len(country_ids) > 0:
        #If we are only processing specific countries, then don't update the stored countries

        filtered_live_country_ids =   [country_id for country_id in live_country_ids  if country_id in country_ids]
        unsupported_country_ids =     [country_id for country_id in country_ids       if country_id not in live_country_ids]
        new_country_ids =             [country_id for country_id in filtered_live_country_ids if country_id not in stored_country_ids]

        if len(unsupported_country_ids) > 0:
            raise Exception(f"You requested {unsupported_country_ids}: countries that are not supported by parkrun.")

        if len(new_country_ids) > 0:
            raise Exception(f"You requested {new_country_ids}: countries that have not been previously stored. Run a full update to store these countries.")

        country_ids_to_process = filtered_live_country_ids

    if len(country_ids) == 0:
        #No filtering, so update the stored countries if required!

        new_country_ids = [country_id for country_id in live_country_ids if country_id not in stored_country_ids]

        if len(new_country_ids) > 0:
            print(f"Identified {new_country_ids}: countries that have not been previously stored.")

            live_countries =   orchestrator.extract_raw_countries()
            countries_handler.write_parquet(live_countries)

        country_ids_to_process = live_country_ids


    ########################################################    
    # Process courses
    ########################################################

    live_course_ids =   list(orchestrator.live_courses_lookup.keys())
    stored_course_ids = courses_handler.get_stored_courses()

    if len(course_ids) > 0:
        #If we are only processing specific courses, then don't update the stored courses

        filtered_live_course_ids =   [course_id for course_id in live_course_ids            if course_id in course_ids]
        unsupported_course_ids =     [course_id for course_id in course_ids                 if course_id not in live_course_ids]
        new_course_ids =             [course_id for course_id in filtered_live_course_ids   if course_id not in stored_course_ids]

        if len(unsupported_course_ids) > 0:
            raise Exception(f"You requested {unsupported_course_ids}: courses that are not supported by parkrun.")

        if len(new_course_ids) > 0:
            raise Exception(f"You requested {new_course_ids}: courses that have not been previously stored. Run a full update to store these courses.")

        course_ids_to_process = filtered_live_course_ids

    if len(course_ids) == 0:
        #No filtering, so update the stored courses if required!

        new_course_ids = [course_id for course_id in live_course_ids if course_id not in stored_course_ids]

        if len(new_course_ids) > 0:
            print(f"Identified {new_course_ids}: courses that have not been previously stored.")

            live_courses =   orchestrator.extract_raw_courses()
            courses_handler.write_parquet(live_courses)

        course_ids_to_process = live_course_ids


    if len(country_ids_to_process) > 0:

        course_ids_not_in_countries = [course_id for course_id in course_ids_to_process if orchestrator.live_courses_lookup[course_id].country_id not in country_ids_to_process]
        if len(course_ids_not_in_countries) > 0:
            print(f"Excluding {course_ids_not_in_countries}: courses that are not in the specified countries.")
        
        course_ids_to_process = [course_id for course_id in course_ids_to_process if orchestrator.live_courses_lookup[course_id].country_id in country_ids_to_process]

    
    print(f"Processing {len(country_ids_to_process)} countries and {len(course_ids_to_process)} courses.")
    print(f"Processing course_ids: {course_ids_to_process}")

    # Process events and results
   
    for course_id in course_ids_to_process:

        # nordvpn_reconnect()

        obj_course = orchestrator.live_courses_lookup[course_id]

        ############################
        # Events    
        ############################
        processed_event_ids = events_handler.get_processed_event_ids(course_id=obj_course.course_id, country_id=obj_course.country_id)

        new_events = orchestrator.extract_course_new_event_history(obj_course, processed_event_ids)
        course_first_event_date = orchestrator.get_course_first_event_date(obj_course)

        if parser.parse(course_first_event_date).date() > parser.parse(processing_date).date():
            print(f"Course {obj_course.course_id} has its first event on {course_first_event_date}. Processing is only being performed up to {processing_date}.")
            continue

        print(f"processed_events: {processed_event_ids}")
        print(f"new_events: {new_events}")

        #Update the event history file - a single file for each course
        if new_events:

            events_to_write = list(new_events.values())
            print(f"events_to_write: {events_to_write}")
            events_handler.write_parquet(data=events_to_write, course_id=obj_course.course_id, country_id=obj_course.country_id)

        ############################        
        # Results
        ############################
        
        #Todo: Based on how many events there are to process, we should divide them up into smaller chunks to avoid memory issues
        #Ideally paralleise this process


        known_event_ids = events_handler.get_processed_event_ids(course_id=obj_course.course_id, country_id=obj_course.country_id)
        processed_result_event_ids = results_handler.get_processed_result_event_ids(course_id=obj_course.course_id, country_id = obj_course.country_id)
        fully_processed_events = [event_id for event_id in processed_event_ids if event_id in processed_result_event_ids]


        print(f"processed_result_event_ids: {processed_result_event_ids}")

        #Unprocessed results that are before the processing date. This gets all of them!
        unprocessed_results: List[Event] = orchestrator.extract_course_unprocessed_result_events(obj_course, fully_processed_events)

        print(f"unprocessed_results: {unprocessed_results}")

        for event in unprocessed_results:
            event_result = orchestrator.extract_raw_event_results(obj_course, event)
            results_handler.write_parquet(event_result, course_id=obj_course.course_id, country_id=obj_course.country_id, event_id=event.event_id)

        #Parallelise the processing of the events
        # def process_event(course, event):
        #     event_result = orchestrator.extract_raw_event_results(course, event)
        #     results_handler.write_parquet(event_result, course_id=course.course_id, country_id=course.country_id, event_id=event.event_id)


        # unprocessed_results: List[Event] = orchestrator.extract_course_unprocessed_result_events(course, fully_processed_events)
        # print(f"unprocessed_results: {unprocessed_results}")

        # with concurrent.futures.ThreadPoolExecutor() as executor:
        #     futures = [executor.submit(process_event, course, event) for event in unprocessed_results]
        #     for future in concurrent.futures.as_completed(futures):
        #         try:
        #             future.result()  # To raise exceptions if any
        #         except Exception as e:
        #             print(f"Error processing event: {e}")


        # for event_id in unprocessed_results.keys():

        #     results_handler.write_parquet(unprocessed_results[event_id], course_id=course.course_id, country_id=course.country_id, event_id=event_id)

                    
                    # # Process runners
                    # new_runners = []
                    # for result in results:
                    #     if not runners_handler.get_processed_runner(result.athlete_id):
                    #         runner = orchestrator.extract_runner(result.athlete_id)
                    #         new_runners.append(runner)
                    
                    # if new_runners:
                    #     for runner in new_runners:
                    #         runners_handler.write_parquet([runner], athlete_id=runner.athlete_id)
            
            # current_date += timedelta(days=1)

    # print(f"Processed {total_events_processed} events in this run.")

# if __name__ == "__main__":
    # Example usage:
    # Process all countries and courses, with a limit of 1000 events per run
    # main()

    # Process specific countries and courses, with a limit of 500 events per run
    # main(country_ids=['1', '2', '3'], course_ids=['10', '20', '30'], max_events_per_run=500)

In [9]:
processing_date = "2023-01-01"

do_it(base_path, processing_date=processing_date, country_ids=["3"], course_ids=["3042"])

Processing 1 countries and 1 courses.
Processing course_ids: ['3042']
Error getting processed IDs: No such file or directory (os error 2): /home/nathanielramm/parkrun_data/events/country_id=3/events_3042.parquet
Course 3042 has an event registered for 2024-10-19. However, event processing is only being performed up to the processing date: 2023-01-01
Course 3042 has an event registered for 2024-10-12. However, event processing is only being performed up to the processing date: 2023-01-01
Course 3042 has an event registered for 2024-10-05. However, event processing is only being performed up to the processing date: 2023-01-01
Course 3042 has an event registered for 2024-09-28. However, event processing is only being performed up to the processing date: 2023-01-01
Course 3042 has an event registered for 2024-09-21. However, event processing is only being performed up to the processing date: 2023-01-01
Course 3042 has an event registered for 2024-09-14. However, event processing is only be

In [10]:
import polars as pl
res = ( polars.scan_parquet("/home/nathanielramm/parkrun_data/results/country_id=*/course_id=3042/results_*.parquet") 
        # .filter(pl.col("age_grade") > "80.00" )
)

res.collect()#['event_id']

event_id,course_id,country_id,event_date,name,age_group,club,gender,gender_position,position,runs,age_grade,achievement,volunteer_count,time,personal_best,athlete_id,is_pb,club_membership
str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str
"""1""","""3042""","""3""","""2022-06-04 00:00:00""","""Paul PAYNE""","""VM45-49""","""""","""Male""","""None""","""1""","""26""","""78.29""","""First Timer!""","""0""","""18:07""","""None""","""7500485""","""False""","""Member of the 25 Club"""
"""1""","""3042""","""3""","""2022-06-04 00:00:00""","""James SCHLOEFFEL""","""VM40-44""","""""","""Male""","""None""","""2""","""90""","""68.79""","""First Timer!""","""7""","""20:08""","""None""","""602616""","""False""","""Member of the 50 Club"""
"""1""","""3042""","""3""","""2022-06-04 00:00:00""","""Mikey SINCLAIR""","""VM40-44""","""No Homers""","""Male""","""None""","""3""","""52""","""65.31""","""First Timer!""","""8""","""20:54""","""None""","""2403550""","""False""","""Member of the 50 Club"""
"""1""","""3042""","""3""","""2022-06-04 00:00:00""","""Jon GEE""","""VM45-49""","""""","""Male""","""None""","""4""","""140""","""67.69""","""First Timer!""","""11""","""21:06""","""None""","""1751870""","""False""","""Member of the 100 Club"""
"""1""","""3042""","""3""","""2022-06-04 00:00:00""","""Cameron COOK""","""VM45-49""","""""","""Male""","""None""","""5""","""50""","""68.27""","""First Timer!""","""5""","""21:26""","""None""","""2840392""","""False""","""Member of the 50 Club"""
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""9""","""3042""","""3""","""2022-07-30 00:00:00""","""Rebecca RYAN""","""SW30-34""","""The Adventurers""","""Female""","""None""","""61""","""163""","""24.83""","""First Timer!""","""11""","""59:36""","""None""","""3580708""","""False""","""Member of the 100 Club"""
"""9""","""3042""","""3""","""2022-07-30 00:00:00""","""Tracey BANNAN""","""VW55-59""","""""","""Female""","""None""","""62""","""112""","""30.34""","""First Timer!""","""53""","""59:39""","""None""","""6921042""","""False""","""Member of the 100 Club"""
"""9""","""3042""","""3""","""2022-07-30 00:00:00""","""Helen ANDREWS""","""VW45-49""","""Westerfoldians""","""Female""","""None""","""63""","""217""","""27.42""","""First Timer!""","""175""","""59:41""","""None""","""4100966""","""False""","""Member of the 100 Club"""
"""9""","""3042""","""3""","""2022-07-30 00:00:00""","""Laura MACRAILD""","""JW10""","""Lalor Running Club""","""Female""","""None""","""64""","""36""","""26.80""",""" ""","""11""","""1:12:53""","""58:19""","""6068287""","""True""","""Member of the 25 Club"""


In [3]:
processing_date = "2024-09-12"
orchestrator = ParkrunDataExtractionOrchestrator(processing_date)

In [12]:
obj_course
# orchestrator.country_course_ids

Course(course_id='25', country_id='3', coordinates=(151.18219, -33.907658), eventname='stpeters', long_name='St Peters parkrun', short_name='St Peters', local_long_name='None', country_code='3', series_id='1', location='Sydney Park, St Peters, Sydney', course_url='https://www.parkrun.com.au/stpeters/', event_history=[])

In [10]:
# obj_course = Course



obj_course= orchestrator.live_courses_lookup["25"]

events = Event.get_event_history(obj_course.course_id, obj_course.course_url, obj_course.country_id)

# # events = orchestrator.extract_raw_course_event_history(obj_course)
# events
# orchestrator.get_course_first_event_date(obj_course)



<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="utf-8">
    <meta name="viewport" content="width=device-width, initial-scale=1">
    <title></title>
    <style>
        body {
            font-family: "Arial";
        }
    </style>
    <script type="text/javascript">
    window.awsWafCookieDomainList = [];
    window.gokuProps = {
"key":"AQIDAHjcYu/GjX+QlghicBgQ/7bFaQZ+m5FKCMDnO+vTbNg96AFDxrLLqKbG+O6B3OC1eNBlAAAAfjB8BgkqhkiG9w0BBwagbzBtAgEAMGgGCSqGSIb3DQEHATAeBglghkgBZQMEAS4wEQQMQe1lcHaWHC02OTa/AgEQgDuFbA1awdsEibEeTEOW0AFglEIlS0tatndqCSOu8yIRQ3uH2+R8sAS60ASIdyJKTNiA3U97kuyBbD9WqQ==",
          "iv":"CgAEwh8x0QAAACSn",
          "context":"PKfsqZ+0dD59KrAfdxPiy5pd5FEIpbrwioccmU+Ivcms2JSH6w9EtWuDmYEGM8NLcwYnrFyxkZE9LhnOnUeacz3KABDGIfavs8vvfrROY1tnNCpLFC+76PtxnuN6l8RGRrH7tELd24gmtM4QZl952Z+krRf/LtQyytnCyAiG3rC4Edl0Gyb5/65uaXmhQcVDThr5jmZ3Fv/m51CnOQ8afndLoaAZSKNuAbXit99uzwE7aGuIaUTxfTYQ8TiKdpq6HvGNBI5DJpsg0wmT2/JPS23nEBScxqzl0srCQrD/wenXhe84R6TTZQIiUVDh1jkd9IlkUwBRyPAMYqLD6WyNy

In [5]:
obj_course

Course(course_id='2857', country_id='4', coordinates=(13.065462, 47.761065), eventname='hellbrunn', long_name='Hellbrunn parkrun', short_name='Hellbrunn', local_long_name='None', country_code='4', series_id='1', location='', course_url='https://www.parkrun.co.at/hellbrunn/', event_history=[])

In [1]:
from parkrun_scraper_sdk import Country, Course, Result, Event, ParkrunDataExtractionOrchestrator

from dataclasses import dataclass, asdict, fields
import stat
from typing import Any, Dict, get_origin, get_args, Optional, Type
from datetime import datetime
import pyarrow as pa
import polars as pl


In [None]:
Event().polars_schema()

In [None]:
from typing import get_origin, get_args, Union, Optional, List, Dict, Any
from collections.abc import Sequence

def get_first_concrete_type(type_hint: Any) -> type:
    """
    Recursively explore type hints to find the first concrete type.
    
    :param type_hint: The type hint to explore
    :return: The first concrete type found
    """
    # If it's a simple type (int, str, etc.), return it directly
    if isinstance(type_hint, type):
        return type_hint
    
    # Get the origin of the type hint (e.g., Union, List, etc.)
    origin = get_origin(type_hint)
    
    # If it's None (meaning it's a simple type), return the type hint itself
    if origin is None:
        return type_hint
    
    # Handle Union types (including Optional, which is Union[T, None])
    if origin is Union:
        args = get_args(type_hint)
        # Filter out NoneType from Union args
        concrete_args = [arg for arg in args if arg is not type(None)]
        if concrete_args:
            return get_first_concrete_type(concrete_args[0])
    
    # Handle List, Dict, and other generic types
    elif issubclass(origin, (List, Sequence, Dict)):
        args = get_args(type_hint)
        if args:
            return get_first_concrete_type(args[0])
    
    # For any other case, recurse into arguments
    args = get_args(type_hint)
    if args:
        return get_first_concrete_type(args[0])
    
    # If we can't determine a concrete type, return Any
    return Any

# Test the function
from typing import Optional, Union, List

def test_get_first_concrete_type():
    assert get_first_concrete_type(int) == int
    assert get_first_concrete_type(str) == str
    assert get_first_concrete_type(Optional[int]) == int
    assert get_first_concrete_type(Union[int, str]) == int
    assert get_first_concrete_type(Optional[Union[int, str, list]]) == int
    assert get_first_concrete_type(List[str]) == str
    assert get_first_concrete_type(Dict[str, int]) == str
    print("All tests passed!")

test_get_first_concrete_type()

In [None]:
import polars as pl
pl.scan_parquet("/home/nathanielramm/parkrun_data/results/country_id=54/course_id=*/*.parquet").collect()

# x = t.filter(pl.col("athlete_id") == 5215943)
# x.collect()



In [4]:
url = "https://www.parkrun.com.au/coburg/results/latestresults/"

In [6]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import time




chrome_options = Options()
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=chrome_options)

driver.get(url)
print("Please complete any manual tests in the browser window.")
input("Press Enter when you have completed any necessary actions...")
driver.page_source


WebDriverException: Message: unknown error: DevToolsActivePort file doesn't exist
Stacktrace:
#0 0x5586e4cea4e3 <unknown>
#1 0x5586e4a19c76 <unknown>
#2 0x5586e4a46019 <unknown>
#3 0x5586e4a422fa <unknown>
#4 0x5586e4a3f029 <unknown>
#5 0x5586e4a7dccc <unknown>
#6 0x5586e4a7d47f <unknown>
#7 0x5586e4a74de3 <unknown>
#8 0x5586e4a4a2dd <unknown>
#9 0x5586e4a4b34e <unknown>
#10 0x5586e4caa3e4 <unknown>
#11 0x5586e4cae3d7 <unknown>
#12 0x5586e4cb8b20 <unknown>
#13 0x5586e4caf023 <unknown>
#14 0x5586e4c7d1aa <unknown>
#15 0x5586e4cd36b8 <unknown>
#16 0x5586e4cd3847 <unknown>
#17 0x5586e4ce3243 <unknown>
#18 0x7ffbf9560ac3 <unknown>


In [1]:
import subprocess

def ping_ip(ip_address: str) -> bool:
    """Ping an IP address to check if it is responding."""
    try:
        # Use the ping command with a single packet (-c 1)
        result = subprocess.run(['ping', '-c', '1', ip_address], capture_output=True, text=True)
        return result.returncode == 0
    except Exception as e:
        print(f"An error occurred: {e}")
        return False

# Example usage
ip_address = "8.8.8.8"  # Replace with the IP address you want to check
if ping_ip(ip_address):
    print(f"{ip_address} is responding to ping.")
else:
    print(f"{ip_address} is not responding to ping.")

8.8.8.8 is responding to ping.
