In [None]:
import requests
import json


def get_runner_ids(eventCode="M2024", place1=48000, place2=48010):
    url = 'https://rmsprodapi.nyrr.org/api/v2/runners/finishers-filter'
    header = {"content-type": "application/json;charset=UTF-8"}
    j = {"eventCode": eventCode, "overallPlaceFrom": place1, "overallPlaceTo": place2,
         "sortColumn": "overallTime", "sortDescending":"false"
}
    response = requests.post(url, headers=header, json=j, allow_redirects=True)
    assert response.status_code == 200
    print(response)
    d = json.loads(response.text)
    print(d["totalItems"])
    print(len(d["items"]))
    print(d["items"][0]["runnerId"])
    return d

def get_splits(runner_id=42948497):
    url = 'https://rmsprodapi.nyrr.org/api/v2/runners/resultDetails'
    header = {"content-type": "application/json;charset=UTF-8"}
    response = requests.post(url, headers=header, json={"runnerId": str(runner_id)}, allow_redirects=True)
    assert response.status_code == 200, f"Response Code = {response.status_code}"
    d = json.loads(response.text)
    assert d["details"], f"No data specified, d = {d}"
    return d

In [2]:
x = get_runner_ids(eventCode="M2023", place1=51340, place2=51348)

<Response [200]>
9
9
42323398


In [43]:
# get_splits(runner_id=43928778)

In [2]:
import time
import pandas as pd
feats = ["runnerId", "bib", "firstName", "lastName", "age", "gender", "city", "countryCode"]

def load_data(last_runner=522, eventCode="M2024"):
    tables = []
    for i in range(1, last_runner, 100):
        place1, place2 = i, i+99
        data = get_runner_ids(eventCode, place1, place2)
        table = pd.DataFrame(data['items'])[feats]
        tables.append(table)
        if (i - 1) % 1000 == 0:
            time.sleep(1)
            print("loaded", i - 1)

    return tables

In [9]:
# tables = load_data(last_runner=24949, eventCode="M2021")
# tables = load_data(last_runner=55270, eventCode="M2024")
# tables = load_data(last_runner=47745, eventCode="M2022")
# tables = load_data(last_runner=51348, eventCode="M2023")
# pd.concat(tables).to_csv("raw_data/nyc/nyc_names21.csv", index=False)

In [18]:
def loop_attempts(a_dict, n, year=23):
    for i in range(n):
        a_dict = attempt_scrape_iteration(a_dict, year=year)
        num_left = len([k for k, v in a_dict.items() if v is None])
        print(f"{i}th cooldown, 30 seconds..........num left = {num_left}")
        pd.DataFrame([v for v in a_dict.values() if v is not None]).to_csv(f"raw_data/nyc/nyc_times{year}.csv")
        if num_left == 0:
            print("done!!!!")
            return a_dict
        time.sleep(30)

    return a_dict

def attempt_scrape_iteration(adict, year=23):
    ids = [k for k, v in adict.items() if v is None]
    for idx, id in enumerate(ids):
        try:
            splits = get_splits(id)['details']['splitResults']
            dt = {dct["splitCode"]: dct["time"] for dct in splits}
            ser = pd.Series(dt)
            ser["id"] = id
            adict[id] = ser

            if idx % 1000 == 0:
                print("loaded", idx)
                pd.DataFrame([v for v in adict.values() if v is not None]).to_csv(f"raw_data/nyc/nyc_times{year}.csv")

            if idx % 100 == 0:
                time.sleep(6.6)
            if idx % 44 == 0:
                time.sleep(4.4)
            if idx % 55 == 0:
                time.sleep(5.5)

            # if idx == 10:
            #     break

        except AssertionError as e:
            print("idx=", idx, ":", e)
            break
    
    return adict

In [4]:
year = 21
data_load21 = pd.read_csv(f"raw_data/nyc/nyc_names{year}.csv")
init_dict21 = {id: None for id in list(data_load21["runnerId"])}
adding_dict21 = loop_attempts(init_dict21, n=100, year=year)

loaded 0
loaded 1000
loaded 2000
loaded 3000
loaded 4000
loaded 5000
loaded 6000
loaded 7000
loaded 8000
loaded 9000
loaded 10000
loaded 11000
loaded 12000
loaded 13000
loaded 14000
loaded 15000
loaded 16000
loaded 17000
loaded 18000
loaded 19000
loaded 20000
loaded 21000
loaded 22000
loaded 23000
loaded 24000
0th cooldown, 30 seconds..........num left = 0
done!!!!


In [20]:
times21 = pd.DataFrame([v for v in adding_dict21.values() if v is not None])
data_load21.merge(times21, left_on="runnerId", right_on="id").to_csv("raw_data/nyc/nyc21.csv")

In [23]:
year = 23
data_load23 = pd.read_csv(f"raw_data/nyc/nyc_names{year}.csv")[:100]
init_dict23 = {id: None for id in list(data_load23["runnerId"])}
adding_dict23 = loop_attempts(init_dict23, n=100, year=year)

loaded 0
0th cooldown, 30 seconds..........num left = 0
done!!!!


In [24]:
times23 = pd.DataFrame([v for v in adding_dict23.values() if v is not None])
data_load23.merge(times23, left_on="runnerId", right_on="id").to_csv("raw_data/nyc/nyc23.csv")

In [25]:
year = 24
data_load24 = pd.read_csv(f"raw_data/nyc/nyc_names{year}.csv")[:100]
init_dict24 = {id: None for id in list(data_load24["runnerId"])}
adding_dict24 = loop_attempts(init_dict24, n=100, year=year)

loaded 0
0th cooldown, 30 seconds..........num left = 0
done!!!!


In [26]:
times24 = pd.DataFrame([v for v in adding_dict24.values() if v is not None])
data_load24.merge(times24, left_on="runnerId", right_on="id").to_csv("raw_data/nyc/nyc24.csv")

In [15]:
s = pd.Series({dct["splitCode"]: dct["time"] for dct in get_splits(id)['details']['splitResults']})
s['id']

5K       0:18:38
10K      0:37:09
15K      0:55:29
20K      1:13:54
HALF     1:17:57
25K      1:32:41
30K      1:51:19
20M      1:59:58
35K      2:11:26
40K      2:32:25
25.2M    2:34:45
26M      2:40:02
MAR      2:41:25
dtype: object

In [82]:
marks = ["5K", "10K", "15K", "20K", "HALF", "25K", "30K", "35K", "40K", "MAR"]

In [9]:
test_id =d2['items'][0]["runnerId"]
d3 = get_splits(runner_id = test_id)
d3

IndexError: list index out of range

In [52]:
d3["details"]

{'runnerId': 42939236,
 'bib': '1300',
 'teamName': None,
 'iaaf': 'USA',
 'placeOverall': 230,
 'gunPlaceOverall': 237,
 'netPlaceOverall': 230,
 'timeOverall': '2:34:54',
 'gunTime': '2:35:34',
 'netTime': '2:34:54',
 'scoreByNetTime': True,
 'pace': '05:55',
 'placeGender': 209,
 'placeAgeGroup': 67,
 'ageGroupFromTo': '30-34',
 'timeAgeGrade': '2:34:54',
 'placeAgeGrade': 456,
 'percentAgeGrade': 79.38,
 'placeCountry': 137,
 'speed': 10.2,
 'photoUrl': 'https://www.marathonfoto.com/In?RaceOID=27042024F2&LastName=Harper&BibNumber=1300',
 'basnoPhotoUrl': '',
 'splitResults': [{'splitCode': '5K',
   'splitName': None,
   'time': '0:18:29',
   'pace': '05:57',
   'speed': 10.1,
   'distance': 3.11},
  {'splitCode': '10K',
   'splitName': None,
   'time': '0:36:37',
   'pace': '05:54',
   'speed': 10.2,
   'distance': 6.21},
  {'splitCode': '15K',
   'splitName': None,
   'time': '0:54:51',
   'pace': '05:54',
   'speed': 10.2,
   'distance': 9.32},
  {'splitCode': '20K',
   'splitNam

In [35]:
import json
# url = 'https://rmsprodapi.nyrr.org/api/v2/runners/eventRunner'
url = 'https://rmsprodapi.nyrr.org/api/v2/runners/resultDetails'
j = {"eventCode": "M2024", "bib": "4"}
header = {"content-type": "application/json;charset=UTF-8"}
response = requests.post(url, headers=header, json=j, allow_redirects=True)
print(response)

<Response [200]>


In [23]:
json.loads(response.text)["details"]

{'runnerId': 42948497,
 'bib': '7',
 'teamName': 'NIKE',
 'iaaf': 'NED',
 'placeOverall': 1,
 'gunPlaceOverall': 1,
 'netPlaceOverall': 1,
 'timeOverall': '2:07:39',
 'gunTime': '2:07:39',
 'netTime': '2:07:39',
 'scoreByNetTime': True,
 'pace': '04:53',
 'placeGender': 1,
 'placeAgeGroup': 1,
 'ageGroupFromTo': '35-39',
 'timeAgeGrade': '2:06:57',
 'placeAgeGrade': 1,
 'percentAgeGrade': 96.86,
 'placeCountry': 1,
 'speed': 12.3,
 'photoUrl': 'https://www.marathonfoto.com/In?RaceOID=27042024F2&LastName=Nageeye&BibNumber=7',
 'basnoPhotoUrl': '',
 'splitResults': [{'splitCode': '5K',
   'splitName': None,
   'time': '0:16:02',
   'pace': '05:10',
   'speed': 11.6,
   'distance': 3.11},
  {'splitCode': '10K',
   'splitName': None,
   'time': '0:31:28',
   'pace': '05:04',
   'speed': 11.9,
   'distance': 6.21},
  {'splitCode': '15K',
   'splitName': None,
   'time': '0:46:55',
   'pace': '05:02',
   'speed': 11.9,
   'distance': 9.32},
  {'splitCode': '20K',
   'splitName': None,
   'ti