-
Notifications
You must be signed in to change notification settings - Fork 911
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
refactor(parser-valiation): refactor BO parser #6271
Conversation
PR Analysis
PR Feedback
How to useInstructions
|
parsers/CNDC.py
Outdated
DATA_URL = "https://www.cndc.bo/gene/dat/gene.php?fechag={0}" | ||
SOURCE = "cndc.bo" | ||
|
||
|
||
def extract_xsrf_token(html): | ||
"""Extracts XSRF token from the source code of the generation graph page.""" | ||
return re.search(r'var ttoken = "([a-f0-9]+)";', html).group(1) | ||
|
||
|
||
def get_timestamp(query_date: datetime, hour: int) -> datetime: | ||
return datetime( | ||
year=query_date.year, | ||
month=query_date.month, | ||
day=query_date.day, | ||
hour=hour, | ||
tzinfo=tz_bo, | ||
) | ||
|
||
|
||
def fetch_data( | ||
session: Session | None = None, target_datetime: datetime | None = None | ||
) -> tuple[list[dict], datetime]: | ||
if session is None: | ||
session = Session() | ||
|
||
if target_datetime is None: | ||
target_datetime = datetime.now() | ||
target_datetime = target_datetime.astimezone(tz_bo) | ||
# Define actual and previous day (for midnight data). | ||
formatted_dt = target_datetime.strftime("%Y-%m-%d") | ||
|
||
# XSRF token for the initial request | ||
xsrf_token = extract_xsrf_token(session.get(INDEX_URL).text) | ||
|
||
resp = session.get( | ||
DATA_URL.format(formatted_dt), headers={"x-csrf-token": xsrf_token} | ||
) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Suggestion: Use f-string for formatting the DATA_URL instead of using the format method.
DATA_URL = "https://www.cndc.bo/gene/dat/gene.php?fechag={0}" | |
SOURCE = "cndc.bo" | |
def extract_xsrf_token(html): | |
"""Extracts XSRF token from the source code of the generation graph page.""" | |
return re.search(r'var ttoken = "([a-f0-9]+)";', html).group(1) | |
def get_timestamp(query_date: datetime, hour: int) -> datetime: | |
return datetime( | |
year=query_date.year, | |
month=query_date.month, | |
day=query_date.day, | |
hour=hour, | |
tzinfo=tz_bo, | |
) | |
def fetch_data( | |
session: Session | None = None, target_datetime: datetime | None = None | |
) -> tuple[list[dict], datetime]: | |
if session is None: | |
session = Session() | |
if target_datetime is None: | |
target_datetime = datetime.now() | |
target_datetime = target_datetime.astimezone(tz_bo) | |
# Define actual and previous day (for midnight data). | |
formatted_dt = target_datetime.strftime("%Y-%m-%d") | |
# XSRF token for the initial request | |
xsrf_token = extract_xsrf_token(session.get(INDEX_URL).text) | |
resp = session.get( | |
DATA_URL.format(formatted_dt), headers={"x-csrf-token": xsrf_token} | |
) | |
DATA_URL = "https://www.cndc.bo/gene/dat/gene.php?fechag={formatted_dt}" | |
... | |
resp = session.get( | |
DATA_URL, headers={"x-csrf-token": xsrf_token} | |
) |
parsers/CNDC.py
Outdated
session = Session() | ||
|
||
if target_datetime is None: | ||
target_datetime = datetime.now() | ||
target_datetime = target_datetime.astimezone(tz_bo) | ||
# Define actual and previous day (for midnight data). | ||
formatted_dt = target_datetime.strftime("%Y-%m-%d") | ||
|
||
# XSRF token for the initial request | ||
xsrf_token = extract_xsrf_token(session.get(INDEX_URL).text) | ||
|
||
resp = session.get( | ||
DATA_URL.format(formatted_dt), headers={"x-csrf-token": xsrf_token} | ||
) | ||
|
||
hour_rows = json.loads(resp.text.replace("", ""))["data"] | ||
return hour_rows, target_datetime | ||
|
||
|
||
def parse_generation_forecast( | ||
zone_key: ZoneKey, date: datetime, raw_data: list[dict], logger: Logger | ||
) -> TotalProductionList: | ||
result = TotalProductionList(logger) | ||
assert date.tzinfo == tz_bo | ||
for hour_row in raw_data: | ||
[hour, forecast, total, thermo, hydro, wind, solar, bagasse] = hour_row | ||
|
||
# "hour" is one-indexed | ||
timestamp = get_timestamp(query_date=date, hour=hour - 1) | ||
|
||
result.append( | ||
zoneKey=zone_key, | ||
datetime=timestamp, | ||
value=forecast, | ||
source=SOURCE, | ||
sourceType=EventSourceType.forecasted, | ||
) | ||
|
||
return result | ||
|
||
|
||
def parser_production_breakdown( | ||
zone_key: ZoneKey, date: datetime, raw_data: list[dict], logger: Logger | ||
) -> ProductionBreakdownList: | ||
result = ProductionBreakdownList(logger) | ||
assert date.tzinfo == tz_bo | ||
for hour_row in raw_data: | ||
[hour, forecast, total, thermo, hydro, wind, solar, bagasse] = hour_row | ||
|
||
# "hour" is one-indexed | ||
timestamp = get_timestamp(query_date=date, hour=hour - 1) | ||
modes_extracted = [hydro, solar, wind, bagasse] | ||
|
||
if total is None or None in modes_extracted: | ||
continue | ||
|
||
result.append( | ||
zoneKey=zone_key, | ||
datetime=timestamp, | ||
production=ProductionMix( | ||
hydro=hydro, | ||
solar=solar, | ||
wind=wind, | ||
biomass=bagasse, | ||
# NOTE: thermo includes gas + oil mixed, so we set these as unknown for now | ||
# The modes here should match the ones we extract in the production payload | ||
unknown=total - (hydro + solar + wind + bagasse), | ||
), | ||
source=SOURCE, | ||
) | ||
|
||
return result | ||
|
||
|
||
def fetch_production( | ||
zone_key: ZoneKey = ZoneKey("BO"), | ||
session: Session | None = None, | ||
target_datetime: datetime | None = None, | ||
logger: Logger = getLogger(__name__), | ||
) -> list: | ||
"""Requests the last known production mix (in MW) of a given country.""" | ||
production = ProductionBreakdownList(logger) | ||
raw_data, query_date = fetch_data(session=session, target_datetime=target_datetime) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Suggestion: Use a context manager to handle the session object. This ensures that the session is properly closed after use.
session = Session() | |
if target_datetime is None: | |
target_datetime = datetime.now() | |
target_datetime = target_datetime.astimezone(tz_bo) | |
# Define actual and previous day (for midnight data). | |
formatted_dt = target_datetime.strftime("%Y-%m-%d") | |
# XSRF token for the initial request | |
xsrf_token = extract_xsrf_token(session.get(INDEX_URL).text) | |
resp = session.get( | |
DATA_URL.format(formatted_dt), headers={"x-csrf-token": xsrf_token} | |
) | |
hour_rows = json.loads(resp.text.replace("", ""))["data"] | |
return hour_rows, target_datetime | |
def parse_generation_forecast( | |
zone_key: ZoneKey, date: datetime, raw_data: list[dict], logger: Logger | |
) -> TotalProductionList: | |
result = TotalProductionList(logger) | |
assert date.tzinfo == tz_bo | |
for hour_row in raw_data: | |
[hour, forecast, total, thermo, hydro, wind, solar, bagasse] = hour_row | |
# "hour" is one-indexed | |
timestamp = get_timestamp(query_date=date, hour=hour - 1) | |
result.append( | |
zoneKey=zone_key, | |
datetime=timestamp, | |
value=forecast, | |
source=SOURCE, | |
sourceType=EventSourceType.forecasted, | |
) | |
return result | |
def parser_production_breakdown( | |
zone_key: ZoneKey, date: datetime, raw_data: list[dict], logger: Logger | |
) -> ProductionBreakdownList: | |
result = ProductionBreakdownList(logger) | |
assert date.tzinfo == tz_bo | |
for hour_row in raw_data: | |
[hour, forecast, total, thermo, hydro, wind, solar, bagasse] = hour_row | |
# "hour" is one-indexed | |
timestamp = get_timestamp(query_date=date, hour=hour - 1) | |
modes_extracted = [hydro, solar, wind, bagasse] | |
if total is None or None in modes_extracted: | |
continue | |
result.append( | |
zoneKey=zone_key, | |
datetime=timestamp, | |
production=ProductionMix( | |
hydro=hydro, | |
solar=solar, | |
wind=wind, | |
biomass=bagasse, | |
# NOTE: thermo includes gas + oil mixed, so we set these as unknown for now | |
# The modes here should match the ones we extract in the production payload | |
unknown=total - (hydro + solar + wind + bagasse), | |
), | |
source=SOURCE, | |
) | |
return result | |
def fetch_production( | |
zone_key: ZoneKey = ZoneKey("BO"), | |
session: Session | None = None, | |
target_datetime: datetime | None = None, | |
logger: Logger = getLogger(__name__), | |
) -> list: | |
"""Requests the last known production mix (in MW) of a given country.""" | |
production = ProductionBreakdownList(logger) | |
raw_data, query_date = fetch_data(session=session, target_datetime=target_datetime) | |
with Session() as session: | |
raw_data, query_date = fetch_data(session=session, target_datetime=target_datetime) |
def fetch_data( | ||
session: Session | None = None, target_datetime: datetime | None = None | ||
) -> tuple[list[dict], datetime]: | ||
if session is None: | ||
session = Session() | ||
|
||
if target_datetime is None: | ||
target_datetime = datetime.now() | ||
target_datetime = target_datetime.astimezone(tz_bo) | ||
# Define actual and previous day (for midnight data). | ||
formatted_dt = target_datetime.strftime("%Y-%m-%d") | ||
|
||
# XSRF token for the initial request | ||
xsrf_token = extract_xsrf_token(session.get(INDEX_URL).text) | ||
|
||
resp = session.get( | ||
DATA_URL.format(formatted_dt), headers={"x-csrf-token": xsrf_token} | ||
) | ||
|
||
hour_rows = json.loads(resp.text.replace("", ""))["data"] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Suggestion: Use a more descriptive function name than 'fetch_data'. The function name should reflect what kind of data it fetches.
def fetch_data( | |
session: Session | None = None, target_datetime: datetime | None = None | |
) -> tuple[list[dict], datetime]: | |
if session is None: | |
session = Session() | |
if target_datetime is None: | |
target_datetime = datetime.now() | |
target_datetime = target_datetime.astimezone(tz_bo) | |
# Define actual and previous day (for midnight data). | |
formatted_dt = target_datetime.strftime("%Y-%m-%d") | |
# XSRF token for the initial request | |
xsrf_token = extract_xsrf_token(session.get(INDEX_URL).text) | |
resp = session.get( | |
DATA_URL.format(formatted_dt), headers={"x-csrf-token": xsrf_token} | |
) | |
hour_rows = json.loads(resp.text.replace("", ""))["data"] | |
def fetch_hourly_data( | |
session: Session | None = None, target_datetime: datetime | None = None | |
) -> tuple[list[dict], datetime]: | |
... |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Everything works as expected but I have a few comments before approving.
parsers/CNDC.py
Outdated
[hour, forecast, total, thermo, hydro, wind, solar, bagasse] = hour_row | ||
|
||
# "hour" is one-indexed | ||
timestamp = get_timestamp(query_date=date, hour=hour - 1) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Not sure if you changed this part but I would prefer if it was called something like get_datetime instead as it actually returns a datetime.
# NOTE: thermo includes gas + oil mixed, so we set these as unknown for now | ||
# The modes here should match the ones we extract in the production payload | ||
unknown=total - (hydro + solar + wind + bagasse), | ||
), |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why are we not assigning thermo directly to unknown?
Or are there additional missing production?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think there might be additional unknowns we want to account for. This was already in the original parser.
result.append( | ||
zoneKey=zone_key, | ||
datetime=timestamp, | ||
production=ProductionMix( |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
By not using the add value function we introduce floating point errors like this: 'unknown': 777.4199999999998,
We could possibly change how the ProductionMix class works to avoid this or use the add_value function everywhere. Either or I think we should avoid introducing these.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think I would rather update the __set__attr method of the Mix class so that it's coherent with the add method. I will do a follow up PR to address this.
Description
Refactor the BO parser to use the new datastructure. Add test coverage and minor improvements.
Double check
poetry run test_parser "zone_key"
pnpx prettier --write .
andpoetry run format
to format my changes.