In [1]:
from collections import defaultdict, ChainMap
import itertools as it
import re
from string import Template

from bs4 import BeautifulSoup
import pandas as pd

In [2]:
html_files = [
    {'week': 13, 'collected': '2021-07-14'},
    {'week': 11, 'collected': '2021-06-29'},
    {'week': 9, 'collected': '2021-06-25'},
    {'week': 4, 'collected': '2021-05-20'},
    {'week': 2, 'collected': '2021-04-26'},
    {'week': 1, 'collected': '2021-04-19'},
]

In [3]:
score_key_template = Template('week_${week}_score')

def get_current_player_scores(soup, week):
    current_player_score_rows = soup.find_all('tr', player_id=True, player_role='overall')
    score_key = score_key_template.substitute(week=week)
    return (
        {
            'player_id': row['player_id'],
            'player_name': row.find(class_='ibm-player-name').contents[0],
            score_key: int(row.find(class_='ibm-rating').contents[0]),
        } for row in current_player_score_rows
    )

score_history_regex = re.compile('^player_impact_Chart_(?P<player_id>\d+)_overall$')

def transform_to_score(scorestr):
    try:
        return int(scorestr)
    except ValueError:
        return None


def get_player_score_history(canvas):
    score_history = {
        score_key_template.substitute(week=index + 1): transform_to_score(scorestr)
        for index, scorestr in enumerate(canvas['impactdata'].split(','))
    }

    return {'player_id': re.match(score_history_regex, canvas['id']).group('player_id')} | score_history

def get_player_score_histories(soup):
    return (get_player_score_history(canvas) for canvas in soup.find_all('canvas', id=score_history_regex))

def consolidate_score_snapshots(score_snapshots):
    player_scores = defaultdict(ChainMap)
    for score_snapshot in score_snapshots:
        player_id = score_snapshot.pop('player_id')
        player_scores[player_id].maps.append(score_snapshot)
    return [dict(chained_score_data) for chained_score_data in player_scores.values()]

In [4]:
html_file_template = Template('raw-html/${collected}.html')

def get_score_snapshots_from_file(week, collected):
    html_file = html_file_template.substitute(collected=collected)
    with open(html_file) as f:
        soup = BeautifulSoup(f)

    yield from get_current_player_scores(soup, week)
    yield from get_player_score_histories(soup)

def get_complete_score_history():
    consolidated_scores = consolidate_score_snapshots(
        it.chain(*[
            get_score_snapshots_from_file(**file_metadata)
            # The order shouldn't matter, but we prioritize the most recently-collected data
            # in case there were score presentation errors that were corrected
            for file_metadata in sorted(html_files, key=lambda m: m['week'], reverse=True)
        ])
    )
    return pd.DataFrame.from_records(consolidated_scores).set_index('player_name').astype(pd.UInt32Dtype())

In [5]:
scores = get_complete_score_history()

In [6]:
missing_score_count = scores.isna().sum(axis='columns')
scores[missing_score_count > missing_score_count.min()]

Unnamed: 0_level_0,week_1_score,week_2_score,week_3_score,week_4_score,week_5_score,week_6_score,week_7_score,week_8_score,week_9_score,week_10_score,week_11_score,week_12_score,week_13_score
player_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
LIP,107.0,,130.0,135.0,,136.0,139.0,121.0,124.0,,122.0,126.0,125.0
JinMu,157.0,140.0,125.0,122.0,,115.0,,,,,,,121.0
Decay,,124.0,129.0,122.0,,116.0,113.0,120.0,120.0,,119.0,118.0,118.0
Edison,48.0,,,,,,,,,,,,118.0
skewed,72.0,109.0,,,,,,,,,,,117.0
MAG,,139.0,134.0,128.0,,115.0,111.0,112.0,112.0,,111.0,112.0,116.0
shy,,,94.0,,,110.0,108.0,114.0,114.0,,113.0,113.0,110.0
smurf,100.0,,,,,,,110.0,109.0,,112.0,112.0,110.0
MCD,,70.0,88.0,89.0,,103.0,101.0,108.0,108.0,,111.0,111.0,109.0
Guxue,,82.0,92.0,94.0,,108.0,106.0,108.0,108.0,,109.0,109.0,108.0


In [7]:
scores.isna().sum(axis='rows')

week_1_score      34
week_2_score       8
week_3_score       7
week_4_score       9
week_5_score     112
week_6_score      14
week_7_score      13
week_8_score      10
week_9_score      12
week_10_score    112
week_11_score     12
week_12_score     15
week_13_score     12
dtype: int64

In [8]:
scores.to_csv('ibm_power_scores.csv')