In [None]:
import requests

url = 'https://api.bilibili.com/pgc/view/web/season?season_id=33378'
res = requests.get(url, headers={'User-Agent':'Chrome/87.0.4280.88'})

with open('tmp_playlist.json', 'w', encoding='utf8') as f:
    f.write(res.text)

In [None]:
import json

with open('tmp_playlist.json', 'r', encoding='utf8') as f:
    data = json.load(f)

assert data['code'] == 0
assert data['message'] == 'success'

videos = []

for ep in data['result']['episodes']:
    x = {}
    videos.append(x)

    x['playId'] = 'ep' + str(ep['id'])
    x['tvid'] = int(ep['title'])
    x['title'] = ep['long_title']
    # x['pub'] = ep['pub_time']

len(videos)
videos[3]

with open('tvlist.json', 'w', encoding='utf8') as f:
    json.dump(videos, f, ensure_ascii=False, indent=4)

In [None]:
# zh.wikipedia.org/wiki/名偵探柯南動畫集數列表

from bs4 import BeautifulSoup
from typing import List
from itertools import islice
import re

s = BeautifulSoup(open('wiki-zh.html', 'r', encoding='utf8'))

def extract(table, cols:List[int], skip_rows=2) -> List:
    info_list = []
    for tr in islice(table.select('tr'), skip_rows, None):
        tds = tr.select('td')
        info = []
        for i in cols:
            info.append(tds[i].text)
        info_list.append(info)
    return info_list

table_count = 13

col_list = [3] * table_count
col_list[0] = 4
col_list = [[0, p] for p in col_list]

tables = s.select('.wikitable')
len(tables)

records = []
for i in range(table_count):
    x = extract(tables[i], col_list[i])
    records += x

rep_pat = r'[ \r\n“”《》！·!]'

# process records, remove mark like [2]
for r in records:
    r[1] = re.sub(r'\[\w+?\]', '', r[1])
    r[1] = re.sub('\n','',r[1])
    r[1] = re.sub(r'（[^（]+?特别版）','',r[1])
    r[1] = re.sub(rep_pat,'',r[1])
    r[1] = r[1].upper()
    r[1] = re.sub(r'KUDOUSHINICHI', '工藤新一', r[1])

with open('wiki-zh.json', 'w', encoding='utf8') as f:
    json.dump(records, f, indent=4, ensure_ascii=False)

In [None]:
# match bilibili with wiki

import json
from datetime import datetime

# load wiki
wiki_data = json.load(open('wiki-zh.json', 'r', encoding='utf8'))
names = set([name for [t, name] in wiki_data])
name_and_date = {name:t for [t, name] in wiki_data}
assert len(wiki_data) == len(names)

# load bilibili
tvlist = json.load(open('tvlist.json', 'r', encoding='utf8'))

# wiki to full text search

from difflib import SequenceMatcher

def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()
    
def closest(s: str):
    max_score = -1
    s2 = None

    for name in names:
        score = similar(s, name)
        if score > max_score:
            max_score = score
            s2 = name
    return s2

not_match = 0

# load manual ref list
manual_ref = dict()
with open('manual-ref.txt', 'r', encoding='utf8') as f:
    lines = f.read().splitlines(keepends=False)
for line in lines:
    ss = line.split(' -- ')
    assert len(ss) == 2
    manual_ref[ss[0]] = ss[1]

for tv in tvlist:
    title = tv['title']

    # normalize title
    title = re.sub(rep_pat, '', title)
    title = title.upper()

    if title not in names:
        if False:
            not_match += 1
            t = closest(title)
            print(title, '--', t)
        # use manual ref
        title = manual_ref[title]
        assert title in names
    
    tv['wikiPub'] = name_and_date[title]

with open('tv-with-time.json','w', encoding='utf8') as f:
    json.dump(tvlist, f, ensure_ascii=False, indent=4)

dtfmt = '%Y年%m月%d日'

# check date
for i in range(len(tvlist)):
    tv = tvlist[i]

    tv['wikiPub'] = datetime.strptime(tv['wikiPub'], dtfmt)

for i in range(len(tvlist)-1):
    tv1 = tvlist[i]
    tv2 = tvlist[i+1]

    if tv1['wikiPub'] > tv2['wikiPub']:
        print(tv1, tv2)
# if no output, then ok