In [1]:
from pathlib import Path
from tqdm import tqdm

In [2]:
def list_sjis_chars(byte_seq):
    i = 0
    while i < len(byte_seq):
        byte1 = byte_seq[i]
        if (0x00 <= byte1 <= 0x7F) or (0xA1 <= byte1 <= 0xDF):
            yield byte_seq[i:i+1]
            i += 1
        elif (0x81 <= byte1 <= 0x9F) or (0xE0 <= byte1 <= 0xFC):
            yield byte_seq[i:i+2]
            i += 2
        else:
            raise ValueError(f'Invalid byte: {bytes([byte1]).hex()} at {i}')

In [3]:
%%time

paths = list(Path('cards').glob('**/*.txt'))
paths.sort()

sjis_chars = set()

for path in tqdm(paths):
    assert path.suffix == '.txt'

    with open(path, 'rb') as file:
        data = file.read()

        try:
            for char in list_sjis_chars(data):
                sjis_chars.add(char)
        except ValueError as e:
            print(path)
            raise e


100%|██████████| 17436/17436 [01:03<00:00, 275.96it/s] 

CPU times: user 1min 2s, sys: 640 ms, total: 1min 3s
Wall time: 1min 3s





In [4]:
len(sjis_chars)

6728

In [5]:
with open('sjis_chars.csv', 'w') as output_file:
    for sjis_seq in sorted(sjis_chars):
        try:
            sjis_hex = sjis_seq.hex()
            char = sjis_seq.decode('cp932')
            utf8_hex = char.encode('utf8').hex()
            output_file.write(f'{sjis_hex},{utf8_hex},{char}\n')
        except:
            print(sjis_hex)

eb81
