Filter garbage strings from a new stringdump

In [1]:
from collections import Counter
import math
from operator import itemgetter
from collections.abc import Iterator


def triplets(s: bytes) -> Iterator[memoryview]:
    s = b' ' + s.strip() + b' '
    memview = memoryview(s)
    for i in range(len(s)-2):
        yield memview[i:i+3]


def all_triplets_from_many_lines(lines: Iterator[bytes]) -> Iterator[memoryview]:
    for line in lines:
        yield from triplets(line)


def load_file(filename: str) -> dict[bytes, int]:
    with open(filename, 'rb') as file:
        return {line.rstrip(b'\r\n'): i for i, line in enumerate(file.readlines())}


def account_triplets(lines: Iterator[bytes]):
    c = Counter(all_triplets_from_many_lines(lines))
    m = max(c.values())
    for key in c:
        c[key] /= m  # Normalize by max value
    return c


def get_score(s: bytes, trained: dict[bytes, float]) -> float:
    # return sum(c[t] for t in triplets(s)) / len(s)
    return math.sqrt(sum(trained[t] for t in triplets(s)) / math.log(len(s) + 1))

In [2]:
old_file = load_file('../stringdumps/stringdump_0_47_04.txt').keys()
old_file |= load_file('../stringdumps/stringdump_0_47_05.txt').keys()
old_file |= load_file('../stringdumps/stringdump_0_47_03.txt').keys()
old_file |= load_file('../stringdumps/stringdump_0_47_02.txt').keys()
old_file |= load_file('../stringdumps/stringdump_0_47_01.txt').keys()
old_file |= load_file('../stringdumps/stringdump_0_44_12.txt').keys()
old_file |= load_file('../stringdumps/stringdump_steam_50_01.txt').keys()
old_file |= load_file('../stringdumps/stringdump_steam_50_02.txt').keys()
old_file |= load_file('../stringdumps/stringdump_steam_50_05.txt').keys()
old_file |= load_file('../stringdumps/stringdump_steam_50_06.txt').keys()
old_file |= load_file('../stringdumps/stringdump_steam_50_08.txt').keys()
old_file |= load_file('../stringdumps/stringdump_steam_50_09.txt').keys()
old_file |= load_file('../stringdumps/stringdump_steam_50_10.txt').keys()
old_file |= load_file('../stringdumps/stringdump_steam_50_11.txt').keys()
old_file |= load_file('../stringdumps/stringdump_steam_50_12.txt').keys()
old_file |= load_file('../stringdumps/stringdump_steam_50_13.txt').keys()
old_file |= load_file('../stringdumps/stringdump_steam_50_14.txt').keys()
old_file |= load_file('../stringdumps/stringdump_steam_51_01.txt').keys()

trained = account_triplets(old_file)  # Обучаем на старых файлах

In [3]:
new_file = load_file('../dfint64_patch/stringdump.txt')

In [4]:
diff = sorted(new_file.keys() - old_file, key=lambda s: get_score(s, trained))

In [5]:
for item in diff[:200]:
    score = get_score(item, trained)
    print(f"{item!r:40} {score:.10f}")

b'7G'                                    0.0000000000
b'0KG'                                   0.0000000000
b'k'                                     0.0000000000
b'+0x%llX'                               0.0000000000
b'&tC'                                   0.0000000000
b'`Y0'                                   0.0000000000
b'%u> '                                  0.0000000000
b'KG'                                    0.0000000000
b'j0KG'                                  0.0000000000
b'ltP'                                   0.0000000000
b'0!p'                                   0.0000000000
b'cg'                                    0.0000000000
b'`j0KG'                                 0.0000000000
b'6V'                                    0.0000000000
b'Xt4'                                   0.0000000000
b'P>L'                                   0.0000000000
b'/F'                                    0.0000000000
b'YGy'                                   0.0000000000
b"'4M"                      

In [6]:
threshold = 0.016

In [7]:
with open('../stringdumps/stringdump_steam_51_02.txt', 'wb') as output:
    for line, number in sorted(new_file.items(), key=itemgetter(1)):
        if line in diff and get_score(line, trained) < threshold: # Отсеиваем только добавившиеся строки
            continue
        
        output.write(line)
        output.write(b'\n')