Filter garbage strings from a new stringdump

In [1]:
from collections import Counter
import math
from operator import itemgetter


def triplets(s: bytes):
    s = b' ' + s.strip() + b' '
    memview = memoryview(s)
    for i in range(len(s)-2):
        yield memview[i:i+3]


def all_triplets_from_many_lines(lines):
    for line in lines:
        yield from triplets(line)


def load_file(filename):
    with open(filename, 'rb') as file:
        return {line.rstrip(b'\r\n'): i for i, line in enumerate(file.readlines())}


def account_triplets(lines):
    c = Counter(all_triplets_from_many_lines(lines))
    m = max(c.values())
    for key in c:
        c[key] /= m  # Normalize by max value
    return c


def get_score(s: bytes, trained: dict):
    # return sum(c[t] for t in triplets(s)) / len(s)
    return sum(trained[t] for t in triplets(s)) / math.log(len(s)+1)  # Prioritize long strings

In [2]:
old_file = load_file('../stringdumps/stringdump_0_47_04.txt')
old_file.update(load_file('../stringdumps/stringdump_0_47_05.txt'))
old_file.update(load_file('../stringdumps/stringdump_0_47_03.txt'))
old_file.update(load_file('../stringdumps/stringdump_0_47_02.txt'))
old_file.update(load_file('../stringdumps/stringdump_0_47_01.txt'))
old_file.update(load_file('../stringdumps/stringdump_0_44_12.txt'))
old_file.update(load_file('../stringdumps/stringdump_steam_50_01.txt'))
old_file.update(load_file('../stringdumps/stringdump_steam_50_02.txt'))
old_file.update(load_file('../stringdumps/stringdump_steam_50_05.txt'))
old_file.update(load_file('../stringdumps/stringdump_steam_50_06.txt'))
old_file.update(load_file('../stringdumps/stringdump_steam_50_08.txt'))
old_file.update(load_file('../stringdumps/stringdump_steam_50_09.txt'))
old_file.update(load_file('../stringdumps/stringdump_steam_50_10.txt'))
old_file.update(load_file('../stringdumps/stringdump_steam_50_11.txt'))
old_file.update(load_file('../stringdumps/stringdump_steam_50_12.txt'))
old_file.update(load_file('../stringdumps/stringdump_steam_50_13.txt'))

trained = account_triplets(old_file)  # Обучаем на старых файлах

In [3]:
new_file = load_file('../dfint64_patch/stringdump.txt')

In [4]:
diff = sorted(new_file.keys() - old_file.keys(), key=lambda s: get_score(s, trained))

In [14]:
for item in diff[:200]:
    print(f"{item!r:40} {get_score(item, trained):.10f}")

b'%K5'                                   0.0000000000
b'46J'                                   0.0000000000
b'ww'                                    0.0000000000
b',Iz?'                                  0.0000000000
b'>.u'                                   0.0000000000
b'\x7fH'                                 0.0000000000
b'<+N'                                   0.0000000000
b"'dF"                                   0.0000000000
b'4pX'                                   0.0000000000
b'4.J'                                   0.0000000000
b"'Vm"                                   0.0000000000
b'L3M'                                   0.0000000000
b'H\\'                                   0.0000000000
b'tV'                                    0.0000000000
b'&tT'                                   0.0000000000
b'+0x%llX'                               0.0000000000
b'AO5'                                   0.0000000000
b'h'                                     0.0000000000
b'\\C'                      

In [15]:
threshold = 0.005034

In [16]:
with open('../stringdumps/stringdump_steam_51_01_beta_22.txt', 'wb') as output:
    for line, number in sorted(new_file.items(), key=itemgetter(1)):
        if line in diff and get_score(line, trained) < threshold: # Отсеиваем только добавившиеся строки
            continue
        
        output.write(line)
        output.write(b'\n')