Filter garbage strings from a new stringdump

In [1]:
from collections import Counter
import math
from operator import itemgetter


def triplets(s: bytes):
    s = b' ' + s.strip() + b' '
    memview = memoryview(s)
    for i in range(len(s)-2):
        yield memview[i:i+3]


def all_triplets_from_many_lines(lines):
    for line in lines:
        yield from triplets(line)


def load_file(filename):
    with open(filename, 'rb') as file:
        return {line.rstrip(b'\r\n'): i for i, line in enumerate(file.readlines())}


def account_triplets(lines):
    c = Counter(all_triplets_from_many_lines(lines))
    m = max(c.values())
    for key in c:
        c[key] /= m  # Normalize by max value
    return c


def get_score(s: bytes, trained: dict):
    # return sum(c[t] for t in triplets(s)) / len(s)
    return sum(trained[t] for t in triplets(s)) / math.log(len(s)+1)  # Prioritize long strings

In [2]:
old_file = load_file('../stringdumps/stringdump_0_47_04.txt')
old_file.update(load_file('../stringdumps/stringdump_0_47_05.txt'))
old_file.update(load_file('../stringdumps/stringdump_0_47_03.txt'))
old_file.update(load_file('../stringdumps/stringdump_0_47_02.txt'))
old_file.update(load_file('../stringdumps/stringdump_0_47_01.txt'))
old_file.update(load_file('../stringdumps/stringdump_0_44_12.txt'))
old_file.update(load_file('../stringdumps/stringdump_steam_50_01.txt'))
old_file.update(load_file('../stringdumps/stringdump_steam_50_02.txt'))
old_file.update(load_file('../stringdumps/stringdump_steam_50_05.txt'))
old_file.update(load_file('../stringdumps/stringdump_steam_50_06.txt'))
old_file.update(load_file('../stringdumps/stringdump_steam_50_08.txt'))
old_file.update(load_file('../stringdumps/stringdump_steam_50_09.txt'))
old_file.update(load_file('../stringdumps/stringdump_steam_50_10.txt'))
old_file.update(load_file('../stringdumps/stringdump_steam_50_11.txt'))
old_file.update(load_file('../stringdumps/stringdump_steam_50_12.txt'))
old_file.update(load_file('../stringdumps/stringdump_steam_50_13.txt'))

trained = account_triplets(old_file)  # Обучаем на старых файлах

In [3]:
new_file = load_file('../dfint64_patch/stringdump.txt')

In [4]:
diff = sorted(new_file.keys() - old_file.keys(), key=lambda s: get_score(s, trained))

In [6]:
for item in diff[:200]:
    print(item, get_score(item, trained))

b'u-' 0.0
b'(xC' 0.0
b'%u> ' 0.0
b'!G<' 0.0
b'!dn' 0.0
b'9[G' 0.0
b'%f' 0.0
b'L9' 0.0
b'Rn' 0.0
b"'dX" 0.0
b"'dD" 0.0
b'VR' 0.0
b'p|' 0.0
b'c7D' 0.0
b'r+b' 0.0
b'H,' 0.0
b"'4n" 0.0
b'wv1' 0.0
b'h' 0.0
b'cg' 0.0
b'<z.' 0.0
b'PfC' 0.0
b'RF>' 0.0
b'!h<' 0.0
b'*X' 0.0
b')bZ' 0.0
b"'dF" 0.0
b'SR' 0.0
b'+0x%llX' 0.0
b'f1(' 0.0
b"'4E" 0.0
b'k' 0.0
b'uc' 0.0
b'ww' 0.0
b"'4y" 0.0
b'*h5' 0.0
b' d6' 0.0
b'2U' 0.0
b'!7U)' 0.0
b'k;(' 0.0
b'%Y-%m-%d-%H-%M-%S' 0.0
b"'4o" 0.0
b'`R>' 0.0
b'-F' 0.0
b'`U' 0.0
b"'tk" 0.00011984507732920448
b'=E1' 0.00011984507732920448
b' XP' 0.00011984507732920448
b"'4C" 0.00011984507732920448
b"'tW" 0.00011984507732920448
b"'tU" 0.00011984507732920448
b"'tG" 0.00011984507732920448
b':E.' 0.00011984507732920448
b'&dj' 0.00011984507732920448
b"'tO" 0.00011984507732920448
b' v ' 0.00011984507732920448
b'UTF-8' 0.00037089923678434777
b'AEAAXXZ' 0.0006391737457557573
b'& p' 0.0007190704639752269
b'(V' 0.0007561382510606723
b' F' 0.0007561382510606723
b' N' 0.0007561382510606

In [7]:
threshold = 0.0049

In [8]:
with open('../stringdumps/stringdump_steam_51_01_beta_20.txt', 'wb') as output:
    for line, number in sorted(new_file.items(), key=itemgetter(1)):
        if line in diff and get_score(line, trained) < threshold: # Отсеиваем только добавившиеся строки
            continue
        
        output.write(line)
        output.write(b'\n')