In [1]:
from collections import Counter
import math
from operator import itemgetter


def triplets(s: bytes):
    s = b' ' + s.strip() + b' '
    memview = memoryview(s)
    for i in range(len(s)-2):
        yield memview[i:i+3]
        

def all_triplets_from_many_lines(lines):
    for line in lines:
        yield from triplets(line)
        

def load_file(filename):
    with open(filename, 'rb') as file:
        return {line.rstrip(b'\r\n'): i for i, line in enumerate(file.readlines())}
    
    
def account_triplets(lines):
    c = Counter(all_triplets_from_many_lines(lines))
    m = max(c.values())
    for key in c:
        c[key] /= m  # Нормализуем по максимальному значению
    return c


def get_score(s: bytes):
    # return sum(c[t] for t in triplets(s)) / len(s)
    return sum(c[t] for t in triplets(s)) / math.log(len(s)+1)  # Преимущество длинным строкам

In [2]:
old_file = load_file('../stringdumps/stringdump_0_47_04.txt')
old_file.update(load_file('../stringdumps/stringdump_0_47_05.txt'))
old_file.update(load_file('../stringdumps/stringdump_0_47_03.txt'))
old_file.update(load_file('../stringdumps/stringdump_0_47_02.txt'))
old_file.update(load_file('../stringdumps/stringdump_0_47_01.txt'))
old_file.update(load_file('../stringdumps/stringdump_0_44_12.txt'))
old_file.update(load_file('../stringdumps/stringdump_steam_50_01.txt'))
old_file.update(load_file('../stringdumps/stringdump_steam_50_02.txt'))
old_file.update(load_file('../stringdumps/stringdump_steam_50_05.txt'))
old_file.update(load_file('../stringdumps/stringdump_steam_50_06.txt'))
old_file.update(load_file('../stringdumps/stringdump_steam_50_08.txt'))
old_file.update(load_file('../stringdumps/stringdump_steam_50_09.txt'))
old_file.update(load_file('../stringdumps/stringdump_steam_50_10.txt'))

c = account_triplets(old_file)  # Обучаем на старых файлах

In [7]:
new_file = load_file('../dfint64_patch/stringdump.txt')

In [8]:
diff = sorted(new_file.keys()-old_file.keys(), key=lambda s: get_score(s))

In [9]:
for item in diff:
    print(item, get_score(item))

b'\xe2\x89\xa1A' 0.0
b'\xe2\x94\x90F' 0.0
b'zP\xe2\x95\xa3' 0.0
b'\xe2\x95\x9ec' 0.0
b'ZP\xe2\x95\xa3' 0.0
b'_\xc2\xacP\xe2\x95\xa3' 0.0
b'\xe2\x96\xa0G' 0.0
b'u\xc2\xbdP\xe2\x95\xa3' 0.0
b'\xe2\x96\x80\xe2\x96\x80I' 0.0
b'U1' 0.0
b'+0x%llX' 0.0
b'\xc6\x92\xe2\x94\xbc,P\xe2\x95\xa3' 0.0
b'U\xc3\xabP\xe2\x95\xa3' 0.0
b'Z!\xc3\xbf' 0.0
b'p/\xe2\x96\x91' 0.0
b'5\x7fP\xe2\x95\xa3' 0.0
b'PB\xce\xb1' 0.0
b'Rn' 0.0
b'\xe2\x95\xa1_P\xe2\x95\xa3' 0.0
b'\xc3\x87o"' 0.0
b'\xc3\x89y' 0.0
b'a\xc3\x87\xc3\x9c' 0.0
b' s\xe2\x81\xbf' 0.0
b'\xe2\x95\xa7Ry' 0.0
b'D\xe2\x95\xa1FP\xe2\x95\xa3' 0.0
b'\xe2\x94\xb4F' 0.0
b'h' 0.0
b'u\xe2\x95\xabP\xe2\x95\xa3' 0.0
b'\xe2\x89\xa5Q' 0.0
b'2P\xe2\x95\xa3' 0.0
b'\xc3\xbc>\xc3\xb2QP\xe2\x95\xa3' 0.0
b'\xc3\xa4-O' 0.0
b'R\xc2\xa0\xe2\x81\xbf' 0.0
b' y\xc3\xa2' 0.0
b'f?2' 0.0
b'm\xc3\xaam' 0.0
b'p;' 0.0
b'\xc3\xb9BP\xe2\x95\xa3' 0.0
b'r+b' 0.0
b'\xe2\x95\x94F\xc3\xa4' 0.0
b'\xc3\xa9P\xe2\x95\xa3' 0.0
b'[\xe2\x95\x9fo' 0.0
b'P6' 0.0
b'k\xe2\x95\x9f\xc3\x9c' 0.0
b'\xc

In [8]:
threshold = 0.014

In [9]:
with open('../stringdumps/stringdump_steam_50_10.txt', 'wb') as output:
    for line, number in sorted(new_file.items(), key=itemgetter(1)):
        if line in diff and get_score(line) < threshold: # Отсеиваем только добавившиеся строки
            continue
        
        output.write(line)
        output.write(b'\n')