In [1]:
from collections import Counter
from operator import itemgetter

In [2]:
def triplets(s: bytes):
    s = b' ' + s.strip() + b' '
    memview = memoryview(s)
    for i in range(len(s)-2):
        yield memview[i:i+3]

In [3]:
def all_triplets_from_many_lines(lines):
    for line in lines:
        yield from triplets(line)

In [4]:
def load_file(filename):
    with open(filename, 'rb') as file:
        return {line.rstrip(b'\r\n'): i for i, line in enumerate(file.readlines())}

In [5]:
def account_triplets(lines):
    c = Counter(all_triplets_from_many_lines(lines))
    m = max(c.values())
    for key in c:
        c[key] /= m  # Нормализуем по максимальному значению
    return c

In [6]:
old_file = load_file('../stringdumps/stringdump_0_47_04.txt')
old_file.update(load_file('../stringdumps/stringdump_0_47_03.txt'))
old_file.update(load_file('../stringdumps/stringdump_0_47_02.txt'))
old_file.update(load_file('../stringdumps/stringdump_0_47_01.txt'))
old_file.update(load_file('../stringdumps/stringdump_0_44_12.txt'))

In [7]:
new_file = load_file('../stringdumps/stringdump_0_47_05.txt')

In [8]:
#new_file = load_file('output2.txt')

In [9]:
c = account_triplets(old_file)  # Обучаем на старых файлах

In [10]:
import math

def get_score(s: bytes):
    # return sum(c[t] for t in triplets(s)) / len(s)
    return sum(c[t] for t in triplets(s)) / math.log(len(s)+1)  # Преимущество длинным строкам

In [12]:
diff = sorted(new_file.keys()-old_file.keys(), key=lambda s: get_score(s))

In [13]:
for item in diff:
    print(item, get_score(item))

b' wW' 0.0
b'\xff\xff\xff\xffp\\t\xff' 0.0
b'\xff\xff\xff\xff Y\xfd' 0.0
b' t\x98' 0.0
b'\xff\xff\xff\xffp3\xfd' 0.0
b'\xff\xff\xff\xffp\xd5\xff' 0.0
b'B\x94\xd7' 0.0
b'\xff\xff\xff\xff`k\xfd' 0.0
b'0\x9d\xaa' 0.0
b'\xff\xff\xff\xff\xc0H\xfd' 0.0
b'\xff\xff\xff\xffh\xfe\xff' 0.0
b'\xff\xff\xff\xffp\xa5\xfe' 0.0
b'\xff\xff\xff\xff\x80A\xfd' 0.0
b'\xff\xff\xff\xff\xd0Y\xff' 0.0
b'P,\xb3' 0.0
b'\xff\xff\xff\xffpq' 0.0
b'\xff\xff\xff\xffA\xcb\xfd' 0.0
b'H\x86\xd7' 0.0
b'\xc0\xa1F' 0.0
b'\xff\xff\xff\xff`H\xff' 0.0
b'\xe0\\A' 0.0
b'`B\xab' 0.0
b'p\xb3@' 0.0
b'\xff\xff\xff\xff@I\xfe' 0.0
b'\xd0\xf3L' 0.0
b'\xff\xff\xff\xff\xe0S\xfd' 0.0
b'0v\xbe' 0.0
b'\xff\xff\xff\xff`\xfe\xff' 0.0
b'\xff\xff\xff\xff\xb0I\xfd' 0.0
b'\xff\xff\xff\xff\x80s\xfe' 0.0
b'\xff\xff\xff\xff@m\xfe' 0.0
b'\xc0\xbaW' 0.0
b'\xff\xff\xff\xff\x80v\xff' 0.0
b'\xff\xff\xff\xffPC' 0.0
b'\x80{\xdd' 0.0
b'\xff\xff\xff\xffX\xd4\xfd' 0.0
b'\x90\xabW' 0.0
b'\xff\xff\xff\xffP2\xfd' 0.0
b'B\x91\xd7' 0.0
b'\xb0f\xdb' 0.0
b'\xff\xff\

In [14]:
threshold = 0.007

In [15]:
with open('output2.txt', 'wb') as output:
    for line, number in sorted(new_file.items(), key=itemgetter(1)):
        if line in diff and get_score(line) < threshold: # Отсеиваем только добавившиеся строки
            continue
        
        output.write(line)
        output.write(b'\r\n')