In [3]:
from collections import Counter
import math
from operator import itemgetter


def triplets(s: bytes):
    s = b' ' + s.strip() + b' '
    memview = memoryview(s)
    for i in range(len(s)-2):
        yield memview[i:i+3]
        

def all_triplets_from_many_lines(lines):
    for line in lines:
        yield from triplets(line)
        

def load_file(filename):
    with open(filename, 'rb') as file:
        return {line.rstrip(b'\r\n'): i for i, line in enumerate(file.readlines())}
    
    
def account_triplets(lines):
    c = Counter(all_triplets_from_many_lines(lines))
    m = max(c.values())
    for key in c:
        c[key] /= m  # Нормализуем по максимальному значению
    return c


def get_score(s: bytes):
    # return sum(c[t] for t in triplets(s)) / len(s)
    return sum(c[t] for t in triplets(s)) / math.log(len(s)+1)  # Преимущество длинным строкам

In [4]:
old_file = load_file('../stringdumps/stringdump_0_47_04.txt')
old_file.update(load_file('../stringdumps/stringdump_0_47_05.txt'))
old_file.update(load_file('../stringdumps/stringdump_0_47_03.txt'))
old_file.update(load_file('../stringdumps/stringdump_0_47_02.txt'))
old_file.update(load_file('../stringdumps/stringdump_0_47_01.txt'))
old_file.update(load_file('../stringdumps/stringdump_0_44_12.txt'))
old_file.update(load_file('../stringdumps/stringdump_steam_0_50_01.txt'))
old_file.update(load_file('../stringdumps/stringdump_steam_0_50_02.txt'))
old_file.update(load_file('../stringdumps/stringdump_steam_0_50_05_sorted_by_xrefs.txt'))

c = account_triplets(old_file)  # Обучаем на старых файлах

In [5]:
new_file = load_file('../dfrus64/stringdump.txt')

In [6]:
diff = sorted(new_file.keys()-old_file.keys(), key=lambda s: get_score(s))

In [7]:
for item in diff:
    print(item, get_score(item))

b'P;' 0.0
b'B\xc3\xa11' 0.0
b'\xe2\x94\x94a\xc2\xb1' 0.0
b'0\xe2\x95\xa3K' 0.0
b'\xe2\x94\x8cBp7' 0.0
b'\xe2\x94\x8cU9' 0.0
b'+N\xc3\xb7' 0.0
b'\xe2\x95\xa8l' 0.0
b'\xe2\x89\xa1Wu' 0.0
b'Z\xc3\x87\xe2\x95\xa2' 0.0
b'_Wu' 0.0
b'O\xc3\x86\xc3\x86' 0.0
b'/F' 0.0
b'gw}' 0.0
b'K+\xe2\x95\x97' 0.0
b'>H\xc3\xa8' 0.0
b'\xc2\xbfR.' 0.0
b'u\xc3\x91\xc3\x87\xe2\x95\xa2' 0.0
b'\xe2\x95\x9b{p' 0.0
b'>\xc2\xbdr' 0.0
b'pT' 0.0
b'\xc3\xa1\xe2\x94\x8cP\xe2\x95\x90c' 0.0
b']t\xc3\x87' 0.0
b'\xc3\xaa\xe2\x94\xbcz\xc3\x87\xe2\x95\xa2' 0.0
b'uW\xc3\x87\xe2\x95\xa2' 0.0
b'u\xe2\x94\x94\xc3\x87\xe2\x95\xa2' 0.0
b'B\xc3\x87\xe2\x95\xa2' 0.0
b'\xe2\x89\xa1\xe2\x95\xa3e' 0.0
b'\xe2\x94\xb4\xc2\xba\xc3\xb2q\xc3\x87\xe2\x95\xa2' 0.0
b'4\xc3\xa4\xe2\x95\x9e\xc3\xaa4\xc3\xa4\xc3\xa9\xc3\xa84\xc3\xa4\xc3\xa9\xc3\xae4\xc3\xa4x\xc3\x844\xc3\xa4\xe2\x95\x95\xc3\x894\xc3\xa4\xe2\x95\x93' 0.0
b'\xc3\xaa\xc3\xb4u' 0.0
b'\xc3\x96P\xc3\xb6' 0.0
b'\xe2\x95\xa1u\xc3\x87\xe2\x95\xa2' 0.0
b'`\xc3\x87A' 0.0
b'\xe2\x89\xa1s\xc3\x

In [9]:
threshold = 0.011

In [10]:
with open('../stringdumps/stringdump_steam_0_50_06_sorted_by_xrefs.txt', 'wb') as output:
    for line, number in sorted(new_file.items(), key=itemgetter(1)):
        if line in diff and get_score(line) < threshold: # Отсеиваем только добавившиеся строки
            continue
        
        output.write(line)
        output.write(b'\n')