In [1]:
from collections import Counter
import math
from operator import itemgetter


def triplets(s: bytes):
    s = b' ' + s.strip() + b' '
    memview = memoryview(s)
    for i in range(len(s)-2):
        yield memview[i:i+3]
        

def all_triplets_from_many_lines(lines):
    for line in lines:
        yield from triplets(line)
        

def load_file(filename):
    with open(filename, 'rb') as file:
        return {line.rstrip(b'\r\n'): i for i, line in enumerate(file.readlines())}
    
    
def account_triplets(lines):
    c = Counter(all_triplets_from_many_lines(lines))
    m = max(c.values())
    for key in c:
        c[key] /= m  # Нормализуем по максимальному значению
    return c


def get_score(s: bytes):
    # return sum(c[t] for t in triplets(s)) / len(s)
    return sum(c[t] for t in triplets(s)) / math.log(len(s)+1)  # Преимущество длинным строкам

In [2]:
old_file = load_file('../stringdumps/stringdump_0_47_04.txt')
old_file.update(load_file('../stringdumps/stringdump_0_47_05.txt'))
old_file.update(load_file('../stringdumps/stringdump_0_47_03.txt'))
old_file.update(load_file('../stringdumps/stringdump_0_47_02.txt'))
old_file.update(load_file('../stringdumps/stringdump_0_47_01.txt'))
old_file.update(load_file('../stringdumps/stringdump_0_44_12.txt'))
old_file.update(load_file('../stringdumps/stringdump_steam_50_01.txt'))
old_file.update(load_file('../stringdumps/stringdump_steam_50_02.txt'))
old_file.update(load_file('../stringdumps/stringdump_steam_50_05.txt'))
old_file.update(load_file('../stringdumps/stringdump_steam_50_06.txt'))
old_file.update(load_file('../stringdumps/stringdump_steam_50_08.txt'))
old_file.update(load_file('../stringdumps/stringdump_steam_50_09.txt'))
old_file.update(load_file('../stringdumps/stringdump_steam_50_10.txt'))
old_file.update(load_file('../stringdumps/stringdump_steam_50_11.txt'))
old_file.update(load_file('../stringdumps/stringdump_steam_50_12.txt'))
old_file.update(load_file('../stringdumps/stringdump_steam_50_13.txt'))

c = account_triplets(old_file)  # Обучаем на старых файлах

In [3]:
new_file = load_file('../dfint64_patch/stringdump.txt')

In [4]:
diff = sorted(new_file.keys()-old_file.keys(), key=lambda s: get_score(s))

In [5]:
for item in diff:
    print(item, get_score(item))

b' +C' 0.0
b'SR' 0.0
b' t7' 0.0
b' |a' 0.0
b'1w2' 0.0
b' eG' 0.0
b' B;' 0.0
b'h' 0.0
b'yC,' 0.0
b'\x7fB' 0.0
b'%Y-%m-%d-%H-%M-%S' 0.0
b'`#F' 0.0
b"'dN" 0.0
b'AEu' 0.0
b"0'B" 0.0
b'ww' 0.0
b'pkK' 0.0
b"'dF" 0.0
b'%u> ' 0.0
b'p+Y' 0.0
b'-F' 0.0
b'JT7' 0.0
b'k' 0.0
b' eE' 0.0
b'}C' 0.0
b'%f' 0.0
b'S/h' 0.0
b'P]g' 0.0
b'`Mg' 0.0
b'cg' 0.0
b"'4S" 0.0
b'r+b' 0.0
b'+0x%llX' 0.0
b')Bu' 0.0
b'nC' 0.0
b'rH' 0.0
b'MSk ' 0.00010322893081236284
b"'tE" 0.00011984507732920448
b'Wd0' 0.00011984507732920448
b"'tM" 0.00011984507732920448
b' v ' 0.00011984507732920448
b"'tU" 0.00011984507732920448
b"'tW" 0.00011984507732920448
b"'t<" 0.00011984507732920448
b'"f' 0.00015122765021213447
b'N4' 0.00030245530042426893
b'Rm2' 0.00035953523198761344
b'UTF-8' 0.00037089923678434777
b' F' 0.0007561382510606723
b' N' 0.0007561382510606723
b'Oh ' 0.0008389155413044314
b'tEB' 0.0008389155413044314
b'pzy' 0.0008389155413044314
b'df9' 0.0008389155413044314
b'df3' 0.0008389155413044314
b'L' 0.0009587606186336359
b'J' 0

b'not removing the ' 1.851223011986709
b'Once a beautiful tree, and now?  It is a rude bauble, fit only for your kind.' 1.8522268100800963
b'Start a performance.  This can take several turns to complete.' 1.8545543147578252
b'Interact with this item in a different way (opens menu.)' 1.8578494632927014
b'removing the ' 1.8587947935832363
b'adding the ' 1.8655243099289303
b'attempted to play a wing sound too quickly.' 1.8968226301269624
b'starting medium walla for the ' 1.9261538783301457
b'View your companions and tactical settings, or switch party members.' 1.9261852981484955
b'Added fade point at beginning of track to 0% volume.' 1.9374665288592277
b'starting high walla for the ' 1.9491059926176464
b'no more playing layers, so abandoning moments from song.' 1.9591844653160986
b' ambiance playing, but it existed in the poll.' 1.9785502716233294
b"You truly despise life, don't you?  I am beside myself with grief.  Perhaps we will show you how they suffered." 1.9854126757707697
b'Default

In [6]:
threshold = 0.02

In [7]:
with open('../stringdumps/stringdump_steam_51_01_beta14.txt', 'wb') as output:
    for line, number in sorted(new_file.items(), key=itemgetter(1)):
        if line in diff and get_score(line) < threshold: # Отсеиваем только добавившиеся строки
            continue
        
        output.write(line)
        output.write(b'\n')